Skip to content
This repository has been archived by the owner on Nov 4, 2024. It is now read-only.

Commit

Permalink
fix: dropout tests are no longer broken
Browse files Browse the repository at this point in the history
  • Loading branch information
avik-pal committed Sep 12, 2024
1 parent f65dd15 commit 7162f43
Showing 1 changed file with 2 additions and 6 deletions.
8 changes: 2 additions & 6 deletions test/common_ops/dropout_tests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,7 @@ end
x -> sum(first(dropout(rng, x, mask, p, Val(true), Val(true), invp, :)))
end
test_gradients(__f, x; atol=1.0f-3, rtol=1.0f-3,
soft_fail=(T == Float16 ? [AutoFiniteDiff()] : []),
broken_backends=(T == Float16 && Sys.iswindows() ? [AutoEnzyme()] : []))
soft_fail=(T == Float16 ? [AutoFiniteDiff()] : []))

@jet sum(first(dropout(
rng, x, mask, T(0.5), Val(true), Val(true), T(2), :)))
Expand Down Expand Up @@ -105,11 +104,8 @@ end

soft_fail = T == Float16 ? Any[AutoFiniteDiff()] : []
skip_backends = length(x_shape) == 5 ? [AutoEnzyme()] : []
broken_backends = T == Float16 && Sys.iswindows() && length(x_shape) != 5 ?
[AutoEnzyme()] : []

test_gradients(__f, x; atol=1.0f-3, rtol=1.0f-3, soft_fail, skip_backends,
broken_backends)
test_gradients(__f, x; atol=1.0f-3, rtol=1.0f-3, soft_fail, skip_backends)

@jet sum(first(dropout(
rng, x, mask, T(0.5), Val(true), Val(false), T(2), :)))
Expand Down

3 comments on commit 7162f43

@avik-pal
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/115077

Tip: Release Notes

Did you know you can add release notes too? Just add markdown formatted text underneath the comment after the text
"Release notes:" and it will be added to the registry PR, and if TagBot is installed it will also be added to the
release that TagBot creates. i.e.

@JuliaRegistrator register

Release notes:

## Breaking changes

- blah

To add them here just re-invoke and the PR will be updated.

Tagging

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v1.2.1 -m "<description of version>" 7162f4316c6d5067096fbc1e8008405f405cab43
git push origin v1.2.1

@github-actions
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LuxLib Benchmarks

Benchmark suite Current: 7162f43 Previous: 897d842 Ratio
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 5958.5 ns 5312.5 ns 1.12
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 6875 ns 7792 ns 0.88
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 8292 ns 8000 ns 1.04
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 5583 ns 6958.5 ns 0.80
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 119005 ns 119033 ns 1.00
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/Metal 912250 ns 825375 ns 1.11
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/AMDGPU 404275 ns 401934 ns 1.01
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 9979 ns 9583 ns 1.04
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 9917 ns 9875 ns 1.00
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 10458 ns 9875 ns 1.06
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 9833.5 ns 9979 ns 0.99
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 553048 ns 554263 ns 1.00
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/Metal 5278042 ns 2713291 ns 1.95
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 666908 ns 671997 ns 0.99
bias_activation(32, act=relu)(32 x 128)/forward/CPU/2 thread(s) 1416 ns 7645.5 ns 0.19
bias_activation(32, act=relu)(32 x 128)/forward/CPU/4 thread(s) 3000 ns 7500 ns 0.40
bias_activation(32, act=relu)(32 x 128)/forward/CPU/8 thread(s) 2083 ns 9750 ns 0.21
bias_activation(32, act=relu)(32 x 128)/forward/CPU/1 thread(s) 2916 ns 8521 ns 0.34
bias_activation(32, act=relu)(32 x 128)/forward/GPU/CUDA 21729 ns 23694 ns 0.92
bias_activation(32, act=relu)(32 x 128)/forward/GPU/Metal 209229.5 ns 222062.5 ns 0.94
bias_activation(32, act=relu)(32 x 128)/forward/GPU/AMDGPU 28980 ns 31840 ns 0.91
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 4250 ns 4770.5 ns 0.89
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 4083.5 ns 5041 ns 0.81
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 4209 ns 5583.5 ns 0.75
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 3958 ns 5062 ns 0.78
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/CUDA 147955 ns 145766 ns 1.02
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/Metal 1637541 ns 1568604.5 ns 1.04
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/AMDGPU 144052 ns 146901 ns 0.98
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 58125 ns 56917 ns 1.02
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 46334 ns 47083 ns 0.98
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 46875 ns 47375 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 82000 ns 82792 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 37545 ns 39154 ns 0.96
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 1091270.5 ns 1060708 ns 1.03
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 78176 ns 81970 ns 0.95
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2039500 ns 2023687.5 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2086417 ns 2084333.5 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2090625 ns 2097166.5 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2001458 ns 1996667 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 236094 ns 220055 ns 1.07
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 5524250 ns 5389292 ns 1.03
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 981882 ns 1353254 ns 0.73
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 177916.5 ns 147146 ns 1.21
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 149750 ns 149750 ns 1
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 151999.5 ns 146270.5 ns 1.04
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 155625 ns 150667 ns 1.03
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 166025 ns 165828.5 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 1652208 ns 1542042 ns 1.07
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 208252 ns 204932 ns 1.02
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1118959 ns 1114916 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1115500 ns 1110250 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1118583 ns 1120437.5 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1120729.5 ns 1114709 ns 1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 702528 ns 688383 ns 1.02
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 6243791 ns 6685792 ns 0.93
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 921401 ns 1030010.5 ns 0.89
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 5375 ns 4479 ns 1.20
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 5333.5 ns 5417 ns 0.98
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 5979.5 ns 5583 ns 1.07
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 4166 ns 4334 ns 0.96
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 92951.5 ns 91302 ns 1.02
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/Metal 464791 ns 449229 ns 1.03
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/AMDGPU 66471 ns 69581 ns 0.96
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 8666 ns 8458 ns 1.02
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8709 ns 8625 ns 1.01
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 8917 ns 9292 ns 0.96
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8667 ns 8375 ns 1.03
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 609555 ns 588432.5 ns 1.04
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/Metal 6613792 ns 6040187 ns 1.09
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 389844 ns 387324 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 17500 ns 17146 ns 1.02
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 18312.5 ns 18438 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 22041 ns 21500 ns 1.03
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 17999.5 ns 17229.5 ns 1.04
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 66046 ns 66199 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 1312542 ns 1266312.5 ns 1.04
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 78481 ns 76211 ns 1.03
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 221708.5 ns 215958 ns 1.03
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 213209 ns 219125 ns 0.97
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 221208 ns 215188 ns 1.03
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 220125 ns 221708 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 351410 ns 351090 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 5781271 ns 5667541.5 ns 1.02
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 467066 ns 469564 ns 0.99
bias_activation(2, act=relu)(2 x 128)/forward/CPU/2 thread(s) 583 ns 7584 ns 0.07687236286919831
bias_activation(2, act=relu)(2 x 128)/forward/CPU/4 thread(s) 625 ns 8166.5 ns 0.07653217412600258
bias_activation(2, act=relu)(2 x 128)/forward/CPU/8 thread(s) 895.5 ns 11750 ns 0.0762127659574468
bias_activation(2, act=relu)(2 x 128)/forward/CPU/1 thread(s) 625 ns 8562.5 ns 0.072992700729927
bias_activation(2, act=relu)(2 x 128)/forward/GPU/CUDA 20127 ns 22778 ns 0.88
bias_activation(2, act=relu)(2 x 128)/forward/GPU/Metal 299208 ns 301791 ns 0.99
bias_activation(2, act=relu)(2 x 128)/forward/GPU/AMDGPU 32561 ns 32530 ns 1.00
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 1417 ns 2209 ns 0.64
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 1417 ns 2417 ns 0.59
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 1709 ns 2916.5 ns 0.59
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 1417 ns 2375 ns 0.60
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/CUDA 123446 ns 126097.5 ns 0.98
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/Metal 1707625 ns 1533792 ns 1.11
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/AMDGPU 134852 ns 135982 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7458 ns 14041 ns 0.53
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 6125 ns 14167 ns 0.43
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 6167 ns 14458 ns 0.43
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10250 ns 16709 ns 0.61
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 23451 ns 32839 ns 0.71
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 641500 ns 609208 ns 1.05
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 47640 ns 56260 ns 0.85
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 234125 ns 227375 ns 1.03
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 238167 ns 275292 ns 0.87
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 235875 ns 275000 ns 0.86
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 256229 ns 261458 ns 0.98
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 190753.5 ns 202099.5 ns 0.94
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 8815979 ns 8740042 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 641363 ns 655201 ns 0.98
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/2 thread(s) 4084 ns 4125 ns 0.99
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/4 thread(s) 4125 ns 4125 ns 1
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/8 thread(s) 4125 ns 4125 ns 1
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/1 thread(s) 4125 ns 4083 ns 1.01
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/CUDA 23283 ns 22662 ns 1.03
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/Metal 223916 ns 219958 ns 1.02
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/AMDGPU 46130 ns 46610 ns 0.99
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 16500 ns 21041 ns 0.78
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 17000 ns 21791 ns 0.78
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 17208 ns 22250 ns 0.77
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 16916 ns 20917 ns 0.81
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/CUDA 192389 ns 205015 ns 0.94
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/Metal 1461979.5 ns 975584 ns 1.50
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/AMDGPU 171782 ns 182977 ns 0.94
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 511041.5 ns 509167 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 404542 ns 404417 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 405083 ns 405000 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 865583 ns 864791 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/CUDA 113186.5 ns 113334.5 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/Metal 432792 ns 421604.5 ns 1.03
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/AMDGPU 241413 ns 240942 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 2283979.5 ns 2318229.5 ns 0.99
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 2032375 ns 2030833 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 2028375 ns 2041375 ns 0.99
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 3275708 ns 3280292 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/CUDA 238715 ns 250973.5 ns 0.95
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/Metal 2068312 ns 1903125 ns 1.09
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/AMDGPU 738234 ns 725307 ns 1.02
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 6875 ns 5375 ns 1.28
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 6562.5 ns 7604 ns 0.86
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 7791.5 ns 8500 ns 0.92
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 7021 ns 6458.5 ns 1.09
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 89026 ns 89376.5 ns 1.00
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/Metal 835166 ns 762334 ns 1.10
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/AMDGPU 64940 ns 64761 ns 1.00
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 11395.5 ns 10583 ns 1.08
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 10875 ns 11958 ns 0.91
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 11813 ns 11958 ns 0.99
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 12250 ns 10792 ns 1.14
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 626379 ns 632512 ns 0.99
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/Metal 5900375 ns 5666041 ns 1.04
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 405390 ns 401324 ns 1.01
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/2 thread(s) 500 ns 2625 ns 0.19
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/4 thread(s) 500 ns 2958 ns 0.17
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/8 thread(s) 542 ns 3250 ns 0.17
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/1 thread(s) 500 ns 2792 ns 0.18
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/CUDA 22768 ns 30482.5 ns 0.75
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/Metal 329250 ns 340083 ns 0.97
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/AMDGPU 46571 ns 54341 ns 0.86
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 2084 ns 10750 ns 0.19
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 2125 ns 11833 ns 0.18
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 2167 ns 13000 ns 0.17
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 2125 ns 10625 ns 0.20
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/CUDA 222641.5 ns 252151 ns 0.88
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/Metal 2041875 ns 1962708.5 ns 1.04
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/AMDGPU 177642 ns 189561.5 ns 0.94
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 8625 ns 26500 ns 0.33
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 8625 ns 31771 ns 0.27
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 10375 ns 35000 ns 0.30
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 8625 ns 28479 ns 0.30
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 103937.5 ns 121854.5 ns 0.85
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/Metal 808791.5 ns 730917 ns 1.11
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/AMDGPU 72021 ns 80315.5 ns 0.90
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 18000 ns 22791.5 ns 0.79
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 18145.5 ns 25542 ns 0.71
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 18583 ns 25334 ns 0.73
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 18416.5 ns 23000 ns 0.80
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 571815.5 ns 616060 ns 0.93
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/Metal 5404208 ns 5306187.5 ns 1.02
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 375389.5 ns 388424 ns 0.97
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 500 ns 1667 ns 0.30
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 500 ns 2000 ns 0.25
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 584 ns 2167 ns 0.27
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 583 ns 1834 ns 0.32
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 34719 ns 40493 ns 0.86
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/Metal 470542 ns 296417 ns 1.59
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/AMDGPU 45830 ns 48340 ns 0.95
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 9208 ns 10000 ns 0.92
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 9667 ns 11187.5 ns 0.86
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 9458 ns 11958 ns 0.79
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 9166.5 ns 10583 ns 0.87
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 247304 ns 266372 ns 0.93
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/Metal 5213666.5 ns 4716875 ns 1.11
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/AMDGPU 365724 ns 379563.5 ns 0.96
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/2 thread(s) 398667 ns 396417 ns 1.01
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/4 thread(s) 288000 ns 287875 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/8 thread(s) 287959 ns 288125 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/1 thread(s) 755708 ns 756000 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/CUDA 111672 ns 111465 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/Metal 399458 ns 367958 ns 1.09
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/AMDGPU 74445.5 ns 75531 ns 0.99
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/2 thread(s) 1404125 ns 1453958.5 ns 0.97
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/4 thread(s) 1133833 ns 1136125 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/8 thread(s) 1130625 ns 1142437.5 ns 0.99
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/1 thread(s) 2440291.5 ns 2444854 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/CUDA 204531 ns 219029 ns 0.93
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/Metal 1656875 ns 1657083 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/AMDGPU 322048.5 ns 327328 ns 0.98
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 7458 ns 7042 ns 1.06
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 7729.5 ns 8250 ns 0.94
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 8645.5 ns 8833 ns 0.98
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 7104 ns 7209 ns 0.99
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 140658.5 ns 141318 ns 1.00
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/Metal 446584 ns 440833 ns 1.01
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/AMDGPU 65181 ns 65171 ns 1.00
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 14854 ns 12000 ns 1.24
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 14041.5 ns 14812 ns 0.95
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 14812.5 ns 14750 ns 1.00
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 14500 ns 11917 ns 1.22
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 931410 ns 936057 ns 1.00
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/Metal 6503021 ns 5924541.5 ns 1.10
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 419645 ns 423354 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 25750 ns 23584 ns 1.09
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 30229 ns 29312.5 ns 1.03
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 27250 ns 31187 ns 0.87
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 24749.5 ns 24833 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 197427 ns 197551 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 1129250 ns 605479 ns 1.87
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 113061 ns 114941 ns 0.98
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 114729 ns 108084 ns 1.06
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 151542 ns 124167 ns 1.22
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 149125 ns 154542 ns 0.96
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 144292 ns 151166.5 ns 0.95
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1069359 ns 1062517 ns 1.01
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 6073166.5 ns 6076604 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 585507 ns 587946 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 76500 ns 73750 ns 1.04
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 75375 ns 82958 ns 0.91
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 82062.5 ns 78916 ns 1.04
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 76375 ns 77042 ns 0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 205087.5 ns 204012 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 534500 ns 530625 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 126942 ns 129202 ns 0.98
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 222708 ns 209875 ns 1.06
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 218938 ns 218333 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 301625 ns 286250 ns 1.05
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 211000 ns 224708 ns 0.94
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1112213 ns 1104104 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 6842291.5 ns 6448250 ns 1.06
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 687778 ns 693286 ns 0.99
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 16625 ns 15687.5 ns 1.06
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 17084 ns 17458 ns 0.98
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 18250 ns 18250 ns 1
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 16979.5 ns 16812.5 ns 1.01
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 145569.5 ns 144830 ns 1.01
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/Metal 470166 ns 448250 ns 1.05
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/AMDGPU 231123 ns 231562 ns 1.00
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 27000.5 ns 24667 ns 1.09
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 26583.5 ns 26229.5 ns 1.01
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 28167 ns 27083 ns 1.04
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 28104 ns 24833 ns 1.13
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 973282.5 ns 963572.5 ns 1.01
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/Metal 6243292 ns 6046333 ns 1.03
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 686787 ns 687187 ns 1.00
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 10708 ns 32208.5 ns 0.33
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 11833 ns 38208 ns 0.31
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 13146 ns 43375 ns 0.30
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 11083 ns 31459 ns 0.35
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 122917 ns 138477.5 ns 0.89
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/Metal 909979.5 ns 880000 ns 1.03
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/AMDGPU 234753 ns 243662 ns 0.96
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 22125 ns 23270.5 ns 0.95
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 22208 ns 23917 ns 0.93
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 22833 ns 25145.5 ns 0.91
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 21458 ns 22645.5 ns 0.95
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 698911 ns 705404 ns 0.99
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/Metal 5533791.5 ns 5486750 ns 1.01
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 667848 ns 671427 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 63250 ns 63271 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 63291 ns 64396 ns 0.98
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 68792 ns 66666 ns 1.03
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 66896 ns 63375.5 ns 1.06
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 104795 ns 106695.5 ns 0.98
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 1335750 ns 1328458 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 233243 ns 236317 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 465750 ns 437479.5 ns 1.06
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 485500 ns 464312.5 ns 1.05
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 479729 ns 451499.5 ns 1.06
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 439584 ns 485145.5 ns 0.91
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 513938 ns 511151 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 6244250 ns 6149750 ns 1.02
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 700442.5 ns 716967 ns 0.98
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 7896 ns 7542 ns 1.05
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 7625 ns 7375 ns 1.03
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 8437.5 ns 8500 ns 0.99
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 7791.5 ns 7125 ns 1.09
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 143630.5 ns 142876.5 ns 1.01
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/Metal 485125 ns 463208.5 ns 1.05
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/AMDGPU 64771 ns 64690 ns 1.00
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 16271 ns 12958 ns 1.26
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 14042 ns 13812 ns 1.02
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 15395.5 ns 14417 ns 1.07
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 12291 ns 15458 ns 0.80
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 937305 ns 934056 ns 1.00
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/Metal 5742562.5 ns 5680771 ns 1.01
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/AMDGPU 395454 ns 396764 ns 1.00
batchedmm(512, Bsize=4)/forward/CPU/2 thread(s) 6159333 ns 6145625 ns 1.00
batchedmm(512, Bsize=4)/forward/CPU/4 thread(s) 6381500 ns 6375834 ns 1.00
batchedmm(512, Bsize=4)/forward/CPU/8 thread(s) 6372812.5 ns 6379875 ns 1.00
batchedmm(512, Bsize=4)/forward/CPU/1 thread(s) 11913291 ns 11908958 ns 1.00
batchedmm(512, Bsize=4)/forward/GPU/CUDA 301808 ns 348241 ns 0.87
batchedmm(512, Bsize=4)/forward/GPU/AMDGPU 300133 ns 302192.5 ns 0.99
batchedmm(512, Bsize=4)/zygote/CPU/2 thread(s) 19143020.5 ns 19047770.5 ns 1.01
batchedmm(512, Bsize=4)/zygote/CPU/4 thread(s) 20001479.5 ns 19961208.5 ns 1.00
batchedmm(512, Bsize=4)/zygote/CPU/8 thread(s) 19942104 ns 19978625 ns 1.00
batchedmm(512, Bsize=4)/zygote/CPU/1 thread(s) 36496041.5 ns 36632228.5 ns 1.00
batchedmm(512, Bsize=4)/zygote/GPU/CUDA 1192939 ns 1017536 ns 1.17
batchedmm(512, Bsize=4)/zygote/GPU/AMDGPU 1151383 ns 1157817 ns 0.99
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 917 ns 3208 ns 0.29
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 958 ns 3541 ns 0.27
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 959 ns 4084 ns 0.23
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 958 ns 3250 ns 0.29
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/CUDA 22767.5 ns 30273 ns 0.75
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/Metal 329667 ns 335958 ns 0.98
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/AMDGPU 206473 ns 212322 ns 0.97
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 3667 ns 11417 ns 0.32
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 3667 ns 12291 ns 0.30
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 3750 ns 15000 ns 0.25
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 3667 ns 11459 ns 0.32
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/CUDA 278806 ns 300887 ns 0.93
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/Metal 2160750 ns 2150875 ns 1.00
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/AMDGPU 619497 ns 613366 ns 1.01
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 8770.5 ns 32583 ns 0.27
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 7542 ns 39625 ns 0.19
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 9396 ns 42125 ns 0.22
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 8416.5 ns 31291 ns 0.27
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 119876.5 ns 134275.5 ns 0.89
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/Metal 873333.5 ns 782479 ns 1.12
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/AMDGPU 72170 ns 81161 ns 0.89
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 11417 ns 17959 ns 0.64
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 11875 ns 19937.5 ns 0.60
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 12750 ns 20375 ns 0.63
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 12354.5 ns 18291 ns 0.68
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 634380 ns 653629 ns 0.97
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/Metal 5505625 ns 4601291 ns 1.20
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 353023 ns 371429 ns 0.95
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/2 thread(s) 292 ns 291 ns 1.00
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/4 thread(s) 292 ns 292 ns 1
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/8 thread(s) 292 ns 292 ns 1
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/1 thread(s) 250 ns 291 ns 0.86
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/CUDA 22115 ns 22327 ns 0.99
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/Metal 323000 ns 324479 ns 1.00
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/AMDGPU 45911 ns 46841 ns 0.98
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 2875 ns 6791 ns 0.42
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 3083 ns 7208 ns 0.43
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 3375 ns 9375 ns 0.36
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 2958 ns 6667 ns 0.44
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/CUDA 201959 ns 215371.5 ns 0.94
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/Metal 1698500 ns 1703500 ns 1.00
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/AMDGPU 162226.5 ns 166071 ns 0.98
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 10625 ns 10167 ns 1.05
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 11791 ns 12875.5 ns 0.92
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 13292 ns 13125 ns 1.01
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 12792 ns 11083 ns 1.15
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 120724 ns 120797.5 ns 1.00
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/Metal 1021125 ns 935500 ns 1.09
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/AMDGPU 237033 ns 233122 ns 1.02
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 20104.5 ns 20709 ns 0.97
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 20416.5 ns 21875 ns 0.93
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 21667 ns 21750 ns 1.00
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 22499.5 ns 22625 ns 0.99
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 591155 ns 590585 ns 1.00
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/Metal 4924396 ns 4822000 ns 1.02
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 647057 ns 648361 ns 1.00
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/2 thread(s) 4375 ns 6833 ns 0.64
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/4 thread(s) 4458 ns 7041 ns 0.63
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/8 thread(s) 4417 ns 7833 ns 0.56
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/1 thread(s) 4458 ns 6875 ns 0.65
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/CUDA 23658 ns 31284 ns 0.76
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/Metal 224687.5 ns 229313 ns 0.98
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/AMDGPU 47091 ns 52301 ns 0.90
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 16208 ns 26125 ns 0.62
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 16583 ns 27209 ns 0.61
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 16625 ns 30000 ns 0.55
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 16541 ns 25834 ns 0.64
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/CUDA 328446.5 ns 347032.5 ns 0.95
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/Metal 1736125 ns 1080292 ns 1.61
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/AMDGPU 206977.5 ns 216482.5 ns 0.96
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 2000 ns 3334 ns 0.60
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 2041 ns 3458 ns 0.59
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 2167 ns 3875 ns 0.56
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 2166 ns 3417 ns 0.63
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 35584 ns 41491.5 ns 0.86
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/Metal 496542 ns 397958 ns 1.25
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/AMDGPU 202993 ns 206202 ns 0.98
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 16625 ns 16917 ns 0.98
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 17083 ns 20833 ns 0.82
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 19375 ns 21208 ns 0.91
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 17292 ns 17687.5 ns 0.98
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 291555 ns 288016 ns 1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/Metal 5729000 ns 5201083 ns 1.10
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 685537 ns 696531.5 ns 0.98
batchedmm(16, Bsize=512)/forward/CPU/2 thread(s) 58875 ns 55459 ns 1.06
batchedmm(16, Bsize=512)/forward/CPU/4 thread(s) 64917 ns 64896 ns 1.00
batchedmm(16, Bsize=512)/forward/CPU/8 thread(s) 65979.5 ns 65583.5 ns 1.01
batchedmm(16, Bsize=512)/forward/CPU/1 thread(s) 51500 ns 51541.5 ns 1.00
batchedmm(16, Bsize=512)/forward/GPU/CUDA 66564 ns 66456 ns 1.00
batchedmm(16, Bsize=512)/forward/GPU/AMDGPU 114441 ns 113921 ns 1.00
batchedmm(16, Bsize=512)/zygote/CPU/2 thread(s) 135604 ns 132500 ns 1.02
batchedmm(16, Bsize=512)/zygote/CPU/4 thread(s) 160729.5 ns 166374.5 ns 0.97
batchedmm(16, Bsize=512)/zygote/CPU/8 thread(s) 161958.5 ns 111500 ns 1.45
batchedmm(16, Bsize=512)/zygote/CPU/1 thread(s) 220333 ns 316833 ns 0.70
batchedmm(16, Bsize=512)/zygote/GPU/CUDA 213911 ns 217912 ns 0.98
batchedmm(16, Bsize=512)/zygote/GPU/AMDGPU 608437 ns 613066 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 109333.5 ns 80625 ns 1.36
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 125708 ns 125645.5 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 107083 ns 86146 ns 1.24
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 84000 ns 82959 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 193351 ns 193130 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 1827812 ns 1989666.5 ns 0.92
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 202822.5 ns 216662.5 ns 0.94
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1924187 ns 1912792 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1898125 ns 1921187.5 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1884187.5 ns 1912375 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1880291.5 ns 1908917 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 530663 ns 526124 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 9424833.5 ns 8680374.5 ns 1.09
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1067912 ns 1069560 ns 1.00
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/2 thread(s) 291 ns 2375 ns 0.12
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/4 thread(s) 291 ns 2792 ns 0.10
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/8 thread(s) 292 ns 3459 ns 0.08441746169413125
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/1 thread(s) 292 ns 2375 ns 0.12
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/CUDA 21196 ns 28282 ns 0.75
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/Metal 359916.5 ns 355667 ns 1.01
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/AMDGPU 41330 ns 46231 ns 0.89
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/2 thread(s) 1833 ns 9625 ns 0.19
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/4 thread(s) 1792 ns 13459 ns 0.13
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/8 thread(s) 1834 ns 14166 ns 0.13
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/1 thread(s) 1792 ns 9625 ns 0.19
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/CUDA 249929 ns 270635.5 ns 0.92
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/Metal 1139125 ns 1067437.5 ns 1.07
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/AMDGPU 181082 ns 195496.5 ns 0.93
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 9916 ns 7958 ns 1.25
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 9625 ns 9854 ns 0.98
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 11687 ns 11000 ns 1.06
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 6958 ns 7521 ns 0.93
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 117950.5 ns 116502.5 ns 1.01
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/Metal 889354 ns 901250 ns 0.99
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/AMDGPU 233722 ns 233553 ns 1.00
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 8500 ns 8500 ns 1
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 8917 ns 10042 ns 0.89
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 9583 ns 10208 ns 0.94
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 10417 ns 8708 ns 1.20
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 519707.5 ns 518097.5 ns 1.00
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/Metal 4836042 ns 4329250 ns 1.12
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 623196 ns 626366 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 58000 ns 63291 ns 0.92
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 46250 ns 58084 ns 0.80
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 46125 ns 57292 ns 0.81
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 81708 ns 89791 ns 0.91
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 39849.5 ns 50283 ns 0.79
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 1140667 ns 1167250 ns 0.98
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 73531 ns 84641 ns 0.87
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1931958.5 ns 1912667 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1988875 ns 1975417 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1971000 ns 1966250 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1886041 ns 1870792 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 219155.5 ns 232632 ns 0.94
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 11236021 ns 11151146 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1172433 ns 1177471 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 420583 ns 415042 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 421937.5 ns 419333.5 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 421541.5 ns 422000 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 418396 ns 417187.5 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 208248.5 ns 207638.5 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 544854.5 ns 542958.5 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 281798 ns 282777.5 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 791042 ns 667541 ns 1.19
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 776334 ns 748979 ns 1.04
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 760667 ns 673708 ns 1.13
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 745854.5 ns 675250 ns 1.10
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1047869 ns 1040445 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 6618917 ns 6673666.5 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 906724.5 ns 908713 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 3475167 ns 3514375 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 3435791.5 ns 3451021 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 3445666 ns 3440750 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 3354000 ns 3449792 ns 0.97
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 170188 ns 184364 ns 0.92
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 1444500.5 ns 1385709 ns 1.04
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 419994 ns 425164 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 6226750 ns 6177354 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 6217084 ns 6248791 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 6216479 ns 6199834 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 6171167 ns 6163750 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 989017 ns 983055 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 8277583.5 ns 8007792 ns 1.03
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1541036 ns 1641310.5 ns 0.94
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 473104.5 ns 474292 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 341916.5 ns 345209 ns 0.99
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 342625 ns 346500 ns 0.99
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 902958 ns 905500 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/CUDA 45561 ns 54357 ns 0.84
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/Metal 434709 ns 404541 ns 1.07
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/AMDGPU 242062 ns 247753 ns 0.98
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 2288334 ns 2334208 ns 0.98
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 2037291 ns 2038708.5 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 2030792 ns 2043542 ns 0.99
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 3276500 ns 3293041.5 ns 0.99
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/CUDA 265294 ns 278906.5 ns 0.95
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/Metal 2215417 ns 2088084 ns 1.06
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/AMDGPU 765908 ns 756552 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 57833 ns 62291 ns 0.93
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 45833 ns 58500 ns 0.78
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 46083 ns 56958 ns 0.81
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 82333 ns 89250 ns 0.92
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 27901.5 ns 38171 ns 0.73
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 1140354.5 ns 1166417 ns 0.98
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 77831 ns 86475.5 ns 0.90
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2037292 ns 2035708 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2096667 ns 2102479 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2095750 ns 2080937.5 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1972770.5 ns 2008875 ns 0.98
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 232003 ns 245090 ns 0.95
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 11459667 ns 11967000 ns 0.96
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1054811 ns 1207986.5 ns 0.87
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 58333 ns 57709 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 46333 ns 48084 ns 0.96
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 46583 ns 49000 ns 0.95
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 82125 ns 83667 ns 0.98
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 48836.5 ns 56367 ns 0.87
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 1107208 ns 1087374.5 ns 1.02
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 71481 ns 80301 ns 0.89
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1939584 ns 1917209 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1977833.5 ns 1944583 ns 1.02
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1921500 ns 1961125 ns 0.98
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1901541 ns 1894375 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 237483 ns 246493.5 ns 0.96
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 9889292 ns 9855333.5 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 914639.5 ns 1034015 ns 0.88
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 291 ns 1375 ns 0.21
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 291 ns 1667 ns 0.17
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 375 ns 1959 ns 0.19
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 292 ns 1375 ns 0.21
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 34044.5 ns 39676 ns 0.86
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/Metal 479792 ns 286209 ns 1.68
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/AMDGPU 45711 ns 48840 ns 0.94
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 6375 ns 7375 ns 0.86
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7166.5 ns 9083 ns 0.79
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7042 ns 9479.5 ns 0.74
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 6875 ns 7583 ns 0.91
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 208323.5 ns 213194.5 ns 0.98
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/Metal 5275958 ns 4692500 ns 1.12
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/AMDGPU 365689 ns 380024 ns 0.96
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/2 thread(s) 250 ns 250 ns 1
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/4 thread(s) 250 ns 291 ns 0.86
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/8 thread(s) 291 ns 291 ns 1
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/1 thread(s) 250 ns 250 ns 1
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/CUDA 32143 ns 32533 ns 0.99
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/Metal 257083 ns 254979.5 ns 1.01
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/AMDGPU 38641 ns 37420 ns 1.03
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/2 thread(s) 3083 ns 6042 ns 0.51
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/4 thread(s) 2834 ns 7083 ns 0.40
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/8 thread(s) 3375 ns 9333 ns 0.36
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/1 thread(s) 3334 ns 6083 ns 0.55
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/CUDA 186317 ns 199543 ns 0.93
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/Metal 955000 ns 950520.5 ns 1.00
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/AMDGPU 153876.5 ns 164831 ns 0.93
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 456103.5 ns 437749.5 ns 1.04
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 457959 ns 487292 ns 0.94
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 449833.5 ns 466021 ns 0.97
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 446792 ns 442021 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 137834 ns 143480 ns 0.96
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 2047375 ns 2179687.5 ns 0.94
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 367534 ns 370168.5 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 3827084 ns 3794500 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 3779125 ns 3803417 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 3815354.5 ns 3791458 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 3805250 ns 3801709 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 704868 ns 707529.5 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 10646500 ns 10857667 ns 0.98
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1467966 ns 1463934 ns 1.00
batchedmm(512, Bsize=32)/forward/CPU/2 thread(s) 49950250 ns 49798563 ns 1.00
batchedmm(512, Bsize=32)/forward/CPU/4 thread(s) 35502667 ns 35524209 ns 1.00
batchedmm(512, Bsize=32)/forward/CPU/8 thread(s) 35525542 ns 35534958 ns 1.00
batchedmm(512, Bsize=32)/forward/CPU/1 thread(s) 96946146 ns 97214791.5 ns 1.00
batchedmm(512, Bsize=32)/forward/GPU/CUDA 1591119 ns 1600126 ns 0.99
batchedmm(512, Bsize=32)/forward/GPU/AMDGPU 1048170.5 ns 1047610 ns 1.00
batchedmm(512, Bsize=32)/zygote/CPU/2 thread(s) 154686729 ns 153739771 ns 1.01
batchedmm(512, Bsize=32)/zygote/CPU/4 thread(s) 112459458.5 ns 112306958.5 ns 1.00
batchedmm(512, Bsize=32)/zygote/CPU/8 thread(s) 112502167 ns 112388250 ns 1.00
batchedmm(512, Bsize=32)/zygote/CPU/1 thread(s) 294796688 ns 294975583 ns 1.00
batchedmm(512, Bsize=32)/zygote/GPU/CUDA 6513322 ns 6485489.5 ns 1.00
batchedmm(512, Bsize=32)/zygote/GPU/AMDGPU 5586938 ns 5559847 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/2 thread(s) 18083.5 ns 21209 ns 0.85
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/4 thread(s) 18875 ns 20792 ns 0.91
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/8 thread(s) 17666 ns 20667 ns 0.85
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/1 thread(s) 15333 ns 23334 ns 0.66
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/CUDA 20058 ns 23699 ns 0.85
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/Metal 263167 ns 222416.5 ns 1.18
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/AMDGPU 27460 ns 28521 ns 0.96
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/2 thread(s) 11000 ns 11500 ns 0.96
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/4 thread(s) 9062.5 ns 10000 ns 0.91
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/8 thread(s) 9375 ns 10375 ns 0.90
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/1 thread(s) 17208 ns 18416 ns 0.93
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/CUDA 258100 ns 259109.5 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/Metal 1636271 ns 1578917 ns 1.04
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/AMDGPU 148221 ns 147201.5 ns 1.01
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 8375 ns 24625 ns 0.34
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 8500 ns 27667 ns 0.31
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 9709 ns 30500 ns 0.32
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 8667 ns 23937.5 ns 0.36
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 114814.5 ns 137987.5 ns 0.83
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/Metal 871583 ns 670209 ns 1.30
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/AMDGPU 237832 ns 243177.5 ns 0.98
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 10083 ns 10917 ns 0.92
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 9458 ns 11500 ns 0.82
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 10917 ns 11750 ns 0.93
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 10000 ns 10792 ns 0.93
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 616125 ns 621051 ns 0.99
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/Metal 5530209 ns 4704896 ns 1.18
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 649716 ns 650846 ns 1.00
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 9104 ns 8041 ns 1.13
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 9729 ns 10271 ns 0.95
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 11875 ns 11125 ns 1.07
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 10146 ns 9292 ns 1.09
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 119182.5 ns 119985.5 ns 0.99
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/Metal 925000 ns 895208 ns 1.03
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/AMDGPU 71721 ns 71901 ns 1.00
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 14000 ns 13354 ns 1.05
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 13291.5 ns 13667 ns 0.97
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 16125 ns 13917 ns 1.16
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 16375 ns 13708 ns 1.19
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 587210 ns 585616 ns 1.00
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/Metal 4884791.5 ns 4221708 ns 1.16
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 342378 ns 351904 ns 0.97
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 459 ns 1542 ns 0.30
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 500 ns 1750 ns 0.29
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 583 ns 1792 ns 0.33
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 541 ns 1583 ns 0.34
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 34108 ns 40136 ns 0.85
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/Metal 463166 ns 273959 ns 1.69
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/AMDGPU 204103 ns 207332 ns 0.98
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 8500 ns 8750 ns 0.97
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7208 ns 9250 ns 0.78
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 10250 ns 9291 ns 1.10
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 8875 ns 8875 ns 1
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 231051 ns 227150.5 ns 1.02
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/Metal 5606395.5 ns 4712916 ns 1.19
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 656736.5 ns 674086 ns 0.97
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 16708 ns 17875 ns 0.93
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 17709 ns 19167 ns 0.92
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 15459 ns 18896 ns 0.82
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 11812.5 ns 18125 ns 0.65
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/CUDA 21417 ns 24199.5 ns 0.89
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/Metal 326333 ns 208625.5 ns 1.56
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/AMDGPU 187682 ns 187926.5 ns 1.00
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 32083 ns 32417 ns 0.99
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 32083 ns 32958 ns 0.97
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 32583 ns 33458 ns 0.97
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 32125 ns 32625 ns 0.98
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/CUDA 271002 ns 275193 ns 0.98
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/Metal 1817666.5 ns 1674271 ns 1.09
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/AMDGPU 588436 ns 588556 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 442583 ns 455833.5 ns 0.97
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 484500 ns 470416.5 ns 1.03
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 466916.5 ns 445500 ns 1.05
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 474875 ns 442125 ns 1.07
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 195143 ns 194972.5 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 1992645.5 ns 2002875 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 366114 ns 368743 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 3836708 ns 3826416.5 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 3833541 ns 3821625 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 3838416.5 ns 3805291.5 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 3820125 ns 3828770.5 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 541312.5 ns 539774 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 8971125 ns 9665250 ns 0.93
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1356384 ns 1360323 ns 1.00
batchedmm(512, Bsize=512)/forward/CPU/2 thread(s) 831437333 ns 787624562.5 ns 1.06
batchedmm(512, Bsize=512)/forward/CPU/4 thread(s) 543898584 ns 541996916 ns 1.00
batchedmm(512, Bsize=512)/forward/CPU/8 thread(s) 543162208 ns 539785459 ns 1.01
batchedmm(512, Bsize=512)/forward/CPU/1 thread(s) 1507470958 ns 1557728417 ns 0.97
batchedmm(512, Bsize=512)/forward/GPU/CUDA 22556380.5 ns 22543125 ns 1.00
batchedmm(512, Bsize=512)/forward/GPU/AMDGPU 14756806 ns 14726018 ns 1.00
batchedmm(512, Bsize=512)/zygote/CPU/2 thread(s) 2992149250 ns 2518400750 ns 1.19
batchedmm(512, Bsize=512)/zygote/CPU/4 thread(s) 2941572500 ns 1785169708 ns 1.65
batchedmm(512, Bsize=512)/zygote/CPU/8 thread(s) 1796840333 ns 1784676208 ns 1.01
batchedmm(512, Bsize=512)/zygote/CPU/1 thread(s) 4805069125 ns 5268664750 ns 0.91
batchedmm(512, Bsize=512)/zygote/GPU/CUDA 365112590 ns 367578104 ns 0.99
batchedmm(512, Bsize=512)/zygote/GPU/AMDGPU 88955268 ns 88737971 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 76312 ns 75084 ns 1.02
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 87000 ns 76541.5 ns 1.14
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 80792 ns 78958 ns 1.02
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 77791.5 ns 75625 ns 1.03
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 206518 ns 206590 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 538646 ns 947916 ns 0.57
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 106801 ns 120271 ns 0.89
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 299625 ns 193042 ns 1.55
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 288125 ns 278584 ns 1.03
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 206708.5 ns 194458 ns 1.06
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 252125 ns 249250 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1045048 ns 1038440 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 6344792 ns 6277083 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 629272 ns 658001 ns 0.96
batchedmm(512, Bsize=128)/forward/CPU/2 thread(s) 199941103.5 ns 199276312.5 ns 1.00
batchedmm(512, Bsize=128)/forward/CPU/4 thread(s) 139210416.5 ns 139271583 ns 1.00
batchedmm(512, Bsize=128)/forward/CPU/8 thread(s) 139343334 ns 139246333 ns 1.00
batchedmm(512, Bsize=128)/forward/CPU/1 thread(s) 389333417 ns 388477666 ns 1.00
batchedmm(512, Bsize=128)/forward/GPU/CUDA 5834950 ns 5836579.5 ns 1.00
batchedmm(512, Bsize=128)/forward/GPU/AMDGPU 3575806.5 ns 3573103 ns 1.00
batchedmm(512, Bsize=128)/zygote/CPU/2 thread(s) 620327354.5 ns 619375645.5 ns 1.00
batchedmm(512, Bsize=128)/zygote/CPU/4 thread(s) 439885000 ns 439498458 ns 1.00
batchedmm(512, Bsize=128)/zygote/CPU/8 thread(s) 440156062.5 ns 439699604.5 ns 1.00
batchedmm(512, Bsize=128)/zygote/CPU/1 thread(s) 1177316333 ns 1187020083 ns 0.99
batchedmm(512, Bsize=128)/zygote/GPU/CUDA 26374289 ns 26508453 ns 0.99
batchedmm(512, Bsize=128)/zygote/GPU/AMDGPU 22085394.5 ns 22071416 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7208 ns 13833 ns 0.52
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 6292 ns 13292 ns 0.47
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 5709 ns 13625 ns 0.42
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10125 ns 16334 ns 0.62
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 27684 ns 37105 ns 0.75
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 612583 ns 682166 ns 0.90
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 48080 ns 56160 ns 0.86
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 215042 ns 219750 ns 0.98
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 220416.5 ns 228708 ns 0.96
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 221542 ns 229666.5 ns 0.96
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 207541.5 ns 213125 ns 0.97
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 221316 ns 233596 ns 0.95
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 9129500 ns 9102583 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 527145.5 ns 556036 ns 0.95
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 9666.5 ns 8625 ns 1.12
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 8291 ns 9083.5 ns 0.91
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 9979 ns 10416 ns 0.96
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 8416.5 ns 8125 ns 1.04
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 115804.5 ns 116194 ns 1.00
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/Metal 910250 ns 900041.5 ns 1.01
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/AMDGPU 69921 ns 73561 ns 0.95
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 8271 ns 7583 ns 1.09
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7250 ns 8084 ns 0.90
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 10417 ns 8208 ns 1.27
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 10416.5 ns 7708 ns 1.35
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 516584 ns 515823 ns 1.00
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/Metal 4789687 ns 4141083 ns 1.16
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 315163 ns 319483 ns 0.99
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 500 ns 6709 ns 0.07452675510508272
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 416 ns 7000 ns 0.05942857142857143
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 667 ns 7250 ns 0.092
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 500 ns 6833 ns 0.07317430118542367
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 25989 ns 35385 ns 0.73
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/Metal 482354 ns 317896 ns 1.52
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/AMDGPU 46350 ns 58250 ns 0.80
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 12250 ns 15083 ns 0.81
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 8708 ns 16500 ns 0.53
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 13250 ns 17375 ns 0.76
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 14583 ns 15583 ns 0.94
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 251956 ns 263078 ns 0.96
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/Metal 5957666 ns 5365146 ns 1.11
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/AMDGPU 390104 ns 398084 ns 0.98
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 107083 ns 111604 ns 0.96
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 99458.5 ns 106999.5 ns 0.93
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 100667 ns 111125 ns 0.91
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 146396 ns 158562.5 ns 0.92
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/CUDA 24308 ns 27181 ns 0.89
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/Metal 269875 ns 268208 ns 1.01
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/AMDGPU 190462 ns 193052 ns 0.99
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 513042 ns 479520.5 ns 1.07
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 501687 ns 510437.5 ns 0.98
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 490687 ns 480729 ns 1.02
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 494500 ns 479354.5 ns 1.03
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/CUDA 229321 ns 233277 ns 0.98
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/Metal 2440291 ns 2209500 ns 1.10
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/AMDGPU 605607 ns 604431 ns 1.00
batchedmm(16, Bsize=32)/forward/CPU/2 thread(s) 5167 ns 5021 ns 1.03
batchedmm(16, Bsize=32)/forward/CPU/4 thread(s) 5020.5 ns 5708.5 ns 0.88
batchedmm(16, Bsize=32)/forward/CPU/8 thread(s) 6333 ns 6333.5 ns 1.00
batchedmm(16, Bsize=32)/forward/CPU/1 thread(s) 5166.5 ns 6625 ns 0.78
batchedmm(16, Bsize=32)/forward/GPU/CUDA 15753 ns 16031 ns 0.98
batchedmm(16, Bsize=32)/forward/GPU/AMDGPU 83851 ns 84920 ns 0.99
batchedmm(16, Bsize=32)/zygote/CPU/2 thread(s) 12625 ns 12729.5 ns 0.99
batchedmm(16, Bsize=32)/zygote/CPU/4 thread(s) 10562.5 ns 11646 ns 0.91
batchedmm(16, Bsize=32)/zygote/CPU/8 thread(s) 11916 ns 12146 ns 0.98
batchedmm(16, Bsize=32)/zygote/CPU/1 thread(s) 17313 ns 17375 ns 1.00
batchedmm(16, Bsize=32)/zygote/GPU/CUDA 211388 ns 216325.5 ns 0.98
batchedmm(16, Bsize=32)/zygote/GPU/AMDGPU 376734 ns 366004 ns 1.03
batchedmm(16, Bsize=128)/forward/CPU/2 thread(s) 38250 ns 35312.5 ns 1.08
batchedmm(16, Bsize=128)/forward/CPU/4 thread(s) 51833 ns 51479 ns 1.01
batchedmm(16, Bsize=128)/forward/CPU/8 thread(s) 52709 ns 53042 ns 0.99
batchedmm(16, Bsize=128)/forward/CPU/1 thread(s) 13750 ns 13667 ns 1.01
batchedmm(16, Bsize=128)/forward/GPU/CUDA 21653 ns 21712 ns 1.00
batchedmm(16, Bsize=128)/forward/GPU/AMDGPU 87391 ns 91931 ns 0.95
batchedmm(16, Bsize=128)/zygote/CPU/2 thread(s) 37167 ns 37354.5 ns 0.99
batchedmm(16, Bsize=128)/zygote/CPU/4 thread(s) 30854 ns 44104 ns 0.70
batchedmm(16, Bsize=128)/zygote/CPU/8 thread(s) 31854 ns 32958 ns 0.97
batchedmm(16, Bsize=128)/zygote/CPU/1 thread(s) 58271 ns 57917 ns 1.01
batchedmm(16, Bsize=128)/zygote/GPU/CUDA 189565 ns 194626.5 ns 0.97
batchedmm(16, Bsize=128)/zygote/GPU/AMDGPU 418544 ns 399414 ns 1.05
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/2 thread(s) 1667 ns 8542 ns 0.20
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/4 thread(s) 1917 ns 9791.5 ns 0.20
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/8 thread(s) 2167 ns 11625 ns 0.19
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/1 thread(s) 1666.5 ns 9750 ns 0.17
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/CUDA 20444 ns 23397 ns 0.87
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/Metal 314396.5 ns 305375 ns 1.03
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/AMDGPU 34190 ns 34271 ns 1.00
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/2 thread(s) 2292 ns 3041 ns 0.75
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/4 thread(s) 2083 ns 3271 ns 0.64
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/8 thread(s) 2500 ns 3792 ns 0.66
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/1 thread(s) 2083 ns 3208 ns 0.65
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/CUDA 202251.5 ns 206318 ns 0.98
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/Metal 1617458 ns 1504750.5 ns 1.07
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/AMDGPU 140196.5 ns 141011.5 ns 0.99
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 5395.5 ns 4792 ns 1.13
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 4875 ns 4708.5 ns 1.04
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 6167 ns 6834 ns 0.90
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 4834 ns 4667 ns 1.04
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 143952.5 ns 141147.5 ns 1.02
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/Metal 450959 ns 457167 ns 0.99
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/AMDGPU 69101 ns 68731 ns 1.01
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 9083 ns 8458.5 ns 1.07
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8417 ns 8459 ns 1.00
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 9417 ns 8750 ns 1.08
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8041.5 ns 8333 ns 0.97
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 869463 ns 861183 ns 1.01
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/Metal 5940500 ns 5555937.5 ns 1.07
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/AMDGPU 397364 ns 385044 ns 1.03
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 56875 ns 58083 ns 0.98
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 57750 ns 59084 ns 0.98
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 57666 ns 59416 ns 0.97
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 58042 ns 59416 ns 0.98
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 37216 ns 43710 ns 0.85
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 666708 ns 532666 ns 1.25
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 210572.5 ns 207252 ns 1.02
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 459395.5 ns 449104.5 ns 1.02
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 469645.5 ns 465666.5 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 472604 ns 467437 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 434437.5 ns 435520.5 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 265753 ns 264164 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 8430333 ns 8246875 ns 1.02
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 835999 ns 831528 ns 1.01
batchedmm(128, Bsize=128)/forward/CPU/2 thread(s) 3320312.5 ns 3290708 ns 1.01
batchedmm(128, Bsize=128)/forward/CPU/4 thread(s) 2332854.5 ns 2334854.5 ns 1.00
batchedmm(128, Bsize=128)/forward/CPU/8 thread(s) 2338770.5 ns 2339729 ns 1.00
batchedmm(128, Bsize=128)/forward/CPU/1 thread(s) 6300103.5 ns 6308458 ns 1.00
batchedmm(128, Bsize=128)/forward/GPU/CUDA 204444 ns 204167 ns 1.00
batchedmm(128, Bsize=128)/forward/GPU/AMDGPU 218932.5 ns 218552 ns 1.00
batchedmm(128, Bsize=128)/zygote/CPU/2 thread(s) 11527604 ns 11346209 ns 1.02
batchedmm(128, Bsize=128)/zygote/CPU/4 thread(s) 8330834 ns 8328312.5 ns 1.00
batchedmm(128, Bsize=128)/zygote/CPU/8 thread(s) 8347500 ns 8321834 ns 1.00
batchedmm(128, Bsize=128)/zygote/CPU/1 thread(s) 21053229.5 ns 21080084 ns 1.00
batchedmm(128, Bsize=128)/zygote/GPU/CUDA 739381 ns 728462 ns 1.01
batchedmm(128, Bsize=128)/zygote/GPU/AMDGPU 1071796 ns 1058000 ns 1.01
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 4750 ns 5083.5 ns 0.93
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 5145.5 ns 6875 ns 0.75
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 7104 ns 7083 ns 1.00
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 5500 ns 6604 ns 0.83
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 137284.5 ns 136287.5 ns 1.01
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/Metal 846417 ns 783520.5 ns 1.08
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/AMDGPU 56935.5 ns 55800 ns 1.02
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 10167 ns 7000 ns 1.45
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7250 ns 7417 ns 0.98
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 8875 ns 7375 ns 1.20
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7166 ns 7291.5 ns 0.98
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 754765 ns 747674.5 ns 1.01
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/Metal 5619792 ns 5585312.5 ns 1.01
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/AMDGPU 363983.5 ns 365323 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 120333 ns 110750 ns 1.09
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 123500 ns 127458.5 ns 0.97
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 104166 ns 122542 ns 0.85
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 121959 ns 117167 ns 1.04
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 151640 ns 156753 ns 0.97
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 2035604 ns 2136000 ns 0.95
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 203932 ns 226292.5 ns 0.90
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2022667 ns 2021500 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2031562.5 ns 2022042 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2025666.5 ns 2031021 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1707375 ns 2023917 ns 0.84
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 702589 ns 706711 ns 0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 11089875 ns 10690542 ns 1.04
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1248902 ns 1254492 ns 1.00
batchedmm(2, Bsize=4)/forward/CPU/2 thread(s) 33458 ns 28833.5 ns 1.16
batchedmm(2, Bsize=4)/forward/CPU/4 thread(s) 36625 ns 36542 ns 1.00
batchedmm(2, Bsize=4)/forward/CPU/8 thread(s) 35291 ns 34917 ns 1.01
batchedmm(2, Bsize=4)/forward/CPU/1 thread(s) 584 ns 708.5 ns 0.82
batchedmm(2, Bsize=4)/forward/GPU/CUDA 15174 ns 15392 ns 0.99
batchedmm(2, Bsize=4)/forward/GPU/AMDGPU 79811 ns 79601 ns 1.00
batchedmm(2, Bsize=4)/zygote/CPU/2 thread(s) 2604.5 ns 3250 ns 0.80
batchedmm(2, Bsize=4)/zygote/CPU/4 thread(s) 2708 ns 3833 ns 0.71
batchedmm(2, Bsize=4)/zygote/CPU/8 thread(s) 3667 ns 3917 ns 0.94
batchedmm(2, Bsize=4)/zygote/CPU/1 thread(s) 2167 ns 2834 ns 0.76
batchedmm(2, Bsize=4)/zygote/GPU/CUDA 138127.5 ns 139825 ns 0.99
batchedmm(2, Bsize=4)/zygote/GPU/AMDGPU 339374 ns 340743 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7334 ns 8416 ns 0.87
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 6083 ns 7333 ns 0.83
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 6000 ns 7541 ns 0.80
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10167 ns 11208 ns 0.91
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 36513 ns 42506 ns 0.86
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 649854 ns 420187.5 ns 1.55
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 47760 ns 50571 ns 0.94
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 214542 ns 213521 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 226417 ns 229958 ns 0.98
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 226979 ns 222791.5 ns 1.02
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 205334 ns 215375 ns 0.95
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 244233 ns 251022 ns 0.97
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 8096000 ns 7930584 ns 1.02
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 576056 ns 574850 ns 1.00
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/2 thread(s) 3917 ns 6209 ns 0.63
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/4 thread(s) 3916 ns 6375 ns 0.61
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/8 thread(s) 3917 ns 6459 ns 0.61
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/1 thread(s) 3917 ns 6125 ns 0.64
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/CUDA 21418 ns 28584 ns 0.75
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/Metal 247729.5 ns 251125 ns 0.99
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/AMDGPU 42160 ns 47090 ns 0.90
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/2 thread(s) 14625 ns 23167 ns 0.63
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/4 thread(s) 14917 ns 24166 ns 0.62
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/8 thread(s) 14959 ns 24375 ns 0.61
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/1 thread(s) 14875 ns 23292 ns 0.64
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/CUDA 308560 ns 333019.5 ns 0.93
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/Metal 1007917 ns 1014854.5 ns 0.99
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/AMDGPU 197152 ns 208872 ns 0.94
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 127625 ns 110000.5 ns 1.16
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 102583.5 ns 148604 ns 0.69
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 108542 ns 126750 ns 0.86
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 144833 ns 133125 ns 1.09
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 140996 ns 148515 ns 0.95
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 2049875 ns 2080104 ns 0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 205082 ns 217122 ns 0.94
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1924833 ns 1912521 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1933417 ns 1906583 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1927375 ns 1884312.5 ns 1.02
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1690041 ns 1920187.5 ns 0.88
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 688959 ns 696456 ns 0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 10643666 ns 10487959 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1213162 ns 1218296 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 16916 ns 17542 ns 0.96
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 22833 ns 22458 ns 1.02
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 22083.5 ns 20771 ns 1.06
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 18791 ns 18771 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 108917 ns 112142.5 ns 0.97
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 1343250 ns 1340625 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 80395.5 ns 80871 ns 0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 217750 ns 215875 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 216291.5 ns 253583 ns 0.85
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 219333 ns 217667 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 217312.5 ns 216417 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 519232 ns 525953.5 ns 0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 6202583.5 ns 6121020.5 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 476885 ns 476639.5 ns 1.00
batchedmm(16, Bsize=4)/forward/CPU/2 thread(s) 25542 ns 23979.5 ns 1.07
batchedmm(16, Bsize=4)/forward/CPU/4 thread(s) 30750 ns 32625 ns 0.94
batchedmm(16, Bsize=4)/forward/CPU/8 thread(s) 28270.5 ns 28250 ns 1.00
batchedmm(16, Bsize=4)/forward/CPU/1 thread(s) 1291 ns 1666.5 ns 0.77
batchedmm(16, Bsize=4)/forward/GPU/CUDA 15852 ns 16428 ns 0.96
batchedmm(16, Bsize=4)/forward/GPU/AMDGPU 87461 ns 81141 ns 1.08
batchedmm(16, Bsize=4)/zygote/CPU/2 thread(s) 4646 ns 5271 ns 0.88
batchedmm(16, Bsize=4)/zygote/CPU/4 thread(s) 4667 ns 5854.5 ns 0.80
batchedmm(16, Bsize=4)/zygote/CPU/8 thread(s) 5437.5 ns 6437.5 ns 0.84
batchedmm(16, Bsize=4)/zygote/CPU/1 thread(s) 4791 ns 5646 ns 0.85
batchedmm(16, Bsize=4)/zygote/GPU/CUDA 205245 ns 215206.5 ns 0.95
batchedmm(16, Bsize=4)/zygote/GPU/AMDGPU 379294 ns 379243.5 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 305167 ns 303000 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 305500 ns 305416.5 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 307958.5 ns 308771 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 307625 ns 305083 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 227913 ns 231043 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 1203958 ns 1184000 ns 1.02
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 273823 ns 272543 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 577625 ns 529833.5 ns 1.09
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 541708.5 ns 567729.5 ns 0.95
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 538625 ns 533292 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 531000 ns 536958.5 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1079009 ns 1091736 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 6297000 ns 6208000 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 857439 ns 868528 ns 0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 19583 ns 36042 ns 0.54
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 21020.5 ns 39083 ns 0.54
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 23146 ns 42458 ns 0.55
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 20125 ns 37041 ns 0.54
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 113331 ns 131591 ns 0.86
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 1478062.5 ns 1464375 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 79341 ns 87560 ns 0.91
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 219375 ns 215250.5 ns 1.02
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 223208 ns 215104.5 ns 1.04
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 221333 ns 215917 ns 1.03
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 213250 ns 219083.5 ns 0.97
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 742408.5 ns 768516 ns 0.97
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 7389833.5 ns 7384104 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 536040.5 ns 532425 ns 1.01
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 6459 ns 5500 ns 1.17
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 6167 ns 7000 ns 0.88
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 8271 ns 8542 ns 0.97
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 6709 ns 6334 ns 1.06
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 140459.5 ns 140673 ns 1.00
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/Metal 825583 ns 772083 ns 1.07
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/AMDGPU 64880 ns 67510 ns 0.96
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 10917 ns 10271 ns 1.06
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 10291.5 ns 10292 ns 1.00
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 11000 ns 10833.5 ns 1.02
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 9042 ns 10875 ns 0.83
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 822871.5 ns 833045.5 ns 0.99
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/Metal 5686479 ns 5336083 ns 1.07
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/AMDGPU 386913.5 ns 390258.5 ns 0.99
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 4666 ns 4709 ns 0.99
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 4729 ns 5500 ns 0.86
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 6166.5 ns 6709 ns 0.92
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 4729 ns 6542 ns 0.72
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 142533.5 ns 144256 ns 0.99
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/Metal 842166 ns 792979 ns 1.06
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/AMDGPU 66671 ns 66931 ns 1.00
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7750 ns 7125.5 ns 1.09
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7520.5 ns 7917 ns 0.95
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7792 ns 7917 ns 0.98
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7334 ns 7604.5 ns 0.96
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 781913 ns 790107 ns 0.99
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/Metal 6165208 ns 5547667 ns 1.11
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 390103 ns 391578.5 ns 1.00
batchedmm(128, Bsize=512)/forward/CPU/2 thread(s) 14542834 ns 14365625 ns 1.01
batchedmm(128, Bsize=512)/forward/CPU/4 thread(s) 10133792 ns 10109792 ns 1.00
batchedmm(128, Bsize=512)/forward/CPU/8 thread(s) 10143250 ns 10132375 ns 1.00
batchedmm(128, Bsize=512)/forward/CPU/1 thread(s) 27710708 ns 27659333 ns 1.00
batchedmm(128, Bsize=512)/forward/GPU/CUDA 529129.5 ns 534508 ns 0.99
batchedmm(128, Bsize=512)/forward/GPU/AMDGPU 397218.5 ns 392324 ns 1.01
batchedmm(128, Bsize=512)/zygote/CPU/2 thread(s) 46516250 ns 45855833 ns 1.01
batchedmm(128, Bsize=512)/zygote/CPU/4 thread(s) 33510145.5 ns 33506395.5 ns 1.00
batchedmm(128, Bsize=512)/zygote/CPU/8 thread(s) 33544583 ns 33525958 ns 1.00
batchedmm(128, Bsize=512)/zygote/CPU/1 thread(s) 85322958 ns 85233208 ns 1.00
batchedmm(128, Bsize=512)/zygote/GPU/CUDA 2673514 ns 2804828.5 ns 0.95
batchedmm(128, Bsize=512)/zygote/GPU/AMDGPU 3288493 ns 3316671 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 68541 ns 83646 ns 0.82
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 66354 ns 87875 ns 0.76
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 69208 ns 90333 ns 0.77
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 66250 ns 85687.5 ns 0.77
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 120674 ns 124763.5 ns 0.97
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 1454437.5 ns 1478042 ns 0.98
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 228432 ns 248002.5 ns 0.92
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 450875 ns 442062 ns 1.02
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 484020.5 ns 451167 ns 1.07
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 448417 ns 444167 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 455791 ns 441479 ns 1.03
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 730668 ns 747087 ns 0.98
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 7716375 ns 7697145.5 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 781067 ns 784227 ns 1.00
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 542 ns 1750 ns 0.31
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 541 ns 1875 ns 0.29
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 584 ns 2000 ns 0.29
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 584 ns 1750 ns 0.33
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 32007 ns 38564 ns 0.83
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/Metal 466021 ns 469896 ns 0.99
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/AMDGPU 48911 ns 50030 ns 0.98
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 9375 ns 10250 ns 0.91
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 8125 ns 10938 ns 0.74
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 9562.5 ns 11042 ns 0.87
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 8583 ns 10625 ns 0.81
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 284659 ns 286642 ns 0.99
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/Metal 5684229 ns 4815583.5 ns 1.18
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 375694 ns 389993 ns 0.96
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 9791 ns 9833 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 9833 ns 9834 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 9792 ns 9833 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 9875 ns 9792 ns 1.01
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/CUDA 22881 ns 23280 ns 0.98
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/Metal 221417 ns 228792 ns 0.97
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/AMDGPU 210902 ns 204342 ns 1.03
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 45708 ns 49584 ns 0.92
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 45875 ns 50542 ns 0.91
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 46292 ns 50708 ns 0.91
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 45875 ns 49917 ns 0.92
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/CUDA 288887.5 ns 308151 ns 0.94
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/Metal 1407145.5 ns 1545500 ns 0.91
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/AMDGPU 605076 ns 603836 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 56709 ns 62834 ns 0.90
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 57166 ns 64292 ns 0.89
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 57041 ns 64333 ns 0.89
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 57875 ns 64250 ns 0.90
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 28717 ns 39886 ns 0.72
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 606417 ns 638041.5 ns 0.95
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 202332 ns 213412 ns 0.95
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 458583 ns 456084 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 511562 ns 488791.5 ns 1.05
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 468541 ns 476146 ns 0.98
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 444250 ns 491750 ns 0.90
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 246546 ns 263616 ns 0.94
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 9398625 ns 9629125 ns 0.98
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 886918.5 ns 891718 ns 0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 622666.5 ns 638875 ns 0.97
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 650042 ns 657062.5 ns 0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 626375 ns 647917 ns 0.97
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 609145.5 ns 637833 ns 0.96
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 206439.5 ns 209655 ns 0.98
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 1405042 ns 1377917 ns 1.02
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 302253 ns 308858 ns 0.98
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2246916.5 ns 2231042 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2234750 ns 2234709 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2232959 ns 2231770.5 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2238478.5 ns 2224542 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 974225.5 ns 969019 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 7985125 ns 7164667 ns 1.11
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1218663 ns 1319082 ns 0.92
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 23292 ns 36750.5 ns 0.63
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 19541 ns 40083 ns 0.49
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 23708.5 ns 42416 ns 0.56
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 20416 ns 36146 ns 0.56
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 113048.5 ns 131167.5 ns 0.86
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal 1461666 ns 1489541.5 ns 0.98
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 79571 ns 89901 ns 0.89
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 229583 ns 221125 ns 1.04
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 219083 ns 231999.5 ns 0.94
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 225166 ns 223062.5 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 260792 ns 220250 ns 1.18
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 729682 ns 745440.5 ns 0.98
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal 7602084 ns 7764958 ns 0.98
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 553456 ns 549685 ns 1.01
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 542 ns 6833 ns 0.07932094248499927
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 500 ns 7208 ns 0.06936736958934517
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 667 ns 7375 ns 0.0904406779661017
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 583 ns 6834 ns 0.08530875036581798
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 22763 ns 33512 ns 0.68
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/Metal 508625 ns 444542 ns 1.14
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/AMDGPU 47710 ns 57271 ns 0.83
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 9104.5 ns 15833.5 ns 0.58
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 9584 ns 17146 ns 0.56
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 9750 ns 17041 ns 0.57
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 9666 ns 16687.5 ns 0.58
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 266997 ns 282963.5 ns 0.94
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/Metal 6194875 ns 5994417 ns 1.03
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 405084 ns 408498.5 ns 0.99
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 9167 ns 9458.5 ns 0.97
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 8354 ns 9167 ns 0.91
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 10083 ns 10416.5 ns 0.97
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 9333 ns 8541.5 ns 1.09
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 119160 ns 120739.5 ns 0.99
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/Metal 906395.5 ns 888833.5 ns 1.02
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/AMDGPU 71140 ns 70321 ns 1.01
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7437.5 ns 7500 ns 0.99
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7833 ns 7667 ns 1.02
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 8042 ns 7895.5 ns 1.02
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7834 ns 7417 ns 1.06
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 505714 ns 513454.5 ns 0.98
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/Metal 4336416.5 ns 3973625 ns 1.09
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/AMDGPU 318483 ns 319713 ns 1.00
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 1542 ns 9354 ns 0.16
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 1500 ns 9542 ns 0.16
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 2000 ns 10583 ns 0.19
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 1459 ns 9229 ns 0.16
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/CUDA 20988 ns 24142 ns 0.87
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/Metal 308500 ns 305208 ns 1.01
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/AMDGPU 188631.5 ns 190361 ns 0.99
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 3208 ns 4145.5 ns 0.77
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 3375 ns 4208 ns 0.80
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 3750 ns 4750 ns 0.79
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 3292 ns 4167 ns 0.79
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/CUDA 215843 ns 226431.5 ns 0.95
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/Metal 1789834 ns 1679312.5 ns 1.07
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/AMDGPU 578526 ns 577155 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/2 thread(s) 149521 ns 155083 ns 0.96
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/4 thread(s) 131625 ns 136375 ns 0.97
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/8 thread(s) 129375 ns 140958 ns 0.92
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/1 thread(s) 225125 ns 232833.5 ns 0.97
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/CUDA 23704 ns 26998 ns 0.88
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/Metal 277145.5 ns 297875 ns 0.93
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/AMDGPU 41105.5 ns 42431 ns 0.97
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/2 thread(s) 160500 ns 144458 ns 1.11
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/4 thread(s) 127875 ns 127291 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/8 thread(s) 111292 ns 112104.5 ns 0.99
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/1 thread(s) 285104 ns 252250 ns 1.13
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/CUDA 215659.5 ns 219049 ns 0.98
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/Metal 2088792 ns 2074312 ns 1.01
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/AMDGPU 218702 ns 265923 ns 0.82
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7291 ns 8583 ns 0.85
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 6042 ns 7292 ns 0.83
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 6041 ns 7292 ns 0.83
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10125 ns 11333 ns 0.89
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 32743 ns 38422 ns 0.85
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 684042 ns 374313 ns 1.83
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 50791 ns 51281 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 258458 ns 221417 ns 1.17
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 269000 ns 229791 ns 1.17
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 232916 ns 230458.5 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 213583 ns 214041.5 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 264603 ns 259283 ns 1.02
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 8332437.5 ns 8241896 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 596726 ns 592306 ns 1.01
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 15208 ns 15479 ns 0.98
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 14750 ns 15375 ns 0.96
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 16812.5 ns 17458 ns 0.96
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 15500 ns 15542 ns 1.00
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 139047.5 ns 137835 ns 1.01
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/Metal 826750 ns 778728.5 ns 1.06
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/AMDGPU 231322 ns 231852 ns 1.00
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 24000 ns 23417 ns 1.02
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 23459 ns 23791 ns 0.99
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 24833 ns 24000 ns 1.03
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 23729.5 ns 23937 ns 0.99
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 863000.5 ns 858271 ns 1.01
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/Metal 5971562.5 ns 5635500 ns 1.06
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 676426 ns 677086 ns 1.00
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 9125 ns 26604 ns 0.34
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 9792 ns 28250 ns 0.35
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 11625 ns 31333 ns 0.37
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 9959 ns 26812.5 ns 0.37
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 123210 ns 137010 ns 0.90
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/Metal 822750 ns 925417 ns 0.89
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/AMDGPU 73491 ns 82411 ns 0.89
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 13792 ns 14792 ns 0.93
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 13834 ns 15708 ns 0.88
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 14708 ns 16000 ns 0.92
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 13959 ns 15645.5 ns 0.89
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 662447.5 ns 668142 ns 0.99
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/Metal 5427958 ns 5325770.5 ns 1.02
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/AMDGPU 363743 ns 366524 ns 0.99
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 9500 ns 9312.5 ns 1.02
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 9917 ns 9416 ns 1.05
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 10917 ns 10583 ns 1.03
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 9500 ns 9542 ns 1.00
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 121865.5 ns 121280 ns 1.00
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/Metal 947416 ns 932375 ns 1.02
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/AMDGPU 72530 ns 72561 ns 1.00
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 12625 ns 12354 ns 1.02
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 12791 ns 13000 ns 0.98
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 13500 ns 13042 ns 1.04
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 12792 ns 12458.5 ns 1.03
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 548713 ns 545614 ns 1.01
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/Metal 4635584 ns 4752396 ns 0.98
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/AMDGPU 338153 ns 340553.5 ns 0.99
batchedmm(2, Bsize=128)/forward/CPU/2 thread(s) 31166.5 ns 26958 ns 1.16
batchedmm(2, Bsize=128)/forward/CPU/4 thread(s) 34604.5 ns 34792 ns 0.99
batchedmm(2, Bsize=128)/forward/CPU/8 thread(s) 32000.5 ns 32041.5 ns 1.00
batchedmm(2, Bsize=128)/forward/CPU/1 thread(s) 2041 ns 1958.5 ns 1.04
batchedmm(2, Bsize=128)/forward/GPU/CUDA 16060 ns 16169 ns 0.99
batchedmm(2, Bsize=128)/forward/GPU/AMDGPU 80571 ns 80481 ns 1.00
batchedmm(2, Bsize=128)/zygote/CPU/2 thread(s) 5459 ns 6042 ns 0.90
batchedmm(2, Bsize=128)/zygote/CPU/4 thread(s) 5229.5 ns 6208 ns 0.84
batchedmm(2, Bsize=128)/zygote/CPU/8 thread(s) 5500 ns 6520.5 ns 0.84
batchedmm(2, Bsize=128)/zygote/CPU/1 thread(s) 6375 ns 6834 ns 0.93
batchedmm(2, Bsize=128)/zygote/GPU/CUDA 137726 ns 141884.5 ns 0.97
batchedmm(2, Bsize=128)/zygote/GPU/AMDGPU 367863 ns 371004 ns 0.99
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 292 ns 6458 ns 0.0452152369154537
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 292 ns 6834 ns 0.04272753877670471
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 375 ns 6875 ns 0.05454545454545454
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 292 ns 6375 ns 0.04580392156862745
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 25579 ns 34623 ns 0.74
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/Metal 497667 ns 457312.5 ns 1.09
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/AMDGPU 46710 ns 56171 ns 0.83
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 6250 ns 12916 ns 0.48
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 6333 ns 13791 ns 0.46
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 6875 ns 14084 ns 0.49
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 6417 ns 13042 ns 0.49
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 186593 ns 198569 ns 0.94
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/Metal 6148312.5 ns 5453125 ns 1.13
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/AMDGPU 386914 ns 396759 ns 0.98
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 2000 ns 8292 ns 0.24
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 2042 ns 8625 ns 0.24
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 2042 ns 8833 ns 0.23
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 2042 ns 8333 ns 0.25
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 25958 ns 35748 ns 0.73
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/Metal 499875 ns 324084 ns 1.54
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/AMDGPU 204552 ns 215022 ns 0.95
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 16250 ns 22770.5 ns 0.71
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 16375 ns 23812.5 ns 0.69
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 17208 ns 24291.5 ns 0.71
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 17208 ns 22458 ns 0.77
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 273322 ns 284982 ns 0.96
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/Metal 6425583 ns 5718333 ns 1.12
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 703277 ns 709637 ns 0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 151375 ns 149125 ns 1.02
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 175375 ns 155917 ns 1.12
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 151625 ns 152500 ns 0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 148542 ns 148250 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 201338.5 ns 200827 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 1421833 ns 1424250.5 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 235723 ns 214342 ns 1.10
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1323958 ns 1322854.5 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1324646 ns 1324334 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1329917 ns 1306187.5 ns 1.02
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1331958 ns 1319750 ns 1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 910604 ns 894838 ns 1.02
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 6755792 ns 6451042 ns 1.05
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1099850.5 ns 1104625 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 26000 ns 25541.5 ns 1.02
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 26917 ns 25166 ns 1.07
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 27959 ns 27666 ns 1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 26333 ns 24084 ns 1.09
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 236208.5 ns 236708 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal 1183958.5 ns 1207792 ns 0.98
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 115741 ns 114481 ns 1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 131958.5 ns 117291.5 ns 1.13
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 174417 ns 119125.5 ns 1.46
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 118937.5 ns 119021 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 134229.5 ns 129000 ns 1.04
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1078567 ns 1066520 ns 1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal 6416541.5 ns 6154750 ns 1.04
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 608010.5 ns 614935 ns 0.99
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 250 ns 6417 ns 0.03895901511609787
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 292 ns 6750 ns 0.04325925925925926
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 375 ns 6875 ns 0.05454545454545454
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 334 ns 6458 ns 0.051718798389594305
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 22647 ns 32046 ns 0.71
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/Metal 486354 ns 304791.5 ns 1.60
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/AMDGPU 46890 ns 56421 ns 0.83
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 6354 ns 12958 ns 0.49
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 6625 ns 13958 ns 0.47
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 6917 ns 14104 ns 0.49
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 6770.5 ns 12979.5 ns 0.52
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 203495.5 ns 219681.5 ns 0.93
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/Metal 6258459 ns 5367125 ns 1.17
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 385664 ns 404804 ns 0.95
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 7041 ns 6042 ns 1.17
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 6125 ns 6958 ns 0.88
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 7500 ns 8000 ns 0.94
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 6792 ns 5812.5 ns 1.17
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 145023.5 ns 143745 ns 1.01
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/Metal 543645.5 ns 721833 ns 0.75
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/AMDGPU 232412 ns 232722 ns 1.00
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 9916.5 ns 9875 ns 1.00
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 9917 ns 10417 ns 0.95
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 10459 ns 10375 ns 1.01
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 9917 ns 10083.5 ns 0.98
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 899150 ns 893962 ns 1.01
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/Metal 6366229 ns 6022625 ns 1.06
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 670016 ns 667866 ns 1.00
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 625 ns 666 ns 0.94
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 667 ns 666 ns 1.00
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 667 ns 666 ns 1.00
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 667 ns 667 ns 1
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/CUDA 22252 ns 22221.5 ns 1.00
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/Metal 328833 ns 253958 ns 1.29
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/AMDGPU 206112 ns 206192 ns 1.00
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 4583 ns 7958 ns 0.58
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 4666 ns 8833 ns 0.53
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 4916 ns 8875 ns 0.55
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 4583 ns 8041 ns 0.57
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/CUDA 226579.5 ns 238671.5 ns 0.95
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/Metal 1758521 ns 1611250 ns 1.09
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/AMDGPU 576735 ns 575715 ns 1.00
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 7792 ns 24208 ns 0.32
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 8459 ns 26562.5 ns 0.32
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 9666 ns 29458 ns 0.33
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 8749.5 ns 25313 ns 0.35
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 121871.5 ns 134686.5 ns 0.90
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/Metal 904542 ns 819479.5 ns 1.10
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/AMDGPU 72230 ns 82871 ns 0.87
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 8084 ns 9833 ns 0.82
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8917 ns 10750 ns 0.83
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 8750 ns 10584 ns 0.83
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8771 ns 9959 ns 0.88
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 586988 ns 592874 ns 0.99
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/Metal 5366104 ns 4586229.5 ns 1.17
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/AMDGPU 343713 ns 342583 ns 1.00
batchedmm(128, Bsize=4)/forward/CPU/2 thread(s) 126750 ns 125959 ns 1.01
batchedmm(128, Bsize=4)/forward/CPU/4 thread(s) 130333 ns 129958 ns 1.00
batchedmm(128, Bsize=4)/forward/CPU/8 thread(s) 129959 ns 130021 ns 1.00
batchedmm(128, Bsize=4)/forward/CPU/1 thread(s) 183083 ns 181187.5 ns 1.01
batchedmm(128, Bsize=4)/forward/GPU/CUDA 45749 ns 45830 ns 1.00
batchedmm(128, Bsize=4)/forward/GPU/AMDGPU 102711 ns 105671 ns 0.97
batchedmm(128, Bsize=4)/zygote/CPU/2 thread(s) 334625 ns 325125 ns 1.03
batchedmm(128, Bsize=4)/zygote/CPU/4 thread(s) 345916.5 ns 323667 ns 1.07
batchedmm(128, Bsize=4)/zygote/CPU/8 thread(s) 313708 ns 316417 ns 0.99
batchedmm(128, Bsize=4)/zygote/CPU/1 thread(s) 607375 ns 616792 ns 0.98
batchedmm(128, Bsize=4)/zygote/GPU/CUDA 190009.5 ns 194713 ns 0.98
batchedmm(128, Bsize=4)/zygote/GPU/AMDGPU 509425 ns 508449.5 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/2 thread(s) 399042 ns 400583 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/4 thread(s) 288041 ns 290666 ns 0.99
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/8 thread(s) 288083 ns 291292 ns 0.99
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/1 thread(s) 756334 ns 759541 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/CUDA 43243 ns 51490 ns 0.84
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/Metal 421334 ns 458875 ns 0.92
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/AMDGPU 81861 ns 84931 ns 0.96
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/2 thread(s) 1414229.5 ns 1458459 ns 0.97
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/4 thread(s) 1137312.5 ns 1140687.5 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/8 thread(s) 1123812 ns 1149666.5 ns 0.98
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/1 thread(s) 2440958 ns 2451791 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/CUDA 246507.5 ns 274619 ns 0.90
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/Metal 1867604 ns 1914208 ns 0.98
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/AMDGPU 350093 ns 358283 ns 0.98
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 658792 ns 633666 ns 1.04
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 649229 ns 663666.5 ns 0.98
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 645250 ns 645687.5 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 613562.5 ns 632541 ns 0.97
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 199471 ns 200663 ns 0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 1393834 ns 1352979.5 ns 1.03
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 313773 ns 307532.5 ns 1.02
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2462459 ns 2467667 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2455083 ns 2454750 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2436959 ns 2454500 ns 0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2440000 ns 2451167 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 993992.5 ns 984218.5 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 10375437 ns 7766292 ns 1.34
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1301638 ns 1380642 ns 0.94
batchedmm(2, Bsize=32)/forward/CPU/2 thread(s) 33187 ns 32292 ns 1.03
batchedmm(2, Bsize=32)/forward/CPU/4 thread(s) 36020.5 ns 36875 ns 0.98
batchedmm(2, Bsize=32)/forward/CPU/8 thread(s) 34437 ns 34000 ns 1.01
batchedmm(2, Bsize=32)/forward/CPU/1 thread(s) 833 ns 958.5 ns 0.87
batchedmm(2, Bsize=32)/forward/GPU/CUDA 15321 ns 15278.5 ns 1.00
batchedmm(2, Bsize=32)/forward/GPU/AMDGPU 76730 ns 78690.5 ns 0.98
batchedmm(2, Bsize=32)/zygote/CPU/2 thread(s) 3083.5 ns 3792 ns 0.81
batchedmm(2, Bsize=32)/zygote/CPU/4 thread(s) 3208 ns 4333 ns 0.74
batchedmm(2, Bsize=32)/zygote/CPU/8 thread(s) 3459 ns 4583.5 ns 0.75
batchedmm(2, Bsize=32)/zygote/CPU/1 thread(s) 3042 ns 4124.5 ns 0.74
batchedmm(2, Bsize=32)/zygote/GPU/CUDA 136645 ns 140987 ns 0.97
batchedmm(2, Bsize=32)/zygote/GPU/AMDGPU 338843.5 ns 336043 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 408667 ns 413209 ns 0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 408167 ns 415792 ns 0.98
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 408208 ns 416395.5 ns 0.98
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 419208 ns 427145.5 ns 0.98
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 42599 ns 54475 ns 0.78
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal 1163958 ns 1198125 ns 0.97
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 238593 ns 250702 ns 0.95
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 3876875 ns 3877833 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 4000249.5 ns 3995771 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 3985416.5 ns 3886792 ns 1.03
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 3759708.5 ns 3754728.5 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 242677 ns 255856 ns 0.95
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal 11849584 ns 11943833 ns 0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1429804 ns 1432843 ns 1.00
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/2 thread(s) 3917 ns 3958 ns 0.99
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/4 thread(s) 3958 ns 3917 ns 1.01
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/8 thread(s) 3958 ns 3917 ns 1.01
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/1 thread(s) 3958 ns 3917 ns 1.01
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/CUDA 33734 ns 33843 ns 1.00
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/Metal 245792 ns 177584 ns 1.38
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/AMDGPU 37910 ns 37790.5 ns 1.00
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/2 thread(s) 15500 ns 19500 ns 0.79
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/4 thread(s) 15750 ns 20083 ns 0.78
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/8 thread(s) 16000 ns 20375 ns 0.79
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/1 thread(s) 15750 ns 19708 ns 0.80
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/CUDA 253719.5 ns 265715 ns 0.95
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/Metal 877916 ns 870334 ns 1.01
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/AMDGPU 168172 ns 178112 ns 0.94
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/2 thread(s) 404041 ns 404500 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/4 thread(s) 295791 ns 296167 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/8 thread(s) 295750 ns 295667 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/1 thread(s) 760750 ns 760375 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/CUDA 112938 ns 112966 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/Metal 472396 ns 439666 ns 1.07
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/AMDGPU 87831 ns 87800 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 1425834 ns 1477375 ns 0.97
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 1164145.5 ns 1147125 ns 1.01
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 1154500 ns 1158208 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 2463334 ns 2470770.5 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/CUDA 237064 ns 253070 ns 0.94
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/Metal 1932604 ns 1857708 ns 1.04
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/AMDGPU 351653 ns 354333 ns 0.99
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 500 ns 6708 ns 0.07453786523553965
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 583 ns 7125 ns 0.08182456140350877
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 583 ns 7125 ns 0.08182456140350877
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 584 ns 6666 ns 0.08760876087608761
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 25248 ns 34849 ns 0.72
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/Metal 487000 ns 444292 ns 1.10
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/AMDGPU 206212 ns 216212 ns 0.95
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 7208 ns 14000 ns 0.51
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 7792 ns 15125 ns 0.52
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 8000 ns 15584 ns 0.51
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 7958 ns 14250 ns 0.56
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 210355 ns 223556.5 ns 0.94
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/Metal 5898958 ns 5345375.5 ns 1.10
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 681746 ns 696921.5 ns 0.98
batchedmm(128, Bsize=32)/forward/CPU/2 thread(s) 832291.5 ns 832708 ns 1.00
batchedmm(128, Bsize=32)/forward/CPU/4 thread(s) 620334 ns 618166 ns 1.00
batchedmm(128, Bsize=32)/forward/CPU/8 thread(s) 620375 ns 611542 ns 1.01
batchedmm(128, Bsize=32)/forward/CPU/1 thread(s) 1554083 ns 1540812.5 ns 1.01
batchedmm(128, Bsize=32)/forward/GPU/CUDA 129543 ns 130337.5 ns 0.99
batchedmm(128, Bsize=32)/forward/GPU/AMDGPU 230863 ns 224742 ns 1.03
batchedmm(128, Bsize=32)/zygote/CPU/2 thread(s) 2682917 ns 2662417 ns 1.01
batchedmm(128, Bsize=32)/zygote/CPU/4 thread(s) 2002166.5 ns 2007708 ns 1.00
batchedmm(128, Bsize=32)/zygote/CPU/8 thread(s) 2003625 ns 2003084 ns 1.00
batchedmm(128, Bsize=32)/zygote/CPU/1 thread(s) 4890125 ns 4932771 ns 0.99
batchedmm(128, Bsize=32)/zygote/GPU/CUDA 259430 ns 261909.5 ns 0.99
batchedmm(128, Bsize=32)/zygote/GPU/AMDGPU 768547 ns 835813 ns 0.92
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 291 ns 1375 ns 0.21
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 292 ns 1542 ns 0.19
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 375 ns 1583 ns 0.24
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 375 ns 1375 ns 0.27
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 31540 ns 36888 ns 0.86
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/Metal 476750 ns 366667 ns 1.30
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/AMDGPU 46720 ns 49661 ns 0.94
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 6083 ns 7687.5 ns 0.79
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 6333 ns 8542 ns 0.74
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 6667 ns 8291 ns 0.80
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 6625 ns 7958 ns 0.83
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 229293 ns 219381 ns 1.05
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/Metal 5740833 ns 4917833 ns 1.17
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 358943 ns 375813 ns 0.96
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 2382542 ns 2401916.5 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 2420709 ns 2401583 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 2392542 ns 2379416 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 2375083 ns 2371833 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 201753.5 ns 198341.5 ns 1.02
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 1502667 ns 2274958 ns 0.66
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 374933.5 ns 374084 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 4665833 ns 4636458 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 4657978.5 ns 4653166.5 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 4661000 ns 4641125 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 4630625 ns 4652750 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 899813 ns 889968 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 7005729 ns 6404438 ns 1.09
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1237162 ns 1356447.5 ns 0.91
bias_activation(512, act=relu)(512 x 128)/forward/CPU/2 thread(s) 6896 ns 17208.5 ns 0.40
bias_activation(512, act=relu)(512 x 128)/forward/CPU/4 thread(s) 7000 ns 14583 ns 0.48
bias_activation(512, act=relu)(512 x 128)/forward/CPU/8 thread(s) 7667 ns 16313 ns 0.47
bias_activation(512, act=relu)(512 x 128)/forward/CPU/1 thread(s) 6916.5 ns 21229 ns 0.33
bias_activation(512, act=relu)(512 x 128)/forward/GPU/CUDA 23032 ns 25470 ns 0.90
bias_activation(512, act=relu)(512 x 128)/forward/GPU/Metal 249750 ns 267750 ns 0.93
bias_activation(512, act=relu)(512 x 128)/forward/GPU/AMDGPU 37160.5 ns 42811 ns 0.87
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 45292 ns 45146 ns 1.00
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 33000 ns 49833 ns 0.66
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 45875 ns 34417 ns 1.33
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 32937.5 ns 73000.5 ns 0.45
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/CUDA 215380 ns 218060 ns 0.99
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/Metal 2165104 ns 2129250 ns 1.02
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/AMDGPU 263512 ns 268402 ns 0.98
batchedmm(2, Bsize=512)/forward/CPU/2 thread(s) 21604 ns 20459 ns 1.06
batchedmm(2, Bsize=512)/forward/CPU/4 thread(s) 25667 ns 26208 ns 0.98
batchedmm(2, Bsize=512)/forward/CPU/8 thread(s) 25291.5 ns 25292 ns 1.00
batchedmm(2, Bsize=512)/forward/CPU/1 thread(s) 5250 ns 5333.5 ns 0.98
batchedmm(2, Bsize=512)/forward/GPU/CUDA 16144 ns 16594 ns 0.97
batchedmm(2, Bsize=512)/forward/GPU/AMDGPU 84401 ns 83491 ns 1.01
batchedmm(2, Bsize=512)/zygote/CPU/2 thread(s) 11917 ns 12541 ns 0.95
batchedmm(2, Bsize=512)/zygote/CPU/4 thread(s) 10125 ns 11375 ns 0.89
batchedmm(2, Bsize=512)/zygote/CPU/8 thread(s) 10833 ns 11625 ns 0.93
batchedmm(2, Bsize=512)/zygote/CPU/1 thread(s) 19021 ns 19084 ns 1.00
batchedmm(2, Bsize=512)/zygote/GPU/CUDA 225599 ns 227944.5 ns 0.99
batchedmm(2, Bsize=512)/zygote/GPU/AMDGPU 371243.5 ns 370203 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/2 thread(s) 406334 ns 409416 ns 0.99
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/4 thread(s) 296833 ns 299958 ns 0.99
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/8 thread(s) 297292 ns 300250 ns 0.99
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/1 thread(s) 762458 ns 765750 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/CUDA 45629.5 ns 53976 ns 0.85
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/Metal 508917 ns 442125 ns 1.15
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/AMDGPU 88701 ns 94470.5 ns 0.94
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 1436625 ns 1489667 ns 0.96
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 1165250 ns 1171812 ns 0.99
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 1164416.5 ns 1175459 ns 0.99
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 2470667 ns 2480500 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/CUDA 290172 ns 311892 ns 0.93
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/Metal 2138521 ns 2072208.5 ns 1.03
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/AMDGPU 373543.5 ns 370933 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 434312.5 ns 435250 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 436375 ns 438084 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 437145.5 ns 437333 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 446208 ns 448333 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 54620 ns 61295 ns 0.89
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 1097771 ns 1135104 ns 0.97
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 234342 ns 237222 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 3914542 ns 3895917 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 4034250 ns 4001312.5 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 4027166.5 ns 3913375.5 ns 1.03
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 3797250 ns 3807916.5 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 262204 ns 261286 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 10527209 ns 9972333 ns 1.06
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1358593 ns 1208741 ns 1.12
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 8709 ns 11000 ns 0.79
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 7667 ns 10292 ns 0.74
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 7666 ns 10334 ns 0.74
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 12417 ns 14625 ns 0.85
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/CUDA 23305 ns 30723 ns 0.76
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/Metal 229958 ns 233208.5 ns 0.99
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/AMDGPU 209822 ns 215396.5 ns 0.97
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 44791 ns 52791 ns 0.85
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 45375 ns 53583 ns 0.85
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 45125 ns 54083.5 ns 0.83
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 45125 ns 53125 ns 0.85
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/CUDA 345825.5 ns 366013 ns 0.94
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/Metal 1263834 ns 1891437.5 ns 0.67
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/AMDGPU 655157 ns 643336 ns 1.02
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 122229 ns 94209 ns 1.30
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 124458 ns 90833 ns 1.37
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 86958 ns 85958 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 86333.5 ns 126167 ns 0.68
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 190067.5 ns 190399.5 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal 2021459 ns 1996458 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 225357.5 ns 221047 ns 1.02
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2020250 ns 2017500 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2026812.5 ns 2011417 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2019709 ns 1801333.5 ns 1.12
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2000187.5 ns 1978875 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 534191 ns 531205 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal 9193375 ns 9357625 ns 0.98
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 953884 ns 1089565 ns 0.88

This comment was automatically generated by workflow using github-action-benchmark.

Please sign in to comment.