Skip to content
This repository has been archived by the owner on Nov 4, 2024. It is now read-only.

Commit

Permalink
ci: run tests only on 1.10 for now (#172)
Browse files Browse the repository at this point in the history
  • Loading branch information
avik-pal authored Oct 8, 2024
1 parent ba739d3 commit 2d7533c
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 34 deletions.
12 changes: 6 additions & 6 deletions .buildkite/benchmarks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ steps:
- "8"
plugins:
- JuliaCI/julia#v1:
version: "1"
version: "1.10"
command: |
julia --project=benchmarks -e 'println("--- :julia: Instantiating project")
using Pkg
Expand All @@ -34,7 +34,7 @@ steps:
soft_fail: true
plugins:
- JuliaCI/julia#v1:
version: "1"
version: "1.10"
command: |
julia --project=benchmarks -e 'println("--- :julia: Instantiating project")
using Pkg
Expand All @@ -58,7 +58,7 @@ steps:
- label: "CUDA: Run Benchmarks"
plugins:
- JuliaCI/julia#v1:
version: "1"
version: "1.10"
command: |
julia --project=benchmarks -e 'println("--- :julia: Instantiating project")
using Pkg
Expand All @@ -84,7 +84,7 @@ steps:
soft_fail: true
plugins:
- JuliaCI/julia#v1:
version: "1"
version: "1.10"
command: |
julia --project=benchmarks -e 'println("--- :julia: Instantiating project")
using Pkg
Expand All @@ -110,7 +110,7 @@ steps:
soft_fail: true
plugins:
- JuliaCI/julia#v1:
version: "1"
version: "1.10"
command: |
julia --project=benchmarks -e 'println("--- :julia: Instantiating project")
using Pkg
Expand All @@ -137,7 +137,7 @@ steps:
- label: "Combine benchmarks"
plugins:
- JuliaCI/julia#v1:
version: "1"
version: "1.10"
command: |
buildkite-agent artifact download "benchmarks/results/*" .
Expand Down
13 changes: 6 additions & 7 deletions .buildkite/testing.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ steps:
matrix:
setup:
julia:
- "1"
- "1.10"

- group: ":julia: AMD GPU"
steps:
Expand All @@ -49,7 +49,7 @@ steps:
matrix:
setup:
julia:
- "1"
- "1.10"

# - group: ":julia: Metal GPU"
# steps:
Expand All @@ -76,7 +76,7 @@ steps:
# matrix:
# setup:
# julia:
# - "1"
# - "1.10"

# - group: ":julia: oneAPI GPU"
# steps:
Expand All @@ -102,14 +102,14 @@ steps:
# matrix:
# setup:
# julia:
# - "1"
# - "1.10"

- group: ":telescope: Downstream CUDA"
steps:
- label: ":julia: {{matrix.repo}} (Julia 1 + CUDA GPU)"
plugins:
- JuliaCI/julia#v1:
version: "1"
version: "1.10"
- JuliaCI/julia-coverage#v1:
codecov: true
dirs:
Expand All @@ -132,7 +132,7 @@ steps:
- label: ":julia: {{matrix.repo}} (Julia 1 + AMD GPU)"
plugins:
- JuliaCI/julia#v1:
version: "1"
version: "1.10"
- JuliaCI/julia-coverage#v1:
codecov: true
dirs:
Expand All @@ -154,6 +154,5 @@ steps:
- "Lux"

env:
RETESTITEMS_TESTITEM_TIMEOUT: 3600
JULIA_PKG_SERVER: ""
SECRET_CODECOV_TOKEN: "wMpDLaAVEHe6EJAc+LZBl4jF3wADVN6F+15vr/ONJHOv/XXbtYovuc1PCQwhz0AzZjWpSO12IDTyKfwVgYvqaGYfQ9yGyplJtSu2MiL2k44B/IY+wEZhsfkBIhXlG89si5A/I+/f8T8QuwxBqBLh8fYq7oxC+gNzKhbj8vIT4n5hCusvYYGufgKRC2U9P4ij0Sf40egQ5B+StaTykqJNq1163UARjNBypHIVDbYE0HUHiF7WB4eI5LxBBzlcHmsUkuGp6ZlqAu/8C83k65lwDnyHDfjvBM24q9GQTDFA5r7RUfYKHElQEBPk3GhoJn7XGIfD2pC0VNcw5jYCwsX2mw==;U2FsdGVkX1+euKMib66zno5Kkw7OxXo6v4RnkAA/HElJM46qfX17VgZ9iVLg45jOOWRgghmyYuy2WQ8RcVbuOg=="
38 changes: 17 additions & 21 deletions .github/workflows/CI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,16 +24,13 @@ jobs:
name: Julia ${{ matrix.version }} - ${{ matrix.test_group }} - ${{ matrix.os }} - ${{ matrix.blas_backend }}
if: ${{ !contains(github.event.head_commit.message, '[skip tests]') }}
runs-on: ${{ matrix.os }}
timeout-minutes: 60
strategy:
fail-fast: false
matrix:
version:
- "1"
- "1.10"
os:
- ubuntu-latest
- macos-latest
- windows-latest
test_group:
- "conv"
- "dense"
Expand All @@ -46,22 +43,27 @@ jobs:
- "others"
blas_backend:
- "default"
exclude:
- os: macos-latest
test_group: "conv" # Never terminates
include:
- os: ubuntu-latest
test_group: "dense"
blas_backend: "blis"
version: "1"
version: "1.10"
- os: ubuntu-latest
test_group: "dense"
blas_backend: "mkl"
version: "1"
version: "1.10"
- os: macos-latest
test_group: "dense"
blas_backend: "appleaccelerate"
version: "1"
version: "1.10"
- os: macos-latest
test_group: "all"
blas_backend: "default"
version: "1.10"
- os: windows-latest
test_group: "all"
blas_backend: "default"
version: "1.10"
steps:
- uses: actions/checkout@v4
- uses: julia-actions/setup-julia@v2
Expand Down Expand Up @@ -95,16 +97,13 @@ jobs:
downstream:
name: Downstream ${{ matrix.package.repo }}/${{ matrix.package.group }}
if: ${{ !contains(github.event.head_commit.message, '[skip tests]') && github.base_ref == github.event.repository.default_branch }}
runs-on: ${{ matrix.os }}
timeout-minutes: 60
runs-on: ubuntu-latest
env:
GROUP: ${{ matrix.package.group }}
LUX_TEST_GROUP: ${{ matrix.package.group }}
strategy:
fail-fast: false
matrix:
julia-version: ["1"]
os: [ubuntu-latest]
package:
- { user: LuxDL, repo: Lux.jl, group: "core_layers" }
- { user: LuxDL, repo: Lux.jl, group: "contrib" }
Expand All @@ -116,12 +115,12 @@ jobs:
- { user: LuxDL, repo: Lux.jl, group: "recurrent_layers" }
- { user: LuxDL, repo: Lux.jl, group: "eltype_match" }
- { user: LuxDL, repo: Lux.jl, group: "fluxcompat" }
- { user: LuxDL, repo: Boltz.jl, group: All }
- { user: LuxDL, repo: Boltz.jl, group: "all" }
steps:
- uses: actions/checkout@v4
- uses: julia-actions/setup-julia@v2
with:
version: ${{ matrix.julia-version }}
version: "1.10"
arch: x64
- uses: julia-actions/julia-buildpkg@v1
- name: Clone Downstream
Expand Down Expand Up @@ -156,14 +155,11 @@ jobs:

downgrade:
if: ${{ !contains(github.event.head_commit.message, '[skip tests]') && github.base_ref == github.event.repository.default_branch }}
name: Downgrade Julia ${{ matrix.version }} - ${{ matrix.test_group }}
name: Downgrade Julia - ${{ matrix.test_group }}
runs-on: ubuntu-latest
timeout-minutes: 60
strategy:
fail-fast: false
matrix:
version:
- "1"
test_group:
- "conv"
- "dense"
Expand All @@ -178,7 +174,7 @@ jobs:
- uses: actions/checkout@v4
- uses: julia-actions/setup-julia@v2
with:
version: ${{ matrix.version }}
version: "1.10"
- uses: julia-actions/julia-downgrade-compat@v1
- uses: julia-actions/julia-buildpkg@v1
- uses: julia-actions/julia-runtest@v1
Expand Down

1 comment on commit 2d7533c

@github-actions
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LuxLib Benchmarks

Benchmark suite Current: 2d7533c Previous: ba739d3 Ratio
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 5209 ns 5541 ns 0.94
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 5208 ns 5208.5 ns 1.00
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 7291 ns 6834 ns 1.07
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 6208 ns 4917 ns 1.26
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 115729 ns 102997 ns 1.12
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/oneAPI 2692776 ns
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/AMDGPU 408504 ns 422395 ns 0.97
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 10083 ns 10125 ns 1.00
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 10208 ns 10167 ns 1.00
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 10375 ns 9917 ns 1.05
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 9833 ns 10020.5 ns 0.98
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 496762 ns 530333 ns 0.94
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/oneAPI 17703724 ns
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 10961843 ns 11174375 ns 0.98
bias_activation(32, act=relu)(32 x 128)/forward/CPU/2 thread(s) 1312 ns 2854 ns 0.46
bias_activation(32, act=relu)(32 x 128)/forward/CPU/4 thread(s) 1500 ns 1375 ns 1.09
bias_activation(32, act=relu)(32 x 128)/forward/CPU/8 thread(s) 1875 ns 3750 ns 0.50
bias_activation(32, act=relu)(32 x 128)/forward/CPU/1 thread(s) 1479.5 ns 2792 ns 0.53
bias_activation(32, act=relu)(32 x 128)/forward/GPU/CUDA 20353.5 ns 19948 ns 1.02
bias_activation(32, act=relu)(32 x 128)/forward/GPU/oneAPI 1346068.5 ns
bias_activation(32, act=relu)(32 x 128)/forward/GPU/AMDGPU 31961 ns 33501 ns 0.95
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 4000 ns 3834 ns 1.04
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 4416 ns 4250 ns 1.04
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 4500 ns 4208 ns 1.07
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 4333 ns 4416 ns 0.98
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/CUDA 133606 ns 131207.5 ns 1.02
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/oneAPI 9495102 ns
bias_activation(32, act=relu)(32 x 128)/zygote/GPU/AMDGPU 147546.5 ns 146692 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 57500 ns 58167 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 46333 ns 39792 ns 1.16
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 39750 ns 38209 ns 1.04
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 82562.5 ns 83208 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 36967.5 ns 36515 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 548600 ns
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 80581 ns 80481 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2024000 ns 2038875 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2088104 ns 2083750 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2081875 ns 2035541 ns 1.02
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1983520.5 ns 2003250 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 218972 ns 217066 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 7891968 ns
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 973560 ns 1203774 ns 0.81
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 145834 ns 146333.5 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 172583 ns 147458 ns 1.17
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 151875.5 ns 174542 ns 0.87
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 176250 ns 150167 ns 1.17
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 167986 ns 167907.5 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 7801350.5 ns
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 197777 ns 171622 ns 1.15
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1108729.5 ns 1119853.5 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1105292 ns 1129187.5 ns 0.98
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1119062.5 ns 1072541 ns 1.04
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1108749.5 ns 1117229.5 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 642887 ns 620063 ns 1.04
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 33405409 ns
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1027070 ns 1023002 ns 1.00
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 6083 ns 5021.5 ns 1.21
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 4937.5 ns 5083 ns 0.97
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 5896 ns 6417 ns 0.92
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 5750 ns 4584 ns 1.25
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 83848 ns 79500 ns 1.05
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/oneAPI 5356951.5 ns
layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/AMDGPU 69841 ns 59431 ns 1.18
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 9000 ns 8833 ns 1.02
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 9042 ns 8458 ns 1.07
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 9042 ns 9083 ns 1.00
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8542 ns 8958 ns 0.95
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 556012 ns 540188.5 ns 1.03
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/oneAPI 37949872 ns
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 395964 ns 390145 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 18791 ns 17750 ns 1.06
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 16875 ns 17000 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 20917 ns 22125 ns 0.95
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 22791.5 ns 18146 ns 1.26
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 61826 ns 61981.5 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 3296125 ns
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 76391 ns 78051 ns 0.98
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 211083 ns 212750 ns 0.99
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 218583.5 ns 257833 ns 0.85
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 221999.5 ns 221375 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 211500 ns 221750 ns 0.95
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 328054 ns 323096 ns 1.02
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 14617604.5 ns
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 468680 ns 463260 ns 1.01
bias_activation(2, act=relu)(2 x 128)/forward/CPU/2 thread(s) 750 ns 666 ns 1.13
bias_activation(2, act=relu)(2 x 128)/forward/CPU/4 thread(s) 666.5 ns 625 ns 1.07
bias_activation(2, act=relu)(2 x 128)/forward/CPU/8 thread(s) 917 ns 875 ns 1.05
bias_activation(2, act=relu)(2 x 128)/forward/CPU/1 thread(s) 625 ns 625 ns 1
bias_activation(2, act=relu)(2 x 128)/forward/GPU/CUDA 19270 ns 18860 ns 1.02
bias_activation(2, act=relu)(2 x 128)/forward/GPU/oneAPI 1164614.5 ns
bias_activation(2, act=relu)(2 x 128)/forward/GPU/AMDGPU 31200 ns 30120 ns 1.04
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 1459 ns 1458 ns 1.00
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 1417 ns 1375 ns 1.03
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 1500 ns 1625 ns 0.92
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 1459 ns 1375 ns 1.06
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/CUDA 115345.5 ns 114822.5 ns 1.00
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/oneAPI 8786881.5 ns
bias_activation(2, act=relu)(2 x 128)/zygote/GPU/AMDGPU 136362 ns 123847 ns 1.10
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7333 ns 7500 ns 0.98
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 5958 ns 5333 ns 1.12
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 5458 ns 5333 ns 1.02
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10167 ns 10459 ns 0.97
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 23777 ns 23715.5 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 1195053 ns
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 49421 ns 46501 ns 1.06
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 228791 ns 227792 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 262833 ns 241750 ns 1.09
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 244208 ns 241584 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 227438 ns 227125 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 188310 ns 188481.5 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 30683195 ns
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 646667 ns 591832 ns 1.09
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/2 thread(s) 4125 ns 4084 ns 1.01
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/4 thread(s) 3916 ns 4125 ns 0.95
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/8 thread(s) 4125 ns 3958 ns 1.04
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/1 thread(s) 4083 ns 4125 ns 0.99
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/CUDA 23548.5 ns 23784 ns 0.99
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/oneAPI 2046712.5 ns
dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/AMDGPU 49050 ns 45550 ns 1.08
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 16750 ns 16750 ns 1
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 16833 ns 16792 ns 1.00
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 16833 ns 16791 ns 1.00
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 17000 ns 16500 ns 1.03
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/CUDA 184716.5 ns 184666.5 ns 1.00
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/oneAPI 10810606 ns
dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/AMDGPU 178062 ns 171442 ns 1.04
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 491291 ns 493292 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 385708 ns 312833 ns 1.23
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 313250 ns 310584 ns 1.01
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 846667 ns 847917 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/CUDA 113504.5 ns 113490 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/oneAPI 400320 ns
dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/AMDGPU 243402 ns 243193 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 2157041.5 ns 2121291 ns 1.02
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 1860000 ns 1584833 ns 1.17
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 1596917 ns 1574875 ns 1.01
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 3118291.5 ns 3034896 ns 1.03
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/CUDA 228877.5 ns 228348 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/oneAPI 9523997.5 ns
dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/AMDGPU 743298 ns 739108 ns 1.01
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 6541.5 ns 7021 ns 0.93
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 6167 ns 6792 ns 0.91
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 7145.5 ns 7958 ns 0.90
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 6416 ns 6875 ns 0.93
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 82766.5 ns 82934 ns 1.00
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/oneAPI 5786455 ns
layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/AMDGPU 67260 ns 57300 ns 1.17
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 11708.5 ns 11520.5 ns 1.02
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 10333 ns 11708 ns 0.88
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 12417 ns 12062.5 ns 1.03
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 10375 ns 10896 ns 0.95
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 599572.5 ns 598177.5 ns 1.00
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/oneAPI 36065836.5 ns
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 415124 ns 401725 ns 1.03
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/2 thread(s) 500 ns 541 ns 0.92
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/4 thread(s) 541 ns 542 ns 1.00
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/8 thread(s) 542 ns 542 ns 1
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/1 thread(s) 500 ns 500 ns 1
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/CUDA 23681.5 ns 23280.5 ns 1.02
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/oneAPI 2157030 ns
dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/AMDGPU 49180 ns 48351 ns 1.02
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 2125 ns 2083 ns 1.02
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 2167 ns 2166 ns 1.00
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 2208 ns 2209 ns 1.00
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 2125 ns 2084 ns 1.02
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/CUDA 230420 ns 217524 ns 1.06
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/oneAPI 10946869 ns
dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/AMDGPU 182202 ns 178702 ns 1.02
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 9208 ns 8542 ns 1.08
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 8666.5 ns 9229.5 ns 0.94
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 9917 ns 11042 ns 0.90
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 8792 ns 8042 ns 1.09
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 100396.5 ns 92171 ns 1.09
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/oneAPI 3318002.5 ns
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/AMDGPU 75271 ns 76060.5 ns 0.99
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 17229.5 ns 19125 ns 0.90
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 18479.5 ns 18895.5 ns 0.98
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 18625 ns 19375 ns 0.96
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 18000 ns 18458 ns 0.98
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 575393.5 ns 534402.5 ns 1.08
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/oneAPI 16729549.5 ns
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 385864 ns 379154 ns 1.02
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 500 ns 500 ns 1
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 583 ns 583 ns 1
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 583 ns 542 ns 1.08
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 583 ns 500 ns 1.17
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 34044 ns 33745.5 ns 1.01
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/oneAPI 1236371 ns
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/AMDGPU 48691 ns 45241 ns 1.08
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 9625.5 ns 9104 ns 1.06
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 9541.5 ns 9583 ns 1.00
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 9709 ns 9187.5 ns 1.06
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 8833.5 ns 10042 ns 0.88
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 254859 ns 242113 ns 1.05
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/oneAPI 19246352.5 ns
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/AMDGPU 375034 ns 367124 ns 1.02
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/2 thread(s) 397208 ns 398958 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/4 thread(s) 287667 ns 215291 ns 1.34
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/8 thread(s) 215291 ns 213750 ns 1.01
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/1 thread(s) 755625 ns 756041 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/CUDA 112458 ns 111898 ns 1.01
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/oneAPI 340204 ns
dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/AMDGPU 76851 ns 77281 ns 0.99
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/2 thread(s) 1468271 ns 1396458 ns 1.05
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/4 thread(s) 1130458 ns 859875 ns 1.31
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/8 thread(s) 858125 ns 847958 ns 1.01
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/1 thread(s) 2440187.5 ns 2356833.5 ns 1.04
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/CUDA 199457 ns 199002 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/oneAPI 9886202 ns
dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/AMDGPU 322043 ns 322423 ns 1.00
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 8021.5 ns 7250 ns 1.11
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 7875 ns 7625.5 ns 1.03
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 8750 ns 9062.5 ns 0.97
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 7125 ns 7229 ns 0.99
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 134916.5 ns 126183.5 ns 1.07
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/oneAPI 5780710 ns
layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/AMDGPU 70255.5 ns 57821 ns 1.22
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 16917 ns 16959 ns 1.00
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 15042 ns 14354.5 ns 1.05
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 15979 ns 14792 ns 1.08
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 16000 ns 15042 ns 1.06
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 878404 ns 851673 ns 1.03
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/oneAPI 41935612.5 ns
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 433994 ns 420849.5 ns 1.03
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 28792 ns 32959 ns 0.87
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 25792 ns 29083.5 ns 0.89
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 28833.5 ns 30875 ns 0.93
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 30354.5 ns 25770.5 ns 1.18
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 183000.5 ns 184566 ns 0.99
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 7959277.5 ns
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 115401 ns 110921 ns 1.04
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 112375 ns 160875 ns 0.70
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 144438 ns 124458 ns 1.16
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 105854.5 ns 145396 ns 0.73
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 150875 ns 157729 ns 0.96
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 977911 ns 1005586 ns 0.97
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 41813067 ns
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 589736 ns 576731 ns 1.02
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 74166 ns 75875 ns 0.98
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 74604 ns 75042 ns 0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 77333 ns 80959 ns 0.96
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 76334 ns 74437.5 ns 1.03
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 189045 ns 190691 ns 0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 7503392 ns
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 128881 ns 124242 ns 1.04
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 295667 ns 300833 ns 0.98
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 307166 ns 322542 ns 0.95
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 300000 ns 298292 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 276875.5 ns 219396 ns 1.26
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 986480 ns 1023572 ns 0.96
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 40933470 ns
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 697017.5 ns 692382 ns 1.01
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 13166.5 ns 13000 ns 1.01
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 13229 ns 13500 ns 0.98
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 14833.5 ns 14833 ns 1.00
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 13667 ns 13208 ns 1.03
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 133538.5 ns 136120 ns 0.98
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/oneAPI 5773755.5 ns
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/AMDGPU 236113 ns 234302 ns 1.01
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 27000 ns 27083.5 ns 1.00
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 27500 ns 26395.5 ns 1.04
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 27187.5 ns 27146 ns 1.00
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 27438 ns 27770.5 ns 0.99
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 917467.5 ns 907766 ns 1.01
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/oneAPI 39999839 ns
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 698258 ns 693402 ns 1.01
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 11209 ns 11500 ns 0.97
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 11292 ns 10875 ns 1.04
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 13375 ns 13249.5 ns 1.01
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 11083 ns 11666 ns 0.95
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 119722.5 ns 119510.5 ns 1.00
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/oneAPI 3349179 ns
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/AMDGPU 240142 ns 240667.5 ns 1.00
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 23333 ns 23021 ns 1.01
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 23084 ns 23312.5 ns 0.99
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 24000 ns 23917 ns 1.00
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 21958 ns 22708 ns 0.97
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 678230.5 ns 664160.5 ns 1.02
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/oneAPI 22343314.5 ns
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 678857 ns 675107 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 65021 ns 66750 ns 0.97
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 62875 ns 63542 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 68667 ns 68709 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 66417 ns 65000 ns 1.02
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 101393 ns 101310 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 3400903 ns
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 236963 ns 234673 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 477895.5 ns 466062.5 ns 1.03
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 476959 ns 478625 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 468750 ns 472875 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 495833 ns 518125 ns 0.96
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 488817 ns 484379 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 20464230 ns
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 715823 ns 712597 ns 1.00
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 7146 ns 7479 ns 0.96
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 8375 ns 7687.5 ns 1.09
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 8500 ns 9958 ns 0.85
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 7021 ns 7667 ns 0.92
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 136539.5 ns 134386 ns 1.02
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/oneAPI 5535345 ns
layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/AMDGPU 69291 ns 57600 ns 1.20
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 11458 ns 15750 ns 0.73
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 14500 ns 16333 ns 0.89
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 16125 ns 15250 ns 1.06
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 13416 ns 15291 ns 0.88
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 886518 ns 880162.5 ns 1.01
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/oneAPI 37792827 ns
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/AMDGPU 407319.5 ns 398914 ns 1.02
batchedmm(512, Bsize=4)/forward/CPU/2 thread(s) 6154209 ns 6151875 ns 1.00
batchedmm(512, Bsize=4)/forward/CPU/4 thread(s) 6370021 ns 3226750 ns 1.97
batchedmm(512, Bsize=4)/forward/CPU/8 thread(s) 3225542 ns 3223292 ns 1.00
batchedmm(512, Bsize=4)/forward/CPU/1 thread(s) 11912875 ns 11913583 ns 1.00
batchedmm(512, Bsize=4)/forward/GPU/CUDA 345647 ns 350966 ns 0.98
batchedmm(512, Bsize=4)/forward/GPU/oneAPI 49342806 ns
batchedmm(512, Bsize=4)/forward/GPU/AMDGPU 305758 ns 302008 ns 1.01
batchedmm(512, Bsize=4)/zygote/CPU/2 thread(s) 19108188 ns 19126979 ns 1.00
batchedmm(512, Bsize=4)/zygote/CPU/4 thread(s) 19939624.5 ns 11161229.5 ns 1.79
batchedmm(512, Bsize=4)/zygote/CPU/8 thread(s) 11149250 ns 11077916 ns 1.01
batchedmm(512, Bsize=4)/zygote/CPU/1 thread(s) 36445875 ns 36533646 ns 1.00
batchedmm(512, Bsize=4)/zygote/GPU/CUDA 1059965 ns 1006948.5 ns 1.05
batchedmm(512, Bsize=4)/zygote/GPU/oneAPI 79558988 ns
batchedmm(512, Bsize=4)/zygote/GPU/AMDGPU 1166672 ns 1127082 ns 1.04
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 1000 ns 1042 ns 0.96
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 1000 ns 1042 ns 0.96
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 1083 ns 1042 ns 1.04
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 959 ns 1000 ns 0.96
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/CUDA 23689 ns 23502 ns 1.01
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/oneAPI 2151476.5 ns
dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/AMDGPU 209622 ns 209393 ns 1.00
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 3917 ns 3958 ns 0.99
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 4041 ns 4083 ns 0.99
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 4000 ns 4041 ns 0.99
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 3916 ns 3917 ns 1.00
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/CUDA 274634 ns 270232 ns 1.02
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/oneAPI 10742838 ns
dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/AMDGPU 625596 ns 623846 ns 1.00
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 7292 ns 7833 ns 0.93
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 9000 ns 8042 ns 1.12
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 10250 ns 9750 ns 1.05
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 9062.5 ns 7625 ns 1.19
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 116615 ns 116542 ns 1.00
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/oneAPI 3546009 ns
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/AMDGPU 69341 ns 69700 ns 0.99
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 12000 ns 12375 ns 0.97
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 12667 ns 12458 ns 1.02
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 12437.5 ns 12917 ns 0.96
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 12417 ns 12292 ns 1.01
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 605595 ns 604932 ns 1.00
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/oneAPI 22519876 ns
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 363803 ns 357073.5 ns 1.02
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/2 thread(s) 250 ns 292 ns 0.86
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/4 thread(s) 292 ns 333 ns 0.88
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/8 thread(s) 292 ns 292 ns 1
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/1 thread(s) 250 ns 292 ns 0.86
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/CUDA 22597.5 ns 22511.5 ns 1.00
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/oneAPI 2178291 ns
dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/AMDGPU 48315.5 ns 46531 ns 1.04
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 2834 ns 3167 ns 0.89
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 2916 ns 3166 ns 0.92
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 3167 ns 3333 ns 0.95
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 3083 ns 2875 ns 1.07
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/CUDA 194557 ns 194011 ns 1.00
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/oneAPI 9614403 ns
dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/AMDGPU 170192 ns 158126.5 ns 1.08
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 11333 ns 12125 ns 0.93
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 11459 ns 12333 ns 0.93
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 13708 ns 13708 ns 1
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 12333 ns 11937.5 ns 1.03
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 115903.5 ns 115429.5 ns 1.00
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/oneAPI 3311083 ns
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/AMDGPU 239372.5 ns 237322 ns 1.01
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 20792 ns 22000 ns 0.95
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 23500 ns 24459 ns 0.96
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 22395.5 ns 23396 ns 0.96
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 21458.5 ns 21792 ns 0.98
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 558538 ns 554065.5 ns 1.01
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/oneAPI 19541146 ns
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 657037 ns 651546.5 ns 1.01
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/2 thread(s) 4375 ns 4375 ns 1
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/4 thread(s) 4167 ns 4416 ns 0.94
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/8 thread(s) 4417 ns 4291 ns 1.03
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/1 thread(s) 4375 ns 4375 ns 1
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/CUDA 24750 ns 24232 ns 1.02
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/oneAPI 2038545 ns
dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/AMDGPU 49870 ns 48651 ns 1.03
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 16708 ns 16208 ns 1.03
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 16167 ns 16500 ns 0.98
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 16500 ns 16042 ns 1.03
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 16667 ns 16250 ns 1.03
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/CUDA 317514 ns 316149 ns 1.00
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/oneAPI 12292699 ns
dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/AMDGPU 212047.5 ns 208227 ns 1.02
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 2083 ns 2083 ns 1
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 2125 ns 2083 ns 1.02
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 2125 ns 2083 ns 1.02
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 2083 ns 2000 ns 1.04
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 35083 ns 34761 ns 1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/oneAPI 1184726 ns
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/AMDGPU 206953 ns 205252 ns 1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 17250 ns 17937.5 ns 0.96
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 18667 ns 19271 ns 0.97
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 19584 ns 18584 ns 1.05
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 20125 ns 18375 ns 1.10
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 284678 ns 283100 ns 1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/oneAPI 20274746 ns
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 691617 ns 682562.5 ns 1.01
batchedmm(16, Bsize=512)/forward/CPU/2 thread(s) 60292 ns 59229.5 ns 1.02
batchedmm(16, Bsize=512)/forward/CPU/4 thread(s) 66792 ns 60896 ns 1.10
batchedmm(16, Bsize=512)/forward/CPU/8 thread(s) 62000 ns 60959 ns 1.02
batchedmm(16, Bsize=512)/forward/CPU/1 thread(s) 51125 ns 53792 ns 0.95
batchedmm(16, Bsize=512)/forward/GPU/CUDA 66448 ns 66317 ns 1.00
batchedmm(16, Bsize=512)/forward/GPU/oneAPI 87696389 ns
batchedmm(16, Bsize=512)/forward/GPU/AMDGPU 117412 ns 100931 ns 1.16
batchedmm(16, Bsize=512)/zygote/CPU/2 thread(s) 198916 ns 195625 ns 1.02
batchedmm(16, Bsize=512)/zygote/CPU/4 thread(s) 167229 ns 149417 ns 1.12
batchedmm(16, Bsize=512)/zygote/CPU/8 thread(s) 141417 ns 138292 ns 1.02
batchedmm(16, Bsize=512)/zygote/CPU/1 thread(s) 300125 ns 219291 ns 1.37
batchedmm(16, Bsize=512)/zygote/GPU/CUDA 209004 ns 208292.5 ns 1.00
batchedmm(16, Bsize=512)/zygote/GPU/oneAPI 147263909.5 ns
batchedmm(16, Bsize=512)/zygote/GPU/AMDGPU 620696.5 ns 554746 ns 1.12
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 82583 ns 85062 ns 0.97
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 140250 ns 127458 ns 1.10
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 86417 ns 86104 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 116583 ns 86812.5 ns 1.34
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 191982.5 ns 192707 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 5863118 ns
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 203942 ns 169152 ns 1.21
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1921771 ns 1926791.5 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1908917 ns 1918312.5 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1919708 ns 1895083 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1924521 ns 1862750 ns 1.03
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 504208.5 ns 503729 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 26294676.5 ns
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1070976 ns 915670 ns 1.17
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/2 thread(s) 291 ns 291 ns 1
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/4 thread(s) 292 ns 292 ns 1
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/8 thread(s) 292 ns 292 ns 1
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/1 thread(s) 292 ns 292 ns 1
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/CUDA 21855 ns 21463.5 ns 1.02
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/oneAPI 2006228 ns
dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/AMDGPU 41700 ns 41990 ns 0.99
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/2 thread(s) 1792 ns 1792 ns 1
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/4 thread(s) 1875 ns 1833 ns 1.02
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/8 thread(s) 1833 ns 1833 ns 1
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/1 thread(s) 1792 ns 1834 ns 0.98
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/CUDA 242053 ns 244422 ns 0.99
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/oneAPI 10350039 ns
dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/AMDGPU 183192 ns 183082 ns 1.00
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 9833 ns 11375 ns 0.86
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 9833 ns 10292 ns 0.96
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 11709 ns 12166 ns 0.96
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 10583 ns 9084 ns 1.17
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 116639.5 ns 113574.5 ns 1.03
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/oneAPI 3403003.5 ns
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/AMDGPU 238567.5 ns 237182 ns 1.01
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 8875 ns 9583 ns 0.93
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 10875 ns 12396 ns 0.88
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 10125 ns 10750 ns 0.94
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 9500 ns 9458 ns 1.00
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 488952 ns 489512 ns 1.00
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/oneAPI 20132943 ns
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 630866 ns 632057 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 57875 ns 57959 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 46958 ns 39208 ns 1.20
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 39625 ns 38708 ns 1.02
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 82250 ns 83375 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 38551 ns 38522 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 1316937 ns
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 79411 ns 78311 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1922646 ns 1724708.5 ns 1.11
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1979292 ns 1941208 ns 1.02
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1942292 ns 1947834 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1900917 ns 1891208.5 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 210456 ns 210148.5 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 33978774 ns
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1015680 ns 998640 ns 1.02
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 267333 ns 269083 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 269625 ns 268833 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 270729.5 ns 275875 ns 0.98
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 269645.5 ns 269729.5 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 192987.5 ns 193164 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 7844239 ns
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 285143 ns 282737.5 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 698604 ns 587166.5 ns 1.19
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 671916.5 ns 614875 ns 1.09
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 667416 ns 651500 ns 1.02
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 626771 ns 652062 ns 0.96
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 985897 ns 993619.5 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 45574369 ns
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 913670 ns 899480 ns 1.02
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 2218667 ns 2202416 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 2215687 ns 2216125 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 2220312.5 ns 2192812.5 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 2213250 ns 2220500 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 157769 ns 179761.5 ns 0.88
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 8237698 ns
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 425304 ns 415294 ns 1.02
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5486562 ns 5520708 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5529917 ns 5537000 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5524333.5 ns 5449958.5 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 5488625 ns 5515167 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 927722 ns 930917 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 53249072 ns
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1555466 ns 1711728 ns 0.91
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 478042 ns 477542 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 346167 ns 257375 ns 1.34
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 257167 ns 255375 ns 1.01
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 909250 ns 908666 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/CUDA 46497 ns 46830 ns 0.99
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/oneAPI 825183 ns
dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/AMDGPU 245473 ns 245313 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 2167292 ns 2116979 ns 1.02
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 1862208 ns 1589770.5 ns 1.17
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 1591771 ns 1579645.5 ns 1.01
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 3122542 ns 3037833.5 ns 1.03
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/CUDA 255431 ns 274670.5 ns 0.93
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/oneAPI 12961347 ns
dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/AMDGPU 773598 ns 769148 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 57520.5 ns 57875 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 46708 ns 39000 ns 1.20
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 39292 ns 38458 ns 1.02
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 82500 ns 83333 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 28213 ns 28067 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 1370930 ns
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 76011 ns 75041 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2032125 ns 2047334 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2090250 ns 2049854.5 ns 1.02
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2068583 ns 2059333 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1997000 ns 1987666.5 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 223132 ns 227893 ns 0.98
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 35910018 ns
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1194083 ns 1038901 ns 1.15
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 57812.5 ns 58000 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 46708 ns 39333 ns 1.19
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 39583 ns 38333 ns 1.03
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 82375 ns 83125 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 48361 ns 48807.5 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 762273.5 ns
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 80795.5 ns 67171 ns 1.20
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1928084 ns 1934875 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1964958 ns 1962667 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1966541.5 ns 1938167 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1886625 ns 1827396 ns 1.03
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 230366 ns 233324 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 16959659 ns
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 920174 ns 914834.5 ns 1.01
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 333 ns 292 ns 1.14
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 375 ns 375 ns 1
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 375 ns 375 ns 1
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 292 ns 333 ns 0.88
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 33705 ns 34314.5 ns 0.98
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/oneAPI 1253501.5 ns
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/AMDGPU 45940 ns 45171 ns 1.02
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 6646 ns 6542 ns 1.02
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7395.5 ns 7083 ns 1.04
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7292 ns 7000 ns 1.04
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 6417 ns 6958 ns 0.92
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 201838.5 ns 202653 ns 1.00
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/oneAPI 21257580 ns
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/AMDGPU 371664 ns 366114 ns 1.02
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/2 thread(s) 250 ns 250 ns 1
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/4 thread(s) 292 ns 292 ns 1
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/8 thread(s) 292 ns 292 ns 1
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/1 thread(s) 250 ns 292 ns 0.86
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/CUDA 32336 ns 32763 ns 0.99
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/oneAPI 1213220 ns
dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/AMDGPU 37120 ns 38131 ns 0.97
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/2 thread(s) 3292 ns 2792 ns 1.18
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/4 thread(s) 3000 ns 3000 ns 1
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/8 thread(s) 3125 ns 3459 ns 0.90
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/1 thread(s) 2666 ns 2875 ns 0.93
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/CUDA 182468 ns 184852 ns 0.99
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/oneAPI 7479362 ns
dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/AMDGPU 151261 ns 151962 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 502687.5 ns 494188 ns 1.02
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 491916.5 ns 500333.5 ns 0.98
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 465083.5 ns 470041.5 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 498417 ns 489437 ns 1.02
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 134412 ns 134801.5 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 5713043 ns
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 367259 ns 322243 ns 1.14
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 4072041 ns 4053479 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 4093021 ns 4072375 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 4069979 ns 4033500 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 4043667 ns 4070625 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 669547 ns 680027 ns 0.98
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 34596141 ns
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1474565 ns 1463545 ns 1.01
batchedmm(512, Bsize=32)/forward/CPU/2 thread(s) 49859062 ns 49933854 ns 1.00
batchedmm(512, Bsize=32)/forward/CPU/4 thread(s) 35504667 ns 26023000 ns 1.36
batchedmm(512, Bsize=32)/forward/CPU/8 thread(s) 26029000 ns 25982541.5 ns 1.00
batchedmm(512, Bsize=32)/forward/CPU/1 thread(s) 96942959 ns 97045646 ns 1.00
batchedmm(512, Bsize=32)/forward/GPU/CUDA 1621240 ns 1626445 ns 1.00
batchedmm(512, Bsize=32)/forward/GPU/oneAPI 55961032 ns
batchedmm(512, Bsize=32)/forward/GPU/AMDGPU 1046111 ns 1047410 ns 1.00
batchedmm(512, Bsize=32)/zygote/CPU/2 thread(s) 154467896 ns 155000104.5 ns 1.00
batchedmm(512, Bsize=32)/zygote/CPU/4 thread(s) 112182625 ns 89050542 ns 1.26
batchedmm(512, Bsize=32)/zygote/CPU/8 thread(s) 89208292 ns 88666916.5 ns 1.01
batchedmm(512, Bsize=32)/zygote/CPU/1 thread(s) 294884062.5 ns 295479666.5 ns 1.00
batchedmm(512, Bsize=32)/zygote/GPU/CUDA 6486949 ns 6477658 ns 1.00
batchedmm(512, Bsize=32)/zygote/GPU/oneAPI 128111295 ns
batchedmm(512, Bsize=32)/zygote/GPU/AMDGPU 5579662.5 ns 5560101.5 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/2 thread(s) 19541 ns 20062.5 ns 0.97
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/4 thread(s) 18625 ns 15500 ns 1.20
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/8 thread(s) 13917 ns 13833.5 ns 1.01
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/1 thread(s) 15458.5 ns 15708.5 ns 0.98
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/CUDA 20271 ns 20427 ns 0.99
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/oneAPI 1104775.5 ns
bias_activation(32, act=tanh)(32 x 128)/forward/GPU/AMDGPU 26071 ns 25781 ns 1.01
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/2 thread(s) 10729.5 ns 11063 ns 0.97
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/4 thread(s) 9000 ns 7895.5 ns 1.14
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/8 thread(s) 8125 ns 7937.5 ns 1.02
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/1 thread(s) 17291 ns 17375 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/CUDA 244379 ns 248558 ns 0.98
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/oneAPI 10081500 ns
bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/AMDGPU 148582 ns 143922 ns 1.03
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 8374.5 ns 8417 ns 0.99
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 8750 ns 10229 ns 0.86
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 10833 ns 10375 ns 1.04
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 9104.5 ns 8646 ns 1.05
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 120247 ns 119635 ns 1.01
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/oneAPI 3746738 ns
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/AMDGPU 239122.5 ns 239173 ns 1.00
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 9437.5 ns 10041.5 ns 0.94
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 9708 ns 10667 ns 0.91
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 11792 ns 10750 ns 1.10
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 9500 ns 10145.5 ns 0.94
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 585732.5 ns 591757 ns 0.99
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/oneAPI 22572008 ns
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 659212 ns 654107 ns 1.01
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 9083.5 ns 10375 ns 0.88
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 9833.5 ns 9770.5 ns 1.01
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 10375 ns 11312.5 ns 0.92
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 9438 ns 9500 ns 0.99
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 116564 ns 117527.5 ns 0.99
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/oneAPI 3425324 ns
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/AMDGPU 75361 ns 72401 ns 1.04
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 13958 ns 14292 ns 0.98
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 13291.5 ns 17708 ns 0.75
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 16625 ns 14834 ns 1.12
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 13750 ns 14750 ns 0.93
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 556648.5 ns 562161 ns 0.99
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/oneAPI 19935565.5 ns
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 351184 ns 345113 ns 1.02
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 500 ns 500 ns 1
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 625 ns 625 ns 1
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 625 ns 625 ns 1
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 500 ns 500 ns 1
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA 33504 ns 34287 ns 0.98
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/oneAPI 1200134 ns
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/AMDGPU 207882 ns 207072 ns 1.00
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7542 ns 8625 ns 0.87
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7958 ns 9667 ns 0.82
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 9542 ns 8667 ns 1.10
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7625 ns 8687.5 ns 0.88
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA 223084.5 ns 224465.5 ns 0.99
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/oneAPI 21568038 ns
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 665587 ns 658996 ns 1.01
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 17958 ns 17292 ns 1.04
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 17584 ns 13771 ns 1.28
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 13334 ns 12458.5 ns 1.07
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 10833.5 ns 10770.5 ns 1.01
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/CUDA 20393 ns 20290 ns 1.01
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/oneAPI 1168335 ns
bias_activation(32, act=gelu)(32 x 128)/forward/GPU/AMDGPU 191442 ns 186982 ns 1.02
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 35542 ns 35625 ns 1.00
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 35583 ns 35625 ns 1.00
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 36208 ns 35834 ns 1.01
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 35500 ns 35666 ns 1.00
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/CUDA 258577 ns 261247.5 ns 0.99
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/oneAPI 11381817 ns
bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/AMDGPU 591656 ns 589266 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 511813 ns 450208 ns 1.14
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 447292 ns 494583.5 ns 0.90
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 456792 ns 456791.5 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 517125 ns 461833 ns 1.12
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 194619 ns 194699 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 5685561 ns
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 368453.5 ns 360324 ns 1.02
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 4055479 ns 4069833 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 4065479.5 ns 4063479 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 4057292 ns 4038041.5 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 4051125 ns 4038167 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 506270 ns 514235 ns 0.98
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 28041384.5 ns
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1368029 ns 1354948.5 ns 1.01
batchedmm(512, Bsize=512)/forward/CPU/2 thread(s) 786875042 ns 788948625 ns 1.00
batchedmm(512, Bsize=512)/forward/CPU/4 thread(s) 540385750 ns 416422208.5 ns 1.30
batchedmm(512, Bsize=512)/forward/CPU/8 thread(s) 417627729 ns 415183312.5 ns 1.01
batchedmm(512, Bsize=512)/forward/CPU/1 thread(s) 1558687604 ns 1509932250 ns 1.03
batchedmm(512, Bsize=512)/forward/GPU/CUDA 22789985.5 ns 22522291.5 ns 1.01
batchedmm(512, Bsize=512)/forward/GPU/oneAPI 176484643 ns
batchedmm(512, Bsize=512)/forward/GPU/AMDGPU 14667995.5 ns 14572928 ns 1.01
batchedmm(512, Bsize=512)/zygote/CPU/2 thread(s) 2512454792 ns 2530024250 ns 0.99
batchedmm(512, Bsize=512)/zygote/CPU/4 thread(s) 1772086292 ns 1506878542 ns 1.18
batchedmm(512, Bsize=512)/zygote/CPU/8 thread(s) 1545039084 ns 1519381125 ns 1.02
batchedmm(512, Bsize=512)/zygote/CPU/1 thread(s) 6322382417 ns 4752439166 ns 1.33
batchedmm(512, Bsize=512)/zygote/GPU/CUDA 118300758 ns 118941901 ns 0.99
batchedmm(512, Bsize=512)/zygote/GPU/oneAPI 918719991.5 ns
batchedmm(512, Bsize=512)/zygote/GPU/AMDGPU 87803948.5 ns 87857404.5 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 76458.5 ns 77417 ns 0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 76958 ns 77625 ns 0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 78437 ns 79500 ns 0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 76937.5 ns 76875 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 191503.5 ns 194658.5 ns 0.98
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 8039760 ns
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 106691 ns 106561 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 279042 ns 284458 ns 0.98
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 208625 ns 286188 ns 0.73
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 282125 ns 197750 ns 1.43
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 196250 ns 192708 ns 1.02
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 989645.5 ns 1005733 ns 0.98
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 44408111.5 ns
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 636782 ns 630306 ns 1.01
batchedmm(512, Bsize=128)/forward/CPU/2 thread(s) 199893333 ns 199829146 ns 1.00
batchedmm(512, Bsize=128)/forward/CPU/4 thread(s) 139025625 ns 104009479.5 ns 1.34
batchedmm(512, Bsize=128)/forward/CPU/8 thread(s) 104051042 ns 103995667 ns 1.00
batchedmm(512, Bsize=128)/forward/CPU/1 thread(s) 388708625 ns 389216083 ns 1.00
batchedmm(512, Bsize=128)/forward/GPU/CUDA 5839621 ns 5833781 ns 1.00
batchedmm(512, Bsize=128)/forward/GPU/oneAPI 79074303 ns
batchedmm(512, Bsize=128)/forward/GPU/AMDGPU 3603877.5 ns 3615787 ns 1.00
batchedmm(512, Bsize=128)/zygote/CPU/2 thread(s) 619152625 ns 620952291.5 ns 1.00
batchedmm(512, Bsize=128)/zygote/CPU/4 thread(s) 439143666 ns 354227354.5 ns 1.24
batchedmm(512, Bsize=128)/zygote/CPU/8 thread(s) 353463000 ns 354977104.5 ns 1.00
batchedmm(512, Bsize=128)/zygote/CPU/1 thread(s) 1177182375 ns 1182226250 ns 1.00
batchedmm(512, Bsize=128)/zygote/GPU/CUDA 26537180.5 ns 26559529 ns 1.00
batchedmm(512, Bsize=128)/zygote/GPU/oneAPI 276530657.5 ns
batchedmm(512, Bsize=128)/zygote/GPU/AMDGPU 22057437 ns 21846736 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7291 ns 7167 ns 1.02
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 6167 ns 5375 ns 1.15
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 5375 ns 5250 ns 1.02
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 9792 ns 10292 ns 0.95
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 26296 ns 27179 ns 0.97
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 1196971 ns
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 46670 ns 48210 ns 0.97
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 212500 ns 212666.5 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 219917 ns 222542 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 223521 ns 221917 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 208917 ns 206167 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 213879 ns 217340.5 ns 0.98
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 20926055 ns
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 531735 ns 523165 ns 1.02
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 8104 ns 8708 ns 0.93
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 8709 ns 8958 ns 0.97
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 10791.5 ns 10667 ns 1.01
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 9229 ns 8813 ns 1.05
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 112861.5 ns 115467 ns 0.98
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/oneAPI 3389305 ns
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/AMDGPU 73211 ns 73431 ns 1.00
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7542 ns 7584 ns 0.99
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7542 ns 11521 ns 0.65
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 10229.5 ns 8542 ns 1.20
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7834 ns 8062.5 ns 0.97
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 490362 ns 494404 ns 0.99
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/oneAPI 19246537 ns
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 323133 ns 316873 ns 1.02
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 458 ns 500 ns 0.92
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 500 ns 708 ns 0.71
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 708 ns 708 ns 1
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 459 ns 583 ns 0.79
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 24659 ns 25358 ns 0.97
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/oneAPI 1256249 ns
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/AMDGPU 48770 ns 47920 ns 1.02
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 9250 ns 9250 ns 1
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 8479.5 ns 11396 ns 0.74
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 12291 ns 10875 ns 1.13
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 9083 ns 9750 ns 0.93
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 245415 ns 246651 ns 0.99
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/oneAPI 24116959 ns
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/AMDGPU 395734 ns 388584 ns 1.02
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 112500.5 ns 110834 ns 1.02
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 103271 ns 87791 ns 1.18
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 88333 ns 87792 ns 1.01
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 154625 ns 154959 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/CUDA 23200 ns 23405 ns 0.99
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/oneAPI 818562 ns
bias_activation(512, act=gelu)(512 x 128)/forward/GPU/AMDGPU 193152 ns 189432 ns 1.02
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 578000 ns 539625 ns 1.07
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 534875 ns 562458 ns 0.95
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 548917 ns 535812.5 ns 1.02
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 535333 ns 535000 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/CUDA 215198 ns 220513 ns 0.98
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/oneAPI 11436046 ns
bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/AMDGPU 610641.5 ns 604586.5 ns 1.01
batchedmm(16, Bsize=32)/forward/CPU/2 thread(s) 5000 ns 5354 ns 0.93
batchedmm(16, Bsize=32)/forward/CPU/4 thread(s) 5416.5 ns 7042 ns 0.77
batchedmm(16, Bsize=32)/forward/CPU/8 thread(s) 7604.5 ns 8229.5 ns 0.92
batchedmm(16, Bsize=32)/forward/CPU/1 thread(s) 6625 ns 6541 ns 1.01
batchedmm(16, Bsize=32)/forward/GPU/CUDA 17413 ns 17715 ns 0.98
batchedmm(16, Bsize=32)/forward/GPU/oneAPI 72455521 ns
batchedmm(16, Bsize=32)/forward/GPU/AMDGPU 80361 ns 71815.5 ns 1.12
batchedmm(16, Bsize=32)/zygote/CPU/2 thread(s) 11792 ns 11750 ns 1.00
batchedmm(16, Bsize=32)/zygote/CPU/4 thread(s) 10791.5 ns 11459 ns 0.94
batchedmm(16, Bsize=32)/zygote/CPU/8 thread(s) 11208 ns 10792 ns 1.04
batchedmm(16, Bsize=32)/zygote/CPU/1 thread(s) 17000 ns 17125 ns 0.99
batchedmm(16, Bsize=32)/zygote/GPU/CUDA 203659.5 ns 206057.5 ns 0.99
batchedmm(16, Bsize=32)/zygote/GPU/oneAPI 98210292 ns
batchedmm(16, Bsize=32)/zygote/GPU/AMDGPU 381654 ns 379023.5 ns 1.01
batchedmm(16, Bsize=128)/forward/CPU/2 thread(s) 39542 ns 39250 ns 1.01
batchedmm(16, Bsize=128)/forward/CPU/4 thread(s) 51459 ns 51250 ns 1.00
batchedmm(16, Bsize=128)/forward/CPU/8 thread(s) 51333 ns 50583 ns 1.01
batchedmm(16, Bsize=128)/forward/CPU/1 thread(s) 13520.5 ns 13750 ns 0.98
batchedmm(16, Bsize=128)/forward/GPU/CUDA 19998 ns 21128.5 ns 0.95
batchedmm(16, Bsize=128)/forward/GPU/oneAPI 76386107.5 ns
batchedmm(16, Bsize=128)/forward/GPU/AMDGPU 89551 ns 84216 ns 1.06
batchedmm(16, Bsize=128)/zygote/CPU/2 thread(s) 36229.5 ns 36208 ns 1.00
batchedmm(16, Bsize=128)/zygote/CPU/4 thread(s) 31458 ns 30584 ns 1.03
batchedmm(16, Bsize=128)/zygote/CPU/8 thread(s) 30250 ns 29250 ns 1.03
batchedmm(16, Bsize=128)/zygote/CPU/1 thread(s) 57167 ns 57375 ns 1.00
batchedmm(16, Bsize=128)/zygote/GPU/CUDA 180703 ns 184668 ns 0.98
batchedmm(16, Bsize=128)/zygote/GPU/oneAPI 112491463 ns
batchedmm(16, Bsize=128)/zygote/GPU/AMDGPU 412909.5 ns 414734 ns 1.00
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/2 thread(s) 1791 ns 1583.5 ns 1.13
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/4 thread(s) 1875 ns 2000 ns 0.94
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/8 thread(s) 2125 ns 2187 ns 0.97
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/1 thread(s) 1813 ns 1833.5 ns 0.99
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/CUDA 19867 ns 19835 ns 1.00
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/oneAPI 1142759 ns
bias_activation(2, act=tanh)(2 x 128)/forward/GPU/AMDGPU 34540 ns 25650 ns 1.35
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/2 thread(s) 2042 ns 2292 ns 0.89
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/4 thread(s) 2167 ns 2459 ns 0.88
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/8 thread(s) 2500 ns 2458 ns 1.02
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/1 thread(s) 2062.5 ns 2187.5 ns 0.94
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/CUDA 193884 ns 197459.5 ns 0.98
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/oneAPI 9110958 ns
bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/AMDGPU 138796.5 ns 134722 ns 1.03
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 5791 ns 5021 ns 1.15
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 4916 ns 5167 ns 0.95
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 6312.5 ns 5500 ns 1.15
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 4937.5 ns 5959 ns 0.83
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 140483 ns 141255 ns 0.99
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/oneAPI 5688843 ns
layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/AMDGPU 70765.5 ns 59291 ns 1.19
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 8375 ns 8396 ns 1.00
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8292 ns 9208 ns 0.90
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 9917 ns 9791 ns 1.01
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8291 ns 8375 ns 0.99
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 811929.5 ns 823637 ns 0.99
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/oneAPI 40105318 ns
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/AMDGPU 393874 ns 383144 ns 1.03
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 55000 ns 54917 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 55833 ns 54291 ns 1.03
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 54292 ns 54250 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 56167 ns 56541 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 36588.5 ns 37246 ns 0.98
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 1189517 ns
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 206632.5 ns 204842 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 486646 ns 477000 ns 1.02
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 497020.5 ns 496604 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 505500 ns 494271 ns 1.02
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 504479.5 ns 467792 ns 1.08
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 256235 ns 259843 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 27551860 ns
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 837064 ns 794468 ns 1.05
batchedmm(128, Bsize=128)/forward/CPU/2 thread(s) 3311209 ns 3306791 ns 1.00
batchedmm(128, Bsize=128)/forward/CPU/4 thread(s) 2324917 ns 1761916 ns 1.32
batchedmm(128, Bsize=128)/forward/CPU/8 thread(s) 1764917 ns 1756167 ns 1.00
batchedmm(128, Bsize=128)/forward/CPU/1 thread(s) 6305667 ns 6310604.5 ns 1.00
batchedmm(128, Bsize=128)/forward/GPU/CUDA 204534 ns 205873.5 ns 0.99
batchedmm(128, Bsize=128)/forward/GPU/oneAPI 77630538 ns
batchedmm(128, Bsize=128)/forward/GPU/AMDGPU 220612.5 ns 214142 ns 1.03
batchedmm(128, Bsize=128)/zygote/CPU/2 thread(s) 11424750.5 ns 11469395.5 ns 1.00
batchedmm(128, Bsize=128)/zygote/CPU/4 thread(s) 8337875 ns 6567229 ns 1.27
batchedmm(128, Bsize=128)/zygote/CPU/8 thread(s) 6554562.5 ns 6474021 ns 1.01
batchedmm(128, Bsize=128)/zygote/CPU/1 thread(s) 21046187.5 ns 21232020.5 ns 0.99
batchedmm(128, Bsize=128)/zygote/GPU/CUDA 736592 ns 743103.5 ns 0.99
batchedmm(128, Bsize=128)/zygote/GPU/oneAPI 121665223 ns
batchedmm(128, Bsize=128)/zygote/GPU/AMDGPU 1067736 ns 1064100 ns 1.00
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 6375 ns 7125 ns 0.89
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 5146 ns 4791 ns 1.07
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 7333 ns 7042 ns 1.04
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 4917 ns 5333 ns 0.92
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 130414 ns 130642.5 ns 1.00
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/oneAPI 5600903.5 ns
layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/AMDGPU 56000 ns 55570 ns 1.01
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7500 ns 7333 ns 1.02
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7104.5 ns 8500 ns 0.84
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7833 ns 7500 ns 1.04
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 6917 ns 7625 ns 0.91
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 716948.5 ns 721790 ns 0.99
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/oneAPI 34048818 ns
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/AMDGPU 377284 ns 371284 ns 1.02
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 100375 ns 124000 ns 0.81
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 98042 ns 105458 ns 0.93
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 101229 ns 100416.5 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 121958 ns 93688 ns 1.30
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 148678 ns 149649.5 ns 0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 5976414.5 ns
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 203162 ns 203312 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2025979.5 ns 2020750 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2023750 ns 2021041 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2027979 ns 1993771 ns 1.02
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2028208 ns 2025000 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 667124 ns 676279 ns 0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 32503605.5 ns
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1113981 ns 1107011 ns 1.01
batchedmm(2, Bsize=4)/forward/CPU/2 thread(s) 34896 ns 33958.5 ns 1.03
batchedmm(2, Bsize=4)/forward/CPU/4 thread(s) 36541.5 ns 34334 ns 1.06
batchedmm(2, Bsize=4)/forward/CPU/8 thread(s) 33000 ns 32584 ns 1.01
batchedmm(2, Bsize=4)/forward/CPU/1 thread(s) 667 ns 708 ns 0.94
batchedmm(2, Bsize=4)/forward/GPU/CUDA 15608 ns 16105 ns 0.97
batchedmm(2, Bsize=4)/forward/GPU/oneAPI 72119754.5 ns
batchedmm(2, Bsize=4)/forward/GPU/AMDGPU 83761 ns 78881 ns 1.06
batchedmm(2, Bsize=4)/zygote/CPU/2 thread(s) 2542 ns 2479.5 ns 1.03
batchedmm(2, Bsize=4)/zygote/CPU/4 thread(s) 2875 ns 4000 ns 0.72
batchedmm(2, Bsize=4)/zygote/CPU/8 thread(s) 3042 ns 3125 ns 0.97
batchedmm(2, Bsize=4)/zygote/CPU/1 thread(s) 2125 ns 2292 ns 0.93
batchedmm(2, Bsize=4)/zygote/GPU/CUDA 136848 ns 139246 ns 0.98
batchedmm(2, Bsize=4)/zygote/GPU/oneAPI 92906510 ns
batchedmm(2, Bsize=4)/zygote/GPU/AMDGPU 357139 ns 352743.5 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7250 ns 7209 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 6000 ns 5417 ns 1.11
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 5417 ns 5291 ns 1.02
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 9875 ns 10083 ns 0.98
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 35691 ns 36300 ns 0.98
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 1119535 ns
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 49751 ns 49595.5 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 239895.5 ns 217854 ns 1.10
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 219708 ns 222916.5 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 222104 ns 220604.5 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 206166 ns 206125 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 239376 ns 241210 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 27974510.5 ns
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 574776 ns 515535 ns 1.11
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/2 thread(s) 3958 ns 3917 ns 1.01
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/4 thread(s) 3750 ns 3958 ns 0.95
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/8 thread(s) 3958 ns 3917 ns 1.01
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/1 thread(s) 3958 ns 3958 ns 1
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/CUDA 22068 ns 22201 ns 0.99
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/oneAPI 2145282 ns
dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/AMDGPU 42250 ns 41991 ns 1.01
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/2 thread(s) 14958 ns 14708 ns 1.02
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/4 thread(s) 14541 ns 14708 ns 0.99
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/8 thread(s) 14750 ns 14750 ns 1
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/1 thread(s) 14875 ns 14708 ns 1.01
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/CUDA 298530 ns 301554 ns 0.99
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/oneAPI 11632418 ns
dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/AMDGPU 196947 ns 195902 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 145083 ns 116166.5 ns 1.25
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 103646 ns 130416 ns 0.79
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 105729.5 ns 104479 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 113042 ns 105250 ns 1.07
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 132784 ns 135232 ns 0.98
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 6087845 ns
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 204547 ns 169232 ns 1.21
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1918083 ns 1928583 ns 0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1923042 ns 1925875 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1921375 ns 1895041.5 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1925292 ns 1745875 ns 1.10
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 658916 ns 664669 ns 0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 30625432 ns
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1069806 ns 1220022.5 ns 0.88
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 20959 ns 18583 ns 1.13
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 17979.5 ns 18792 ns 0.96
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 22125 ns 22250 ns 0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 18125 ns 18250 ns 0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 104444.5 ns 107671 ns 0.97
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 3374722 ns
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 81701 ns 77341 ns 1.06
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 229875 ns 216667 ns 1.06
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 223646 ns 216667 ns 1.03
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 218125.5 ns 217812.5 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 225125 ns 227125 ns 0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 492479 ns 497386 ns 0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 19457097 ns
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 483554.5 ns 470184 ns 1.03
batchedmm(16, Bsize=4)/forward/CPU/2 thread(s) 27374.5 ns 26145.5 ns 1.05
batchedmm(16, Bsize=4)/forward/CPU/4 thread(s) 31063 ns 28562 ns 1.09
batchedmm(16, Bsize=4)/forward/CPU/8 thread(s) 26708 ns 26792 ns 1.00
batchedmm(16, Bsize=4)/forward/CPU/1 thread(s) 1458 ns 1458 ns 1
batchedmm(16, Bsize=4)/forward/GPU/CUDA 15690 ns 16337 ns 0.96
batchedmm(16, Bsize=4)/forward/GPU/oneAPI 73206765 ns
batchedmm(16, Bsize=4)/forward/GPU/AMDGPU 89171 ns 86810 ns 1.03
batchedmm(16, Bsize=4)/zygote/CPU/2 thread(s) 4875 ns 4875 ns 1
batchedmm(16, Bsize=4)/zygote/CPU/4 thread(s) 4896 ns 5104 ns 0.96
batchedmm(16, Bsize=4)/zygote/CPU/8 thread(s) 5250 ns 5333 ns 0.98
batchedmm(16, Bsize=4)/zygote/CPU/1 thread(s) 4542 ns 4833 ns 0.94
batchedmm(16, Bsize=4)/zygote/GPU/CUDA 200612 ns 203656 ns 0.99
batchedmm(16, Bsize=4)/zygote/GPU/oneAPI 94501114 ns
batchedmm(16, Bsize=4)/zygote/GPU/AMDGPU 394774 ns 391324 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 221875 ns 222125 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 223209 ns 222583 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 225917 ns 226333 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 223750 ns 223333 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 216221 ns 222346 ns 0.97
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 7634874 ns
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 277862 ns 273793 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 535958 ns 500833 ns 1.07
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 499104 ns 504334 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 510167 ns 498167 ns 1.02
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 508166 ns 497542 ns 1.02
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 1024022 ns 1053089 ns 0.97
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 45569833 ns
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 864044 ns 851353.5 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 25166 ns 20667 ns 1.22
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 20166.5 ns 20313 ns 0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 21750 ns 23083 ns 0.94
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 19167 ns 20000 ns 0.96
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 111455.5 ns 113758.5 ns 0.98
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 3479193 ns
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 78821 ns 79011 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 245354 ns 213084 ns 1.15
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 223375 ns 213541 ns 1.05
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 225417 ns 214291 ns 1.05
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 218541 ns 215500 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 707911 ns 724087 ns 0.98
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 25617389 ns
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 538875 ns 538870.5 ns 1.00
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 7125 ns 6666 ns 1.07
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 6250 ns 6666.5 ns 0.94
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 8666 ns 9125 ns 0.95
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 6458 ns 6584 ns 0.98
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 132297.5 ns 134050 ns 0.99
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/oneAPI 5594794 ns
layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/AMDGPU 67671 ns 67330 ns 1.01
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 10583 ns 10875 ns 0.97
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 10250 ns 10603.5 ns 0.97
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 10958 ns 10584 ns 1.04
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 10875 ns 10750 ns 1.01
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 778959.5 ns 782883 ns 0.99
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/oneAPI 37279902 ns
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/AMDGPU 393784 ns 386274 ns 1.02
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 5250 ns 5000 ns 1.05
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 6167 ns 4625 ns 1.33
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 7583 ns 6541 ns 1.16
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 5208 ns 6375 ns 0.82
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 134141.5 ns 136660 ns 0.98
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/oneAPI 5548829 ns
layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/AMDGPU 69361 ns 58460 ns 1.19
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7834 ns 7667 ns 1.02
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7667 ns 7916.5 ns 0.97
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 8125 ns 7750 ns 1.05
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7458 ns 7750 ns 0.96
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 742994 ns 747431 ns 0.99
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/oneAPI 37148580 ns
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 400934 ns 392653 ns 1.02
batchedmm(128, Bsize=512)/forward/CPU/2 thread(s) 14518042 ns 14573000 ns 1.00
batchedmm(128, Bsize=512)/forward/CPU/4 thread(s) 10053875 ns 7702333.5 ns 1.31
batchedmm(128, Bsize=512)/forward/CPU/8 thread(s) 7724104 ns 7661229.5 ns 1.01
batchedmm(128, Bsize=512)/forward/CPU/1 thread(s) 27741083 ns 27919750 ns 0.99
batchedmm(128, Bsize=512)/forward/GPU/CUDA 554321.5 ns 552572 ns 1.00
batchedmm(128, Bsize=512)/forward/GPU/oneAPI 94275820 ns
batchedmm(128, Bsize=512)/forward/GPU/AMDGPU 399814.5 ns 402049 ns 0.99
batchedmm(128, Bsize=512)/zygote/CPU/2 thread(s) 46185458.5 ns 46551750 ns 0.99
batchedmm(128, Bsize=512)/zygote/CPU/4 thread(s) 33419604 ns 26549208 ns 1.26
batchedmm(128, Bsize=512)/zygote/CPU/8 thread(s) 26602708.5 ns 26263166.5 ns 1.01
batchedmm(128, Bsize=512)/zygote/CPU/1 thread(s) 85208959 ns 85671542 ns 0.99
batchedmm(128, Bsize=512)/zygote/GPU/CUDA 2813842 ns 3391019 ns 0.83
batchedmm(128, Bsize=512)/zygote/GPU/oneAPI 194819687 ns
batchedmm(128, Bsize=512)/zygote/GPU/AMDGPU 3323814 ns 3300103 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 69583 ns 67042 ns 1.04
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 66979 ns 67375 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 70292 ns 70583 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 67625 ns 68291 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 102627 ns 103426.5 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 3515302.5 ns
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 232062 ns 229352.5 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 520062.5 ns 468625 ns 1.11
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 473208 ns 497666.5 ns 0.95
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 482063 ns 469292 ns 1.03
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 474708 ns 468500 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 703393 ns 709808.5 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 26797269 ns
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 793873 ns 786728 ns 1.01
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 500 ns 542 ns 0.92
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 625 ns 584 ns 1.07
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 625 ns 584 ns 1.07
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 541 ns 583 ns 0.93
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA 31962 ns 32664 ns 0.98
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/oneAPI 1180122 ns
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/AMDGPU 47320 ns 47181 ns 1.00
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 8583 ns 8833 ns 0.97
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 9583.5 ns 9750 ns 0.98
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 9541 ns 9708 ns 0.98
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 9667 ns 9792 ns 0.99
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA 278738.5 ns 281049 ns 0.99
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/oneAPI 21728099.5 ns
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 381274 ns 373464 ns 1.02
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 9666 ns 9666 ns 1
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 9459 ns 9708 ns 0.97
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 9667 ns 9625 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 9666 ns 9666 ns 1
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/CUDA 23100 ns 23531 ns 0.98
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/oneAPI 2057483 ns
dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/AMDGPU 212922 ns 211602 ns 1.01
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 50458 ns 50250 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 50875 ns 50250 ns 1.01
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 50375 ns 50125 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 50209 ns 50167 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/CUDA 273986 ns 276186.5 ns 0.99
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/oneAPI 11648854 ns
dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/AMDGPU 610646 ns 603776 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 54917 ns 54916 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 55708 ns 54333 ns 1.03
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 54292 ns 54292 ns 1
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 55875 ns 56125 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 27572 ns 28315 ns 0.97
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 1222185 ns
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 206592 ns 204202 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 522166 ns 515312.5 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 504250 ns 495208 ns 1.02
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 503500 ns 494875 ns 1.02
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 472833.5 ns 465271 ns 1.02
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 236683 ns 238356 ns 0.99
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 32890414.5 ns
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 889849 ns 843049 ns 1.06
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 653833 ns 657146 ns 0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 639812.5 ns 678750 ns 0.94
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 654166.5 ns 625021 ns 1.05
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 643729 ns 649917 ns 0.99
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 186765 ns 189901 ns 0.98
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 8191594 ns
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 303073 ns 230582 ns 1.31
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2228375 ns 2239292 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2240916.5 ns 2249895.5 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2265312.5 ns 2176354.5 ns 1.04
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2228084 ns 2265625 ns 0.98
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 907493 ns 926422 ns 0.98
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 49570533.5 ns
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1227082.5 ns 1211101.5 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 22083 ns 21083 ns 1.05
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 21333 ns 22187.5 ns 0.96
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 21416.5 ns 23666 ns 0.90
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 20208 ns 19959 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA 108981.5 ns 112183.5 ns 0.97
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 3615898 ns
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 81661 ns 81261 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 232104.5 ns 254333 ns 0.91
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 222250 ns 220666 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 228583 ns 220750 ns 1.04
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 259708 ns 226708 ns 1.15
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 701359 ns 705957 ns 0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 27641264 ns
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 557775.5 ns 548680 ns 1.02
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 500 ns 500 ns 1
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 625 ns 583 ns 1.07
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 625 ns 625 ns 1
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 541 ns 583 ns 0.93
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA 22562 ns 23346 ns 0.97
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/oneAPI 1174965 ns
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/AMDGPU 48641 ns 47671 ns 1.02
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 9896 ns 9500 ns 1.04
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 10166 ns 9917 ns 1.03
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 9979.5 ns 9959 ns 1.00
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 10646 ns 10083 ns 1.06
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA 259541 ns 260912 ns 0.99
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/oneAPI 25096956 ns
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 406314 ns 400874 ns 1.01
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 10000 ns 10500 ns 0.95
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 8875 ns 8895.5 ns 1.00
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 10333 ns 11625 ns 0.89
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 9625 ns 8750 ns 1.10
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA 114946 ns 116855 ns 0.98
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/oneAPI 3356422 ns
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/AMDGPU 75001 ns 67861 ns 1.11
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7312.5 ns 7687.5 ns 0.95
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7833 ns 8000 ns 0.98
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7833 ns 7875 ns 0.99
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7645.5 ns 7812.5 ns 0.98
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA 479855 ns 481589 ns 1.00
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/oneAPI 17554055 ns
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/AMDGPU 327064 ns 324483 ns 1.01
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 1375 ns 1666 ns 0.83
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 1834 ns 2042 ns 0.90
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 2125 ns 2104.5 ns 1.01
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 1708 ns 1459 ns 1.17
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/CUDA 19733 ns 19805 ns 1.00
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/oneAPI 1143637.5 ns
bias_activation(2, act=gelu)(2 x 128)/forward/GPU/AMDGPU 192542 ns 190981 ns 1.01
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 3542 ns 3520.5 ns 1.01
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 3584 ns 3792 ns 0.95
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 3875 ns 3854.5 ns 1.01
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 3500 ns 3583 ns 0.98
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/CUDA 210034.5 ns 211153.5 ns 0.99
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/oneAPI 10599117 ns
bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/AMDGPU 584616 ns 578046 ns 1.01
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/2 thread(s) 148333.5 ns 147645.5 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/4 thread(s) 129000 ns 106542 ns 1.21
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/8 thread(s) 107396 ns 106708.5 ns 1.01
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/1 thread(s) 233604.5 ns 225875 ns 1.03
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/CUDA 23312 ns 23334 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/oneAPI 1181923 ns
bias_activation(512, act=tanh)(512 x 128)/forward/GPU/AMDGPU 41095.5 ns 35995.5 ns 1.14
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/2 thread(s) 161208.5 ns 144708 ns 1.11
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/4 thread(s) 140708 ns 104000 ns 1.35
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/8 thread(s) 104000 ns 87625 ns 1.19
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/1 thread(s) 259375 ns 252562.5 ns 1.03
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/CUDA 208046 ns 210178 ns 0.99
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/oneAPI 11091691.5 ns
bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/AMDGPU 267983 ns 230212 ns 1.16
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7270.5 ns 7125 ns 1.02
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 5959 ns 5375 ns 1.11
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 5333 ns 5292 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 9959 ns 10250 ns 0.97
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 32872 ns 33945.5 ns 0.97
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 1199319 ns
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 50331 ns 49690 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 258729 ns 219375 ns 1.18
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 234500 ns 260458 ns 0.90
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 238125 ns 228500.5 ns 1.04
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 253021 ns 222499.5 ns 1.14
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 256256.5 ns 257172 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 27890996 ns
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 595296 ns 523825 ns 1.14
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 13000 ns 13625 ns 0.95
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 12396 ns 13479 ns 0.92
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 14500 ns 15125 ns 0.96
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 12500 ns 13333 ns 0.94
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA 131871 ns 132277 ns 1.00
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/oneAPI 5626771 ns
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/AMDGPU 236102 ns 234872 ns 1.01
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 23854.5 ns 24084 ns 0.99
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 24500 ns 23645.5 ns 1.04
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 25187.5 ns 24708.5 ns 1.02
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 24750 ns 24459 ns 1.01
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA 821231 ns 830067.5 ns 0.99
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/oneAPI 40073814 ns
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/AMDGPU 689137 ns 681347 ns 1.01
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 9167 ns 9792 ns 0.94
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 9834 ns 10063 ns 0.98
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 11417 ns 11375 ns 1.00
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 8999.5 ns 9291.5 ns 0.97
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA 119274.5 ns 120374.5 ns 0.99
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/oneAPI 3523753.5 ns
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/AMDGPU 76811 ns 73601 ns 1.04
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 14083 ns 14541 ns 0.97
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 14166.5 ns 14813 ns 0.96
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 15104 ns 14812.5 ns 1.02
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 14083 ns 14875 ns 0.95
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA 630553.5 ns 637361.5 ns 0.99
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/oneAPI 21897908 ns
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/AMDGPU 373463 ns 368293 ns 1.01
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 9021 ns 10333 ns 0.87
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 9875 ns 9687.5 ns 1.02
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 11250 ns 12041.5 ns 0.93
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 9750 ns 10125.5 ns 0.96
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA 117966.5 ns 119012 ns 0.99
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/oneAPI 3400750 ns
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/AMDGPU 77501 ns 73051 ns 1.06
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 12854 ns 12792 ns 1.00
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 12937 ns 13395.5 ns 0.97
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 13187.5 ns 13375 ns 0.99
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 13166 ns 13166 ns 1
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA 522874 ns 525610 ns 0.99
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/oneAPI 19612958 ns
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/AMDGPU 349524 ns 342408 ns 1.02
batchedmm(2, Bsize=128)/forward/CPU/2 thread(s) 30958.5 ns 31416.5 ns 0.99
batchedmm(2, Bsize=128)/forward/CPU/4 thread(s) 34895.5 ns 32520.5 ns 1.07
batchedmm(2, Bsize=128)/forward/CPU/8 thread(s) 30208 ns 28917 ns 1.04
batchedmm(2, Bsize=128)/forward/CPU/1 thread(s) 2042 ns 2167 ns 0.94
batchedmm(2, Bsize=128)/forward/GPU/CUDA 16552 ns 16642 ns 0.99
batchedmm(2, Bsize=128)/forward/GPU/oneAPI 76609794 ns
batchedmm(2, Bsize=128)/forward/GPU/AMDGPU 87451 ns 78711 ns 1.11
batchedmm(2, Bsize=128)/zygote/CPU/2 thread(s) 5375 ns 5583.5 ns 0.96
batchedmm(2, Bsize=128)/zygote/CPU/4 thread(s) 5229 ns 4958 ns 1.05
batchedmm(2, Bsize=128)/zygote/CPU/8 thread(s) 5395.5 ns 5250 ns 1.03
batchedmm(2, Bsize=128)/zygote/CPU/1 thread(s) 6417 ns 6584 ns 0.97
batchedmm(2, Bsize=128)/zygote/GPU/CUDA 135958 ns 137549 ns 0.99
batchedmm(2, Bsize=128)/zygote/GPU/oneAPI 111332262.5 ns
batchedmm(2, Bsize=128)/zygote/GPU/AMDGPU 390584 ns 383954 ns 1.02
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 291 ns 292 ns 1.00
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 375 ns 375 ns 1
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 375 ns 416 ns 0.90
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 291 ns 375 ns 0.78
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 24266 ns 24843 ns 0.98
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/oneAPI 1220615 ns
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/AMDGPU 49051 ns 48221 ns 1.02
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 6458 ns 6375 ns 1.01
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 6792 ns 6708.5 ns 1.01
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 6875 ns 6916.5 ns 0.99
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 6375 ns 6875 ns 0.93
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 181716 ns 183051 ns 0.99
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/oneAPI 22738910 ns
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/AMDGPU 394694 ns 391009 ns 1.01
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 2000 ns 1958 ns 1.02
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 2125 ns 2042 ns 1.04
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 2125 ns 2084 ns 1.02
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 1959 ns 2041 ns 0.96
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA 25193 ns 25908 ns 0.97
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/oneAPI 1233759.5 ns
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/AMDGPU 207422 ns 207502 ns 1.00
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 16937.5 ns 17333.5 ns 0.98
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 17583 ns 17333 ns 1.01
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 17666 ns 17625 ns 1.00
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 17167 ns 18000 ns 0.95
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA 266060 ns 266084 ns 1.00
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/oneAPI 25037224.5 ns
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/AMDGPU 702687 ns 691847 ns 1.02
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 177959 ns 153459 ns 1.16
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 151000 ns 175583.5 ns 0.86
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 151250 ns 150250 ns 1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 156666 ns 150417 ns 1.04
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 185813 ns 192072 ns 0.97
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 8186035 ns
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 213762 ns 176432 ns 1.21
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1294417 ns 1193541 ns 1.08
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1322667 ns 1327291.5 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1326979.5 ns 1298166.5 ns 1.02
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1325125 ns 1330166.5 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 850017 ns 864717 ns 0.98
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 46207436 ns
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1106552 ns 1114311 ns 0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 25687.5 ns 25604.5 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 25000 ns 25333 ns 0.99
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 27125 ns 28625 ns 0.95
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 27375 ns 25541 ns 1.07
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA 226385 ns 232128 ns 0.98
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/oneAPI 7541451 ns
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU 115741 ns 115071 ns 1.01
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 180771 ns 118791.5 ns 1.52
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 134583.5 ns 126708 ns 1.06
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 175167 ns 118625 ns 1.48
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 164479 ns 117979 ns 1.39
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA 971603.5 ns 994805 ns 0.98
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/oneAPI 45326263 ns
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU 614401.5 ns 588415.5 ns 1.04
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 292 ns 250 ns 1.17
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 375 ns 334 ns 1.12
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 375 ns 416 ns 0.90
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 292 ns 334 ns 0.87
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA 22475 ns 23227 ns 0.97
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/oneAPI 1258351.5 ns
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/AMDGPU 48960 ns 46150 ns 1.06
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 6458.5 ns 6417 ns 1.01
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 6875 ns 6750 ns 1.02
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 6875 ns 6958 ns 0.99
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 6458.5 ns 6750 ns 0.96
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA 197699 ns 199656 ns 0.99
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/oneAPI 25220935 ns
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 395854 ns 393763.5 ns 1.01
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 5666 ns 6250 ns 0.91
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 6542 ns 6500 ns 1.01
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 6416 ns 7291.5 ns 0.88
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 6167 ns 5291 ns 1.17
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 136571.5 ns 137884.5 ns 0.99
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/oneAPI 5759376 ns
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/AMDGPU 236832 ns 233922 ns 1.01
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 10167 ns 10104.5 ns 1.01
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 10250 ns 10125 ns 1.01
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 10708.5 ns 10562.5 ns 1.01
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 10021 ns 10250 ns 0.98
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 843659.5 ns 853228 ns 0.99
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/oneAPI 42177959 ns
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 680842 ns 672507 ns 1.01
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 708 ns 708 ns 1
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 708 ns 708 ns 1
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 750 ns 750 ns 1
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 667 ns 708 ns 0.94
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/CUDA 22622 ns 22896 ns 0.99
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/oneAPI 2092408 ns
dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/AMDGPU 211377.5 ns 209942 ns 1.01
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 4958 ns 4834 ns 1.03
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 5167 ns 5042 ns 1.02
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 5125 ns 5125 ns 1
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 4834 ns 4834 ns 1
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/CUDA 217676 ns 220625.5 ns 0.99
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/oneAPI 10379046 ns
dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/AMDGPU 586156 ns 580650 ns 1.01
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 7646 ns 8750 ns 0.87
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 8458 ns 8708 ns 0.97
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 10000.5 ns 10395.5 ns 0.96
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 8625 ns 8167 ns 1.06
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA 117310.5 ns 118921.5 ns 0.99
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/oneAPI 3542404 ns
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/AMDGPU 77011 ns 71421 ns 1.08
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 8167 ns 8292 ns 0.98
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8792 ns 8791 ns 1.00
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 9541 ns 8958 ns 1.07
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8500 ns 8916 ns 0.95
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA 559897.5 ns 567449 ns 0.99
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/oneAPI 21100984 ns
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/AMDGPU 351894 ns 346934 ns 1.01
batchedmm(128, Bsize=4)/forward/CPU/2 thread(s) 129875 ns 125791.5 ns 1.03
batchedmm(128, Bsize=4)/forward/CPU/4 thread(s) 131334 ns 96000 ns 1.37
batchedmm(128, Bsize=4)/forward/CPU/8 thread(s) 98500 ns 96187.5 ns 1.02
batchedmm(128, Bsize=4)/forward/CPU/1 thread(s) 183000 ns 181542 ns 1.01
batchedmm(128, Bsize=4)/forward/GPU/CUDA 45933 ns 46439 ns 0.99
batchedmm(128, Bsize=4)/forward/GPU/oneAPI 73470628 ns
batchedmm(128, Bsize=4)/forward/GPU/AMDGPU 104986 ns 93231 ns 1.13
batchedmm(128, Bsize=4)/zygote/CPU/2 thread(s) 320833 ns 302834 ns 1.06
batchedmm(128, Bsize=4)/zygote/CPU/4 thread(s) 340500 ns 166542 ns 2.04
batchedmm(128, Bsize=4)/zygote/CPU/8 thread(s) 196229 ns 166917 ns 1.18
batchedmm(128, Bsize=4)/zygote/CPU/1 thread(s) 614646 ns 567708 ns 1.08
batchedmm(128, Bsize=4)/zygote/GPU/CUDA 184661 ns 186141 ns 0.99
batchedmm(128, Bsize=4)/zygote/GPU/oneAPI 95503191 ns
batchedmm(128, Bsize=4)/zygote/GPU/AMDGPU 520426 ns 466525 ns 1.12
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/2 thread(s) 397833 ns 398250 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/4 thread(s) 287792 ns 215167 ns 1.34
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/8 thread(s) 215167 ns 214291 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/1 thread(s) 756459 ns 756250 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/CUDA 43884 ns 43722 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/oneAPI 1380208.5 ns
dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/AMDGPU 82001 ns 80301 ns 1.02
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/2 thread(s) 1449083 ns 1402813 ns 1.03
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/4 thread(s) 1131416 ns 862208 ns 1.31
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/8 thread(s) 862375 ns 854333 ns 1.01
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/1 thread(s) 2444146 ns 2359583.5 ns 1.04
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/CUDA 248740 ns 247149 ns 1.01
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/oneAPI 11082909 ns
dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/AMDGPU 350333 ns 350254 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 652083 ns 657333 ns 0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 652854 ns 621958.5 ns 1.05
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 654417 ns 628854 ns 1.04
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 661125 ns 542146 ns 1.22
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 184615 ns 185394 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 8038741 ns
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 311568 ns 258293 ns 1.21
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2443958.5 ns 2469895.5 ns 0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2461416.5 ns 2491916.5 ns 0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2443812.5 ns 2389875 ns 1.02
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2444771 ns 2478250 ns 0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 932610 ns 934339.5 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 51927904 ns
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1324133 ns 1448647.5 ns 0.91
batchedmm(2, Bsize=32)/forward/CPU/2 thread(s) 34083.5 ns 34271 ns 0.99
batchedmm(2, Bsize=32)/forward/CPU/4 thread(s) 36437.5 ns 34250.5 ns 1.06
batchedmm(2, Bsize=32)/forward/CPU/8 thread(s) 33771 ns 32312.5 ns 1.05
batchedmm(2, Bsize=32)/forward/CPU/1 thread(s) 834 ns 916.5 ns 0.91
batchedmm(2, Bsize=32)/forward/GPU/CUDA 15954 ns 16189.5 ns 0.99
batchedmm(2, Bsize=32)/forward/GPU/oneAPI 74465713 ns
batchedmm(2, Bsize=32)/forward/GPU/AMDGPU 84121 ns 71551 ns 1.18
batchedmm(2, Bsize=32)/zygote/CPU/2 thread(s) 3042 ns 3166.5 ns 0.96
batchedmm(2, Bsize=32)/zygote/CPU/4 thread(s) 3208 ns 3437.5 ns 0.93
batchedmm(2, Bsize=32)/zygote/CPU/8 thread(s) 3416 ns 3541 ns 0.96
batchedmm(2, Bsize=32)/zygote/CPU/1 thread(s) 3084 ns 3125 ns 0.99
batchedmm(2, Bsize=32)/zygote/GPU/CUDA 134871 ns 134833 ns 1.00
batchedmm(2, Bsize=32)/zygote/GPU/oneAPI 101832238 ns
batchedmm(2, Bsize=32)/zygote/GPU/AMDGPU 355194 ns 339494 ns 1.05
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 435000 ns 437000 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 441208 ns 432458 ns 1.02
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 431291 ns 432833 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 449458 ns 449416 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA 42183 ns 42351 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 1418032 ns
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 241737 ns 238133 ns 1.02
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 4139000 ns 4152625 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 4281375 ns 4271667 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 4272125 ns 4252417 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 4043500 ns 4062020.5 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 231383.5 ns 231247 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 38875009 ns
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1238087.5 ns 1229715 ns 1.01
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/2 thread(s) 3917 ns 3875 ns 1.01
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/4 thread(s) 3750 ns 3917 ns 0.96
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/8 thread(s) 3917 ns 3875 ns 1.01
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/1 thread(s) 3916 ns 3916 ns 1
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/CUDA 34290 ns 34451.5 ns 1.00
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/oneAPI 1242809 ns
dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/AMDGPU 40730 ns 38680 ns 1.05
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/2 thread(s) 15750 ns 15458 ns 1.02
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/4 thread(s) 15500 ns 15708 ns 0.99
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/8 thread(s) 15708 ns 15625 ns 1.01
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/1 thread(s) 15667 ns 15459 ns 1.01
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/CUDA 253133 ns 252640 ns 1.00
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/oneAPI 8969271 ns
dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/AMDGPU 178362 ns 169682 ns 1.05
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/2 thread(s) 404000 ns 403417 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/4 thread(s) 295666 ns 221209 ns 1.34
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/8 thread(s) 221167 ns 220042 ns 1.01
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/1 thread(s) 760500 ns 760791 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/CUDA 113399 ns 113133 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/oneAPI 1019290 ns
dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/AMDGPU 89320 ns 87381 ns 1.02
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 1474312.5 ns 1431749.5 ns 1.03
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 1157021 ns 886583 ns 1.31
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 884958 ns 881812.5 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 2465875 ns 2383750 ns 1.03
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/CUDA 244167 ns 229435.5 ns 1.06
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/oneAPI 11671477 ns
dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/AMDGPU 354019 ns 350874 ns 1.01
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 500 ns 459 ns 1.09
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 666 ns 583 ns 1.14
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 584 ns 625 ns 0.93
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 500 ns 584 ns 0.86
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA 24808 ns 24713 ns 1.00
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/oneAPI 1214092.5 ns
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/AMDGPU 210112 ns 207622 ns 1.01
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 7916 ns 7458.5 ns 1.06
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8167 ns 8041.5 ns 1.02
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 8125 ns 8292 ns 0.98
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 7459 ns 7792 ns 0.96
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA 203590.5 ns 202392.5 ns 1.01
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/oneAPI 24613685 ns
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/AMDGPU 690937 ns 689378 ns 1.00
batchedmm(128, Bsize=32)/forward/CPU/2 thread(s) 832166.5 ns 833145.5 ns 1.00
batchedmm(128, Bsize=32)/forward/CPU/4 thread(s) 619583 ns 466667 ns 1.33
batchedmm(128, Bsize=32)/forward/CPU/8 thread(s) 472250 ns 467771 ns 1.01
batchedmm(128, Bsize=32)/forward/CPU/1 thread(s) 1542500 ns 1542833 ns 1.00
batchedmm(128, Bsize=32)/forward/GPU/CUDA 130624 ns 130433 ns 1.00
batchedmm(128, Bsize=32)/forward/GPU/oneAPI 75509279 ns
batchedmm(128, Bsize=32)/forward/GPU/AMDGPU 236082 ns 166542 ns 1.42
batchedmm(128, Bsize=32)/zygote/CPU/2 thread(s) 2694208.5 ns 2696000 ns 1.00
batchedmm(128, Bsize=32)/zygote/CPU/4 thread(s) 1991375 ns 1539437.5 ns 1.29
batchedmm(128, Bsize=32)/zygote/CPU/8 thread(s) 1537625 ns 1533500 ns 1.00
batchedmm(128, Bsize=32)/zygote/CPU/1 thread(s) 4930000 ns 4930000 ns 1
batchedmm(128, Bsize=32)/zygote/GPU/CUDA 233850 ns 233723 ns 1.00
batchedmm(128, Bsize=32)/zygote/GPU/oneAPI 102808354 ns
batchedmm(128, Bsize=32)/zygote/GPU/AMDGPU 768638 ns 771469 ns 1.00
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 292 ns 292 ns 1
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 375 ns 375 ns 1
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 375 ns 375 ns 1
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 250 ns 375 ns 0.67
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA 31761 ns 31721 ns 1.00
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/oneAPI 1224489 ns
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/AMDGPU 47050 ns 48111 ns 0.98
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 6417 ns 6312.5 ns 1.02
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 6792 ns 6812.5 ns 1.00
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 6666 ns 6875 ns 0.97
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 6375 ns 6500 ns 0.98
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA 219075.5 ns 217171.5 ns 1.01
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/oneAPI 23474742 ns
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/AMDGPU 362424 ns 362335 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1776458 ns 1777250 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1755459 ns 1758812.5 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1754000 ns 1730917 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1755666 ns 1776250 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 183229.5 ns 184219 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 8315915 ns
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 375104 ns 354280 ns 1.06
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 4353771 ns 4352917 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 4398479 ns 4382542 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 4376083 ns 4351834 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 4351333 ns 4391416 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 833369 ns 837734 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 47106002 ns
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1251643 ns 1247440 ns 1.00
bias_activation(512, act=relu)(512 x 128)/forward/CPU/2 thread(s) 7083.5 ns 6771 ns 1.05
bias_activation(512, act=relu)(512 x 128)/forward/CPU/4 thread(s) 7104 ns 7937.5 ns 0.89
bias_activation(512, act=relu)(512 x 128)/forward/CPU/8 thread(s) 7375 ns 7333 ns 1.01
bias_activation(512, act=relu)(512 x 128)/forward/CPU/1 thread(s) 6834 ns 6687.5 ns 1.02
bias_activation(512, act=relu)(512 x 128)/forward/GPU/CUDA 22695 ns 22420 ns 1.01
bias_activation(512, act=relu)(512 x 128)/forward/GPU/oneAPI 1216626 ns
bias_activation(512, act=relu)(512 x 128)/forward/GPU/AMDGPU 37200 ns 36840.5 ns 1.01
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 48479.5 ns 45312.5 ns 1.07
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 50874.5 ns 48146 ns 1.06
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 47979 ns 33917 ns 1.41
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 47208 ns 52729.5 ns 0.90
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/CUDA 207872 ns 206304 ns 1.01
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/oneAPI 10801241 ns
bias_activation(512, act=relu)(512 x 128)/zygote/GPU/AMDGPU 234813 ns 232673 ns 1.01
batchedmm(2, Bsize=512)/forward/CPU/2 thread(s) 22854 ns 22146 ns 1.03
batchedmm(2, Bsize=512)/forward/CPU/4 thread(s) 26375 ns 23896 ns 1.10
batchedmm(2, Bsize=512)/forward/CPU/8 thread(s) 23146 ns 22417 ns 1.03
batchedmm(2, Bsize=512)/forward/CPU/1 thread(s) 5333 ns 5334 ns 1.00
batchedmm(2, Bsize=512)/forward/GPU/CUDA 17805 ns 18024 ns 0.99
batchedmm(2, Bsize=512)/forward/GPU/oneAPI 89168517 ns
batchedmm(2, Bsize=512)/forward/GPU/AMDGPU 90691 ns 83860.5 ns 1.08
batchedmm(2, Bsize=512)/zygote/CPU/2 thread(s) 12083 ns 12000 ns 1.01
batchedmm(2, Bsize=512)/zygote/CPU/4 thread(s) 10208.5 ns 9437.5 ns 1.08
batchedmm(2, Bsize=512)/zygote/CPU/8 thread(s) 9583 ns 9583 ns 1
batchedmm(2, Bsize=512)/zygote/CPU/1 thread(s) 18104.5 ns 18250 ns 0.99
batchedmm(2, Bsize=512)/zygote/GPU/CUDA 217973 ns 218264 ns 1.00
batchedmm(2, Bsize=512)/zygote/GPU/oneAPI 150119195 ns
batchedmm(2, Bsize=512)/zygote/GPU/AMDGPU 389829 ns 367444 ns 1.06
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/2 thread(s) 405958 ns 406417 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/4 thread(s) 297166.5 ns 223333 ns 1.33
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/8 thread(s) 223625 ns 222292 ns 1.01
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/1 thread(s) 762167 ns 762750 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/CUDA 46720 ns 46291 ns 1.01
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/oneAPI 1360027 ns
dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/AMDGPU 90521 ns 88691 ns 1.02
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 1491042 ns 1428625 ns 1.04
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 1165750 ns 892375 ns 1.31
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 892791.5 ns 886833 ns 1.01
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 2470333 ns 2386333 ns 1.04
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/CUDA 279542.5 ns 279641 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/oneAPI 11213824.5 ns
dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/AMDGPU 375414 ns 379995 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 436000 ns 436833 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 440750 ns 432708 ns 1.02
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 432000 ns 429500 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 449042 ns 449500 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 54332 ns 52933 ns 1.03
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 999725 ns
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 237743 ns 235598 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 4137041.5 ns 4147167 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 4271042 ns 4260354 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 4270646 ns 4227333 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 4030959 ns 4030354.5 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 253348 ns 252356.5 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 32411933.5 ns
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1223273 ns 1204784 ns 1.02
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 9458 ns 9583 ns 0.99
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 8000 ns 7292 ns 1.10
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 7209 ns 7250 ns 0.99
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 13458 ns 13500 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/CUDA 24044 ns 23984 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/oneAPI 2135292 ns
dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/AMDGPU 214732 ns 212683 ns 1.01
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 49833 ns 49416 ns 1.01
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 49750 ns 49459 ns 1.01
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 49458 ns 49167 ns 1.01
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 49500 ns 49625 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/CUDA 335918.5 ns 333606 ns 1.01
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/oneAPI 12693187 ns
dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/AMDGPU 656617 ns 652008 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 136583 ns 106875 ns 1.28
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 82145.5 ns 113729 ns 0.72
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 85583 ns 88666 ns 0.97
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 83104 ns 89666.5 ns 0.93
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA 191318.5 ns 191172 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/oneAPI 5843078 ns
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU 205972 ns 200642 ns 1.03
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2013959 ns 2027750.5 ns 0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2017792 ns 2023896 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2022958 ns 1986666 ns 1.02
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2019333 ns 2015667 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA 508706 ns 507573.5 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/oneAPI 28081381 ns
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU 1089431 ns 1086742.5 ns 1.00

This comment was automatically generated by workflow using github-action-benchmark.

Please sign in to comment.