Implement `ShuffleUnsafe` methods and optimise `Shuffle` for non-constant `indices` #99596

hamarb123 · 2024-03-12T11:29:29Z

Fixes [API Proposal]: Vector128.ShuffleUnsafe #81609
Support optimised Shuffle with variable indices on coreclr (for all types)
Support optimised cross-lane Shuffle on Vector256 (with signed/unsigned bytes and shorts)
Optimise Vector256 shuffle with Avx2.Shuffle (for signed/unsigned bytes and shorts)

Todo tasks:

Optimise variable shuffle for short/int/long (and other types of same size)
Implement additional tests
Implement VectorXXX.ShuffleUnsafe for vectors of other element types
Utilise/support Avx10
Just call Shuffle in ShuffleUnsafe fallback
Simplify tests & remove unnecessary ones
Remove internal requirement for ShuffleUnsafe(V128) to give 0 when high bit is set
Validate that the variable shuffles are actually faster for the larger types

Codegen:

Shuffle With AVX2

public class ShuffleBenchAvx2Only
{
    //not constant
    private Vector128<byte> indicesByte128 = Vector128.Create((byte)5, 2, 6, 15, 2, 90, 14, 0, 0, 7, 8, 12, 13, 14, 15, 6);
    private Vector128<short> indicesShort128 = Vector128.Create((short)1, 4, 6, 3, 10, 5, short.MaxValue, 0);
    private Vector128<int> indicesInt128 = Vector128.Create(5, -6, 0, 3);
    private Vector128<long> indicesLong128 = Vector128.Create(1, 0);
    private Vector256<byte> indicesByte256 = Vector256.Create((byte)5, 2, 6, 15, 20, 90, 14, 0, 0, 7, 8, 12, 13, 14, 15, 6, 32, 31, 30, 29, 28, 27, 26, 25, 4, 23, 22, 21, 20, 19, 18, 17);
    private Vector256<short> indicesShort256 = Vector256.Create((short)1, 4, 6, 3, 10, 5, short.MaxValue, 0, 1, 13, 2, 14, 9, 8, 7, 6);
    private Vector256<int> indicesInt256 = Vector256.Create(5, -6, 0, 3, 1, 2, 6, 7);
    private Vector256<long> indicesLong256 = Vector256.Create(1, 2, 0, 3);

    [MethodImpl(MethodImplOptions.NoInlining)]
    private static void Consume<T>(T value) { }

    public void ShuffleByte128Indirect() => Consume(Vector128.Shuffle(Vector128.Create((byte)0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), indicesByte128));

    public void ShuffleShort128Indirect() => Consume(Vector128.Shuffle(Vector128.Create((short)0, 1, 2, 3, 4, 5, 6, 7), indicesShort128));

    public void ShuffleInt128Indirect() => Consume(Vector128.Shuffle(Vector128.Create(0, 1, 2, 3), indicesInt128));

    public void ShuffleLong128Indirect() => Consume(Vector128.Shuffle(Vector128.Create(0, 1), indicesLong128));

    public void ShuffleByte256Indirect() => Consume(Vector256.Shuffle(Vector256.Create((byte)0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31), indicesByte256));

    private Vector256<byte> vectorByte256 = Vector256.Create((byte)0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
    public void ShuffleByte256BothIndirect() => Consume(Vector256.Shuffle(vectorByte256, indicesByte256));

    public void ShuffleShort256Indirect() => Consume(Vector256.Shuffle(Vector256.Create((short)0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), indicesShort256));

    private Vector256<short> vectorShort256 = Vector256.Create((short)0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
    public void ShuffleShort256BothIndirect() => Consume(Vector256.Shuffle(vectorShort256, indicesShort256));

    public void ShuffleInt256Indirect() => Consume(Vector256.Shuffle(Vector256.Create(0, 1, 2, 3, 4, 5, 6, 7), indicesInt256));

    public void ShuffleLong256Indirect() => Consume(Vector256.Shuffle(Vector256.Create(0, 1, 2, 3), indicesLong256));
}

; Assembly listing for method ShuffleBenchAvx2Only:ShuffleByte128Indirect():this (Tier1)
; Emitting BLENDED_CODE for X64 with AVX - Apple
; Tier1 code
; optimized code
; optimized using Synthesized PGO
; rsp based frame
; partially interruptible
; with Synthesized PGO: fgCalledCount is 100
; No PGO data

G_M000_IG01:                ;; offset=0x0000
       sub      rsp, 24
 
G_M000_IG02:                ;; offset=0x0004
       vmovups  xmm1, xmmword ptr [reloc @RWD00]
       vpcmpgtb xmm1, xmm1, xmm0
       vmovups  xmm0, xmmword ptr [rdi+0x08]
       vmovups  xmm2, xmmword ptr [reloc @RWD16]
       vpshufb  xmm0, xmm2, xmm0
       vpand    xmm0, xmm0, xmm1
       vmovups  xmmword ptr [rsp], xmm0
       call     [ShuffleBenchAvx2Only:Consume[System.Runtime.Intrinsics.Vector128`1[ubyte]](System.Runtime.Intrinsics.Vector128`1[ubyte])]
       nop      
 
G_M000_IG03:                ;; offset=0x0032
       add      rsp, 24
       ret      
 
RWD00  	dq	1010101010101010h, 1010101010101010h
RWD16  	dq	0706050403020100h, 0F0E0D0C0B0A0908h

; Total bytes of code 55

; Assembly listing for method ShuffleBenchAvx2Only:ShuffleShort128Indirect():this (Tier1)
; Emitting BLENDED_CODE for X64 with AVX - Apple
; Tier1 code
; optimized code
; optimized using Synthesized PGO
; rsp based frame
; partially interruptible
; with Synthesized PGO: fgCalledCount is 100
; No PGO data

G_M000_IG01:                ;; offset=0x0000
       sub      rsp, 40
 
G_M000_IG02:                ;; offset=0x0004
       vmovaps  xmm0, xmmword ptr [rsp+0x10]
       vpsubw   xmm0, xmm0, xmmword ptr [reloc @RWD00]
       vmovups  xmm1, xmmword ptr [reloc @RWD16]
       vpcmpgtw xmm0, xmm1, xmm0
       vmovups  xmm1, xmmword ptr [rdi+0x18]
       vpsllw   xmm1, xmm1, 1
       vpshufb  xmm1, xmm1, xmmword ptr [reloc @RWD32]
       vpor     xmm1, xmm1, xmmword ptr [reloc @RWD48]
       vmovups  xmm2, xmmword ptr [reloc @RWD64]
       vpshufb  xmm1, xmm2, xmm1
       vpand    xmm0, xmm1, xmm0
       vmovups  xmmword ptr [rsp], xmm0
       call     [ShuffleBenchAvx2Only:Consume[System.Runtime.Intrinsics.Vector128`1[short]](System.Runtime.Intrinsics.Vector128`1[short])]
       nop      
 
G_M000_IG03:                ;; offset=0x0056
       add      rsp, 40
       ret      
 
RWD00  	dq	8000800080008000h, 8000800080008000h
RWD16  	dq	8008800880088008h, 8008800880088008h
RWD32  	dq	0606040402020000h, 0E0E0C0C0A0A0808h
RWD48  	dq	0100010001000100h, 0100010001000100h
RWD64  	dq	0003000200010000h, 0007000600050004h

; Total bytes of code 91

; Assembly listing for method ShuffleBenchAvx2Only:ShuffleInt128Indirect():this (Tier1)
; Emitting BLENDED_CODE for X64 with AVX - Apple
; Tier1 code
; optimized code
; optimized using Synthesized PGO
; rsp based frame
; partially interruptible
; with Synthesized PGO: fgCalledCount is 100
; No PGO data

G_M000_IG01:                ;; offset=0x0000
       sub      rsp, 40
 
G_M000_IG02:                ;; offset=0x0004
       vmovaps  xmm0, xmmword ptr [rsp+0x10]
       vpsubd   xmm0, xmm0, xmmword ptr [reloc @RWD00]
       vmovups  xmm1, xmmword ptr [reloc @RWD16]
       vpcmpgtd xmm0, xmm1, xmm0
       vmovups  xmm1, xmmword ptr [rdi+0x28]
       vmovups  xmm2, xmmword ptr [reloc @RWD32]
       vpermilps xmm1, xmm2, xmm1
       vpand    xmm0, xmm1, xmm0
       vmovups  xmmword ptr [rsp], xmm0
       call     [ShuffleBenchAvx2Only:Consume[System.Runtime.Intrinsics.Vector128`1[int]](System.Runtime.Intrinsics.Vector128`1[int])]
       nop      
 
G_M000_IG03:                ;; offset=0x0040
       add      rsp, 40
       ret      
 
RWD00  	dq	8000000080000000h, 8000000080000000h
RWD16  	dq	8000000480000004h, 8000000480000004h
RWD32  	dq	0000000100000000h, 0000000300000002h

; Total bytes of code 69

; Assembly listing for method ShuffleBenchAvx2Only:ShuffleLong128Indirect():this (Tier1)
; Emitting BLENDED_CODE for X64 with AVX - Apple
; Tier1 code
; optimized code
; optimized using Synthesized PGO
; rsp based frame
; partially interruptible
; with Synthesized PGO: fgCalledCount is 100
; No PGO data

G_M000_IG01:                ;; offset=0x0000
       sub      rsp, 40
 
G_M000_IG02:                ;; offset=0x0004
       vmovaps  xmm0, xmmword ptr [rsp+0x10]
       vpsubq   xmm0, xmm0, xmmword ptr [reloc @RWD00]
       vmovups  xmm1, xmmword ptr [reloc @RWD16]
       vpcmpgtq xmm0, xmm1, xmm0
       vmovups  xmm1, xmmword ptr [rdi+0x38]
       vpsllq   xmm1, xmm1, 1
       vpermilps xmm1, xmm1, xmmword ptr [reloc @RWD32]
       vpor     xmm1, xmm1, xmmword ptr [reloc @RWD48]
       vmovups  xmm2, xmmword ptr [reloc @RWD64]
       vpermilps xmm1, xmm2, xmm1
       vpand    xmm0, xmm1, xmm0
       vmovups  xmmword ptr [rsp], xmm0
       call     [ShuffleBenchAvx2Only:Consume[System.Runtime.Intrinsics.Vector128`1[long]](System.Runtime.Intrinsics.Vector128`1[long])]
       nop      
 
G_M000_IG03:                ;; offset=0x0057
       add      rsp, 40
       ret      
 
RWD00  	dq	8000000000000000h, 8000000000000000h
RWD16  	dq	8000000000000002h, 8000000000000002h
RWD32  	dq	0000000000000000h, 0000000200000002h
RWD48  	dq	0000000100000000h, 0000000100000000h
RWD64  	dq	0000000000000000h, 0000000000000001h

; Total bytes of code 92

; Assembly listing for method ShuffleBenchAvx2Only:ShuffleByte256Indirect():this (Tier1)
; Emitting BLENDED_CODE for X64 with AVX - Apple
; Tier1 code
; optimized code
; optimized using Synthesized PGO
; rbp based frame
; partially interruptible
; with Synthesized PGO: fgCalledCount is 100
; No PGO data

G_M000_IG01:                ;; offset=0x0000
       push     rbp
       sub      rsp, 32
       lea      rbp, [rsp+0x20]
 
G_M000_IG02:                ;; offset=0x000A
       vmovups  ymm0, ymmword ptr [rdi+0x48]
       vmovaps  ymm1, ymm0
       vmovups  ymm2, ymmword ptr [reloc @RWD00]
       vpshufb  ymm2, ymm2, ymm1
       vmovups  ymm3, ymmword ptr [reloc @RWD32]
       vpshufb  ymm3, ymm3, ymm1
       vpxor    ymm1, ymm1, ymmword ptr [reloc @RWD64]
       vpcmpgtb ymm1, ymm1, ymmword ptr [reloc @RWD96]
       vpblendvb ymm1, ymm2, ymm3, ymm1
       vmovups  ymm2, ymmword ptr [reloc @RWD128]
       vpcmpgtb ymm0, ymm2, ymm0
       vpand    ymm0, ymm1, ymm0
       vmovups  ymmword ptr [rsp], ymm0
       call     [ShuffleBenchAvx2Only:Consume[System.Runtime.Intrinsics.Vector256`1[ubyte]](System.Runtime.Intrinsics.Vector256`1[ubyte])]
       nop      
 
G_M000_IG03:                ;; offset=0x005F
       vzeroupper 
       add      rsp, 32
       pop      rbp
       ret      
 
RWD00  	dq	0706050403020100h, 0F0E0D0C0B0A0908h, 1716151413121110h, 1F1E1D1C1B1A1918h
RWD32  	dq	1716151413121110h, 1F1E1D1C1B1A1918h, 0706050403020100h, 0F0E0D0C0B0A0908h
RWD64  	dq	0000000000000000h, 0000000000000000h, 1010101010101010h, 1010101010101010h
RWD96  	dq	0F0F0F0F0F0F0F0Fh, 0F0F0F0F0F0F0F0Fh, 0F0F0F0F0F0F0F0Fh, 0F0F0F0F0F0F0F0Fh
RWD128 	dq	2020202020202020h, 2020202020202020h, 2020202020202020h, 2020202020202020h

; Total bytes of code 104

; Assembly listing for method ShuffleBenchAvx2Only:ShuffleByte256BothIndirect():this (Tier1)
; Emitting BLENDED_CODE for X64 with AVX - Apple
; Tier1 code
; optimized code
; optimized using Synthesized PGO
; rbp based frame
; partially interruptible
; with Synthesized PGO: fgCalledCount is 100
; No PGO data

G_M000_IG01:                ;; offset=0x0000
       push     rbp
       sub      rsp, 32
       lea      rbp, [rsp+0x20]
 
G_M000_IG02:                ;; offset=0x000A
       vmovups  ymm0, ymmword ptr [rdi+0xC8]
       vmovups  ymm1, ymmword ptr [rdi+0x48]
       vmovaps  ymm2, ymm1
       vpshufb  ymm3, ymm0, ymm2
       vperm2i128 ymm0, ymm0, ymm0, 1
       vpshufb  ymm0, ymm0, ymm2
       vpxor    ymm2, ymm2, ymmword ptr [reloc @RWD00]
       vpcmpgtb ymm2, ymm2, ymmword ptr [reloc @RWD32]
       vpblendvb ymm0, ymm3, ymm0, ymm2
       vmovups  ymm2, ymmword ptr [reloc @RWD64]
       vpcmpgtb ymm1, ymm2, ymm1
       vpand    ymm0, ymm0, ymm1
       vmovups  ymmword ptr [rsp], ymm0
       call     [ShuffleBenchAvx2Only:Consume[System.Runtime.Intrinsics.Vector256`1[ubyte]](System.Runtime.Intrinsics.Vector256`1[ubyte])]
       nop      
 
G_M000_IG03:                ;; offset=0x005D
       vzeroupper 
       add      rsp, 32
       pop      rbp
       ret      
 
RWD00  	dq	0000000000000000h, 0000000000000000h, 1010101010101010h, 1010101010101010h
RWD32  	dq	0F0F0F0F0F0F0F0Fh, 0F0F0F0F0F0F0F0Fh, 0F0F0F0F0F0F0F0Fh, 0F0F0F0F0F0F0F0Fh
RWD64  	dq	2020202020202020h, 2020202020202020h, 2020202020202020h, 2020202020202020h

; Total bytes of code 102

; Assembly listing for method ShuffleBenchAvx2Only:ShuffleShort256Indirect():this (Tier1)
; Emitting BLENDED_CODE for X64 with AVX - Apple
; Tier1 code
; optimized code
; optimized using Synthesized PGO
; rbp based frame
; partially interruptible
; with Synthesized PGO: fgCalledCount is 100
; No PGO data

G_M000_IG01:                ;; offset=0x0000
       push     rbp
       sub      rsp, 32
       lea      rbp, [rsp+0x20]
 
G_M000_IG02:                ;; offset=0x000A
       vmovups  ymm0, ymmword ptr [rdi+0x68]
       vpsllw   ymm1, ymm0, 1
       vpshufb  ymm1, ymm1, ymmword ptr [reloc @RWD00]
       vpor     ymm1, ymm1, ymmword ptr [reloc @RWD32]
       vmovups  ymm2, ymmword ptr [reloc @RWD64]
       vpshufb  ymm2, ymm2, ymm1
       vmovups  ymm3, ymmword ptr [reloc @RWD96]
       vpshufb  ymm3, ymm3, ymm1
       vpxor    ymm1, ymm1, ymmword ptr [reloc @RWD128]
       vpcmpgtb ymm1, ymm1, ymmword ptr [reloc @RWD160]
       vpblendvb ymm1, ymm2, ymm3, ymm1
       vpsubw   ymm0, ymm0, ymmword ptr [reloc @RWD192]
       vmovups  ymm2, ymmword ptr [reloc @RWD224]
       vpcmpgtw ymm0, ymm2, ymm0
       vpand    ymm0, ymm1, ymm0
       vmovups  ymmword ptr [rsp], ymm0
       call     [ShuffleBenchAvx2Only:Consume[System.Runtime.Intrinsics.Vector256`1[short]](System.Runtime.Intrinsics.Vector256`1[short])]
       nop      
 
G_M000_IG03:                ;; offset=0x0079
       vzeroupper 
       add      rsp, 32
       pop      rbp
       ret      
 
RWD00  	dq	0606040402020000h, 0E0E0C0C0A0A0808h, 0606040402020000h, 0E0E0C0C0A0A0808h
RWD32  	dq	0100010001000100h, 0100010001000100h, 0100010001000100h, 0100010001000100h
RWD64  	dq	0003000200010000h, 0007000600050004h, 000B000A00090008h, 000F000E000D000Ch
RWD96  	dq	000B000A00090008h, 000F000E000D000Ch, 0003000200010000h, 0007000600050004h
RWD128 	dq	0000000000000000h, 0000000000000000h, 1010101010101010h, 1010101010101010h
RWD160 	dq	0F0F0F0F0F0F0F0Fh, 0F0F0F0F0F0F0F0Fh, 0F0F0F0F0F0F0F0Fh, 0F0F0F0F0F0F0F0Fh
RWD192 	dq	8000800080008000h, 8000800080008000h, 8000800080008000h, 8000800080008000h
RWD224 	dq	8010801080108010h, 8010801080108010h, 8010801080108010h, 8010801080108010h

; Total bytes of code 130

; Assembly listing for method ShuffleBenchAvx2Only:ShuffleShort256BothIndirect():this (Tier1)
; Emitting BLENDED_CODE for X64 with AVX - Apple
; Tier1 code
; optimized code
; optimized using Synthesized PGO
; rbp based frame
; partially interruptible
; with Synthesized PGO: fgCalledCount is 100
; No PGO data

G_M000_IG01:                ;; offset=0x0000
       push     rbp
       sub      rsp, 32
       lea      rbp, [rsp+0x20]
 
G_M000_IG02:                ;; offset=0x000A
       vmovups  ymm0, ymmword ptr [rdi+0xE8]
       vmovups  ymm1, ymmword ptr [rdi+0x68]
       vpsllw   ymm2, ymm1, 1
       vpshufb  ymm2, ymm2, ymmword ptr [reloc @RWD00]
       vpor     ymm2, ymm2, ymmword ptr [reloc @RWD32]
       vpshufb  ymm3, ymm0, ymm2
       vperm2i128 ymm0, ymm0, ymm0, 1
       vpshufb  ymm0, ymm0, ymm2
       vpxor    ymm2, ymm2, ymmword ptr [reloc @RWD64]
       vpcmpgtb ymm2, ymm2, ymmword ptr [reloc @RWD96]
       vpblendvb ymm0, ymm3, ymm0, ymm2
       vpsubw   ymm1, ymm1, ymmword ptr [reloc @RWD128]
       vmovups  ymm2, ymmword ptr [reloc @RWD160]
       vpcmpgtw ymm1, ymm2, ymm1
       vpand    ymm0, ymm0, ymm1
       vmovups  ymmword ptr [rsp], ymm0
       call     [ShuffleBenchAvx2Only:Consume[System.Runtime.Intrinsics.Vector256`1[short]](System.Runtime.Intrinsics.Vector256`1[short])]
       nop      
 
G_M000_IG03:                ;; offset=0x0077
       vzeroupper 
       add      rsp, 32
       pop      rbp
       ret      
 
RWD00  	dq	0606040402020000h, 0E0E0C0C0A0A0808h, 0606040402020000h, 0E0E0C0C0A0A0808h
RWD32  	dq	0100010001000100h, 0100010001000100h, 0100010001000100h, 0100010001000100h
RWD64  	dq	0000000000000000h, 0000000000000000h, 1010101010101010h, 1010101010101010h
RWD96  	dq	0F0F0F0F0F0F0F0Fh, 0F0F0F0F0F0F0F0Fh, 0F0F0F0F0F0F0F0Fh, 0F0F0F0F0F0F0F0Fh
RWD128 	dq	8000800080008000h, 8000800080008000h, 8000800080008000h, 8000800080008000h
RWD160 	dq	8010801080108010h, 8010801080108010h, 8010801080108010h, 8010801080108010h

; Total bytes of code 128

; Assembly listing for method ShuffleBenchAvx2Only:ShuffleInt256Indirect():this (Tier1)
; Emitting BLENDED_CODE for X64 with AVX - Apple
; Tier1 code
; optimized code
; optimized using Synthesized PGO
; rbp based frame
; partially interruptible
; with Synthesized PGO: fgCalledCount is 100
; No PGO data

G_M000_IG01:                ;; offset=0x0000
       push     rbp
       sub      rsp, 80
       lea      rbp, [rsp+0x50]
 
G_M000_IG02:                ;; offset=0x000A
       vmovups  ymm0, ymmword ptr [rbp-0x30]
       vpsubd   ymm0, ymm0, ymmword ptr [reloc @RWD00]
       vmovups  ymm1, ymmword ptr [reloc @RWD32]
       vpcmpgtd ymm0, ymm1, ymm0
       vmovups  ymm1, ymmword ptr [rdi+0x88]
       vpermd   ymm1, ymm1, ymmword ptr [reloc @RWD64]
       vpand    ymm0, ymm1, ymm0
       vmovups  ymmword ptr [rsp], ymm0
       call     [ShuffleBenchAvx2Only:Consume[System.Runtime.Intrinsics.Vector256`1[int]](System.Runtime.Intrinsics.Vector256`1[int])]
       nop      
 
G_M000_IG03:                ;; offset=0x0044
       vzeroupper 
       add      rsp, 80
       pop      rbp
       ret      
 
RWD00  	dq	8000000080000000h, 8000000080000000h, 8000000080000000h, 8000000080000000h
RWD32  	dq	8000000880000008h, 8000000880000008h, 8000000880000008h, 8000000880000008h
RWD64  	dq	0000000100000000h, 0000000300000002h, 0000000500000004h, 0000000700000006h

; Total bytes of code 77

; Assembly listing for method ShuffleBenchAvx2Only:ShuffleLong256Indirect():this (Tier1)
; Emitting BLENDED_CODE for X64 with AVX - Apple
; Tier1 code
; optimized code
; optimized using Synthesized PGO
; rbp based frame
; partially interruptible
; with Synthesized PGO: fgCalledCount is 100
; No PGO data

G_M000_IG01:                ;; offset=0x0000
       push     rbp
       sub      rsp, 80
       lea      rbp, [rsp+0x50]
 
G_M000_IG02:                ;; offset=0x000A
       vmovups  ymm0, ymmword ptr [rbp-0x30]
       vpsubq   ymm0, ymm0, ymmword ptr [reloc @RWD00]
       vmovups  ymm1, ymmword ptr [reloc @RWD32]
       vpcmpgtq ymm0, ymm1, ymm0
       vmovups  ymm1, ymmword ptr [rdi+0xA8]
       vpsllq   ymm1, ymm1, 1
       vmovups  ymm2, ymmword ptr [reloc @RWD64]
       vpermd   ymm1, ymm2, ymm1
       vpor     ymm1, ymm1, ymmword ptr [reloc @RWD96]
       vpermd   ymm1, ymm1, ymmword ptr [reloc @RWD128]
       vpand    ymm0, ymm1, ymm0
       vmovups  ymmword ptr [rsp], ymm0
       call     [ShuffleBenchAvx2Only:Consume[System.Runtime.Intrinsics.Vector256`1[long]](System.Runtime.Intrinsics.Vector256`1[long])]
       nop      
 
G_M000_IG03:                ;; offset=0x005F
       vzeroupper 
       add      rsp, 80
       pop      rbp
       ret      
 
RWD00  	dq	8000000000000000h, 8000000000000000h, 8000000000000000h, 8000000000000000h
RWD32  	dq	8000000000000004h, 8000000000000004h, 8000000000000004h, 8000000000000004h
RWD64  	dq	0000000000000000h, 0000000200000002h, 0000000400000004h, 0000000600000006h
RWD96  	dq	0000000100000000h, 0000000100000000h, 0000000100000000h, 0000000100000000h
RWD128 	dq	0000000000000000h, 0000000000000001h, 0000000000000002h, 0000000000000003h

; Total bytes of code 104

ShuffleUnsafe With AVX2

public class ShuffleUnsafeBenchAvx2Only
{
    //not constant
    private Vector128<byte> indicesByte128 = Vector128.Create((byte)5, 2, 6, 15, 2, 90, 14, 0, 0, 7, 8, 12, 13, 14, 15, 6);
    private Vector128<short> indicesShort128 = Vector128.Create((short)1, 4, 6, 3, 10, 5, short.MaxValue, 0);
    private Vector128<int> indicesInt128 = Vector128.Create(5, -6, 0, 3);
    private Vector128<long> indicesLong128 = Vector128.Create(1, 0);
    private Vector256<byte> indicesByte256 = Vector256.Create((byte)5, 2, 6, 15, 20, 90, 14, 0, 0, 7, 8, 12, 13, 14, 15, 6, 32, 31, 30, 29, 28, 27, 26, 25, 4, 23, 22, 21, 20, 19, 18, 17);
    private Vector256<short> indicesShort256 = Vector256.Create((short)1, 4, 6, 3, 10, 5, short.MaxValue, 0, 1, 13, 2, 14, 9, 8, 7, 6);
    private Vector256<int> indicesInt256 = Vector256.Create(5, -6, 0, 3, 1, 2, 6, 7);
    private Vector256<long> indicesLong256 = Vector256.Create(1, 2, 0, 3);

    [MethodImpl(MethodImplOptions.NoInlining)]
    private static void Consume<T>(T value) { }

    public void ShuffleUnsafeByte128Indirect() => Consume(Vector128.ShuffleUnsafe(Vector128.Create((byte)0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), indicesByte128));

    public void ShuffleUnsafeShort128Indirect() => Consume(Vector128.ShuffleUnsafe(Vector128.Create((short)0, 1, 2, 3, 4, 5, 6, 7), indicesShort128));

    public void ShuffleUnsafeInt128Indirect() => Consume(Vector128.ShuffleUnsafe(Vector128.Create(0, 1, 2, 3), indicesInt128));

    public void ShuffleUnsafeLong128Indirect() => Consume(Vector128.ShuffleUnsafe(Vector128.Create(0, 1), indicesLong128));

    public void ShuffleUnsafeByte256Indirect() => Consume(Vector256.ShuffleUnsafe(Vector256.Create((byte)0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31), indicesByte256));

    private Vector256<byte> vectorByte256 = Vector256.Create((byte)0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
    public void ShuffleUnsafeByte256BothIndirect() => Consume(Vector256.ShuffleUnsafe(vectorByte256, indicesByte256));

    public void ShuffleUnsafeShort256Indirect() => Consume(Vector256.ShuffleUnsafe(Vector256.Create((short)0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), indicesShort256));

    private Vector256<short> vectorShort256 = Vector256.Create((short)0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
    public void ShuffleUnsafeShort256BothIndirect() => Consume(Vector256.ShuffleUnsafe(vectorShort256, indicesShort256));

    public void ShuffleUnsafeInt256Indirect() => Consume(Vector256.ShuffleUnsafe(Vector256.Create(0, 1, 2, 3, 4, 5, 6, 7), indicesInt256));

    public void ShuffleUnsafeLong256Indirect() => Consume(Vector256.ShuffleUnsafe(Vector256.Create(0, 1, 2, 3), indicesLong256));
}

; Assembly listing for method ShuffleUnsafeBenchAvx2Only:ShuffleUnsafeByte128Indirect():this (Tier1)
; Emitting BLENDED_CODE for X64 with AVX - Apple
; Tier1 code
; optimized code
; optimized using Synthesized PGO
; rsp based frame
; partially interruptible
; with Synthesized PGO: fgCalledCount is 100
; No PGO data

G_M000_IG01:                ;; offset=0x0000
       sub      rsp, 24
 
G_M000_IG02:                ;; offset=0x0004
       vmovups  xmm0, xmmword ptr [reloc @RWD00]
       vpshufb  xmm0, xmm0, xmmword ptr [rdi+0x08]
       vmovups  xmmword ptr [rsp], xmm0
       call     [ShuffleUnsafeBenchAvx2Only:Consume[System.Runtime.Intrinsics.Vector128`1[ubyte]](System.Runtime.Intrinsics.Vector128`1[ubyte])]
       nop      
 
G_M000_IG03:                ;; offset=0x001E
       add      rsp, 24
       ret      
 
RWD00  	dq	0706050403020100h, 0F0E0D0C0B0A0908h

; Total bytes of code 35

; Assembly listing for method ShuffleUnsafeBenchAvx2Only:ShuffleUnsafeShort128Indirect():this (Tier1)
; Emitting BLENDED_CODE for X64 with AVX - Apple
; Tier1 code
; optimized code
; optimized using Synthesized PGO
; rsp based frame
; partially interruptible
; with Synthesized PGO: fgCalledCount is 100
; No PGO data

G_M000_IG01:                ;; offset=0x0000
       sub      rsp, 24
 
G_M000_IG02:                ;; offset=0x0004
       vmovups  xmm0, xmmword ptr [rdi+0x18]
       vpsllw   xmm0, xmm0, 1
       vpshufb  xmm0, xmm0, xmmword ptr [reloc @RWD00]
       vpor     xmm0, xmm0, xmmword ptr [reloc @RWD16]
       vmovups  xmm1, xmmword ptr [reloc @RWD32]
       vpshufb  xmm0, xmm1, xmm0
       vmovups  xmmword ptr [rsp], xmm0
       call     [ShuffleUnsafeBenchAvx2Only:Consume[System.Runtime.Intrinsics.Vector128`1[short]](System.Runtime.Intrinsics.Vector128`1[short])]
       nop      
 
G_M000_IG03:                ;; offset=0x0038
       add      rsp, 24
       ret      
 
RWD00  	dq	0606040402020000h, 0E0E0C0C0A0A0808h
RWD16  	dq	0100010001000100h, 0100010001000100h
RWD32  	dq	0003000200010000h, 0007000600050004h

; Total bytes of code 61

; Assembly listing for method ShuffleUnsafeBenchAvx2Only:ShuffleUnsafeInt128Indirect():this (Tier1)
; Emitting BLENDED_CODE for X64 with AVX - Apple
; Tier1 code
; optimized code
; optimized using Synthesized PGO
; rsp based frame
; partially interruptible
; with Synthesized PGO: fgCalledCount is 100
; No PGO data

G_M000_IG01:                ;; offset=0x0000
       sub      rsp, 24
 
G_M000_IG02:                ;; offset=0x0004
       vmovups  xmm0, xmmword ptr [reloc @RWD00]
       vpermilps xmm0, xmm0, xmmword ptr [rdi+0x28]
       vmovups  xmmword ptr [rsp], xmm0
       call     [ShuffleUnsafeBenchAvx2Only:Consume[System.Runtime.Intrinsics.Vector128`1[int]](System.Runtime.Intrinsics.Vector128`1[int])]
       nop      
 
G_M000_IG03:                ;; offset=0x001E
       add      rsp, 24
       ret      
 
RWD00  	dq	0000000100000000h, 0000000300000002h

; Total bytes of code 35

; Assembly listing for method ShuffleUnsafeBenchAvx2Only:ShuffleUnsafeLong128Indirect():this (Tier1)
; Emitting BLENDED_CODE for X64 with AVX - Apple
; Tier1 code
; optimized code
; optimized using Synthesized PGO
; rsp based frame
; partially interruptible
; with Synthesized PGO: fgCalledCount is 100
; No PGO data

G_M000_IG01:                ;; offset=0x0000
       sub      rsp, 24
 
G_M000_IG02:                ;; offset=0x0004
       vmovups  xmm0, xmmword ptr [rdi+0x38]
       vpsllq   xmm0, xmm0, 1
       vpermilps xmm0, xmm0, xmmword ptr [reloc @RWD00]
       vpor     xmm0, xmm0, xmmword ptr [reloc @RWD16]
       vmovups  xmm1, xmmword ptr [reloc @RWD32]
       vpermilps xmm0, xmm1, xmm0
       vmovups  xmmword ptr [rsp], xmm0
       call     [ShuffleUnsafeBenchAvx2Only:Consume[System.Runtime.Intrinsics.Vector128`1[long]](System.Runtime.Intrinsics.Vector128`1[long])]
       nop      
 
G_M000_IG03:                ;; offset=0x0038
       add      rsp, 24
       ret      
 
RWD00  	dq	0000000000000000h, 0000000200000002h
RWD16  	dq	0000000100000000h, 0000000100000000h
RWD32  	dq	0000000000000000h, 0000000000000001h

; Total bytes of code 61

; Assembly listing for method ShuffleUnsafeBenchAvx2Only:ShuffleUnsafeByte256Indirect():this (Tier1)
; Emitting BLENDED_CODE for X64 with AVX - Apple
; Tier1 code
; optimized code
; optimized using Synthesized PGO
; rbp based frame
; partially interruptible
; with Synthesized PGO: fgCalledCount is 100
; No PGO data

G_M000_IG01:                ;; offset=0x0000
       push     rbp
       sub      rsp, 32
       lea      rbp, [rsp+0x20]
 
G_M000_IG02:                ;; offset=0x000A
       vmovups  ymm0, ymmword ptr [rdi+0x48]
       vmovups  ymm1, ymmword ptr [reloc @RWD00]
       vpshufb  ymm1, ymm1, ymm0
       vmovups  ymm2, ymmword ptr [reloc @RWD32]
       vpshufb  ymm2, ymm2, ymm0
       vpxor    ymm0, ymm0, ymmword ptr [reloc @RWD64]
       vpcmpgtb ymm0, ymm0, ymmword ptr [reloc @RWD96]
       vpblendvb ymm0, ymm1, ymm2, ymm0
       vmovups  ymmword ptr [rsp], ymm0
       call     [ShuffleUnsafeBenchAvx2Only:Consume[System.Runtime.Intrinsics.Vector256`1[ubyte]](System.Runtime.Intrinsics.Vector256`1[ubyte])]
       nop      
 
G_M000_IG03:                ;; offset=0x004B
       vzeroupper 
       add      rsp, 32
       pop      rbp
       ret      
 
RWD00  	dq	0706050403020100h, 0F0E0D0C0B0A0908h, 1716151413121110h, 1F1E1D1C1B1A1918h
RWD32  	dq	1716151413121110h, 1F1E1D1C1B1A1918h, 0706050403020100h, 0F0E0D0C0B0A0908h
RWD64  	dq	0000000000000000h, 0000000000000000h, 1010101010101010h, 1010101010101010h
RWD96  	dq	0F0F0F0F0F0F0F0Fh, 0F0F0F0F0F0F0F0Fh, 0F0F0F0F0F0F0F0Fh, 0F0F0F0F0F0F0F0Fh

; Total bytes of code 84

; Assembly listing for method ShuffleUnsafeBenchAvx2Only:ShuffleUnsafeByte256BothIndirect():this (Tier1)
; Emitting BLENDED_CODE for X64 with AVX - Apple
; Tier1 code
; optimized code
; optimized using Synthesized PGO
; rbp based frame
; partially interruptible
; with Synthesized PGO: fgCalledCount is 100
; No PGO data

G_M000_IG01:                ;; offset=0x0000
       push     rbp
       sub      rsp, 32
       lea      rbp, [rsp+0x20]
 
G_M000_IG02:                ;; offset=0x000A
       vmovups  ymm0, ymmword ptr [rdi+0xC8]
       vmovups  ymm1, ymmword ptr [rdi+0x48]
       vpshufb  ymm2, ymm0, ymm1
       vperm2i128 ymm0, ymm0, ymm0, 1
       vpshufb  ymm0, ymm0, ymm1
       vpxor    ymm1, ymm1, ymmword ptr [reloc @RWD00]
       vpcmpgtb ymm1, ymm1, ymmword ptr [reloc @RWD32]
       vpblendvb ymm0, ymm2, ymm0, ymm1
       vmovups  ymmword ptr [rsp], ymm0
       call     [ShuffleUnsafeBenchAvx2Only:Consume[System.Runtime.Intrinsics.Vector256`1[ubyte]](System.Runtime.Intrinsics.Vector256`1[ubyte])]
       nop      
 
G_M000_IG03:                ;; offset=0x0049
       vzeroupper 
       add      rsp, 32
       pop      rbp
       ret      
 
RWD00  	dq	0000000000000000h, 0000000000000000h, 1010101010101010h, 1010101010101010h
RWD32  	dq	0F0F0F0F0F0F0F0Fh, 0F0F0F0F0F0F0F0Fh, 0F0F0F0F0F0F0F0Fh, 0F0F0F0F0F0F0F0Fh

; Total bytes of code 82

; Assembly listing for method ShuffleUnsafeBenchAvx2Only:ShuffleUnsafeShort256Indirect():this (Tier1)
; Emitting BLENDED_CODE for X64 with AVX - Apple
; Tier1 code
; optimized code
; optimized using Synthesized PGO
; rbp based frame
; partially interruptible
; with Synthesized PGO: fgCalledCount is 100
; No PGO data

G_M000_IG01:                ;; offset=0x0000
       push     rbp
       sub      rsp, 32
       lea      rbp, [rsp+0x20]
 
G_M000_IG02:                ;; offset=0x000A
       vmovups  ymm0, ymmword ptr [rdi+0x68]
       vpsllw   ymm0, ymm0, 1
       vpshufb  ymm0, ymm0, ymmword ptr [reloc @RWD00]
       vpor     ymm0, ymm0, ymmword ptr [reloc @RWD32]
       vmovups  ymm1, ymmword ptr [reloc @RWD64]
       vpshufb  ymm1, ymm1, ymm0
       vmovups  ymm2, ymmword ptr [reloc @RWD96]
       vpshufb  ymm2, ymm2, ymm0
       vpxor    ymm0, ymm0, ymmword ptr [reloc @RWD128]
       vpcmpgtb ymm0, ymm0, ymmword ptr [reloc @RWD160]
       vpblendvb ymm0, ymm1, ymm2, ymm0
       vmovups  ymmword ptr [rsp], ymm0
       call     [ShuffleUnsafeBenchAvx2Only:Consume[System.Runtime.Intrinsics.Vector256`1[short]](System.Runtime.Intrinsics.Vector256`1[short])]
       nop      
 
G_M000_IG03:                ;; offset=0x0061
       vzeroupper 
       add      rsp, 32
       pop      rbp
       ret      
 
RWD00  	dq	0606040402020000h, 0E0E0C0C0A0A0808h, 0606040402020000h, 0E0E0C0C0A0A0808h
RWD32  	dq	0100010001000100h, 0100010001000100h, 0100010001000100h, 0100010001000100h
RWD64  	dq	0003000200010000h, 0007000600050004h, 000B000A00090008h, 000F000E000D000Ch
RWD96  	dq	000B000A00090008h, 000F000E000D000Ch, 0003000200010000h, 0007000600050004h
RWD128 	dq	0000000000000000h, 0000000000000000h, 1010101010101010h, 1010101010101010h
RWD160 	dq	0F0F0F0F0F0F0F0Fh, 0F0F0F0F0F0F0F0Fh, 0F0F0F0F0F0F0F0Fh, 0F0F0F0F0F0F0F0Fh

; Total bytes of code 106

; Assembly listing for method ShuffleUnsafeBenchAvx2Only:ShuffleUnsafeShort256BothIndirect():this (Tier1)
; Emitting BLENDED_CODE for X64 with AVX - Apple
; Tier1 code
; optimized code
; optimized using Synthesized PGO
; rbp based frame
; partially interruptible
; with Synthesized PGO: fgCalledCount is 100
; No PGO data

G_M000_IG01:                ;; offset=0x0000
       push     rbp
       sub      rsp, 32
       lea      rbp, [rsp+0x20]
 
G_M000_IG02:                ;; offset=0x000A
       vmovups  ymm0, ymmword ptr [rdi+0xE8]
       vmovups  ymm1, ymmword ptr [rdi+0x68]
       vpsllw   ymm1, ymm1, 1
       vpshufb  ymm1, ymm1, ymmword ptr [reloc @RWD00]
       vpor     ymm1, ymm1, ymmword ptr [reloc @RWD32]
       vpshufb  ymm2, ymm0, ymm1
       vperm2i128 ymm0, ymm0, ymm0, 1
       vpshufb  ymm0, ymm0, ymm1
       vpxor    ymm1, ymm1, ymmword ptr [reloc @RWD64]
       vpcmpgtb ymm1, ymm1, ymmword ptr [reloc @RWD96]
       vpblendvb ymm0, ymm2, ymm0, ymm1
       vmovups  ymmword ptr [rsp], ymm0
       call     [ShuffleUnsafeBenchAvx2Only:Consume[System.Runtime.Intrinsics.Vector256`1[short]](System.Runtime.Intrinsics.Vector256`1[short])]
       nop      
 
G_M000_IG03:                ;; offset=0x005F
       vzeroupper 
       add      rsp, 32
       pop      rbp
       ret      
 
RWD00  	dq	0606040402020000h, 0E0E0C0C0A0A0808h, 0606040402020000h, 0E0E0C0C0A0A0808h
RWD32  	dq	0100010001000100h, 0100010001000100h, 0100010001000100h, 0100010001000100h
RWD64  	dq	0000000000000000h, 0000000000000000h, 1010101010101010h, 1010101010101010h
RWD96  	dq	0F0F0F0F0F0F0F0Fh, 0F0F0F0F0F0F0F0Fh, 0F0F0F0F0F0F0F0Fh, 0F0F0F0F0F0F0F0Fh

; Total bytes of code 104

; Assembly listing for method ShuffleUnsafeBenchAvx2Only:ShuffleUnsafeInt256Indirect():this (Tier1)
; Emitting BLENDED_CODE for X64 with AVX - Apple
; Tier1 code
; optimized code
; optimized using Synthesized PGO
; rbp based frame
; partially interruptible
; with Synthesized PGO: fgCalledCount is 100
; No PGO data

G_M000_IG01:                ;; offset=0x0000
       push     rbp
       sub      rsp, 32
       lea      rbp, [rsp+0x20]
 
G_M000_IG02:                ;; offset=0x000A
       vmovups  ymm0, ymmword ptr [rdi+0x88]
       vpermd   ymm0, ymm0, ymmword ptr [reloc @RWD00]
       vmovups  ymmword ptr [rsp], ymm0
       call     [ShuffleUnsafeBenchAvx2Only:Consume[System.Runtime.Intrinsics.Vector256`1[int]](System.Runtime.Intrinsics.Vector256`1[int])]
       nop      
 
G_M000_IG03:                ;; offset=0x0027
       vzeroupper 
       add      rsp, 32
       pop      rbp
       ret      
 
RWD00  	dq	0000000100000000h, 0000000300000002h, 0000000500000004h, 0000000700000006h

; Total bytes of code 48

; Assembly listing for method ShuffleUnsafeBenchAvx2Only:ShuffleUnsafeLong256Indirect():this (Tier1)
; Emitting BLENDED_CODE for X64 with AVX - Apple
; Tier1 code
; optimized code
; optimized using Synthesized PGO
; rbp based frame
; partially interruptible
; with Synthesized PGO: fgCalledCount is 100
; No PGO data

G_M000_IG01:                ;; offset=0x0000
       push     rbp
       sub      rsp, 32
       lea      rbp, [rsp+0x20]
 
G_M000_IG02:                ;; offset=0x000A
       vmovups  ymm0, ymmword ptr [rdi+0xA8]
       vpsllq   ymm0, ymm0, 1
       vmovups  ymm1, ymmword ptr [reloc @RWD00]
       vpermd   ymm0, ymm1, ymm0
       vpor     ymm0, ymm0, ymmword ptr [reloc @RWD32]
       vpermd   ymm0, ymm0, ymmword ptr [reloc @RWD64]
       vmovups  ymmword ptr [rsp], ymm0
       call     [ShuffleUnsafeBenchAvx2Only:Consume[System.Runtime.Intrinsics.Vector256`1[long]](System.Runtime.Intrinsics.Vector256`1[long])]
       nop      
 
G_M000_IG03:                ;; offset=0x0041
       vzeroupper 
       add      rsp, 32
       pop      rbp
       ret      
 
RWD00  	dq	0000000000000000h, 0000000200000002h, 0000000400000004h, 0000000600000006h
RWD32  	dq	0000000100000000h, 0000000100000000h, 0000000100000000h, 0000000100000000h
RWD64  	dq	0000000000000000h, 0000000000000001h, 0000000000000002h, 0000000000000003h

; Total bytes of code 74

Shuffle With Sse4.2

public class ShuffleBenchSse4Only
{
    //not constant
    private Vector128<byte> indicesByte128 = Vector128.Create((byte)5, 2, 6, 15, 2, 90, 14, 0, 0, 7, 8, 12, 13, 14, 15, 6);
    private Vector128<short> indicesShort128 = Vector128.Create((short)1, 4, 6, 3, 10, 5, short.MaxValue, 0);
    private Vector128<int> indicesInt128 = Vector128.Create(5, -6, 0, 3);
    private Vector128<long> indicesLong128 = Vector128.Create(1, 0);

    [MethodImpl(MethodImplOptions.NoInlining)]
    private static void Consume<T>(T value) { }

    public void ShuffleByte128Indirect() => Consume(Vector128.Shuffle(Vector128.Create((byte)0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), indicesByte128));

    public void ShuffleShort128Indirect() => Consume(Vector128.Shuffle(Vector128.Create((short)0, 1, 2, 3, 4, 5, 6, 7), indicesShort128));

    public void ShuffleInt128Indirect() => Consume(Vector128.Shuffle(Vector128.Create(0, 1, 2, 3), indicesInt128));

    public void ShuffleLong128Indirect() => Consume(Vector128.Shuffle(Vector128.Create(0, 1), indicesLong128));
}

; Assembly listing for method ShuffleBenchSse4Only:ShuffleByte128Indirect():this (Tier1)
; Emitting BLENDED_CODE for generic X64 - Apple
; Tier1 code
; optimized code
; optimized using Synthesized PGO
; rsp based frame
; partially interruptible
; with Synthesized PGO: fgCalledCount is 100
; No PGO data

G_M000_IG01:                ;; offset=0x0000
       sub      rsp, 24
 
G_M000_IG02:                ;; offset=0x0004
       movups   xmm1, xmmword ptr [reloc @RWD00]
       pcmpgtb  xmm1, xmm0
       movups   xmm0, xmmword ptr [rdi+0x08]
       movups   xmm2, xmmword ptr [reloc @RWD16]
       pshufb   xmm2, xmm0
       pand     xmm1, xmm2
       movups   xmmword ptr [rsp], xmm1
       call     [ShuffleBenchSse4Only:Consume[System.Runtime.Intrinsics.Vector128`1[ubyte]](System.Runtime.Intrinsics.Vector128`1[ubyte])]
       nop      
 
G_M000_IG03:                ;; offset=0x002E
       add      rsp, 24
       ret      
 
RWD00  	dq	1010101010101010h, 1010101010101010h
RWD16  	dq	0706050403020100h, 0F0E0D0C0B0A0908h

; Total bytes of code 51

; Assembly listing for method ShuffleBenchSse4Only:ShuffleShort128Indirect():this (Tier1)
; Emitting BLENDED_CODE for generic X64 - Apple
; Tier1 code
; optimized code
; optimized using Synthesized PGO
; rsp based frame
; partially interruptible
; with Synthesized PGO: fgCalledCount is 100
; No PGO data

G_M000_IG01:                ;; offset=0x0000
       sub      rsp, 40
 
G_M000_IG02:                ;; offset=0x0004
       movups   xmm0, xmmword ptr [reloc @RWD00]
       movaps   xmm1, xmmword ptr [rsp+0x10]
       psubw    xmm1, xmm0
       movups   xmm0, xmmword ptr [reloc @RWD16]
       pcmpgtw  xmm0, xmm1
       movups   xmm1, xmmword ptr [rdi+0x18]
       psllw    xmm1, 1
       movups   xmm2, xmmword ptr [reloc @RWD32]
       pshufb   xmm1, xmm2
       movups   xmm2, xmmword ptr [reloc @RWD48]
       por      xmm1, xmm2
       movups   xmm2, xmmword ptr [reloc @RWD64]
       pshufb   xmm2, xmm1
       pand     xmm0, xmm2
       movups   xmmword ptr [rsp], xmm0
       call     [ShuffleBenchSse4Only:Consume[System.Runtime.Intrinsics.Vector128`1[short]](System.Runtime.Intrinsics.Vector128`1[short])]
       nop      
 
G_M000_IG03:                ;; offset=0x005A
       add      rsp, 40
       ret      
 
RWD00  	dq	8000800080008000h, 8000800080008000h
RWD16  	dq	8008800880088008h, 8008800880088008h
RWD32  	dq	0606040402020000h, 0E0E0C0C0A0A0808h
RWD48  	dq	0100010001000100h, 0100010001000100h
RWD64  	dq	0003000200010000h, 0007000600050004h

; Total bytes of code 95

; Assembly listing for method ShuffleBenchSse4Only:ShuffleInt128Indirect():this (Tier1)
; Emitting BLENDED_CODE for generic X64 - Apple
; Tier1 code
; optimized code
; optimized using Synthesized PGO
; rsp based frame
; partially interruptible
; with Synthesized PGO: fgCalledCount is 100
; No PGO data

G_M000_IG01:                ;; offset=0x0000
       sub      rsp, 40
 
G_M000_IG02:                ;; offset=0x0004
       movups   xmm0, xmmword ptr [reloc @RWD00]
       movaps   xmm1, xmmword ptr [rsp+0x10]
       psubd    xmm1, xmm0
       movups   xmm0, xmmword ptr [reloc @RWD16]
       pcmpgtd  xmm0, xmm1
       movups   xmm1, xmmword ptr [rdi+0x28]
       pslld    xmm1, 2
       movups   xmm2, xmmword ptr [reloc @RWD32]
       pshufb   xmm1, xmm2
       movups   xmm2, xmmword ptr [reloc @RWD48]
       por      xmm1, xmm2
       movups   xmm2, xmmword ptr [reloc @RWD64]
       pshufb   xmm2, xmm1
       pand     xmm0, xmm2
       movups   xmmword ptr [rsp], xmm0
       call     [ShuffleBenchSse4Only:Consume[System.Runtime.Intrinsics.Vector128`1[int]](System.Runtime.Intrinsics.Vector128`1[int])]
       nop      
 
G_M000_IG03:                ;; offset=0x005A
       add      rsp, 40
       ret      
 
RWD00  	dq	8000000080000000h, 8000000080000000h
RWD16  	dq	8000000480000004h, 8000000480000004h
RWD32  	dq	0404040400000000h, 0C0C0C0C08080808h
RWD48  	dq	0302010003020100h, 0302010003020100h
RWD64  	dq	0000000100000000h, 0000000300000002h

; Total bytes of code 95

; Assembly listing for method ShuffleBenchSse4Only:ShuffleLong128Indirect():this (Tier1)
; Emitting BLENDED_CODE for generic X64 - Apple
; Tier1 code
; optimized code
; optimized using Synthesized PGO
; rsp based frame
; partially interruptible
; with Synthesized PGO: fgCalledCount is 100
; No PGO data

G_M000_IG01:                ;; offset=0x0000
       sub      rsp, 40
 
G_M000_IG02:                ;; offset=0x0004
       movups   xmm0, xmmword ptr [reloc @RWD00]
       movaps   xmm1, xmmword ptr [rsp+0x10]
       psubq    xmm1, xmm0
       movups   xmm0, xmmword ptr [reloc @RWD16]
       pcmpgtq  xmm0, xmm1
       movups   xmm1, xmmword ptr [rdi+0x38]
       psllq    xmm1, 3
       movups   xmm2, xmmword ptr [reloc @RWD32]
       pshufb   xmm1, xmm2
       movups   xmm2, xmmword ptr [reloc @RWD48]
       por      xmm1, xmm2
       movups   xmm2, xmmword ptr [reloc @RWD64]
       pshufb   xmm2, xmm1
       pand     xmm0, xmm2
       movups   xmmword ptr [rsp], xmm0
       call     [ShuffleBenchSse4Only:Consume[System.Runtime.Intrinsics.Vector128`1[long]](System.Runtime.Intrinsics.Vector128`1[long])]
       nop      
 
G_M000_IG03:                ;; offset=0x005B
       add      rsp, 40
       ret      
 
RWD00  	dq	8000000000000000h, 8000000000000000h
RWD16  	dq	8000000000000002h, 8000000000000002h
RWD32  	dq	0000000000000000h, 0808080808080808h
RWD48  	dq	0706050403020100h, 0706050403020100h
RWD64  	dq	0000000000000000h, 0000000000000001h

; Total bytes of code 96

ShuffleUnsafe With Sse4.2

public class ShuffleUnsafeBenchSse4Only
{
    //not constant
    private Vector128<byte> indicesByte128 = Vector128.Create((byte)5, 2, 6, 15, 2, 90, 14, 0, 0, 7, 8, 12, 13, 14, 15, 6);
    private Vector128<short> indicesShort128 = Vector128.Create((short)1, 4, 6, 3, 10, 5, short.MaxValue, 0);
    private Vector128<int> indicesInt128 = Vector128.Create(5, -6, 0, 3);
    private Vector128<long> indicesLong128 = Vector128.Create(1, 0);

    [MethodImpl(MethodImplOptions.NoInlining)]
    private static void Consume<T>(T value) { }

    public void ShuffleUnsafeByte128Indirect() => Consume(Vector128.ShuffleUnsafe(Vector128.Create((byte)0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), indicesByte128));

    public void ShuffleUnsafeShort128Indirect() => Consume(Vector128.ShuffleUnsafe(Vector128.Create((short)0, 1, 2, 3, 4, 5, 6, 7), indicesShort128));

    public void ShuffleUnsafeInt128Indirect() => Consume(Vector128.ShuffleUnsafe(Vector128.Create(0, 1, 2, 3), indicesInt128));

    public void ShuffleUnsafeLong128Indirect() => Consume(Vector128.ShuffleUnsafe(Vector128.Create(0, 1), indicesLong128));
}

; Assembly listing for method ShuffleUnsafeBenchSse4Only:ShuffleUnsafeByte128Indirect():this (Tier1)
; Emitting BLENDED_CODE for generic X64 - Apple
; Tier1 code
; optimized code
; optimized using Synthesized PGO
; rsp based frame
; partially interruptible
; with Synthesized PGO: fgCalledCount is 100
; No PGO data

G_M000_IG01:                ;; offset=0x0000
       sub      rsp, 24
 
G_M000_IG02:                ;; offset=0x0004
       movups   xmm0, xmmword ptr [rdi+0x08]
       movups   xmm1, xmmword ptr [reloc @RWD00]
       pshufb   xmm1, xmm0
       movups   xmmword ptr [rsp], xmm1
       call     [ShuffleUnsafeBenchSse4Only:Consume[System.Runtime.Intrinsics.Vector128`1[ubyte]](System.Runtime.Intrinsics.Vector128`1[ubyte])]
       nop      
 
G_M000_IG03:                ;; offset=0x001F
       add      rsp, 24
       ret      
 
RWD00  	dq	0706050403020100h, 0F0E0D0C0B0A0908h

; Total bytes of code 36

; Assembly listing for method ShuffleUnsafeBenchSse4Only:ShuffleUnsafeShort128Indirect():this (Tier1)
; Emitting BLENDED_CODE for generic X64 - Apple
; Tier1 code
; optimized code
; optimized using Synthesized PGO
; rsp based frame
; partially interruptible
; with Synthesized PGO: fgCalledCount is 100
; No PGO data

G_M000_IG01:                ;; offset=0x0000
       sub      rsp, 24
 
G_M000_IG02:                ;; offset=0x0004
       movups   xmm0, xmmword ptr [rdi+0x18]
       psllw    xmm0, 1
       movups   xmm1, xmmword ptr [reloc @RWD00]
       pshufb   xmm0, xmm1
       movups   xmm1, xmmword ptr [reloc @RWD16]
       por      xmm0, xmm1
       movups   xmm1, xmmword ptr [reloc @RWD32]
       pshufb   xmm1, xmm0
       movups   xmmword ptr [rsp], xmm1
       call     [ShuffleUnsafeBenchSse4Only:Consume[System.Runtime.Intrinsics.Vector128`1[short]](System.Runtime.Intrinsics.Vector128`1[short])]
       nop      
 
G_M000_IG03:                ;; offset=0x003B
       add      rsp, 24
       ret      
 
RWD00  	dq	0606040402020000h, 0E0E0C0C0A0A0808h
RWD16  	dq	0100010001000100h, 0100010001000100h
RWD32  	dq	0003000200010000h, 0007000600050004h

; Total bytes of code 64

; Assembly listing for method ShuffleUnsafeBenchSse4Only:ShuffleUnsafeInt128Indirect():this (Tier1)
; Emitting BLENDED_CODE for generic X64 - Apple
; Tier1 code
; optimized code
; optimized using Synthesized PGO
; rsp based frame
; partially interruptible
; with Synthesized PGO: fgCalledCount is 100
; No PGO data

G_M000_IG01:                ;; offset=0x0000
       sub      rsp, 24
 
G_M000_IG02:                ;; offset=0x0004
       movups   xmm0, xmmword ptr [rdi+0x28]
       pslld    xmm0, 2
       movups   xmm1, xmmword ptr [reloc @RWD00]
       pshufb   xmm0, xmm1
       movups   xmm1, xmmword ptr [reloc @RWD16]
       por      xmm0, xmm1
       movups   xmm1, xmmword ptr [reloc @RWD32]
       pshufb   xmm1, xmm0
       movups   xmmword ptr [rsp], xmm1
       call     [ShuffleUnsafeBenchSse4Only:Consume[System.Runtime.Intrinsics.Vector128`1[int]](System.Runtime.Intrinsics.Vector128`1[int])]
       nop      
 
G_M000_IG03:                ;; offset=0x003B
       add      rsp, 24
       ret      
 
RWD00  	dq	0404040400000000h, 0C0C0C0C08080808h
RWD16  	dq	0302010003020100h, 0302010003020100h
RWD32  	dq	0000000100000000h, 0000000300000002h

; Total bytes of code 64

; Assembly listing for method ShuffleUnsafeBenchSse4Only:ShuffleUnsafeLong128Indirect():this (Tier1)
; Emitting BLENDED_CODE for generic X64 - Apple
; Tier1 code
; optimized code
; optimized using Synthesized PGO
; rsp based frame
; partially interruptible
; with Synthesized PGO: fgCalledCount is 100
; No PGO data

G_M000_IG01:                ;; offset=0x0000
       sub      rsp, 24
 
G_M000_IG02:                ;; offset=0x0004
       movups   xmm0, xmmword ptr [rdi+0x38]
       psllq    xmm0, 3
       movups   xmm1, xmmword ptr [reloc @RWD00]
       pshufb   xmm0, xmm1
       movups   xmm1, xmmword ptr [reloc @RWD16]
       por      xmm0, xmm1
       movups   xmm1, xmmword ptr [reloc @RWD32]
       pshufb   xmm1, xmm0
       movups   xmmword ptr [rsp], xmm1
       call     [ShuffleUnsafeBenchSse4Only:Consume[System.Runtime.Intrinsics.Vector128`1[long]](System.Runtime.Intrinsics.Vector128`1[long])]
       nop      
 
G_M000_IG03:                ;; offset=0x003B
       add      rsp, 24
       ret      
 
RWD00  	dq	0000000000000000h, 0808080808080808h
RWD16  	dq	0706050403020100h, 0706050403020100h
RWD32  	dq	0000000000000000h, 0000000000000001h

; Total bytes of code 64

dotnet-issue-labeler · 2024-03-12T11:29:36Z

Note regarding the new-api-needs-documentation label:

This serves as a reminder for when your PR is modifying a ref *.cs file and adding/modifying public APIs, please make sure the API implementation in the src *.cs file is documented with triple slash comments, so the PR reviewers can sign off that change.

hamarb123 · 2024-03-12T11:35:00Z

Benchmark results of my AVX2 code (ShuffleUnsafe256):

https://gist.github.com/hamarb123/c4e994a896653a46c2788df4cd6bfc74

Yes, this is a very micro benchmark, but results are pretty reproducible on my machine (within ~%10 usually), and are probably pretty close to reality since it should be pretty quick (but obviously this doesn't measure the overhead with surrounding code due to more pipeline usage, etc.).

(edit: there was likely an issue with this benchmark turning into a no-op)

src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256.cs

MihaZupan

Thank you for looking into this again.

We're already using the so-far-internal Vector128.ShuffleUnsafe in a bunch of places. Should we be using Vector256.ShuffleUnsafe somewhere?

src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256.cs

src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector64.cs

src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256.cs

hamarb123 · 2024-03-12T18:55:40Z

We're already using the so-far-internal Vector128.ShuffleUnsafe in a bunch of places. Should we be using Vector256.ShuffleUnsafe somewhere?

It seems to me that all the current uses of Vector128.ShuffleUnsafe have Avx2.Shuffle on their V256 branch, which indicates they do not need the additional guarantees that Vector256.ShuffleUnsafe provides (around between-lane shuffling working), so I don't think there is currently spot we should be using it in dotnet/runtime currently, unless I've missed one.

src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs

hamarb123 · 2024-03-25T08:55:40Z

Can someone please check I won't accidentally regress mono :)
I've attempted to implement a very basic ShuffleUnsafe for mono, but it might need to be more advanced.

fanyang-mono · 2024-03-25T14:34:39Z

Mono changes look good to me. Thanks for your contribution.

src/coreclr/jit/gentree.cpp

MihaZupan · 2024-03-26T11:16:18Z

Re: 9868e73
SearchValues rely on Vector128.ShuffleUnsafe using exactly Ssse3.Shuffle semantics whenever Ssse3.IsSupported (hence the comment). If we're not making that guarantee anymore, we'll need to stop using this helper there.

hamarb123 · 2024-03-26T11:25:31Z

Re: 9868e73 SearchValues rely on Vector128.ShuffleUnsafe using exactly Ssse3.Shuffle semantics whenever Ssse3.IsSupported (hence the comment). If we're not making that guarantee anymore, we'll need to stop using this helper there.

I don't think there's any issue with the runtime relying on specific behaviour.

For external libraries, I think one of the following approaches makes sense:

We document the current behaviour and say that it's only guaranteed to be exactly this in this .NET version; future ones might add additional optimisations
Or we just don't document how they work and which specific instructions they use

I think the approach needs to be consistent for all of them, so I removed the Vector128.ShuffleUnsafe comment. I can include it within the method implementation if you'd like, as a reference for the runtime developers.

Another option, which I briefly mentioned in a comment somewhere, is to expose a variant like VectorXXX.ShuffleUnsafeHighZero - which would mean that if the high bit is set (of whatever byte we think is most convenient for the larger types, probably the first one in memory I would think / or of the value, would have to think more about what would be the best solution), then you're guaranteed to get 0. This would only require some of the implementations to have special handling to ensure this, while the rest could remain the same, still giving greater performance than VectorXXX.Shuffle in the general case, but also allowing external consumers of the API to guarantee they can get 0 when they need it.

MihaZupan · 2024-03-26T11:32:59Z

I'm fine with only documenting "anything above 15 is UB".
Just wanted to note that this is behavior we are currently relying on internally, and we'll want to either keep the behavior or account for it with internal callers.

hamarb123 · 2024-03-26T11:34:05Z

I'm fine with only documenting "anything above 15 is UB". Just wanted to note that this is behavior we are currently relying on internally, and we'll want to either keep the behavior or account for it with internal callers.

Yes, I've been careful to not use the AVX-512 one for this method for this reason. I will add a comment at some point to explain this in the method (assuming I don't forget).

EgorBot · 2024-08-07T02:09:14Z

Benchmark results on Intel

BenchmarkDotNet v0.14.0, Ubuntu 22.04.4 LTS (Jammy Jellyfish)
Intel Xeon Platinum 8370C CPU 2.80GHz, 1 CPU, 8 logical and 4 physical cores
Job=Job-VQIPRA  EnvironmentVariables=DOTNET_EnableAVX512F=0,DOTNET_EnableAVX2=0,DOTNET_EnableAVX=0  Toolchain=Main
RatioSD=?

Method	Mean	Error	Ratio
ShuffleByte128Indirect	NA	NA	?
Benchmarks with issues:
ShuffleBench.ShuffleByte128Indirect: Job-VQIPRA(EnvironmentVariables=DOTNET_EnableAVX512F=0,DOTNET_EnableAVX2=0,DOTNET_EnableAVX=0, Toolchain=Main)

BDN_Artifacts.zip

- Optimise comparison in `gtNewSimdShuffleNodeVariable` for xarch - Optimise for constant vector in Vector256.Shuffle{Unsafe} when have AVX2 only

src/coreclr/jit/gentree.cpp

dotnet-issue-labeler bot added area-System.Runtime.Intrinsics new-api-needs-documentation labels Mar 12, 2024

dotnet-policy-service bot added the community-contribution Indicates that the PR has been added by a community member label Mar 12, 2024

colejohnson66 reviewed Mar 12, 2024

View reviewed changes

src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256.cs Outdated Show resolved Hide resolved

MihaZupan reviewed Mar 12, 2024

View reviewed changes

tannergooding reviewed Mar 18, 2024

View reviewed changes

src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs Outdated Show resolved Hide resolved

hamarb123 requested a review from fanyang-mono as a code owner March 25, 2024 08:53

filipnavara added the tenet-performance Performance related issue label Mar 25, 2024

This was referenced Mar 25, 2024

[browser][MT] TIMED_OUT but 0 was expected #99888

Closed

WASM test failing with OOM #100111

Closed

tannergooding reviewed Mar 25, 2024

View reviewed changes

src/coreclr/jit/gentree.cpp Outdated Show resolved Hide resolved

This comment was marked as outdated.

Sign in to view

hamarb123 added 2 commits August 7, 2024 19:09

Optimise some codegen

b9be44e

- Optimise comparison in `gtNewSimdShuffleNodeVariable` for xarch - Optimise for constant vector in Vector256.Shuffle{Unsafe} when have AVX2 only

jit format

1423e85

hamarb123 marked this pull request as ready for review August 7, 2024 09:51

hamarb123 requested a review from tannergooding August 7, 2024 09:51

hamarb123 commented Aug 7, 2024

View reviewed changes

src/coreclr/jit/gentree.cpp Outdated Show resolved Hide resolved

hamarb123 requested a review from MihaZupan August 7, 2024 10:01

hamarb123 added 2 commits August 7, 2024 20:02

jit format

ff76287

Simplify logic for using Shuffle for ShuffleUnsafe

ca1a5fa

Merge branch 'main' into main12

d64cad2

This was referenced Aug 20, 2024

slow macOS - "##[error]The job running on agent Azure Pipelines 9 ran longer than the maximum time of 60 minutes." dotnet/dnceng#1883

Open

'chrome-GetPropertiesTests' timing out #106625

Closed

Merge branch 'main' into main12

bb974ca

build-analysis bot mentioned this pull request Sep 11, 2024

[browser][MT] Could not load file or assembly System.Runtime.InteropServices #106067

Open

Merge branch 'main' into main12

c1ff983

hamarb123 requested a review from vitek-karas as a code owner November 13, 2024 05:29

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Implement `ShuffleUnsafe` methods and optimise `Shuffle` for non-constant `indices` #99596

Implement `ShuffleUnsafe` methods and optimise `Shuffle` for non-constant `indices` #99596

hamarb123 commented Mar 12, 2024 •

edited

Loading

dotnet-issue-labeler bot commented Mar 12, 2024

hamarb123 commented Mar 12, 2024 •

edited

Loading

MihaZupan left a comment

hamarb123 commented Mar 12, 2024

hamarb123 commented Mar 25, 2024

fanyang-mono commented Mar 25, 2024

MihaZupan commented Mar 26, 2024

hamarb123 commented Mar 26, 2024 •

edited

Loading

MihaZupan commented Mar 26, 2024

hamarb123 commented Mar 26, 2024 •

edited

Loading

This comment was marked as outdated.

EgorBot commented Aug 7, 2024

Implement ShuffleUnsafe methods and optimise Shuffle for non-constant indices #99596

Are you sure you want to change the base?

Implement ShuffleUnsafe methods and optimise Shuffle for non-constant indices #99596

Conversation

hamarb123 commented Mar 12, 2024 • edited Loading

dotnet-issue-labeler bot commented Mar 12, 2024

hamarb123 commented Mar 12, 2024 • edited Loading

MihaZupan left a comment

Choose a reason for hiding this comment

hamarb123 commented Mar 12, 2024

hamarb123 commented Mar 25, 2024

fanyang-mono commented Mar 25, 2024

MihaZupan commented Mar 26, 2024

hamarb123 commented Mar 26, 2024 • edited Loading

MihaZupan commented Mar 26, 2024

hamarb123 commented Mar 26, 2024 • edited Loading

This comment was marked as outdated.

EgorBot commented Aug 7, 2024

Implement `ShuffleUnsafe` methods and optimise `Shuffle` for non-constant `indices` #99596

Implement `ShuffleUnsafe` methods and optimise `Shuffle` for non-constant `indices` #99596

hamarb123 commented Mar 12, 2024 •

edited

Loading

hamarb123 commented Mar 12, 2024 •

edited

Loading

hamarb123 commented Mar 26, 2024 •

edited

Loading

hamarb123 commented Mar 26, 2024 •

edited

Loading