From 52151fc4c6642a5557f264e82470db39e53f342a Mon Sep 17 00:00:00 2001 From: Anders Cedronius Date: Fri, 30 Dec 2022 13:55:40 +0100 Subject: [PATCH] update asm using pointer hand over --- FastQueueASM.h | 16 +++++++++------- README.md | 18 ++++++++++++++++++ fastqueue_arm64.asm | 24 +++++++++++++----------- fastqueue_x86_64.asm | 24 +++++++++++++----------- 4 files changed, 53 insertions(+), 29 deletions(-) diff --git a/FastQueueASM.h b/FastQueueASM.h index b6328bd..edeba7d 100644 --- a/FastQueueASM.h +++ b/FastQueueASM.h @@ -36,11 +36,13 @@ namespace FastQueueASM { }; alignas(L1_CACHE) volatile uint8_t mBorderUpp[L1_CACHE]; - alignas(L1_CACHE) volatile uint64_t mWritePosition; //L1CACHE * 1 - alignas(L1_CACHE) volatile uint64_t mReadPosition; //L1CACHE * 2 - alignas(L1_CACHE) volatile uint64_t mExitThread; //L1CACHE * 3 - alignas(L1_CACHE) volatile uint64_t mExitThreadSemaphore; //L1CACHE * 4 - alignas(L1_CACHE) volatile mAlign mRingBuffer[BUFFER_MASK + 1]; //L1CACHE * 5 + alignas(L1_CACHE) volatile uint64_t mWritePositionPush; //L1CACHE * 1 + alignas(L1_CACHE) volatile uint64_t mReadPositionPush; //L1CACHE * 2 + alignas(L1_CACHE) volatile uint64_t mWritePositionPop; //L1CACHE * 3 + alignas(L1_CACHE) volatile uint64_t mReadPositionPop; //L1CACHE * 4 + alignas(L1_CACHE) volatile uint64_t mExitThread; //L1CACHE * 5 + alignas(L1_CACHE) volatile uint64_t mExitThreadSemaphore; //L1CACHE * 6 + alignas(L1_CACHE) volatile mAlign mRingBuffer[BUFFER_MASK + 1]; //L1CACHE * 7 alignas(L1_CACHE) volatile uint8_t mBorderDown[L1_CACHE]; }; @@ -48,7 +50,7 @@ namespace FastQueueASM { DataBlock *newQueue() { //Verify the compiler generated data block - static_assert(sizeof(DataBlock) == ((4 * L1_CACHE) + ((BUFFER_MASK + 1) * L1_CACHE) + (L1_CACHE * 2)), + static_assert(sizeof(DataBlock) == ((6 * L1_CACHE) + ((BUFFER_MASK + 1) * L1_CACHE) + (L1_CACHE * 2)), "FastQueueASM::DataBlock is not matching expected size"); #ifdef _MSC_VER auto pData = (DataBlock *)_aligned_malloc(sizeof(DataBlock), L1_CACHE); @@ -87,7 +89,7 @@ namespace FastQueueASM { //Stop queue (Maybe called from any thread) void stopQueue(DataBlock *pData) { - pData->mExitThread = pData->mWritePosition; + pData->mExitThread = pData->mWritePositionPush; pData->mExitThreadSemaphore = true; } diff --git a/README.md b/README.md index 2399494..ce6d88f 100644 --- a/README.md +++ b/README.md @@ -52,6 +52,24 @@ FastQueueASM Transactions -> 9471164/s However, on X64 platforms I don't see the same gain in my benchmarks. With that said Rigtorps queue is really the one to beat ;-) . + +**AMD EPYC 7763 64-Core Processor** + +``` +BoostLockFree pointer test started. +BoostLockFree pointer test ended. +BoostLockFree Transactions -> 6851164/s +FastQueue pointer test started. +FastQueue pointer test ended. +FastQueue Transactions -> 8516819/s +Rigtorp pointer test started. +Rigtorp pointer test ended. +Rigtorp Transactions -> 8332916/s +FastQueueASM pointer test started. +FastQueueASM pointer test ended. +FastQueueASM Transactions -> 8856282/s +``` + The queue is a header only template class and is implemented in a few lines of C++. The code compiles on arm64 or x86_64 CPU's running Windows, MacOS or Linux OS. diff --git a/fastqueue_arm64.asm b/fastqueue_arm64.asm index 85c5730..249bf6a 100644 --- a/fastqueue_arm64.asm +++ b/fastqueue_arm64.asm @@ -12,15 +12,15 @@ _pop_item: mov x3, x0 - ldr x1, [x0, #L1_CACHE * 2] ;mReadPosition + ldr x1, [x0, #L1_CACHE * 4] ;mReadPositionPop pop_loop: - ldr x2, [x3, #L1_CACHE * 1] ;mWritePosition + ldr x2, [x3, #L1_CACHE * 3] ;mWritePositionPop cmp x1,x2 bne entry_found - ldr x4, [x3, #L1_CACHE * 3] ;mExitThread + ldr x4, [x3, #L1_CACHE * 5] ;mExitThread cmp x4, x1 bne pop_loop - ldr x5, [x3, #L1_CACHE * 4] ;mExitThreadSemaphore (1 = true) + ldr x5, [x3, #L1_CACHE * 6] ;mExitThreadSemaphore (1 = true) cmp x5, #0 beq pop_loop eor x0, x0, x0 @@ -29,19 +29,20 @@ entry_found: add x2, x1, #1 and x1, x1, BUFFER_MASK lsl x1, x1, SHIFT_NO - add x1, x1, #L1_CACHE * 5 ;mRingBuffer + add x1, x1, #L1_CACHE * 7 ;mRingBuffer ldr x0, [x3, x1] dmb ishld - str x2, [x3, #L1_CACHE * 2] ;mReadPosition + str x2, [x3, #L1_CACHE * 4] ;mReadPositionPop + str x2, [x3, #L1_CACHE * 2] ;mReadPositionPush ret _push_item: - ldr x2, [x0, #L1_CACHE * 1] ;mWritePosition + ldr x2, [x0, #L1_CACHE * 1] ;mWritePositionPush push_loop: - ldr x3, [x0, #L1_CACHE * 4] ;mExitThreadSemaphore (1 = true) + ldr x3, [x0, #L1_CACHE * 6] ;mExitThreadSemaphore (1 = true) cmp x3, #0 bne exit_loop - ldr x4, [x0, #L1_CACHE * 2] ;mReadPosition + ldr x4, [x0, #L1_CACHE * 2] ;mReadPositionPush sub x3, x2, x4 cmp x3, BUFFER_MASK bge push_loop @@ -49,10 +50,11 @@ push_loop: add x2, x2, #1 and x3, x3, BUFFER_MASK lsl x3, x3, SHIFT_NO - add x3, x3, #L1_CACHE * 5 ;mRingBuffer + add x3, x3, #L1_CACHE * 7 ;mRingBuffer str x1,[x0, x3] dmb ishst - str x2,[x0, #L1_CACHE * 1] ;mWritePosition + str x2,[x0, #L1_CACHE * 1] ;mWritePositionPush + str x2,[x0, #L1_CACHE * 3] ;mWritePositionPop exit_loop: ret diff --git a/fastqueue_x86_64.asm b/fastqueue_x86_64.asm index 22c30c0..a79c51c 100644 --- a/fastqueue_x86_64.asm +++ b/fastqueue_x86_64.asm @@ -21,32 +21,33 @@ verify_cache_size: ret push_item: - mov r11, [rdi + (L1_CACHE * 1)] ;mWritePosition + mov r11, [rdi + (L1_CACHE * 1)] ;mWritePositionPush push_loop: - cmp [rdi + (L1_CACHE * 4)], byte 0 ;mExitThreadSemaphore + cmp [rdi + (L1_CACHE * 6)], byte 0 ;mExitThreadSemaphore jnz exit_loop mov rcx, r11 - sub rcx, [rdi + (L1_CACHE * 2)] ;mReadPosition + sub rcx, [rdi + (L1_CACHE * 2)] ;mReadPositionPush cmp rcx, BUFFER_MASK jge push_loop mov rax, r11 inc r11 and rax, BUFFER_MASK shl rax, SHIFT_NO - add rax, (L1_CACHE * 5) ;mRingBuffer + add rax, (L1_CACHE * 7) ;mRingBuffer mov [rdi + rax], rsi sfence - mov [rdi + (L1_CACHE * 1)], r11 ;mWritePosition + mov [rdi + (L1_CACHE * 1)], r11 ;mWritePositionPush + mov [rdi + (L1_CACHE * 3)], r11 ;mWritePositionPop exit_loop: ret pop_item: - mov rcx, [rdi + (L1_CACHE * 2)] ;mReadPosition - cmp rcx, [rdi + (L1_CACHE * 1)] ;mWritePosition + mov rcx, [rdi + (L1_CACHE * 4)] ;mReadPositionPop + cmp rcx, [rdi + (L1_CACHE * 3)] ;mWritePositionPop jne entry_found - sub rcx, [rdi + (L1_CACHE * 3)] ;mExitThread (0 = true) + sub rcx, [rdi + (L1_CACHE * 5)] ;mExitThread (0 = true) jnz pop_item - cmp [rdi + (L1_CACHE * 4)], byte 0 ;mExitThreadSemaphore (1 = true) + cmp [rdi + (L1_CACHE * 6)], byte 0 ;mExitThreadSemaphore (1 = true) jz pop_item xor rax, rax ret @@ -55,9 +56,10 @@ entry_found: inc r11 and rcx, BUFFER_MASK shl rcx, SHIFT_NO - add rcx, (L1_CACHE * 5) ;mRingBuffer + add rcx, (L1_CACHE * 7) ;mRingBuffer mov rax, [rdi + rcx] lfence - mov [rdi + (L1_CACHE * 2)], r11 ;mReadPosition + mov [rdi + (L1_CACHE * 4)], r11 ;mReadPositionPop + mov [rdi + (L1_CACHE * 2)], r11 ;mReadPositionPush ret