From 52151fc4c6642a5557f264e82470db39e53f342a Mon Sep 17 00:00:00 2001
From: Anders Cedronius <anders.cedronius@gmail.com>
Date: Fri, 30 Dec 2022 13:55:40 +0100
Subject: [PATCH] update asm using pointer hand over

---
 FastQueueASM.h       | 16 +++++++++-------
 README.md            | 18 ++++++++++++++++++
 fastqueue_arm64.asm  | 24 +++++++++++++-----------
 fastqueue_x86_64.asm | 24 +++++++++++++-----------
 4 files changed, 53 insertions(+), 29 deletions(-)

diff --git a/FastQueueASM.h b/FastQueueASM.h
index b6328bd..edeba7d 100644
--- a/FastQueueASM.h
+++ b/FastQueueASM.h
@@ -36,11 +36,13 @@ namespace FastQueueASM {
         };
 
         alignas(L1_CACHE) volatile uint8_t mBorderUpp[L1_CACHE];
-        alignas(L1_CACHE) volatile uint64_t mWritePosition; //L1CACHE * 1
-        alignas(L1_CACHE) volatile uint64_t mReadPosition;  //L1CACHE * 2
-        alignas(L1_CACHE) volatile uint64_t mExitThread; //L1CACHE * 3
-        alignas(L1_CACHE) volatile uint64_t mExitThreadSemaphore; //L1CACHE * 4
-        alignas(L1_CACHE) volatile mAlign mRingBuffer[BUFFER_MASK + 1]; //L1CACHE * 5
+        alignas(L1_CACHE) volatile uint64_t mWritePositionPush; //L1CACHE * 1
+        alignas(L1_CACHE) volatile uint64_t mReadPositionPush;  //L1CACHE * 2
+        alignas(L1_CACHE) volatile uint64_t mWritePositionPop; //L1CACHE * 3
+        alignas(L1_CACHE) volatile uint64_t mReadPositionPop;  //L1CACHE * 4
+        alignas(L1_CACHE) volatile uint64_t mExitThread; //L1CACHE * 5
+        alignas(L1_CACHE) volatile uint64_t mExitThreadSemaphore; //L1CACHE * 6
+        alignas(L1_CACHE) volatile mAlign mRingBuffer[BUFFER_MASK + 1]; //L1CACHE * 7
         alignas(L1_CACHE) volatile uint8_t mBorderDown[L1_CACHE];
     };
 
@@ -48,7 +50,7 @@ namespace FastQueueASM {
     DataBlock *newQueue() {
 
         //Verify the compiler generated data block
-        static_assert(sizeof(DataBlock) == ((4 * L1_CACHE) + ((BUFFER_MASK + 1) * L1_CACHE) + (L1_CACHE * 2)),
+        static_assert(sizeof(DataBlock) == ((6 * L1_CACHE) + ((BUFFER_MASK + 1) * L1_CACHE) + (L1_CACHE * 2)),
                       "FastQueueASM::DataBlock is not matching expected size");
 #ifdef _MSC_VER
         auto pData = (DataBlock *)_aligned_malloc(sizeof(DataBlock), L1_CACHE);
@@ -87,7 +89,7 @@ namespace FastQueueASM {
 
     //Stop queue (Maybe called from any thread)
     void stopQueue(DataBlock *pData) {
-        pData->mExitThread = pData->mWritePosition;
+        pData->mExitThread = pData->mWritePositionPush;
         pData->mExitThreadSemaphore = true;
     }
 
diff --git a/README.md b/README.md
index 2399494..ce6d88f 100644
--- a/README.md
+++ b/README.md
@@ -52,6 +52,24 @@ FastQueueASM Transactions -> 9471164/s
 
 However, on X64 platforms I don't see the same gain in my benchmarks. With that said Rigtorps queue is really the one to beat ;-) .
 
+
+**AMD EPYC 7763 64-Core Processor** 
+
+```
+BoostLockFree pointer test started.
+BoostLockFree pointer test ended.
+BoostLockFree Transactions -> 6851164/s
+FastQueue pointer test started.
+FastQueue pointer test ended.
+FastQueue Transactions -> 8516819/s
+Rigtorp pointer test started.
+Rigtorp pointer test ended.
+Rigtorp Transactions -> 8332916/s
+FastQueueASM pointer test started.
+FastQueueASM pointer test ended.
+FastQueueASM Transactions -> 8856282/s
+```
+
 The queue is a header only template class and is implemented in a few lines of C++.
 
 The code compiles on arm64 or x86_64 CPU's running Windows, MacOS or Linux OS.
diff --git a/fastqueue_arm64.asm b/fastqueue_arm64.asm
index 85c5730..249bf6a 100644
--- a/fastqueue_arm64.asm
+++ b/fastqueue_arm64.asm
@@ -12,15 +12,15 @@
 
 _pop_item:
     mov x3, x0
-    ldr x1, [x0, #L1_CACHE * 2] ;mReadPosition
+    ldr x1, [x0, #L1_CACHE * 4] ;mReadPositionPop
 pop_loop:
-    ldr x2, [x3, #L1_CACHE * 1] ;mWritePosition
+    ldr x2, [x3, #L1_CACHE * 3] ;mWritePositionPop
     cmp x1,x2
     bne entry_found
-    ldr x4, [x3, #L1_CACHE * 3] ;mExitThread
+    ldr x4, [x3, #L1_CACHE * 5] ;mExitThread
     cmp x4, x1
     bne pop_loop
-    ldr x5, [x3, #L1_CACHE * 4] ;mExitThreadSemaphore (1 = true)
+    ldr x5, [x3, #L1_CACHE * 6] ;mExitThreadSemaphore (1 = true)
     cmp x5, #0
     beq pop_loop
     eor x0, x0, x0
@@ -29,19 +29,20 @@ entry_found:
     add x2, x1, #1
     and x1, x1, BUFFER_MASK
     lsl x1, x1, SHIFT_NO
-    add x1, x1, #L1_CACHE * 5 ;mRingBuffer
+    add x1, x1, #L1_CACHE * 7 ;mRingBuffer
     ldr x0, [x3, x1]
     dmb ishld
-    str x2, [x3, #L1_CACHE * 2] ;mReadPosition
+    str x2, [x3, #L1_CACHE * 4] ;mReadPositionPop
+    str x2, [x3, #L1_CACHE * 2] ;mReadPositionPush
     ret
 
 _push_item:
-    ldr x2, [x0, #L1_CACHE * 1] ;mWritePosition
+    ldr x2, [x0, #L1_CACHE * 1] ;mWritePositionPush
 push_loop:
-    ldr x3, [x0, #L1_CACHE * 4] ;mExitThreadSemaphore (1 = true)
+    ldr x3, [x0, #L1_CACHE * 6] ;mExitThreadSemaphore (1 = true)
     cmp x3, #0
     bne exit_loop
-    ldr x4, [x0, #L1_CACHE * 2] ;mReadPosition
+    ldr x4, [x0, #L1_CACHE * 2] ;mReadPositionPush
     sub x3, x2, x4
     cmp x3, BUFFER_MASK
     bge push_loop
@@ -49,10 +50,11 @@ push_loop:
     add x2, x2, #1
     and x3, x3, BUFFER_MASK
     lsl x3, x3, SHIFT_NO
-    add x3, x3, #L1_CACHE * 5 ;mRingBuffer
+    add x3, x3, #L1_CACHE * 7 ;mRingBuffer
     str x1,[x0, x3]
     dmb ishst
-    str x2,[x0, #L1_CACHE * 1] ;mWritePosition
+    str x2,[x0, #L1_CACHE * 1] ;mWritePositionPush
+    str x2,[x0, #L1_CACHE * 3] ;mWritePositionPop
 exit_loop:
     ret
 
diff --git a/fastqueue_x86_64.asm b/fastqueue_x86_64.asm
index 22c30c0..a79c51c 100644
--- a/fastqueue_x86_64.asm
+++ b/fastqueue_x86_64.asm
@@ -21,32 +21,33 @@ verify_cache_size:
     ret
 
 push_item:
-    mov r11, [rdi + (L1_CACHE * 1)] ;mWritePosition
+    mov r11, [rdi + (L1_CACHE * 1)] ;mWritePositionPush
 push_loop:
-    cmp [rdi + (L1_CACHE * 4)], byte 0 ;mExitThreadSemaphore
+    cmp [rdi + (L1_CACHE * 6)], byte 0 ;mExitThreadSemaphore
     jnz exit_loop
     mov rcx, r11
-    sub rcx, [rdi + (L1_CACHE * 2)] ;mReadPosition
+    sub rcx, [rdi + (L1_CACHE * 2)] ;mReadPositionPush
     cmp rcx, BUFFER_MASK
     jge push_loop
     mov rax, r11
     inc r11
     and rax, BUFFER_MASK
     shl rax, SHIFT_NO
-    add rax, (L1_CACHE * 5) ;mRingBuffer
+    add rax, (L1_CACHE * 7) ;mRingBuffer
     mov [rdi + rax], rsi
     sfence
-    mov [rdi + (L1_CACHE * 1)], r11 ;mWritePosition
+    mov [rdi + (L1_CACHE * 1)], r11 ;mWritePositionPush
+    mov [rdi + (L1_CACHE * 3)], r11 ;mWritePositionPop
 exit_loop:
 	ret
 
 pop_item:
-    mov rcx, [rdi + (L1_CACHE * 2)] ;mReadPosition
-    cmp rcx, [rdi + (L1_CACHE * 1)] ;mWritePosition
+    mov rcx, [rdi + (L1_CACHE * 4)] ;mReadPositionPop
+    cmp rcx, [rdi + (L1_CACHE * 3)] ;mWritePositionPop
     jne entry_found
-    sub rcx, [rdi + (L1_CACHE * 3)] ;mExitThread (0 = true)
+    sub rcx, [rdi + (L1_CACHE * 5)] ;mExitThread (0 = true)
     jnz pop_item
-    cmp [rdi + (L1_CACHE * 4)], byte 0  ;mExitThreadSemaphore (1 = true)
+    cmp [rdi + (L1_CACHE * 6)], byte 0  ;mExitThreadSemaphore (1 = true)
     jz  pop_item
     xor rax, rax
     ret
@@ -55,9 +56,10 @@ entry_found:
     inc r11
     and rcx, BUFFER_MASK
     shl rcx, SHIFT_NO
-    add rcx, (L1_CACHE * 5) ;mRingBuffer
+    add rcx, (L1_CACHE * 7) ;mRingBuffer
     mov rax, [rdi + rcx]
     lfence
-    mov [rdi + (L1_CACHE * 2)], r11 ;mReadPosition
+    mov [rdi + (L1_CACHE * 4)], r11 ;mReadPositionPop
+    mov [rdi + (L1_CACHE * 2)], r11 ;mReadPositionPush
 	ret