Skip to content

Commit

Permalink
update asm using pointer hand over
Browse files Browse the repository at this point in the history
  • Loading branch information
andersc committed Dec 30, 2022
1 parent 9b00feb commit 52151fc
Show file tree
Hide file tree
Showing 4 changed files with 53 additions and 29 deletions.
16 changes: 9 additions & 7 deletions FastQueueASM.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,19 +36,21 @@ namespace FastQueueASM {
};

alignas(L1_CACHE) volatile uint8_t mBorderUpp[L1_CACHE];
alignas(L1_CACHE) volatile uint64_t mWritePosition; //L1CACHE * 1
alignas(L1_CACHE) volatile uint64_t mReadPosition; //L1CACHE * 2
alignas(L1_CACHE) volatile uint64_t mExitThread; //L1CACHE * 3
alignas(L1_CACHE) volatile uint64_t mExitThreadSemaphore; //L1CACHE * 4
alignas(L1_CACHE) volatile mAlign mRingBuffer[BUFFER_MASK + 1]; //L1CACHE * 5
alignas(L1_CACHE) volatile uint64_t mWritePositionPush; //L1CACHE * 1
alignas(L1_CACHE) volatile uint64_t mReadPositionPush; //L1CACHE * 2
alignas(L1_CACHE) volatile uint64_t mWritePositionPop; //L1CACHE * 3
alignas(L1_CACHE) volatile uint64_t mReadPositionPop; //L1CACHE * 4
alignas(L1_CACHE) volatile uint64_t mExitThread; //L1CACHE * 5
alignas(L1_CACHE) volatile uint64_t mExitThreadSemaphore; //L1CACHE * 6
alignas(L1_CACHE) volatile mAlign mRingBuffer[BUFFER_MASK + 1]; //L1CACHE * 7
alignas(L1_CACHE) volatile uint8_t mBorderDown[L1_CACHE];
};

//Allocate an new queue
DataBlock *newQueue() {

//Verify the compiler generated data block
static_assert(sizeof(DataBlock) == ((4 * L1_CACHE) + ((BUFFER_MASK + 1) * L1_CACHE) + (L1_CACHE * 2)),
static_assert(sizeof(DataBlock) == ((6 * L1_CACHE) + ((BUFFER_MASK + 1) * L1_CACHE) + (L1_CACHE * 2)),
"FastQueueASM::DataBlock is not matching expected size");
#ifdef _MSC_VER
auto pData = (DataBlock *)_aligned_malloc(sizeof(DataBlock), L1_CACHE);
Expand Down Expand Up @@ -87,7 +89,7 @@ namespace FastQueueASM {

//Stop queue (Maybe called from any thread)
void stopQueue(DataBlock *pData) {
pData->mExitThread = pData->mWritePosition;
pData->mExitThread = pData->mWritePositionPush;
pData->mExitThreadSemaphore = true;
}

Expand Down
18 changes: 18 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,24 @@ FastQueueASM Transactions -> 9471164/s

However, on X64 platforms I don't see the same gain in my benchmarks. With that said Rigtorps queue is really the one to beat ;-) .


**AMD EPYC 7763 64-Core Processor**

```
BoostLockFree pointer test started.
BoostLockFree pointer test ended.
BoostLockFree Transactions -> 6851164/s
FastQueue pointer test started.
FastQueue pointer test ended.
FastQueue Transactions -> 8516819/s
Rigtorp pointer test started.
Rigtorp pointer test ended.
Rigtorp Transactions -> 8332916/s
FastQueueASM pointer test started.
FastQueueASM pointer test ended.
FastQueueASM Transactions -> 8856282/s
```

The queue is a header only template class and is implemented in a few lines of C++.

The code compiles on arm64 or x86_64 CPU's running Windows, MacOS or Linux OS.
Expand Down
24 changes: 13 additions & 11 deletions fastqueue_arm64.asm
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,15 @@

_pop_item:
mov x3, x0
ldr x1, [x0, #L1_CACHE * 2] ;mReadPosition
ldr x1, [x0, #L1_CACHE * 4] ;mReadPositionPop
pop_loop:
ldr x2, [x3, #L1_CACHE * 1] ;mWritePosition
ldr x2, [x3, #L1_CACHE * 3] ;mWritePositionPop
cmp x1,x2
bne entry_found
ldr x4, [x3, #L1_CACHE * 3] ;mExitThread
ldr x4, [x3, #L1_CACHE * 5] ;mExitThread
cmp x4, x1
bne pop_loop
ldr x5, [x3, #L1_CACHE * 4] ;mExitThreadSemaphore (1 = true)
ldr x5, [x3, #L1_CACHE * 6] ;mExitThreadSemaphore (1 = true)
cmp x5, #0
beq pop_loop
eor x0, x0, x0
Expand All @@ -29,30 +29,32 @@ entry_found:
add x2, x1, #1
and x1, x1, BUFFER_MASK
lsl x1, x1, SHIFT_NO
add x1, x1, #L1_CACHE * 5 ;mRingBuffer
add x1, x1, #L1_CACHE * 7 ;mRingBuffer
ldr x0, [x3, x1]
dmb ishld
str x2, [x3, #L1_CACHE * 2] ;mReadPosition
str x2, [x3, #L1_CACHE * 4] ;mReadPositionPop
str x2, [x3, #L1_CACHE * 2] ;mReadPositionPush
ret

_push_item:
ldr x2, [x0, #L1_CACHE * 1] ;mWritePosition
ldr x2, [x0, #L1_CACHE * 1] ;mWritePositionPush
push_loop:
ldr x3, [x0, #L1_CACHE * 4] ;mExitThreadSemaphore (1 = true)
ldr x3, [x0, #L1_CACHE * 6] ;mExitThreadSemaphore (1 = true)
cmp x3, #0
bne exit_loop
ldr x4, [x0, #L1_CACHE * 2] ;mReadPosition
ldr x4, [x0, #L1_CACHE * 2] ;mReadPositionPush
sub x3, x2, x4
cmp x3, BUFFER_MASK
bge push_loop
mov x3, x2
add x2, x2, #1
and x3, x3, BUFFER_MASK
lsl x3, x3, SHIFT_NO
add x3, x3, #L1_CACHE * 5 ;mRingBuffer
add x3, x3, #L1_CACHE * 7 ;mRingBuffer
str x1,[x0, x3]
dmb ishst
str x2,[x0, #L1_CACHE * 1] ;mWritePosition
str x2,[x0, #L1_CACHE * 1] ;mWritePositionPush
str x2,[x0, #L1_CACHE * 3] ;mWritePositionPop
exit_loop:
ret

Expand Down
24 changes: 13 additions & 11 deletions fastqueue_x86_64.asm
Original file line number Diff line number Diff line change
Expand Up @@ -21,32 +21,33 @@ verify_cache_size:
ret

push_item:
mov r11, [rdi + (L1_CACHE * 1)] ;mWritePosition
mov r11, [rdi + (L1_CACHE * 1)] ;mWritePositionPush
push_loop:
cmp [rdi + (L1_CACHE * 4)], byte 0 ;mExitThreadSemaphore
cmp [rdi + (L1_CACHE * 6)], byte 0 ;mExitThreadSemaphore
jnz exit_loop
mov rcx, r11
sub rcx, [rdi + (L1_CACHE * 2)] ;mReadPosition
sub rcx, [rdi + (L1_CACHE * 2)] ;mReadPositionPush
cmp rcx, BUFFER_MASK
jge push_loop
mov rax, r11
inc r11
and rax, BUFFER_MASK
shl rax, SHIFT_NO
add rax, (L1_CACHE * 5) ;mRingBuffer
add rax, (L1_CACHE * 7) ;mRingBuffer
mov [rdi + rax], rsi
sfence
mov [rdi + (L1_CACHE * 1)], r11 ;mWritePosition
mov [rdi + (L1_CACHE * 1)], r11 ;mWritePositionPush
mov [rdi + (L1_CACHE * 3)], r11 ;mWritePositionPop
exit_loop:
ret

pop_item:
mov rcx, [rdi + (L1_CACHE * 2)] ;mReadPosition
cmp rcx, [rdi + (L1_CACHE * 1)] ;mWritePosition
mov rcx, [rdi + (L1_CACHE * 4)] ;mReadPositionPop
cmp rcx, [rdi + (L1_CACHE * 3)] ;mWritePositionPop
jne entry_found
sub rcx, [rdi + (L1_CACHE * 3)] ;mExitThread (0 = true)
sub rcx, [rdi + (L1_CACHE * 5)] ;mExitThread (0 = true)
jnz pop_item
cmp [rdi + (L1_CACHE * 4)], byte 0 ;mExitThreadSemaphore (1 = true)
cmp [rdi + (L1_CACHE * 6)], byte 0 ;mExitThreadSemaphore (1 = true)
jz pop_item
xor rax, rax
ret
Expand All @@ -55,9 +56,10 @@ entry_found:
inc r11
and rcx, BUFFER_MASK
shl rcx, SHIFT_NO
add rcx, (L1_CACHE * 5) ;mRingBuffer
add rcx, (L1_CACHE * 7) ;mRingBuffer
mov rax, [rdi + rcx]
lfence
mov [rdi + (L1_CACHE * 2)], r11 ;mReadPosition
mov [rdi + (L1_CACHE * 4)], r11 ;mReadPositionPop
mov [rdi + (L1_CACHE * 2)], r11 ;mReadPositionPush
ret

0 comments on commit 52151fc

Please sign in to comment.