Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Arm64 assembly #513

Merged
merged 12 commits into from
Jan 8, 2025
9 changes: 0 additions & 9 deletions benchmarks-threadpool/wtime.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,15 +28,6 @@ static inline double Wtime_usec(void)
return tv.tv_sec * 1e6 + tv.tv_usec;
}

// Read time stamp counter on x86
static inline unsigned long long readtsc(void)
{
unsigned int lo, hi;
// RDTSC copies contents of 64-bit TSC into EDX:EAX
asm volatile ("rdtsc" : "=a" (lo), "=d" (hi));
return (unsigned long long)hi << 32 | lo;
}

#define WTIME_unique_var_name_paste(id, n) id ## n
#define WTIME_unique_var_name(id, n) WTIME_unique_var_name_paste(id, n)
#define WTIME_unique_var(id) WTIME_unique_var_name(id, __LINE__)
Expand Down
16 changes: 8 additions & 8 deletions constantine.nimble
Original file line number Diff line number Diff line change
Expand Up @@ -240,10 +240,10 @@ proc genDynamicLib(outdir, nimcache: string) =
compile "constantine.dll"

elif defined(macosx) or defined(macos):
compile "libconstantine.dylib.arm", "--cpu:arm64 -l:'-target arm64-apple-macos11' -t:'-target arm64-apple-macos11'"
compile "libconstantine.dylib.x64", "--cpu:amd64 -l:'-target x86_64-apple-macos10.12' -t:'-target x86_64-apple-macos10.12'"
exec &"lipo {outdir}/libconstantine.dylib.arm " &
&" {outdir}/libconstantine.dylib.x64 " &
compile "libconstantine.arm64.dylib", "--cpu:arm64 -l:'-target arm64-apple-macos11' -t:'-target arm64-apple-macos11'"
compile "libconstantine.x86_64.dylib", "--cpu:amd64 -l:'-target x86_64-apple-macos10.12' -t:'-target x86_64-apple-macos10.12'"
exec &"lipo {outdir}/libconstantine.arm64.dylib " &
&" {outdir}/libconstantine.x86_64.dylib " &
&" -output {outdir}/libconstantine.dylib -create"

else:
Expand Down Expand Up @@ -272,10 +272,10 @@ proc genStaticLib(outdir, nimcache: string, extFlags = "") =
compile "constantine.lib"

elif defined(macosx) or defined(macos):
compile "libconstantine.a.arm", "--cpu:arm64 -l:'-target arm64-apple-macos11' -t:'-target arm64-apple-macos11'"
compile "libconstantine.a.x64", "--cpu:amd64 -l:'-target x86_64-apple-macos10.12' -t:'-target x86_64-apple-macos10.12'"
exec &"lipo {outdir}/libconstantine.a.arm " &
&" {outdir}/libconstantine.a.x64 " &
compile "libconstantine.arm64.a", "--cpu:arm64 -l:'-target arm64-apple-macos11' -t:'-target arm64-apple-macos11'"
compile "libconstantine.x86_64.a", "--cpu:amd64 -l:'-target x86_64-apple-macos10.12' -t:'-target x86_64-apple-macos10.12'"
exec &"lipo {outdir}/libconstantine.arm64.a " &
&" {outdir}/libconstantine.x86_64.a " &
&" -output {outdir}/libconstantine.a -create"

else:
Expand Down
2 changes: 1 addition & 1 deletion constantine/hashes/sha256/sha256_x86_shaext.nim
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
# at your option. This file may not be copied, modified, or distributed except according to those terms.

import
constantine/platforms/x86/simd_x86,
constantine/platforms/isa_x86/simd_x86,
constantine/platforms/primitives,
./sha256_generic

Expand Down
2 changes: 1 addition & 1 deletion constantine/hashes/sha256/sha256_x86_ssse3.nim
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
# at your option. This file may not be copied, modified, or distributed except according to those terms.

import
constantine/platforms/x86/simd_x86,
constantine/platforms/isa_x86/simd_x86,
constantine/platforms/primitives,
./sha256_generic

Expand Down
212 changes: 212 additions & 0 deletions constantine/math/arithmetic/assembly/limbs_asm_bigint_arm64.nim
Original file line number Diff line number Diff line change
@@ -0,0 +1,212 @@
# Constantine
# Copyright (c) 2018-2019 Status Research & Development GmbH
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.

import
# Standard library
std/macros,
# Internal
constantine/platforms/abstractions

# ############################################################
#
# Assembly implementation of bigints
#
# ############################################################

static: doAssert UseASM_ARM64

# Copy
# ------------------------------------------------------------

macro ccopy_gen[N: static int](a_PIR: var Limbs[N], b_PIR: Limbs[N], ctl: SecretBool): untyped =
## Generate an optimized conditional copy kernel
result = newStmtList()

var ctx = init(Assembler_arm64, BaseType)

let
# MemOffsettable is the better constraint but
# with ARM64 we cannot generate array offsets from it due to inline ASM auto-bracketings
a = asmArray(a_PIR, N, PointerInReg, asmInput, memIndirect = memReadWrite)
b = asmArray(b_PIR, N, PointerInReg, asmInput, memIndirect = memRead)

control = asmValue(ctl, Reg, asmInput)

u0Sym = ident"u0"
u1Sym = ident"u1"
v0Sym = ident"v0"
v1Sym = ident"v1"

var # Swappable registers to break dependency chains
u0 = asmValue(u0Sym, Reg, asmOutputEarlyClobber)
u1 = asmValue(u1Sym, Reg, asmOutputEarlyClobber)
v0 = asmValue(v0Sym, Reg, asmOutputEarlyClobber)
v1 = asmValue(v1Sym, Reg, asmOutputEarlyClobber)

# Prologue
result.add quote do:
var `u0sym`{.noinit.}, `u1sym`{.noinit.}: BaseType
var `v0sym`{.noinit.}, `v1sym`{.noinit.}: BaseType

# Algorithm
if N >= 2:
ctx.ldp u0, u1, a[0]
ctx.ldp v0, v1, b[0]
else:
ctx.ldr u0, a[0]
ctx.ldr v0, b[0]

# Algorithm
ctx.cmp control, xzr # Check vs 0
for i in 0 ..< N:
ctx.csel u0, u0, v0, eq # Don't modify if eq 0
ctx.str u0, a[i]

# Next iteration
if i != N-1:
swap(u0, u1)
swap(v0, v1)
if i+2 < N:
ctx.ldr u1, a[i+2]
ctx.ldr v1, b[i+2]
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For now this uses fancy prefetching but unsure if beneficial on Apple Silicon (can fetch/decode up to 8 instructions per cycle) or Raspberry Pi 5


# Codegen
result.add ctx.generate()

func ccopy_asm*(a: var Limbs, b: Limbs, ctl: SecretBool) =
## Constant-time conditional copy
## If ctl is true: b is copied into a
## if ctl is false: b is not copied and a is untouched
## Time and memory accesses are the same whether a copy occurs or not
ccopy_gen(a, b, ctl)

# Addition
# ------------------------------------------------------------

macro add_gen[N: static int](carry: var Carry, r_PIR: var Limbs[N], a_PIR, b_PIR: Limbs[N]): untyped =
## Generate an optimized out-of-place addition kernel
result = newStmtList()

var ctx = init(Assembler_arm64, BaseType)
let
# MemOffsettable is the better constraint but
# with ARM64 we cannot generate array offsets from it due to inline ASM auto-bracketings
r = asmArray(r_PIR, N, PointerInReg, asmInput, memIndirect = memWrite)
a = asmArray(a_PIR, N, PointerInReg, asmInput, memIndirect = memRead)
b = asmArray(b_PIR, N, PointerInReg, asmInput, memIndirect = memRead)

u0Sym = ident"u0"
u1Sym = ident"u1"
v0Sym = ident"v0"
v1Sym = ident"v1"

var # Swappable registers to break dependency chains
u0 = asmValue(u0Sym, Reg, asmOutputEarlyClobber)
u1 = asmValue(u1Sym, Reg, asmOutputEarlyClobber)
v0 = asmValue(v0Sym, Reg, asmOutputEarlyClobber)
v1 = asmValue(v1Sym, Reg, asmOutputEarlyClobber)

# Prologue
result.add quote do:
var `u0sym`{.noinit.}, `u1sym`{.noinit.}: BaseType
var `v0sym`{.noinit.}, `v1sym`{.noinit.}: BaseType

# Algorithm
if N >= 2:
ctx.ldp u0, u1, a[0]
ctx.ldp v0, v1, b[0]
else:
ctx.ldr u0, a[0]
ctx.ldr v0, b[0]

for i in 0 ..< N:
if i == 0:
ctx.adds u0, u0, v0
else:
ctx.adcs u0, u0, v0
ctx.str u0, r[i]

# Next iteration
if i != N-1:
swap(u0, u1)
swap(v0, v1)
if i+2 < N:
ctx.ldr u1, a[i+2]
ctx.ldr v1, b[i+2]
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For now this uses fancy prefetching but unsure if beneficial on Apple Silicon (can fetch/decode up to 8 instructions per cycle) or Raspberry Pi 5


ctx.setOutputToFlag(carry, CarryFlag)

# Codegen
result.add ctx.generate()

func add_asm*(r: var Limbs, a, b: Limbs): Carry =
## Constant-time addition
add_gen(result, r, a, b)

# Subtraction
# ------------------------------------------------------------

macro sub_gen[N: static int](borrow: var Borrow, r_PIR: var Limbs[N], a_PIR, b_PIR: Limbs[N]): untyped =
## Generate an optimized out-of-place subtraction kernel
result = newStmtList()

var ctx = init(Assembler_arm64, BaseType)
let
# MemOffsettable is the better constraint but
# with ARM64 we cannot generate array offsets from it due to inline ASM auto-bracketings
r = asmArray(r_PIR, N, PointerInReg, asmInput, memIndirect = memWrite)
a = asmArray(a_PIR, N, PointerInReg, asmInput, memIndirect = memRead)
b = asmArray(b_PIR, N, PointerInReg, asmInput, memIndirect = memRead)

u0Sym = ident"u0"
u1Sym = ident"u1"
v0Sym = ident"v0"
v1Sym = ident"v1"

var # Swappable registers to break dependency chains
u0 = asmValue(u0Sym, Reg, asmOutputEarlyClobber)
u1 = asmValue(u1Sym, Reg, asmOutputEarlyClobber)
v0 = asmValue(v0Sym, Reg, asmOutputEarlyClobber)
v1 = asmValue(v1Sym, Reg, asmOutputEarlyClobber)

# Prologue
result.add quote do:
var `u0sym`{.noinit.}, `u1sym`{.noinit.}: BaseType
var `v0sym`{.noinit.}, `v1sym`{.noinit.}: BaseType

# Algorithm
if N >= 2:
ctx.ldp u0, u1, a[0]
ctx.ldp v0, v1, b[0]
else:
ctx.ldr u0, a[0]
ctx.ldr v0, b[0]

for i in 0 ..< N:
if i == 0:
ctx.subs u0, u0, v0
else:
ctx.sbcs u0, u0, v0
ctx.str u0, r[i]

# Next iteration
if i != N-1:
swap(u0, u1)
swap(v0, v1)
if i+2 < N:
ctx.ldr u1, a[i+2]
ctx.ldr v1, b[i+2]
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For now this uses fancy prefetching but unsure if beneficial on Apple Silicon (can fetch/decode up to 8 instructions per cycle) or Raspberry Pi 5


ctx.setOutputToFlag(borrow, BorrowFlag)

# Codegen
result.add ctx.generate()

func sub_asm*(r: var Limbs, a, b: Limbs): Carry =
## Constant-time subtraction
sub_gen(result, r, a, b)
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ static: doAssert UseASM_X86_32

# Copy
# ------------------------------------------------------------

macro ccopy_gen[N: static int](a_PIR: var Limbs[N], b_MEM: Limbs[N], ctl: SecretBool): untyped =
## Generate an optimized conditional copy kernel
result = newStmtList()
Expand Down Expand Up @@ -108,11 +109,11 @@ func add_asm*(r: var Limbs, a, b: Limbs): Carry =
## Constant-time addition
add_gen(result, r, a, b)

# Substraction
# Subtraction
# ------------------------------------------------------------

macro sub_gen[N: static int](borrow: var Borrow, r_PIR: var Limbs[N], a_MEM, b_MEM: Limbs[N]): untyped =
## Generate an optimized out-of-place substraction kernel
## Generate an optimized out-of-place subtraction kernel

result = newStmtList()

Expand Down Expand Up @@ -150,5 +151,5 @@ macro sub_gen[N: static int](borrow: var Borrow, r_PIR: var Limbs[N], a_MEM, b_M
result.add ctx.generate()

func sub_asm*(r: var Limbs, a, b: Limbs): Borrow =
## Constant-time substraction
## Constant-time subtraction
sub_gen(result, r, a, b)
Loading
Loading