mratsim · mratsim · Jan 8, 2025 · Jan 6, 2025 · Jan 6, 2025 · Jan 6, 2025
diff --git a/benchmarks-threadpool/wtime.h b/benchmarks-threadpool/wtime.h
@@ -28,15 +28,6 @@ static inline double Wtime_usec(void)
 	return tv.tv_sec * 1e6 + tv.tv_usec;
 }
 
-// Read time stamp counter on x86
-static inline unsigned long long readtsc(void)
-{
-	unsigned int lo, hi;
-	// RDTSC copies contents of 64-bit TSC into EDX:EAX
-	asm volatile ("rdtsc" : "=a" (lo), "=d" (hi));
- 	return (unsigned long long)hi << 32 | lo;
-}
-
 #define WTIME_unique_var_name_paste(id, n) id ## n
 #define WTIME_unique_var_name(id, n) WTIME_unique_var_name_paste(id, n)
 #define WTIME_unique_var(id) WTIME_unique_var_name(id, __LINE__)

diff --git a/constantine.nimble b/constantine.nimble
@@ -240,10 +240,10 @@ proc genDynamicLib(outdir, nimcache: string) =
     compile "constantine.dll"
 
   elif defined(macosx) or defined(macos):
-    compile "libconstantine.dylib.arm", "--cpu:arm64 -l:'-target arm64-apple-macos11' -t:'-target arm64-apple-macos11'"
-    compile "libconstantine.dylib.x64", "--cpu:amd64 -l:'-target x86_64-apple-macos10.12' -t:'-target x86_64-apple-macos10.12'"
-    exec &"lipo {outdir}/libconstantine.dylib.arm " &
-             &" {outdir}/libconstantine.dylib.x64 " &
+    compile "libconstantine.arm64.dylib", "--cpu:arm64 -l:'-target arm64-apple-macos11' -t:'-target arm64-apple-macos11'"
+    compile "libconstantine.x86_64.dylib", "--cpu:amd64 -l:'-target x86_64-apple-macos10.12' -t:'-target x86_64-apple-macos10.12'"
+    exec &"lipo {outdir}/libconstantine.arm64.dylib " &
+             &" {outdir}/libconstantine.x86_64.dylib " &
              &" -output {outdir}/libconstantine.dylib -create"
 
   else:
@@ -272,10 +272,10 @@ proc genStaticLib(outdir, nimcache: string, extFlags = "") =
     compile "constantine.lib"
 
   elif defined(macosx) or defined(macos):
-    compile "libconstantine.a.arm", "--cpu:arm64 -l:'-target arm64-apple-macos11' -t:'-target arm64-apple-macos11'"
-    compile "libconstantine.a.x64", "--cpu:amd64 -l:'-target x86_64-apple-macos10.12' -t:'-target x86_64-apple-macos10.12'"
-    exec &"lipo {outdir}/libconstantine.a.arm " &
-             &" {outdir}/libconstantine.a.x64 " &
+    compile "libconstantine.arm64.a", "--cpu:arm64 -l:'-target arm64-apple-macos11' -t:'-target arm64-apple-macos11'"
+    compile "libconstantine.x86_64.a", "--cpu:amd64 -l:'-target x86_64-apple-macos10.12' -t:'-target x86_64-apple-macos10.12'"
+    exec &"lipo {outdir}/libconstantine.arm64.a " &
+             &" {outdir}/libconstantine.x86_64.a " &
              &" -output {outdir}/libconstantine.a -create"
 
   else:

diff --git a/constantine/hashes/sha256/sha256_x86_shaext.nim b/constantine/hashes/sha256/sha256_x86_shaext.nim
@@ -7,7 +7,7 @@
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
 import
-  constantine/platforms/x86/simd_x86,
+  constantine/platforms/isa_x86/simd_x86,
   constantine/platforms/primitives,
   ./sha256_generic
 

diff --git a/constantine/hashes/sha256/sha256_x86_ssse3.nim b/constantine/hashes/sha256/sha256_x86_ssse3.nim
@@ -7,7 +7,7 @@
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
 import
-  constantine/platforms/x86/simd_x86,
+  constantine/platforms/isa_x86/simd_x86,
   constantine/platforms/primitives,
   ./sha256_generic
 

diff --git a/constantine/math/arithmetic/assembly/limbs_asm_bigint_arm64.nim b/constantine/math/arithmetic/assembly/limbs_asm_bigint_arm64.nim
@@ -0,0 +1,212 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  # Standard library
+  std/macros,
+  # Internal
+  constantine/platforms/abstractions
+
+# ############################################################
+#
+#        Assembly implementation of bigints
+#
+# ############################################################
+
+static: doAssert UseASM_ARM64
+
+# Copy
+# ------------------------------------------------------------
+
+macro ccopy_gen[N: static int](a_PIR: var Limbs[N], b_PIR: Limbs[N], ctl: SecretBool): untyped =
+  ## Generate an optimized conditional copy kernel
+  result = newStmtList()
+
+  var ctx = init(Assembler_arm64, BaseType)
+
+  let
+    # MemOffsettable is the better constraint but
+    # with ARM64 we cannot generate array offsets from it due to inline ASM auto-bracketings
+    a = asmArray(a_PIR, N, PointerInReg, asmInput, memIndirect = memReadWrite)
+    b = asmArray(b_PIR, N, PointerInReg, asmInput, memIndirect = memRead)
+
+    control = asmValue(ctl, Reg, asmInput)
+
+    u0Sym = ident"u0"
+    u1Sym = ident"u1"
+    v0Sym = ident"v0"
+    v1Sym = ident"v1"
+
+  var # Swappable registers to break dependency chains
+    u0 = asmValue(u0Sym, Reg, asmOutputEarlyClobber)
+    u1 = asmValue(u1Sym, Reg, asmOutputEarlyClobber)
+    v0 = asmValue(v0Sym, Reg, asmOutputEarlyClobber)
+    v1 = asmValue(v1Sym, Reg, asmOutputEarlyClobber)
+
+  # Prologue
+  result.add quote do:
+    var `u0sym`{.noinit.}, `u1sym`{.noinit.}: BaseType
+    var `v0sym`{.noinit.}, `v1sym`{.noinit.}: BaseType
+
+  # Algorithm
+  if N >= 2:
+    ctx.ldp u0, u1, a[0]
+    ctx.ldp v0, v1, b[0]
+  else:
+    ctx.ldr u0, a[0]
+    ctx.ldr v0, b[0]
+
+  # Algorithm
+  ctx.cmp control, xzr      # Check vs 0
+  for i in 0 ..< N:
+    ctx.csel u0, u0, v0, eq # Don't modify if eq 0
+    ctx.str u0, a[i]
+
+    # Next iteration
+    if i != N-1:
+      swap(u0, u1)
+      swap(v0, v1)
+      if i+2 < N:
+        ctx.ldr u1, a[i+2]
+        ctx.ldr v1, b[i+2]
+
+  # Codegen
+  result.add ctx.generate()
+
+func ccopy_asm*(a: var Limbs, b: Limbs, ctl: SecretBool) =
+  ## Constant-time conditional copy
+  ## If ctl is true: b is copied into a
+  ## if ctl is false: b is not copied and a is untouched
+  ## Time and memory accesses are the same whether a copy occurs or not
+  ccopy_gen(a, b, ctl)
+
+# Addition
+# ------------------------------------------------------------
+
+macro add_gen[N: static int](carry: var Carry, r_PIR: var Limbs[N], a_PIR, b_PIR: Limbs[N]): untyped =
+  ## Generate an optimized out-of-place addition kernel
+  result = newStmtList()
+
+  var ctx = init(Assembler_arm64, BaseType)
+  let
+    # MemOffsettable is the better constraint but
+    # with ARM64 we cannot generate array offsets from it due to inline ASM auto-bracketings
+    r = asmArray(r_PIR, N, PointerInReg, asmInput, memIndirect = memWrite)
+    a = asmArray(a_PIR, N, PointerInReg, asmInput, memIndirect = memRead)
+    b = asmArray(b_PIR, N, PointerInReg, asmInput, memIndirect = memRead)
+
+    u0Sym = ident"u0"
+    u1Sym = ident"u1"
+    v0Sym = ident"v0"
+    v1Sym = ident"v1"
+
+  var # Swappable registers to break dependency chains
+    u0 = asmValue(u0Sym, Reg, asmOutputEarlyClobber)
+    u1 = asmValue(u1Sym, Reg, asmOutputEarlyClobber)
+    v0 = asmValue(v0Sym, Reg, asmOutputEarlyClobber)
+    v1 = asmValue(v1Sym, Reg, asmOutputEarlyClobber)
+
+  # Prologue
+  result.add quote do:
+    var `u0sym`{.noinit.}, `u1sym`{.noinit.}: BaseType
+    var `v0sym`{.noinit.}, `v1sym`{.noinit.}: BaseType
+
+  # Algorithm
+  if N >= 2:
+    ctx.ldp u0, u1, a[0]
+    ctx.ldp v0, v1, b[0]
+  else:
+    ctx.ldr u0, a[0]
+    ctx.ldr v0, b[0]
+
+  for i in 0 ..< N:
+    if i == 0:
+      ctx.adds u0, u0, v0
+    else:
+      ctx.adcs u0, u0, v0
+    ctx.str u0, r[i]
+
+    # Next iteration
+    if i != N-1:
+      swap(u0, u1)
+      swap(v0, v1)
+      if i+2 < N:
+        ctx.ldr u1, a[i+2]
+        ctx.ldr v1, b[i+2]
+
+  ctx.setOutputToFlag(carry, CarryFlag)
+
+  # Codegen
+  result.add ctx.generate()
+
+func add_asm*(r: var Limbs, a, b: Limbs): Carry =
+  ## Constant-time addition
+  add_gen(result, r, a, b)
+
+# Subtraction
+# ------------------------------------------------------------
+
+macro sub_gen[N: static int](borrow: var Borrow, r_PIR: var Limbs[N], a_PIR, b_PIR: Limbs[N]): untyped =
+  ## Generate an optimized out-of-place subtraction kernel
+  result = newStmtList()
+
+  var ctx = init(Assembler_arm64, BaseType)
+  let
+    # MemOffsettable is the better constraint but
+    # with ARM64 we cannot generate array offsets from it due to inline ASM auto-bracketings
+    r = asmArray(r_PIR, N, PointerInReg, asmInput, memIndirect = memWrite)
+    a = asmArray(a_PIR, N, PointerInReg, asmInput, memIndirect = memRead)
+    b = asmArray(b_PIR, N, PointerInReg, asmInput, memIndirect = memRead)
+
+    u0Sym = ident"u0"
+    u1Sym = ident"u1"
+    v0Sym = ident"v0"
+    v1Sym = ident"v1"
+
+  var # Swappable registers to break dependency chains
+    u0 = asmValue(u0Sym, Reg, asmOutputEarlyClobber)
+    u1 = asmValue(u1Sym, Reg, asmOutputEarlyClobber)
+    v0 = asmValue(v0Sym, Reg, asmOutputEarlyClobber)
+    v1 = asmValue(v1Sym, Reg, asmOutputEarlyClobber)
+
+  # Prologue
+  result.add quote do:
+    var `u0sym`{.noinit.}, `u1sym`{.noinit.}: BaseType
+    var `v0sym`{.noinit.}, `v1sym`{.noinit.}: BaseType
+
+  # Algorithm
+  if N >= 2:
+    ctx.ldp u0, u1, a[0]
+    ctx.ldp v0, v1, b[0]
+  else:
+    ctx.ldr u0, a[0]
+    ctx.ldr v0, b[0]
+
+  for i in 0 ..< N:
+    if i == 0:
+      ctx.subs u0, u0, v0
+    else:
+      ctx.sbcs u0, u0, v0
+    ctx.str u0, r[i]
+
+    # Next iteration
+    if i != N-1:
+      swap(u0, u1)
+      swap(v0, v1)
+      if i+2 < N:
+        ctx.ldr u1, a[i+2]
+        ctx.ldr v1, b[i+2]
+
+  ctx.setOutputToFlag(borrow, BorrowFlag)
+
+  # Codegen
+  result.add ctx.generate()
+
+func sub_asm*(r: var Limbs, a, b: Limbs): Carry =
+  ## Constant-time subtraction
+  sub_gen(result, r, a, b)
diff --git a/...ath/arithmetic/assembly/limbs_asm_x86.nim → ...thmetic/assembly/limbs_asm_bigint_x86.nim b/...ath/arithmetic/assembly/limbs_asm_x86.nim → ...thmetic/assembly/limbs_asm_bigint_x86.nim
@@ -22,6 +22,7 @@ static: doAssert UseASM_X86_32
 
 # Copy
 # ------------------------------------------------------------
+
 macro ccopy_gen[N: static int](a_PIR: var Limbs[N], b_MEM: Limbs[N], ctl: SecretBool): untyped =
   ## Generate an optimized conditional copy kernel
   result = newStmtList()
@@ -108,11 +109,11 @@ func add_asm*(r: var Limbs, a, b: Limbs): Carry =
   ## Constant-time addition
   add_gen(result, r, a, b)
 
-# Substraction
+# Subtraction
 # ------------------------------------------------------------
 
 macro sub_gen[N: static int](borrow: var Borrow, r_PIR: var Limbs[N], a_MEM, b_MEM: Limbs[N]): untyped =
-  ## Generate an optimized out-of-place substraction kernel
+  ## Generate an optimized out-of-place subtraction kernel
 
   result = newStmtList()
 
@@ -150,5 +151,5 @@ macro sub_gen[N: static int](borrow: var Borrow, r_PIR: var Limbs[N], a_MEM, b_M
   result.add ctx.generate()
 
 func sub_asm*(r: var Limbs, a, b: Limbs): Borrow =
-  ## Constant-time substraction
+  ## Constant-time subtraction
   sub_gen(result, r, a, b)