feat: precomp sqrt optimization

mratsim · Feb 2, 2024 · 2bfd4e6 · 2bfd4e6
1 parent 58d8d2c
commit 2bfd4e6
Show file tree

Hide file tree

Showing 5 changed files with 1,694 additions and 2 deletions.
diff --git a/benchmarks/bench_verkle_primitives.nim b/benchmarks/bench_verkle_primitives.nim
@@ -0,0 +1,144 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+# ############################################################
+#
+#             Summary of the performance of a curve
+#
+# ############################################################
+
+import
+  # Internals
+  ../constantine/platforms/abstractions,
+  ../constantine/math/config/curves,
+  ../constantine/math/[arithmetic, extension_fields],
+  ../constantine/math/elliptic/[
+    ec_shortweierstrass_affine,
+    ec_shortweierstrass_projective,
+    ec_shortweierstrass_jacobian,
+    ec_scalar_mul, ec_endomorphism_accel,
+    ec_twistededwards_affine,
+    ec_twistededwards_projective],
+  ../constantine/math/constants/zoo_generators,
+  ../constantine/math/pairings/[
+    cyclotomic_subgroups,
+    pairings_bls12,
+    pairings_bn
+  ],
+  ../constantine/math/io/io_fields,
+  ../constantine/math/constants/zoo_pairings,
+  ../constantine/hashes,
+  ../constantine/hash_to_curve/hash_to_curve,
+  ../constantine/serialization/[
+    codecs_status_codes, 
+    codecs_banderwagon
+  ],
+  # Helpers
+  ../helpers/prng_unsafe,
+  ./bench_blueprint
+
+export
+  ec_shortweierstrass_projective,
+  ec_shortweierstrass_jacobian
+
+export abstractions # generic sandwich on SecretBool and SecretBool in Jacobian sum
+export zoo_pairings # generic sandwich https://github.com/nim-lang/Nim/issues/11225
+export notes
+
+type
+  Prj* = ECP_TwEdwards_Prj[Fp[Banderwagon]]
+  Aff* = ECP_TwEdwards_Prj[Fp[Banderwagon]]
+
+const Iters = 10000
+
+proc separator*() = separator(152)
+
+proc report(op, domain: string, start, stop: MonoTime, startClk, stopClk: int64, iters: int) =
+  let ns = inNanoseconds((stop-start) div iters)
+  let throughput = 1e9 / float64(ns)
+  when SupportsGetTicks:
+    echo &"{op:<35} {domain:<40} {throughput:>15.3f} ops/s     {ns:>9} ns/op     {(stopClk - startClk) div iters:>9} CPU cycles (approx)"
+  else:
+    echo &"{op:<35} {domain:<40} {throughput:>15.3f} ops/s     {ns:>9} ns/op"
+
+macro fixEllipticDisplay(T: typedesc): untyped =
+  # At compile-time, enums are integers and their display is buggy
+  # we get the Curve ID instead of the curve name.
+  let instantiated = T.getTypeInst()
+  var name = $instantiated[1][0] # EllipticEquationFormCoordinates
+  let fieldName = $instantiated[1][1][0]
+  let curveName = $Curve(instantiated[1][1][1].intVal)
+  name.add "[" & fieldName & "[" & curveName & "]]"
+  result = newLit name
+
+macro fixFieldDisplay(T: typedesc): untyped =
+  # At compile-time, enums are integers and their display is buggy
+  # we get the Curve ID instead of the curve name.
+  let instantiated = T.getTypeInst()
+  var name = $instantiated[1][0] # Fp
+  name.add "[" & $Curve(instantiated[1][1].intVal) & "]"
+  result = newLit name
+
+func fixDisplay(T: typedesc): string =
+  when T is (ECP_ShortW_Prj or ECP_ShortW_Jac or ECP_ShortW_Aff):
+    fixEllipticDisplay(T)
+  elif T is (Fp or Fp2 or Fp4 or Fp6 or Fp12):
+    fixFieldDisplay(T)
+  else:
+    $T
+
+func fixDisplay(T: Curve): string =
+  $T
+
+template bench(op: string, T: typed, iters: int, body: untyped): untyped =
+  measure(iters, startTime, stopTime, startClk, stopClk, body)
+  report(op, fixDisplay(T), startTime, stopTime, startClk, stopClk, iters)
+
+# proc equalityBench*(T: typedesc, iters: int) =
+#   when T is Aff:
+#     let P = Banderwagon.getGenerator()
+#     let Q = Banderwagon.getGenerator()
+#   else:
+#     var P, Q: Prj
+#     P.fromAffine(Banderwagon.getGenerator())
+#     Q.fromAffine(Banderwagon.getGenerator())
+#   bench("Banderwagon Equality ", T, iters):
+#     assert (P == Q).bool()
+
+
+
+proc serializaBench*(T: typedesc, iters: int) =
+  var bytes: array[32, byte]
+  var P: Prj
+  P.fromAffine(Banderwagon.getGenerator())
+  for i in 0 ..< 9:
+    P.double()
+  bench("Banderwagon Serialization", T, iters):
+    discard bytes.serialize(P)
+
+proc deserializeBench*(T: typedesc, iters: int) =
+  var bytes: array[32, byte]
+  var P: Prj
+  P.fromAffine(Banderwagon.getGenerator())
+  for i in 0 ..< 6:
+    P.double()
+  discard bytes.serialize(P)
+  bench("Banderwagon Deserialization", T, iters):
+    discard P.deserialize(bytes)
+
+
+proc main() =
+  # separator()
+  # equalityBench(Aff, Iters)
+  # equalityBench(Prj, Iters)
+  separator()
+  serializaBench(Prj, Iters)
+  deserializeBench(Prj, Iters)
+
+main()
+notes()
diff --git a/constantine.nimble b/constantine.nimble
@@ -613,7 +613,8 @@ const benchDesc = [
   "bench_ethereum_eip4844_kzg",
   "bench_evm_modexp_dos",
   "bench_gmp_modexp",
-  "bench_gmp_modmul"
+  "bench_gmp_modmul",
+  "bench_verkle_primitives"
 ]
 
 # For temporary (hopefully) investigation that can only be reproduced in CI
@@ -1036,3 +1037,6 @@ task bench_ethereum_bls_signatures, "Run Ethereum BLS signatures benchmarks - CC
 # ------------------------------------------
 task bench_ethereum_eip4844_kzg, "Run Ethereum EIP4844 KZG Polynomial commitment - CC compiler":
   runBench("bench_ethereum_eip4844_kzg")
+
+task bench_verkle, "Run benchmarks for Banderwagon":
+  runBench("bench_verkle_primitives")
diff --git a/constantine/math/arithmetic/finite_fields_precomp_square_root.nim b/constantine/math/arithmetic/finite_fields_precomp_square_root.nim
@@ -0,0 +1,198 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  std/tables,
+  ../../platforms/abstractions,
+  ../constants/zoo_square_roots,
+  ./bigints, ./finite_fields
+
+# ############################################################
+#
+#            Only For Bandersnatch/Banderwagon for now
+#
+# ############################################################
+
+# sqrtAlg_NegDlogInSmallDyadicSubgroup takes a (not necessarily primitive) root of unity x of order 2^sqrtParam_BlockSize.
+# x has the form sqrtPrecomp_ReconstructionDyadicRoot^a and returns its negative dlog -a.
+#
+# The returned value is only meaningful modulo 1<<sqrtParam_BlockSize and is fully reduced, i.e. in [0, 1<<sqrtParam_BlockSize )
+#
+# NOTE: If x is not a root of unity as asserted, the behaviour is undefined.
+func sqrtAlg_NegDlogInSmallDyadicSubgroup*(x: Fp): int =
+  let key = cast[int](x.mres.limbs[0] and SecretWord 0xFFFF)
+  if key in Fp.C.tonelliShanks(sqrtPrecomp_dlogLUT):
+    return Fp.C.tonelliShanks(sqrtPrecomp_dlogLUT)[key]
+  return 0
+
+# sqrtAlg_GetPrecomputedRootOfUnity sets target to g^(multiplier << (order * sqrtParam_BlockSize)), where g is the fixed primitive 2^32th root of unity.
+#
+# We assume that order 0 <= order*sqrtParam_BlockSize <= 32 and that multiplier is in [0, 1 <<sqrtParam_BlockSize)
+func sqrtAlg_GetPrecomputedRootOfUnity*(target: var Fp, multiplier: int, order: uint) =
+  target = Fp.C.tonelliShanks(sqrtPrecomp_PrecomputedBlocks)[order][multiplier]
+
+
+func sqrtAlg_ComputerRelevantPowers*(z: var Fp, squareRootCandidate: var Fp, rootOfUnity: var Fp) =
+  ## sliding window-type algorithm with window-size 5
+  ## Note that we precompute and use z^255 multiple times (even though it's not size 5)
+  ## and some windows actually overlap
+  var z2, z3, z7, z6, z9, z11, z13, z19, z21, z25, z27, z29, z31, z255 {.noInit.} : Fp
+  var acc: Fp
+  z2.square(z)
+  z3.prod(z2, z)
+  z6.prod(z3, z3)
+  z7.prod(z6, z)
+  z9.prod(z7, z2)
+  z11.prod(z9, z2)
+  z13.prod(z11, z2)
+  z19.prod(z13, z6)
+  z21.prod(z19, z2)
+  z25.prod(z19, z6)
+  z27.prod(z25, z2)
+  z29.prod(z27, z2)
+  z31.prod(z29, z2)
+  acc.prod(z27, z29)
+  acc.prod(acc, acc)
+  acc.prod(acc, acc)
+  z255.prod(acc, z31)
+  acc.prod(acc, acc)
+  acc.prod(acc, acc)
+  acc.prod(acc, z31)
+  square_repeated(acc, 6)
+  acc.prod(acc, z27)
+  square_repeated(acc, 6)
+  acc.prod(acc, z19)
+  square_repeated(acc, 5)
+  acc.prod(acc, z21)
+  square_repeated(acc, 7)
+  acc.prod(acc, z25)
+  square_repeated(acc, 6)
+  acc.prod(acc, z19)
+  square_repeated(acc, 5)
+  acc.prod(acc, z7)
+  square_repeated(acc, 5)
+  acc.prod(acc, z11)
+  square_repeated(acc, 5)
+  acc.prod(acc, z29)
+  square_repeated(acc, 5)
+  acc.prod(acc, z9)
+  square_repeated(acc, 7)
+  acc.prod(acc, z3)
+  square_repeated(acc, 7)
+  acc.prod(acc, z25)
+  square_repeated(acc, 5)
+  acc.prod(acc, z25)
+  square_repeated(acc, 5)
+  acc.prod(acc, z27)
+  square_repeated(acc, 8)
+  acc.prod(acc, z)
+  square_repeated(acc, 8)
+  acc.prod(acc, z)
+  square_repeated(acc, 6)
+  acc.prod(acc, z13)
+  square_repeated(acc, 7)
+  acc.prod(acc, z7)
+  square_repeated(acc, 3)
+  acc.prod(acc, z3)
+  square_repeated(acc, 13)
+  acc.prod(acc, z21)
+  square_repeated(acc, 5)
+  acc.prod(acc, z9)
+  square_repeated(acc, 5)
+  acc.prod(acc, z27)
+  square_repeated(acc, 5)
+  acc.prod(acc, z27)
+  square_repeated(acc, 5)
+  acc.prod(acc, z9)
+  square_repeated(acc, 10)
+  acc.prod(acc, z)
+  square_repeated(acc, 7)
+  acc.prod(acc, z255)
+  square_repeated(acc, 8)
+  acc.prod(acc, z255)
+  square_repeated(acc, 6)
+  acc.prod(acc, z11)
+  square_repeated(acc, 9)
+  acc.prod(acc, z255)
+  square_repeated(acc, 2)
+  acc.prod(acc, z)
+  square_repeated(acc, 7)
+  acc.prod(acc, z255)
+  square_repeated(acc, 8)
+  acc.prod(acc, z255)
+  square_repeated(acc, 8)
+  acc.prod(acc, z255)
+  square_repeated(acc, 8)
+  acc.prod(acc, z255)
+  # acc is now z^((BaseFieldMultiplicativeOddOrder - 1)/2)
+  rootOfUnity.square(acc)
+  rootOfUnity.prod(rootOfUnity ,z)
+  squareRootCandidate.prod(acc, z)
+
+
+func invSqrtEqDyadic*(z: var Fp): bool =
+  ## The algorithm works by essentially computing the dlog of z and then halving it.
+  ## negExponent is intended to hold the negative of the dlog of z.
+  ## We determine this 32-bit value (usually) _sqrtBlockSize many bits at a time, starting with the least-significant bits.
+  ##
+  ## If _sqrtBlockSize does not divide 32, the *first* iteration will determine fewer bits.
+
+  var negExponent: int
+  var temp, temp2: Fp
+
+  # set powers[i] to z^(1<< (i*blocksize))
+  var powers: array[4, Fp]
+  powers[0] = z
+  for i in 1..<Fp.C.tonelliShanks(sqrtParam_Blocks):
+    powers[i] = powers[i - 1]
+    for j in 0..<Fp.C.tonelliShanks(sqrtParam_BlockSize):
+      powers[i].square(powers[i])
+
+  ## looking at the dlogs, powers[i] is essentially the wanted exponent, left-shifted by i*_sqrtBlockSize and taken mod 1<<32
+  ## dlogHighDyadicRootNeg essentially (up to sign) reads off the _sqrtBlockSize many most significant bits. (returned as low-order bits)
+  ## 
+  ## first iteration may be slightly special if BlockSize does not divide 32
+  negExponent = sqrtAlg_NegDlogInSmallDyadicSubgroup(powers[Fp.C.tonelliShanks(sqrtParam_Blocks) - 1])
+  negExponent = negExponent shr Fp.C.tonelliShanks(sqrtParam_FirstBlockUnusedBits)
+
+  # if the exponent we just got is odd, there is no square root, no point in determining the other bits
+  if (negExponent and 1) == 1:
+    return false
+
+  for i in 1..<Fp.C.tonelliShanks(sqrtParam_Blocks):
+    temp2 = powers[Fp.C.tonelliShanks(sqrtParam_Blocks) - 1 - i]
+    for j in 0..<i:
+      sqrtAlg_GetPrecomputedRootOfUnity(temp, int( (negExponent shr (j*Fp.C.tonelliShanks(sqrtParam_BlockSize))) and Fp.C.tonelliShanks(sqrtParam_BitMask) ), uint(j + Fp.C.tonelliShanks(sqrtParam_Blocks) - 1 - i))
+      temp2.prod(temp2, temp)
+
+    var newBits = sqrtAlg_NegDlogInSmallDyadicSubgroup(temp2)
+    negExponent = negExponent or (newBits shl ((i*Fp.C.tonelliShanks(sqrtParam_BlockSize)) - Fp.C.tonelliShanks(sqrtParam_FirstBlockUnusedBits)))
+
+  negExponent = negExponent shr 1
+  z.setOne()
+
+  for i in 0..<Fp.C.tonelliShanks(sqrtParam_Blocks):
+    sqrtAlg_GetPrecomputedRootOfUnity(temp, int((negExponent shr (i*Fp.C.tonelliShanks(sqrtParam_BlockSize))) and Fp.C.tonelliShanks(sqrtParam_BitMask)), uint(i))
+    z.prod(z, temp)
+
+  return true
+
+func sqrtPrecomp*(dst: var Fp, x: Fp): SecretBool {.inline.} =
+  dst.setZero()
+  if x.isZero().bool():
+    return SecretBool(true)
+
+  var xCopy: Fp = x
+  var candidate, rootOfUnity: Fp
+  sqrtAlg_ComputerRelevantPowers(xCopy, candidate, rootOfUnity)
+  if not invSqrtEqDyadic(rootOfUnity):
+    return SecretBool(false)
+
+  dst.prod(candidate, rootOfUnity)
+  return SecretBool(true)
+