Skip to content

Commit

Permalink
feat: precomp sqrt optimization
Browse files Browse the repository at this point in the history
  • Loading branch information
advaita-saha committed Feb 2, 2024
1 parent 58d8d2c commit 2bfd4e6
Show file tree
Hide file tree
Showing 5 changed files with 1,694 additions and 2 deletions.
144 changes: 144 additions & 0 deletions benchmarks/bench_verkle_primitives.nim
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
# Constantine
# Copyright (c) 2018-2019 Status Research & Development GmbH
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.

# ############################################################
#
# Summary of the performance of a curve
#
# ############################################################

import
# Internals
../constantine/platforms/abstractions,
../constantine/math/config/curves,
../constantine/math/[arithmetic, extension_fields],
../constantine/math/elliptic/[
ec_shortweierstrass_affine,
ec_shortweierstrass_projective,
ec_shortweierstrass_jacobian,
ec_scalar_mul, ec_endomorphism_accel,
ec_twistededwards_affine,
ec_twistededwards_projective],
../constantine/math/constants/zoo_generators,
../constantine/math/pairings/[
cyclotomic_subgroups,
pairings_bls12,
pairings_bn
],
../constantine/math/io/io_fields,
../constantine/math/constants/zoo_pairings,
../constantine/hashes,
../constantine/hash_to_curve/hash_to_curve,
../constantine/serialization/[
codecs_status_codes,
codecs_banderwagon
],
# Helpers
../helpers/prng_unsafe,
./bench_blueprint

export
ec_shortweierstrass_projective,
ec_shortweierstrass_jacobian

export abstractions # generic sandwich on SecretBool and SecretBool in Jacobian sum
export zoo_pairings # generic sandwich https://github.com/nim-lang/Nim/issues/11225
export notes

type
Prj* = ECP_TwEdwards_Prj[Fp[Banderwagon]]
Aff* = ECP_TwEdwards_Prj[Fp[Banderwagon]]

const Iters = 10000

proc separator*() = separator(152)

proc report(op, domain: string, start, stop: MonoTime, startClk, stopClk: int64, iters: int) =
let ns = inNanoseconds((stop-start) div iters)
let throughput = 1e9 / float64(ns)
when SupportsGetTicks:
echo &"{op:<35} {domain:<40} {throughput:>15.3f} ops/s {ns:>9} ns/op {(stopClk - startClk) div iters:>9} CPU cycles (approx)"
else:
echo &"{op:<35} {domain:<40} {throughput:>15.3f} ops/s {ns:>9} ns/op"

macro fixEllipticDisplay(T: typedesc): untyped =
# At compile-time, enums are integers and their display is buggy
# we get the Curve ID instead of the curve name.
let instantiated = T.getTypeInst()
var name = $instantiated[1][0] # EllipticEquationFormCoordinates
let fieldName = $instantiated[1][1][0]
let curveName = $Curve(instantiated[1][1][1].intVal)
name.add "[" & fieldName & "[" & curveName & "]]"
result = newLit name

macro fixFieldDisplay(T: typedesc): untyped =
# At compile-time, enums are integers and their display is buggy
# we get the Curve ID instead of the curve name.
let instantiated = T.getTypeInst()
var name = $instantiated[1][0] # Fp
name.add "[" & $Curve(instantiated[1][1].intVal) & "]"
result = newLit name

func fixDisplay(T: typedesc): string =
when T is (ECP_ShortW_Prj or ECP_ShortW_Jac or ECP_ShortW_Aff):
fixEllipticDisplay(T)
elif T is (Fp or Fp2 or Fp4 or Fp6 or Fp12):
fixFieldDisplay(T)
else:
$T

func fixDisplay(T: Curve): string =
$T

template bench(op: string, T: typed, iters: int, body: untyped): untyped =
measure(iters, startTime, stopTime, startClk, stopClk, body)
report(op, fixDisplay(T), startTime, stopTime, startClk, stopClk, iters)

# proc equalityBench*(T: typedesc, iters: int) =
# when T is Aff:
# let P = Banderwagon.getGenerator()
# let Q = Banderwagon.getGenerator()
# else:
# var P, Q: Prj
# P.fromAffine(Banderwagon.getGenerator())
# Q.fromAffine(Banderwagon.getGenerator())
# bench("Banderwagon Equality ", T, iters):
# assert (P == Q).bool()



proc serializaBench*(T: typedesc, iters: int) =
var bytes: array[32, byte]
var P: Prj
P.fromAffine(Banderwagon.getGenerator())
for i in 0 ..< 9:
P.double()
bench("Banderwagon Serialization", T, iters):
discard bytes.serialize(P)

proc deserializeBench*(T: typedesc, iters: int) =
var bytes: array[32, byte]
var P: Prj
P.fromAffine(Banderwagon.getGenerator())
for i in 0 ..< 6:
P.double()
discard bytes.serialize(P)
bench("Banderwagon Deserialization", T, iters):
discard P.deserialize(bytes)


proc main() =
# separator()
# equalityBench(Aff, Iters)
# equalityBench(Prj, Iters)
separator()
serializaBench(Prj, Iters)
deserializeBench(Prj, Iters)

main()
notes()
6 changes: 5 additions & 1 deletion constantine.nimble
Original file line number Diff line number Diff line change
Expand Up @@ -613,7 +613,8 @@ const benchDesc = [
"bench_ethereum_eip4844_kzg",
"bench_evm_modexp_dos",
"bench_gmp_modexp",
"bench_gmp_modmul"
"bench_gmp_modmul",
"bench_verkle_primitives"
]

# For temporary (hopefully) investigation that can only be reproduced in CI
Expand Down Expand Up @@ -1036,3 +1037,6 @@ task bench_ethereum_bls_signatures, "Run Ethereum BLS signatures benchmarks - CC
# ------------------------------------------
task bench_ethereum_eip4844_kzg, "Run Ethereum EIP4844 KZG Polynomial commitment - CC compiler":
runBench("bench_ethereum_eip4844_kzg")

task bench_verkle, "Run benchmarks for Banderwagon":
runBench("bench_verkle_primitives")
198 changes: 198 additions & 0 deletions constantine/math/arithmetic/finite_fields_precomp_square_root.nim
Original file line number Diff line number Diff line change
@@ -0,0 +1,198 @@
# Constantine
# Copyright (c) 2018-2019 Status Research & Development GmbH
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.

import
std/tables,
../../platforms/abstractions,
../constants/zoo_square_roots,
./bigints, ./finite_fields

# ############################################################
#
# Only For Bandersnatch/Banderwagon for now
#
# ############################################################

# sqrtAlg_NegDlogInSmallDyadicSubgroup takes a (not necessarily primitive) root of unity x of order 2^sqrtParam_BlockSize.
# x has the form sqrtPrecomp_ReconstructionDyadicRoot^a and returns its negative dlog -a.
#
# The returned value is only meaningful modulo 1<<sqrtParam_BlockSize and is fully reduced, i.e. in [0, 1<<sqrtParam_BlockSize )
#
# NOTE: If x is not a root of unity as asserted, the behaviour is undefined.
func sqrtAlg_NegDlogInSmallDyadicSubgroup*(x: Fp): int =
let key = cast[int](x.mres.limbs[0] and SecretWord 0xFFFF)
if key in Fp.C.tonelliShanks(sqrtPrecomp_dlogLUT):
return Fp.C.tonelliShanks(sqrtPrecomp_dlogLUT)[key]
return 0

# sqrtAlg_GetPrecomputedRootOfUnity sets target to g^(multiplier << (order * sqrtParam_BlockSize)), where g is the fixed primitive 2^32th root of unity.
#
# We assume that order 0 <= order*sqrtParam_BlockSize <= 32 and that multiplier is in [0, 1 <<sqrtParam_BlockSize)
func sqrtAlg_GetPrecomputedRootOfUnity*(target: var Fp, multiplier: int, order: uint) =
target = Fp.C.tonelliShanks(sqrtPrecomp_PrecomputedBlocks)[order][multiplier]


func sqrtAlg_ComputerRelevantPowers*(z: var Fp, squareRootCandidate: var Fp, rootOfUnity: var Fp) =
## sliding window-type algorithm with window-size 5
## Note that we precompute and use z^255 multiple times (even though it's not size 5)
## and some windows actually overlap
var z2, z3, z7, z6, z9, z11, z13, z19, z21, z25, z27, z29, z31, z255 {.noInit.} : Fp
var acc: Fp
z2.square(z)
z3.prod(z2, z)
z6.prod(z3, z3)
z7.prod(z6, z)
z9.prod(z7, z2)
z11.prod(z9, z2)
z13.prod(z11, z2)
z19.prod(z13, z6)
z21.prod(z19, z2)
z25.prod(z19, z6)
z27.prod(z25, z2)
z29.prod(z27, z2)
z31.prod(z29, z2)
acc.prod(z27, z29)
acc.prod(acc, acc)
acc.prod(acc, acc)
z255.prod(acc, z31)
acc.prod(acc, acc)
acc.prod(acc, acc)
acc.prod(acc, z31)
square_repeated(acc, 6)
acc.prod(acc, z27)
square_repeated(acc, 6)
acc.prod(acc, z19)
square_repeated(acc, 5)
acc.prod(acc, z21)
square_repeated(acc, 7)
acc.prod(acc, z25)
square_repeated(acc, 6)
acc.prod(acc, z19)
square_repeated(acc, 5)
acc.prod(acc, z7)
square_repeated(acc, 5)
acc.prod(acc, z11)
square_repeated(acc, 5)
acc.prod(acc, z29)
square_repeated(acc, 5)
acc.prod(acc, z9)
square_repeated(acc, 7)
acc.prod(acc, z3)
square_repeated(acc, 7)
acc.prod(acc, z25)
square_repeated(acc, 5)
acc.prod(acc, z25)
square_repeated(acc, 5)
acc.prod(acc, z27)
square_repeated(acc, 8)
acc.prod(acc, z)
square_repeated(acc, 8)
acc.prod(acc, z)
square_repeated(acc, 6)
acc.prod(acc, z13)
square_repeated(acc, 7)
acc.prod(acc, z7)
square_repeated(acc, 3)
acc.prod(acc, z3)
square_repeated(acc, 13)
acc.prod(acc, z21)
square_repeated(acc, 5)
acc.prod(acc, z9)
square_repeated(acc, 5)
acc.prod(acc, z27)
square_repeated(acc, 5)
acc.prod(acc, z27)
square_repeated(acc, 5)
acc.prod(acc, z9)
square_repeated(acc, 10)
acc.prod(acc, z)
square_repeated(acc, 7)
acc.prod(acc, z255)
square_repeated(acc, 8)
acc.prod(acc, z255)
square_repeated(acc, 6)
acc.prod(acc, z11)
square_repeated(acc, 9)
acc.prod(acc, z255)
square_repeated(acc, 2)
acc.prod(acc, z)
square_repeated(acc, 7)
acc.prod(acc, z255)
square_repeated(acc, 8)
acc.prod(acc, z255)
square_repeated(acc, 8)
acc.prod(acc, z255)
square_repeated(acc, 8)
acc.prod(acc, z255)
# acc is now z^((BaseFieldMultiplicativeOddOrder - 1)/2)
rootOfUnity.square(acc)
rootOfUnity.prod(rootOfUnity ,z)
squareRootCandidate.prod(acc, z)


func invSqrtEqDyadic*(z: var Fp): bool =
## The algorithm works by essentially computing the dlog of z and then halving it.
## negExponent is intended to hold the negative of the dlog of z.
## We determine this 32-bit value (usually) _sqrtBlockSize many bits at a time, starting with the least-significant bits.
##
## If _sqrtBlockSize does not divide 32, the *first* iteration will determine fewer bits.

var negExponent: int
var temp, temp2: Fp

# set powers[i] to z^(1<< (i*blocksize))
var powers: array[4, Fp]
powers[0] = z
for i in 1..<Fp.C.tonelliShanks(sqrtParam_Blocks):
powers[i] = powers[i - 1]
for j in 0..<Fp.C.tonelliShanks(sqrtParam_BlockSize):
powers[i].square(powers[i])

## looking at the dlogs, powers[i] is essentially the wanted exponent, left-shifted by i*_sqrtBlockSize and taken mod 1<<32
## dlogHighDyadicRootNeg essentially (up to sign) reads off the _sqrtBlockSize many most significant bits. (returned as low-order bits)
##
## first iteration may be slightly special if BlockSize does not divide 32
negExponent = sqrtAlg_NegDlogInSmallDyadicSubgroup(powers[Fp.C.tonelliShanks(sqrtParam_Blocks) - 1])
negExponent = negExponent shr Fp.C.tonelliShanks(sqrtParam_FirstBlockUnusedBits)

# if the exponent we just got is odd, there is no square root, no point in determining the other bits
if (negExponent and 1) == 1:
return false

for i in 1..<Fp.C.tonelliShanks(sqrtParam_Blocks):
temp2 = powers[Fp.C.tonelliShanks(sqrtParam_Blocks) - 1 - i]
for j in 0..<i:
sqrtAlg_GetPrecomputedRootOfUnity(temp, int( (negExponent shr (j*Fp.C.tonelliShanks(sqrtParam_BlockSize))) and Fp.C.tonelliShanks(sqrtParam_BitMask) ), uint(j + Fp.C.tonelliShanks(sqrtParam_Blocks) - 1 - i))
temp2.prod(temp2, temp)

var newBits = sqrtAlg_NegDlogInSmallDyadicSubgroup(temp2)
negExponent = negExponent or (newBits shl ((i*Fp.C.tonelliShanks(sqrtParam_BlockSize)) - Fp.C.tonelliShanks(sqrtParam_FirstBlockUnusedBits)))

negExponent = negExponent shr 1
z.setOne()

for i in 0..<Fp.C.tonelliShanks(sqrtParam_Blocks):
sqrtAlg_GetPrecomputedRootOfUnity(temp, int((negExponent shr (i*Fp.C.tonelliShanks(sqrtParam_BlockSize))) and Fp.C.tonelliShanks(sqrtParam_BitMask)), uint(i))
z.prod(z, temp)

return true

func sqrtPrecomp*(dst: var Fp, x: Fp): SecretBool {.inline.} =
dst.setZero()
if x.isZero().bool():
return SecretBool(true)

var xCopy: Fp = x
var candidate, rootOfUnity: Fp
sqrtAlg_ComputerRelevantPowers(xCopy, candidate, rootOfUnity)
if not invSqrtEqDyadic(rootOfUnity):
return SecretBool(false)

dst.prod(candidate, rootOfUnity)
return SecretBool(true)

Loading

0 comments on commit 2bfd4e6

Please sign in to comment.