Skip to content

Commit

Permalink
Nvidia GPU: add field substraction
Browse files Browse the repository at this point in the history
  • Loading branch information
mratsim committed Jan 20, 2024
1 parent f0dd144 commit bd0948c
Show file tree
Hide file tree
Showing 3 changed files with 128 additions and 3 deletions.
63 changes: 62 additions & 1 deletion constantine/math_codegen/fields_nvidia.nim
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,21 @@ import

# ############################################################
#
# Field arithmetic on Nvidia GPU
# Field arithmetic on Nvidia GPUs
#
# ############################################################

# Loads from global (kernel params) take over 100 cycles
# https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#operand-costs

# Instructions cycle count:
# - Dissecting the NVIDIA Volta GPU Architecture via Microbenchmarking
# Zhe Jia, Marco Maggioni, Benjamin Staiger, Daniele P. Scarpazza
# https://arxiv.org/pdf/1804.06826.pdf
# - Demystifying the Nvidia Ampere Architecture through Microbenchmarking
# and Instruction-level Analysis
# https://arxiv.org/pdf/2208.11174.pdf

proc finalSubMayOverflow*(asy: Assembler_LLVM, cm: CurveMetadata, field: Field, r, a: Array) =
## If a >= Modulus: r <- a-M
## else: r <- a
Expand Down Expand Up @@ -115,3 +123,56 @@ proc field_add_gen*(asy: Assembler_LLVM, cm: CurveMetadata, field: Field): FnDef
bld.retVoid()

return (addModTy, addModKernel)

proc field_sub_gen*(asy: Assembler_LLVM, cm: CurveMetadata, field: Field): FnDef =
## Generate an optimized modular substraction kernel
## with parameters `a, b, modulus: Limbs -> Limbs`

let procName = cm.genSymbol(block:
case field
of fp: opFpSub
of fr: opFrSub)
let fieldTy = cm.getFieldType(field)
let pFieldTy = pointer_t(fieldTy)

let subModTy = function_t(asy.void_t, [pFieldTy, pFieldTy, pFieldTy])
let subModKernel = asy.module.addFunction(cstring procName, subModTy)
let blck = asy.ctx.appendBasicBlock(subModKernel, "subModBody")
asy.builder.positionAtEnd(blck)

let bld = asy.builder

let r = bld.asArray(subModKernel.getParam(0), fieldTy)
let a = bld.asArray(subModKernel.getParam(1), fieldTy)
let b = bld.asArray(subModKernel.getParam(2), fieldTy)

let t = bld.makeArray(fieldTy)
let N = cm.getNumWords(field)

t[0] = bld.sub_bo(a[0], b[0])
for i in 1 ..< N:
t[i] = bld.sub_bio(a[i], b[i])

let underflowMask = case cm.wordSize
of size32: bld.sub_bi(0'u32, 0'u32)
of size64: bld.sub_bi(0'u64, 0'u64)

# If underflow
# TODO: predicated mov instead?
# The number of cycles is not available in https://arxiv.org/pdf/2208.11174.pdf
let M = (seq[ValueRef])(cm.getModulus(field))
let maskedM = bld.makeArray(fieldTy)
for i in 0 ..< N:
maskedM[i] = bld.`and`(M[i], underflowMask)

block:
t[0] = bld.add_co(t[0], maskedM[0])
for i in 1 ..< N-1:
t[i] = bld.add_cio(t[i], maskedM[i])
if N > 1:
t[N-1] = bld.add_ci(t[N-1], maskedM[N-1])

bld.store(r, t)
bld.retVoid()

return (subModTy, subModKernel)
2 changes: 2 additions & 0 deletions constantine/platforms/code_generator/ir.nim
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,8 @@ type
Opcode* = enum
opFpAdd = "fp_add"
opFrAdd = "fr_add"
opFpSub = "fp_sub"
opFrSub = "fr_sub"

proc setFieldConst(fc: var FieldConst, ctx: ContextRef, wordSize: WordSize, modBits: uint32, modulus: string) =
let wordTy = case wordSize
Expand Down
66 changes: 64 additions & 2 deletions tests/gpu/t_nvidia_fp.nim
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,12 @@ proc genFieldAddPTX(asy: Assembler_LLVM, cm: CurveMetadata) =
let frAdd = asy.field_add_gen(cm, fr)
asy.module.setCallableCudaKernel(frAdd)

proc genFieldSubPTX(asy: Assembler_LLVM, cm: CurveMetadata) =
let fpSub = asy.field_sub_gen(cm, fp)
asy.module.setCallableCudaKernel(fpSub)
let frSub = asy.field_sub_gen(cm, fr)
asy.module.setCallableCudaKernel(frSub)

# Init LLVM
# -------------------------
initializeFullNVPTXTarget()
Expand Down Expand Up @@ -109,6 +115,60 @@ proc t_field_add(curve: static Curve) =
doAssert bool(rCPU == rGPU_32)
doAssert bool(rCPU == rGPU_64)

proc t_field_sub(curve: static Curve) =
# Codegen
# -------------------------
let asy = Assembler_LLVM.new(bkNvidiaPTX, cstring("t_nvidia_" & $curve))
let cm32 = CurveMetadata.init(asy, curve, size32)
asy.genFieldSubPTX(cm32)
let cm64 = CurveMetadata.init(asy, curve, size64)
asy.genFieldSubPTX(cm64)

let ptx = asy.codegenNvidiaPTX(sm)

# GPU exec
# -------------------------
var cuCtx: CUcontext
var cuMod: CUmodule
check cuCtxCreate(cuCtx, 0, cudaDevice)
check cuModuleLoadData(cuMod, ptx)
defer:
check cuMod.cuModuleUnload()
check cuCtx.cuCtxDestroy()

let fpSub32 = cuMod.getCudaKernel(cm32, opFpSub)
let fpSub64 = cuMod.getCudaKernel(cm64, opFpSub)
let frSub32 = cuMod.getCudaKernel(cm32, opFrSub)
let frSub64 = cuMod.getCudaKernel(cm64, opFrSub)

# Fp
for i in 0 ..< Iters:
let a = rng.random_long01Seq(Fp[curve])
let b = rng.random_long01Seq(Fp[curve])

var rCPU, rGPU_32, rGPU_64: Fp[curve]

rCPU.diff(a, b)
fpSub32.exec(rGPU_32, a, b)
fpSub64.exec(rGPU_64, a, b)

doAssert bool(rCPU == rGPU_32)
doAssert bool(rCPU == rGPU_64)

# Fr
for i in 0 ..< Iters:
let a = rng.random_long01Seq(Fr[curve])
let b = rng.random_long01Seq(Fr[curve])

var rCPU, rGPU_32, rGPU_64: Fr[curve]

rCPU.diff(a, b)
frSub32.exec(rGPU_32, a, b)
frSub64.exec(rGPU_64, a, b)

doAssert bool(rCPU == rGPU_32)
doAssert bool(rCPU == rGPU_64)

proc main() =
const curves = [
P224,
Expand All @@ -125,10 +185,12 @@ proc main() =
BW6_761
]

suite "[Nvidia GPU] Field Addition":
suite "[Nvidia GPU] Field Arithmetic":
staticFor i, 0, curves.len:
const curve = curves[i]
test "Nvidia GPU field addition (𝔽p, 𝔽r) for " & $curve:
test "Nvidia GPU field addition (𝔽p, 𝔽r) for " & $curve:
t_field_add(curve)
test "Nvidia GPU field substraction (𝔽p, 𝔽r) for " & $curve:
t_field_sub(curve)

main()

0 comments on commit bd0948c

Please sign in to comment.