mratsim · mratsim · Aug 27, 2024 · Aug 27, 2024 · Aug 27, 2024 · Aug 27, 2024
diff --git a/constantine/math_compiler/codegen_amdgpu.nim b/constantine/math_compiler/codegen_amdgpu.nim
@@ -67,45 +67,6 @@ proc hipDeviceInit*(deviceID = 0'i32): HipDevice =
 
   return hipDevice
 
-# ############################################################
-#
-#                   LLVM IR for AMD GPUs
-#
-# ############################################################
-#
-# Note:
-#   __device__ functions for field and elliptic curve arithmetic
-#   might be compiled by default with scalar codegen
-#
-#   We will need to either:
-#   - Derive explicitly a vectorized version of the warp/wave size (32)
-#   - Derive implicitly a vectorized version, probably with __forceinline__
-
-proc wrapInCallableHipKernel*(module: ModuleRef, fn: FnDef) =
-  ## Create a public wrapper of a Hip device function
-  ##
-  ## A function named `addmod` can be found by appending _public
-  ##   check hipModuleGetFunction(fnPointer, cuModule, "addmod_public")
-
-  let pubName = fn.fnImpl.getName() & "_public"
-  let pubFn = module.addFunction(cstring(pubName), fn.fnTy)
-
-  let ctx = module.getContext()
-  let builder = ctx.createBuilder()
-  defer: builder.dispose()
-
-  let blck = ctx.appendBasicBlock(pubFn, "publicKernelBody")
-  builder.positionAtEnd(blck)
-
-  var args = newSeq[ValueRef](fn.fnTy.countParamTypes())
-  for i, arg in mpairs(args):
-    arg = pubFn.getParam(i.uint32)
-  discard builder.call2(fn.fnTy, fn.fnImpl, args)
-
-  # A public kernel must return void
-  builder.retVoid()
-  pubFn.setCallingConvention(AMDGPU_KERNEL)
-
 # ############################################################
 #
 #                      Code generation

diff --git a/constantine/math_compiler/codegen_nvidia.nim b/constantine/math_compiler/codegen_nvidia.nim
@@ -109,37 +109,6 @@ proc cudaDeviceInit*(deviceID = 0'i32): CUdevice =
 
   return cuDevice
 
-# ############################################################
-#
-#                   LLVM IR for Nvidia GPUs
-#
-# ############################################################
-
-proc wrapInCallableCudaKernel*(module: ModuleRef, fn: FnDef) =
-  ## Create a public wrapper of a cuda device function
-  ##
-  ## A function named `addmod` can be found by appending _public
-  ##   check cuModuleGetFunction(fnPointer, cuModule, "addmod_public")
-
-  let pubName = fn.fnImpl.getName() & "_public"
-  let pubFn = module.addFunction(cstring(pubName), fn.fnTy)
-
-  let ctx = module.getContext()
-  let builder = ctx.createBuilder()
-  defer: builder.dispose()
-
-  let blck = ctx.appendBasicBlock(pubFn, "publicKernelBody")
-  builder.positionAtEnd(blck)
-
-  var args = newSeq[ValueRef](fn.fnTy.countParamTypes())
-  for i, arg in mpairs(args):
-    arg = pubFn.getParam(i.uint32)
-  discard builder.call2(fn.fnTy, fn.fnImpl, args)
-
-  # A public kernel must return void
-  builder.retVoid()
-  module.tagCudaKernel((fn.fnTy, pubFn))
-
 # ############################################################
 #
 #                      Code generation
@@ -194,9 +163,7 @@ proc codegenNvidiaPTX*(asy: Assembler_LLVM, sm: tuple[major, minor: int32]): str
 #
 # ############################################################
 
-proc getCudaKernel*(cuMod: CUmodule, cm: CurveMetadata, opcode: Opcode): CUfunction =
-  # Public kernels are appended _public
-  let fnName = cm.genSymbol(opcode) & "_public"
+proc getCudaKernel*(cuMod: CUmodule, fnName: string): CUfunction =
   check cuModuleGetFunction(result, cuMod, fnName)
 
 proc exec*[T](jitFn: CUfunction, r: var T, a, b: T) =

diff --git a/constantine/math_compiler/impl_fields_dispatch.nim b/constantine/math_compiler/impl_fields_dispatch.nim
@@ -0,0 +1,34 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  constantine/platforms/llvm/llvm,
+  ./ir,
+  ./impl_fields_sat {.all.},
+  ./impl_fields_nvidia {.all.}
+
+proc modadd*(asy: Assembler_LLVM, fd: FieldDescriptor, r, a, b, M: ValueRef) =
+  case asy.backend
+  of {bkX86_64_Linux, bkAmdGpu}:
+    asy.modadd_sat(fd, r, a, b, M)
+  of bkNvidiaPTX:
+    asy.modadd_nvidia(fd, r, a, b, M)
+
+proc modsub*(asy: Assembler_LLVM, fd: FieldDescriptor, r, a, b, M: ValueRef) =
+  case asy.backend
+  of bkNvidiaPTX:
+    asy.modsub_nvidia(fd, r, a, b, M)
+  else:
+    doAssert false, "Unimplemented"
+
+proc mtymul*(asy: Assembler_LLVM, fd: FieldDescriptor, r, a, b, M: ValueRef) =
+  case asy.backend
+  of bkNvidiaPTX:
+    asy.mtymul_nvidia(fd, r, a, b, M)
+  else:
+    doAssert false, "Unimplemented"
diff --git a/constantine/math_compiler/impl_fields_globals.nim b/constantine/math_compiler/impl_fields_globals.nim
@@ -157,8 +157,7 @@ proc getM0ninv*(asy: Assembler_LLVM, fd: FieldDescriptor): ValueRef =
         fd.wordTy
       )
 
-
-  return m0ninv
+  return asy.load2(fd.wordTy, m0ninv, "m0ninv")
 
 when isMainModule:
   let asy = Assembler_LLVM.new("test_module", bkX86_64_Linux)