NVIDIA-Merlin · jiashuy · Aug 21, 2024 · Aug 20, 2024
diff --git a/include/merlin/core_kernels/group_lock_kernels.cuh b/include/merlin/core_kernels/group_lock_kernels.cuh
@@ -21,15 +21,15 @@ namespace nv {
 namespace merlin {
 namespace group_lock {
 
-static __global__ void init_kernel(
+__global__ void init_kernel(
     cuda::atomic<int, cuda::thread_scope_device>* update_count,
     cuda::atomic<int, cuda::thread_scope_device>* read_count,
     cuda::atomic<bool, cuda::thread_scope_device>* unique_flag) {
   new (update_count) cuda::atomic<int, cuda::thread_scope_device>{0};
   new (read_count) cuda::atomic<int, cuda::thread_scope_device>{0};
   new (unique_flag) cuda::atomic<bool, cuda::thread_scope_device>{false};
 }
-static __global__ void lock_read_kernel(
+__global__ void lock_read_kernel(
     cuda::atomic<int, cuda::thread_scope_device>* update_count,
     cuda::atomic<int, cuda::thread_scope_device>* read_count) {
   for (;;) {
@@ -43,12 +43,12 @@ static __global__ void lock_read_kernel(
   }
 }
 
-static __global__ void unlock_read_kernel(
+__global__ void unlock_read_kernel(
     cuda::atomic<int, cuda::thread_scope_device>* read_count) {
   read_count->fetch_sub(1, cuda::std::memory_order_relaxed);
 }
 
-static __global__ void lock_update_kernel(
+__global__ void lock_update_kernel(
     cuda::atomic<int, cuda::thread_scope_device>* update_count,
     cuda::atomic<int, cuda::thread_scope_device>* read_count) {
   for (;;) {
@@ -62,12 +62,12 @@ static __global__ void lock_update_kernel(
   }
 }
 
-static __global__ void unlock_update_kernel(
+__global__ void unlock_update_kernel(
     cuda::atomic<int, cuda::thread_scope_device>* update_count) {
   update_count->fetch_sub(1, cuda::std::memory_order_relaxed);
 }
 
-static __global__ void lock_update_read_kernel(
+__global__ void lock_update_read_kernel(
     cuda::atomic<int, cuda::thread_scope_device>* update_count,
     cuda::atomic<int, cuda::thread_scope_device>* read_count,
     cuda::atomic<bool, cuda::thread_scope_device>* unique_flag) {
@@ -101,7 +101,7 @@ static __global__ void lock_update_read_kernel(
   }
 }
 
-static __global__ void unlock_update_read_kernel(
+__global__ void unlock_update_read_kernel(
     cuda::atomic<int, cuda::thread_scope_device>* update_count,
     cuda::atomic<int, cuda::thread_scope_device>* read_count,
     cuda::atomic<bool, cuda::thread_scope_device>* unique_flag) {
@@ -110,12 +110,12 @@ static __global__ void unlock_update_read_kernel(
   unique_flag->store(false, cuda::std::memory_order_relaxed);
 }
 
-static __global__ void update_count_kernel(
+__global__ void update_count_kernel(
     int* counter, cuda::atomic<int, cuda::thread_scope_device>* update_count) {
   *counter = update_count->load(cuda::std::memory_order_relaxed);
 }
 
-static __global__ void read_count_kernel(
+__global__ void read_count_kernel(
     int* counter, cuda::atomic<int, cuda::thread_scope_device>* read_count) {
   *counter = read_count->load(cuda::std::memory_order_relaxed);
 }

diff --git a/include/merlin/types.cuh b/include/merlin/types.cuh
@@ -54,7 +54,7 @@ constexpr uint64_t MAX_SCORE = UINT64_C(0xFFFFFFFFFFFFFFFF);
 constexpr uint64_t EMPTY_SCORE = UINT64_C(0);
 constexpr uint64_t IGNORED_GLOBAL_EPOCH = UINT64_C(0xFFFFFFFFFFFFFFFF);
 
-static uint64_t EMPTY_KEY_CPU = DEFAULT_EMPTY_KEY;
+uint64_t EMPTY_KEY_CPU = DEFAULT_EMPTY_KEY;
 __constant__ uint64_t EMPTY_KEY = DEFAULT_EMPTY_KEY;
 __constant__ uint64_t RECLAIM_KEY = DEFAULT_RECLAIM_KEY;
 __constant__ uint64_t LOCKED_KEY = DEFAULT_LOCKED_KEY;
@@ -76,7 +76,7 @@ __forceinline__ __device__ bool IS_VACANT_KEY(K key) {
   return (VACANT_KEY_MASK_1 & key) == VACANT_KEY_MASK_2;
 }
 
-static cudaError_t init_reserved_keys(int index) {
+cudaError_t init_reserved_keys(int index) {
   if (index < 1 || index > MAX_RESERVED_KEY_BIT) {
     // index = 0 is the default,
     // index = 62 is the maximum index can be set for reserved keys.

diff --git a/include/merlin/utils.cuh b/include/merlin/utils.cuh
@@ -314,8 +314,7 @@ inline void free_pointers(cudaStream_t stream, int n, ...) {
   va_end(args);
 }
 
-static __global__ void memset64bitKernel(void* devPtr, uint64_t value,
-                                         size_t count) {
+__global__ void memset64bitKernel(void* devPtr, uint64_t value, size_t count) {
   size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
   if (idx < count) {
     static_cast<uint64_t*>(devPtr)[idx] = value;