From efbf8ce3cc96512edab8db1e8e7d239f2ddd125e Mon Sep 17 00:00:00 2001 From: Jiashu Yao Date: Tue, 20 Aug 2024 01:53:44 +0000 Subject: [PATCH] Revert the commit 9e02f53 to make it a standalone PR. --- .../merlin/core_kernels/group_lock_kernels.cuh | 18 +++++++++--------- include/merlin/types.cuh | 4 ++-- include/merlin/utils.cuh | 3 +-- 3 files changed, 12 insertions(+), 13 deletions(-) diff --git a/include/merlin/core_kernels/group_lock_kernels.cuh b/include/merlin/core_kernels/group_lock_kernels.cuh index ae2f38466..d42f208cd 100644 --- a/include/merlin/core_kernels/group_lock_kernels.cuh +++ b/include/merlin/core_kernels/group_lock_kernels.cuh @@ -21,7 +21,7 @@ namespace nv { namespace merlin { namespace group_lock { -static __global__ void init_kernel( +__global__ void init_kernel( cuda::atomic* update_count, cuda::atomic* read_count, cuda::atomic* unique_flag) { @@ -29,7 +29,7 @@ static __global__ void init_kernel( new (read_count) cuda::atomic{0}; new (unique_flag) cuda::atomic{false}; } -static __global__ void lock_read_kernel( +__global__ void lock_read_kernel( cuda::atomic* update_count, cuda::atomic* read_count) { for (;;) { @@ -43,12 +43,12 @@ static __global__ void lock_read_kernel( } } -static __global__ void unlock_read_kernel( +__global__ void unlock_read_kernel( cuda::atomic* read_count) { read_count->fetch_sub(1, cuda::std::memory_order_relaxed); } -static __global__ void lock_update_kernel( +__global__ void lock_update_kernel( cuda::atomic* update_count, cuda::atomic* read_count) { for (;;) { @@ -62,12 +62,12 @@ static __global__ void lock_update_kernel( } } -static __global__ void unlock_update_kernel( +__global__ void unlock_update_kernel( cuda::atomic* update_count) { update_count->fetch_sub(1, cuda::std::memory_order_relaxed); } -static __global__ void lock_update_read_kernel( +__global__ void lock_update_read_kernel( cuda::atomic* update_count, cuda::atomic* read_count, cuda::atomic* unique_flag) { @@ -101,7 +101,7 @@ static __global__ void lock_update_read_kernel( } } -static __global__ void unlock_update_read_kernel( +__global__ void unlock_update_read_kernel( cuda::atomic* update_count, cuda::atomic* read_count, cuda::atomic* unique_flag) { @@ -110,12 +110,12 @@ static __global__ void unlock_update_read_kernel( unique_flag->store(false, cuda::std::memory_order_relaxed); } -static __global__ void update_count_kernel( +__global__ void update_count_kernel( int* counter, cuda::atomic* update_count) { *counter = update_count->load(cuda::std::memory_order_relaxed); } -static __global__ void read_count_kernel( +__global__ void read_count_kernel( int* counter, cuda::atomic* read_count) { *counter = read_count->load(cuda::std::memory_order_relaxed); } diff --git a/include/merlin/types.cuh b/include/merlin/types.cuh index ebbe1bffd..cc266604f 100644 --- a/include/merlin/types.cuh +++ b/include/merlin/types.cuh @@ -54,7 +54,7 @@ constexpr uint64_t MAX_SCORE = UINT64_C(0xFFFFFFFFFFFFFFFF); constexpr uint64_t EMPTY_SCORE = UINT64_C(0); constexpr uint64_t IGNORED_GLOBAL_EPOCH = UINT64_C(0xFFFFFFFFFFFFFFFF); -static uint64_t EMPTY_KEY_CPU = DEFAULT_EMPTY_KEY; +uint64_t EMPTY_KEY_CPU = DEFAULT_EMPTY_KEY; __constant__ uint64_t EMPTY_KEY = DEFAULT_EMPTY_KEY; __constant__ uint64_t RECLAIM_KEY = DEFAULT_RECLAIM_KEY; __constant__ uint64_t LOCKED_KEY = DEFAULT_LOCKED_KEY; @@ -76,7 +76,7 @@ __forceinline__ __device__ bool IS_VACANT_KEY(K key) { return (VACANT_KEY_MASK_1 & key) == VACANT_KEY_MASK_2; } -static cudaError_t init_reserved_keys(int index) { +cudaError_t init_reserved_keys(int index) { if (index < 1 || index > MAX_RESERVED_KEY_BIT) { // index = 0 is the default, // index = 62 is the maximum index can be set for reserved keys. diff --git a/include/merlin/utils.cuh b/include/merlin/utils.cuh index 885682bb8..d33ee50ae 100644 --- a/include/merlin/utils.cuh +++ b/include/merlin/utils.cuh @@ -314,8 +314,7 @@ inline void free_pointers(cudaStream_t stream, int n, ...) { va_end(args); } -static __global__ void memset64bitKernel(void* devPtr, uint64_t value, - size_t count) { +__global__ void memset64bitKernel(void* devPtr, uint64_t value, size_t count) { size_t idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < count) { static_cast(devPtr)[idx] = value;