diff --git a/cpp/include/cudf/detail/copy_if_else.cuh b/cpp/include/cudf/detail/copy_if_else.cuh index 5dc75b1a3fb..a7efb4e6e93 100644 --- a/cpp/include/cudf/detail/copy_if_else.cuh +++ b/cpp/include/cudf/detail/copy_if_else.cuh @@ -44,10 +44,11 @@ __launch_bounds__(block_size) CUDF_KERNEL mutable_column_device_view out, size_type* __restrict__ const valid_count) { - auto tidx = cudf::detail::grid_1d::global_thread_id(); - auto const stride = cudf::detail::grid_1d::grid_stride(); - int const warp_id = tidx / cudf::detail::warp_size; - size_type const warps_per_grid = gridDim.x * block_size / cudf::detail::warp_size; + auto tidx = cudf::detail::grid_1d::global_thread_id(); + + auto const stride = cudf::detail::grid_1d::grid_stride(); + auto const warp_id = tidx / cudf::detail::warp_size; + auto const warps_per_grid = stride / cudf::detail::warp_size; // begin/end indices for the column data size_type const begin = 0; @@ -60,7 +61,7 @@ __launch_bounds__(block_size) CUDF_KERNEL // lane id within the current warp constexpr size_type leader_lane{0}; - int const lane_id = threadIdx.x % cudf::detail::warp_size; + auto const lane_id = threadIdx.x % cudf::detail::warp_size; size_type warp_valid_count{0}; diff --git a/cpp/src/partitioning/partitioning.cu b/cpp/src/partitioning/partitioning.cu index ebab3beb08f..d6b85db3f0f 100644 --- a/cpp/src/partitioning/partitioning.cu +++ b/cpp/src/partitioning/partitioning.cu @@ -138,7 +138,7 @@ CUDF_KERNEL void compute_row_partition_numbers(row_hasher_t the_hasher, auto const stride = cudf::detail::grid_1d::grid_stride(); // Initialize local histogram - size_type partition_number = threadIdx.x; + thread_index_type partition_number = threadIdx.x; while (partition_number < num_partitions) { shared_partition_sizes[partition_number] = 0; partition_number += blockDim.x; @@ -207,7 +207,7 @@ CUDF_KERNEL void compute_row_output_locations(size_type* __restrict__ row_partit extern __shared__ size_type shared_partition_offsets[]; // Initialize array of this blocks offsets from global array - size_type partition_number = threadIdx.x; + thread_index_type partition_number = threadIdx.x; while (partition_number < num_partitions) { shared_partition_offsets[partition_number] = block_partition_offsets[partition_number * gridDim.x + blockIdx.x]; @@ -303,7 +303,8 @@ CUDF_KERNEL void copy_block_partitions(InputIter input_iter, // Fetch the offset in the output buffer of each partition in this thread // block - for (size_type ipartition = threadIdx.x; ipartition < num_partitions; ipartition += blockDim.x) { + for (thread_index_type ipartition = threadIdx.x; ipartition < num_partitions; + ipartition += blockDim.x) { partition_offset_global[ipartition] = scanned_block_partition_sizes[ipartition * gridDim.x + blockIdx.x]; } diff --git a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu index d27420658d6..2128bacff80 100644 --- a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu +++ b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu @@ -385,7 +385,7 @@ CUDF_KERNEL void generate_cluster_limits_kernel(int delta, size_type const* group_cluster_offsets, bool has_nulls) { - int const tid = threadIdx.x + blockIdx.x * blockDim.x; + auto const tid = cudf::detail::grid_1d::global_thread_id(); auto const group_index = tid; if (group_index >= num_groups) { return; } diff --git a/cpp/src/transform/jit/kernel.cu b/cpp/src/transform/jit/kernel.cu index 4fd0369c26b..9d96c11c3f2 100644 --- a/cpp/src/transform/jit/kernel.cu +++ b/cpp/src/transform/jit/kernel.cu @@ -38,8 +38,9 @@ CUDF_KERNEL void kernel(cudf::size_type size, TypeOut* out_data, TypeIn* in_data { // cannot use global_thread_id utility due to a JIT build issue by including // the `cudf/detail/utilities/cuda.cuh` header - thread_index_type const start = threadIdx.x + blockIdx.x * blockDim.x; - thread_index_type const stride = blockDim.x * gridDim.x; + auto const block_size = static_cast(blockDim.x); + thread_index_type const start = threadIdx.x + blockIdx.x * block_size; + thread_index_type const stride = block_size * gridDim.x; for (auto i = start; i < static_cast(size); i += stride) { GENERIC_UNARY_OP(&out_data[i], in_data[i]); diff --git a/cpp/src/transform/row_bit_count.cu b/cpp/src/transform/row_bit_count.cu index 66bbe532e46..39c11295fbd 100644 --- a/cpp/src/transform/row_bit_count.cu +++ b/cpp/src/transform/row_bit_count.cu @@ -413,7 +413,7 @@ CUDF_KERNEL void compute_segment_sizes(device_span col size_type max_branch_depth) { extern __shared__ row_span thread_branch_stacks[]; - int const tid = threadIdx.x + blockIdx.x * blockDim.x; + auto const tid = static_cast(cudf::detail::grid_1d::global_thread_id()); auto const num_segments = static_cast(output.size()); if (tid >= num_segments) { return; }