Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ROCm] add support for ROCm/HIP device #6086

Open
wants to merge 21 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions .ci/hipify.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/bin/bash
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this file should be added in a follow-up PR in which we'll enable hipifying at our CI or will request users hipify localy before suggesting a CUDA-related PRs.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree. We can postpone this to the next PR for ROCm.

# Copyright(C) 2023 Advanced Micro Devices, Inc. All rights reserved.

for DIR in ./src ./include
do
for EXT in cpp h hpp cu
do
find ${DIR} -name "*.${EXT}" -exec sh -c '
echo "hipifying $1 in-place"
hipify-perl "$1" -inplace &
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Where do we get hipify-perl script?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is installed when installing HIP.
https://github.com/ROCm/HIP/blob/master/INSTALL.md

' sh {} \;
done
done

echo "waiting for all hipify-perl invocations to finish"
wait
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -463,3 +463,6 @@ dask-worker-space/
*.pub
*.rdp
*_rsa

# hipify-perl -inplace leaves behind *.prehip files
*.prehip
99 changes: 99 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ option(USE_GPU "Enable GPU-accelerated training" OFF)
option(USE_SWIG "Enable SWIG to generate Java API" OFF)
option(USE_TIMETAG "Set to ON to output time costs" OFF)
option(USE_CUDA "Enable CUDA-accelerated training " OFF)
option(USE_ROCM "Enable ROCM-accelerated training " OFF)
option(USE_DEBUG "Set to ON for Debug mode" OFF)
option(USE_SANITIZER "Use sanitizer flags" OFF)
set(
Expand Down Expand Up @@ -160,6 +161,11 @@ if(USE_CUDA)
set(USE_OPENMP ON CACHE BOOL "CUDA requires OpenMP" FORCE)
endif()

if(USE_ROCM)
enable_language(HIP)
set(USE_OPENMP ON CACHE BOOL "ROCM requires OpenMP" FORCE)
endif()

if(USE_OPENMP)
if(APPLE)
find_package(OpenMP)
Expand Down Expand Up @@ -302,6 +308,85 @@ if(USE_CUDA)
endforeach()
endif()

if(USE_ROCM)
find_package(HIP)
include_directories(${HIP_INCLUDE_DIRS})
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__HIP_PLATFORM_AMD__")
set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} ${OpenMP_CXX_FLAGS} -fPIC -Wall")

# avoid warning: unused variable 'mask' due to __shfl_down_sync work-around
set(DISABLED_WARNINGS "${DISABLED_WARNINGS} -Wno-unused-variable")
# avoid warning: 'hipHostAlloc' is deprecated: use hipHostMalloc instead
set(DISABLED_WARNINGS "${DISABLED_WARNINGS} -Wno-deprecated-declarations")
# avoid many warnings about missing overrides
set(DISABLED_WARNINGS "${DISABLED_WARNINGS} -Wno-inconsistent-missing-override")
# avoid warning: shift count >= width of type in feature_histogram.hpp
set(DISABLED_WARNINGS "${DISABLED_WARNINGS} -Wno-shift-count-overflow")

set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${DISABLED_WARNINGS}")
set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} ${DISABLED_WARNINGS}")

if(USE_DEBUG)
set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -g -O0")
else()
set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -O3")
endif()
message(STATUS "CMAKE_HIP_FLAGS: ${CMAKE_HIP_FLAGS}")

add_definitions(-DUSE_CUDA)

set(
BASE_DEFINES
-DPOWER_FEATURE_WORKGROUPS=12
-DUSE_CONSTANT_BUF=0
)
set(
ALLFEATS_DEFINES
${BASE_DEFINES}
-DENABLE_ALL_FEATURES
)
set(
FULLDATA_DEFINES
${ALLFEATS_DEFINES}
-DIGNORE_INDICES
)

message(STATUS "ALLFEATS_DEFINES: ${ALLFEATS_DEFINES}")
message(STATUS "FULLDATA_DEFINES: ${FULLDATA_DEFINES}")

function(add_histogram hsize hname hadd hconst hdir)
Copy link
Collaborator

@StrikerRUS StrikerRUS Dec 17, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How this function differs from existing one for CUDA? Can we reuse it or merge these two functions into one?

function(add_histogram hsize hname hadd hconst hdir)

add_library(histo${hsize}${hname} OBJECT src/treelearner/kernels/histogram${hsize}.cu)
if(hadd)
list(APPEND histograms histo${hsize}${hname})
set(histograms ${histograms} PARENT_SCOPE)
endif()
target_compile_definitions(
histo${hsize}${hname}
PRIVATE
-DCONST_HESSIAN=${hconst}
${hdir}
)
endfunction()

foreach(hsize _16_64_256)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same question as for add_histogram function. Can we [incapsulate this for-loop into a function and] reuse it with CUDA and HIP?

add_histogram("${hsize}" "_sp_const" "True" "1" "${BASE_DEFINES}")
add_histogram("${hsize}" "_sp" "True" "0" "${BASE_DEFINES}")
add_histogram("${hsize}" "-allfeats_sp_const" "False" "1" "${ALLFEATS_DEFINES}")
add_histogram("${hsize}" "-allfeats_sp" "False" "0" "${ALLFEATS_DEFINES}")
add_histogram("${hsize}" "-fulldata_sp_const" "True" "1" "${FULLDATA_DEFINES}")
add_histogram("${hsize}" "-fulldata_sp" "True" "0" "${FULLDATA_DEFINES}")
endforeach()
endif()

if(USE_HDFS)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

HDFS support was dropped some time ago. This if block should be removed.

find_package(JNI REQUIRED)
find_path(HDFS_INCLUDE_DIR hdfs.h REQUIRED)
find_library(HDFS_LIB NAMES hdfs REQUIRED)
include_directories(${HDFS_INCLUDE_DIR})
add_definitions(-DUSE_HDFS)
set(HDFS_CXX_LIBRARIES ${HDFS_LIB} ${JAVA_JVM_LIBRARY})
endif()

include(CheckCXXSourceCompiles)
check_cxx_source_compiles("
#include <xmmintrin.h>
Expand Down Expand Up @@ -644,6 +729,20 @@ if(USE_CUDA)
target_link_libraries(_lightgbm PRIVATE ${histograms})
endif()

if(USE_ROCM)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we merge CUDA and HIP with if( USE_CUDA OR USE_ROCM) here?

# histograms are list of object libraries. Linking object library to other
# object libraries only gets usage requirements, the linked objects won't be
# used. Thus we have to call target_link_libraries on final targets here.
if(BUILD_CLI)
target_link_libraries(lightgbm PRIVATE ${histograms})
endif()
target_link_libraries(_lightgbm PRIVATE ${histograms})
endif()

if(USE_HDFS)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Remove this.

target_link_libraries(lightgbm_objs PUBLIC ${HDFS_CXX_LIBRARIES})
endif()

if(WIN32)
if(MINGW OR CYGWIN)
target_link_libraries(lightgbm_objs PUBLIC ws2_32 iphlpapi)
Expand Down
50 changes: 46 additions & 4 deletions include/LightGBM/cuda/cuda_algorithms.hpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
/*!
* Copyright (c) 2021 Microsoft Corporation. All rights reserved.
* Licensed under the MIT License. See LICENSE file in the project root for license information.
* Modifications Copyright(C) 2023 Advanced Micro Devices, Inc. All rights reserved.
*/

#ifndef LIGHTGBM_CUDA_CUDA_ALGORITHMS_HPP_
Expand All @@ -14,6 +15,7 @@

#include <LightGBM/bin.h>
#include <LightGBM/cuda/cuda_utils.hu>
#include <LightGBM/cuda/cuda_rocm_interop.h>
#include <LightGBM/utils/log.h>

#include <algorithm>
Expand Down Expand Up @@ -174,7 +176,7 @@ __device__ __forceinline__ void GlobalMemoryPrefixSum(T* array, const size_t len
for (size_t index = start; index < end; ++index) {
thread_sum += array[index];
}
__shared__ T shared_mem[32];
__shared__ T shared_mem[WARPSIZE];
const T thread_base = ShufflePrefixSumExclusive<T>(thread_sum, shared_mem);
if (start < end) {
array[start] += thread_base;
Expand Down Expand Up @@ -483,7 +485,7 @@ __device__ void ShuffleSortedPrefixSumDevice(const VAL_T* in_values,
const INDEX_T* sorted_indices,
REDUCE_VAL_T* out_values,
const INDEX_T num_data) {
__shared__ REDUCE_VAL_T shared_buffer[32];
__shared__ REDUCE_VAL_T shared_buffer[WARPSIZE];
const INDEX_T num_data_per_thread = (num_data + static_cast<INDEX_T>(blockDim.x) - 1) / static_cast<INDEX_T>(blockDim.x);
const INDEX_T start = num_data_per_thread * static_cast<INDEX_T>(threadIdx.x);
const INDEX_T end = min(start + num_data_per_thread, num_data);
Expand Down Expand Up @@ -572,8 +574,48 @@ __device__ VAL_T PercentileDevice(const VAL_T* values,
INDEX_T* indices,
REDUCE_WEIGHT_T* weights_prefix_sum,
const double alpha,
const INDEX_T len);

const INDEX_T len) {
if (len <= 1) {
return values[0];
}
if (!USE_WEIGHT) {
BitonicArgSortDevice<VAL_T, INDEX_T, ASCENDING, BITONIC_SORT_NUM_ELEMENTS / 2, 10>(values, indices, len);
const double float_pos = (1.0f - alpha) * len;
const INDEX_T pos = static_cast<INDEX_T>(float_pos);
if (pos < 1) {
return values[indices[0]];
} else if (pos >= len) {
return values[indices[len - 1]];
} else {
const double bias = float_pos - pos;
const VAL_T v1 = values[indices[pos - 1]];
const VAL_T v2 = values[indices[pos]];
return static_cast<VAL_T>(v1 - (v1 - v2) * bias);
}
} else {
BitonicArgSortDevice<VAL_T, INDEX_T, ASCENDING, BITONIC_SORT_NUM_ELEMENTS / 4, 9>(values, indices, len);
ShuffleSortedPrefixSumDevice<WEIGHT_T, REDUCE_WEIGHT_T, INDEX_T>(weights, indices, weights_prefix_sum, len);
const REDUCE_WEIGHT_T threshold = weights_prefix_sum[len - 1] * (1.0f - alpha);
__shared__ INDEX_T pos;
if (threadIdx.x == 0) {
pos = len;
}
__syncthreads();
for (INDEX_T index = static_cast<INDEX_T>(threadIdx.x); index < len; index += static_cast<INDEX_T>(blockDim.x)) {
if (weights_prefix_sum[index] > threshold && (index == 0 || weights_prefix_sum[index - 1] <= threshold)) {
pos = index;
}
}
__syncthreads();
pos = min(pos, len - 1);
if (pos == 0 || pos == len - 1) {
return values[pos];
}
const VAL_T v1 = values[indices[pos - 1]];
const VAL_T v2 = values[indices[pos]];
return static_cast<VAL_T>(v1 - (v1 - v2) * (threshold - weights_prefix_sum[pos - 1]) / (weights_prefix_sum[pos] - weights_prefix_sum[pos - 1]));
}
}

} // namespace LightGBM

Expand Down
20 changes: 20 additions & 0 deletions include/LightGBM/cuda/cuda_rocm_interop.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
/*!
* Copyright(C) 2023 Advanced Micro Devices, Inc. All rights reserved.
*/
#ifdef USE_CUDA

#if defined(__HIP_PLATFORM_AMD__) || defined(__HIP__)
// ROCm doesn't have __shfl_down_sync, only __shfl_down without mask.
// Since mask is full 0xffffffff, we can use __shfl_down instead.
#define __shfl_down_sync(mask, val, offset) __shfl_down(val, offset)
#define __shfl_up_sync(mask, val, offset) __shfl_up(val, offset)
// ROCm warpSize is constexpr and is either 32 or 64 depending on gfx arch.
#define WARPSIZE warpSize
// ROCm doesn't have atomicAdd_block, but it should be semantically the same as atomicAdd
#define atomicAdd_block atomicAdd
#else
// CUDA warpSize is not a constexpr, but always 32
#define WARPSIZE 32
#endif

#endif
11 changes: 6 additions & 5 deletions include/LightGBM/cuda/cuda_split_info.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
* Copyright (c) 2021 Microsoft Corporation. All rights reserved.
* Licensed under the MIT License. See LICENSE file in the project root for
* license information.
* Modifications Copyright(C) 2023 Advanced Micro Devices, Inc. All rights reserved.
*/

#ifdef USE_CUDA
Expand Down Expand Up @@ -40,24 +41,24 @@ class CUDASplitInfo {
uint32_t* cat_threshold = nullptr;
int* cat_threshold_real = nullptr;

__device__ CUDASplitInfo() {
__host__ __device__ CUDASplitInfo() {
num_cat_threshold = 0;
cat_threshold = nullptr;
cat_threshold_real = nullptr;
}

__device__ ~CUDASplitInfo() {
__host__ __device__ ~CUDASplitInfo() {
if (num_cat_threshold > 0) {
if (cat_threshold != nullptr) {
cudaFree(cat_threshold);
CUDASUCCESS_OR_FATAL(cudaFree(cat_threshold));
}
if (cat_threshold_real != nullptr) {
cudaFree(cat_threshold_real);
CUDASUCCESS_OR_FATAL(cudaFree(cat_threshold_real));
}
}
}

__device__ CUDASplitInfo& operator=(const CUDASplitInfo& other) {
__host__ __device__ CUDASplitInfo& operator=(const CUDASplitInfo& other) {
is_valid = other.is_valid;
leaf_index = other.leaf_index;
gain = other.gain;
Expand Down
3 changes: 2 additions & 1 deletion include/LightGBM/cuda/vector_cudahost.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
/*!
* Copyright (c) 2020 IBM Corporation, Microsoft Corporation. All rights reserved.
* Licensed under the MIT License. See LICENSE file in the project root for license information.
* Modifications Copyright(C) 2023 Advanced Micro Devices, Inc. All rights reserved.
*/
#ifndef LIGHTGBM_CUDA_VECTOR_CUDAHOST_H_
#define LIGHTGBM_CUDA_VECTOR_CUDAHOST_H_
Expand Down Expand Up @@ -45,7 +46,7 @@ struct CHAllocator {
n = SIZE_ALIGNED(n);
#ifdef USE_CUDA
if (LGBM_config_::current_device == lgbm_device_cuda) {
cudaError_t ret = cudaHostAlloc(&ptr, n*sizeof(T), cudaHostAllocPortable);
cudaError_t ret = cudaHostAlloc(reinterpret_cast<void**>(&ptr), n*sizeof(T), cudaHostAllocPortable);
if (ret != cudaSuccess) {
Log::Warning("Defaulting to malloc in CHAllocator!!!");
ptr = reinterpret_cast<T*>(_mm_malloc(n*sizeof(T), 16));
Expand Down
Loading