microsoft · jeffdaily · Sep 7, 2023 · Sep 7, 2023 · Sep 7, 2023 · Sep 7, 2023
@@ -0,0 +1,16 @@
+#!/bin/bash
+# Copyright(C) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+for DIR in ./src ./include
+do
+    for EXT in cpp h hpp cu
+    do
+        find ${DIR} -name "*.${EXT}" -exec sh -c '
+          echo "hipifying $1 in-place"
+           hipify-perl "$1" -inplace &
+        ' sh {} \;
+    done
+done
+
+echo "waiting for all hipify-perl invocations to finish"
+wait
@@ -463,3 +463,6 @@ dask-worker-space/
 *.pub
 *.rdp
 *_rsa
+
+# hipify-perl -inplace leaves behind *.prehip files
+*.prehip
@@ -4,6 +4,7 @@ option(USE_GPU "Enable GPU-accelerated training" OFF)
 option(USE_SWIG "Enable SWIG to generate Java API" OFF)
 option(USE_TIMETAG "Set to ON to output time costs" OFF)
 option(USE_CUDA "Enable CUDA-accelerated training " OFF)
+option(USE_ROCM "Enable ROCM-accelerated training " OFF)
 option(USE_DEBUG "Set to ON for Debug mode" OFF)
 option(USE_SANITIZER "Use sanitizer flags" OFF)
 set(
@@ -160,6 +161,11 @@ if(USE_CUDA)
     set(USE_OPENMP ON CACHE BOOL "CUDA requires OpenMP" FORCE)
 endif()
 
+if(USE_ROCM)
+    enable_language(HIP)
+    set(USE_OPENMP ON CACHE BOOL "ROCM requires OpenMP" FORCE)
+endif()
+
 if(USE_OPENMP)
     if(APPLE)
         find_package(OpenMP)
@@ -302,6 +308,85 @@ if(USE_CUDA)
     endforeach()
 endif()
 
+if(USE_ROCM)
+    find_package(HIP)
+    include_directories(${HIP_INCLUDE_DIRS})
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__HIP_PLATFORM_AMD__")
+    set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} ${OpenMP_CXX_FLAGS} -fPIC -Wall")
+
+    # avoid warning: unused variable 'mask' due to __shfl_down_sync work-around
+    set(DISABLED_WARNINGS "${DISABLED_WARNINGS} -Wno-unused-variable")
+    # avoid warning: 'hipHostAlloc' is deprecated: use hipHostMalloc instead
+    set(DISABLED_WARNINGS "${DISABLED_WARNINGS} -Wno-deprecated-declarations")
+    # avoid many warnings about missing overrides
+    set(DISABLED_WARNINGS "${DISABLED_WARNINGS} -Wno-inconsistent-missing-override")
+    # avoid warning: shift count >= width of type in feature_histogram.hpp
+    set(DISABLED_WARNINGS "${DISABLED_WARNINGS} -Wno-shift-count-overflow")
+
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${DISABLED_WARNINGS}")
+    set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} ${DISABLED_WARNINGS}")
+
+    if(USE_DEBUG)
+      set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -g -O0")
+    else()
+      set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -O3")
+    endif()
+    message(STATUS "CMAKE_HIP_FLAGS: ${CMAKE_HIP_FLAGS}")
+
+    add_definitions(-DUSE_CUDA)
+
+    set(
+      BASE_DEFINES
+      -DPOWER_FEATURE_WORKGROUPS=12
+      -DUSE_CONSTANT_BUF=0
+    )
+    set(
+      ALLFEATS_DEFINES
+      ${BASE_DEFINES}
+      -DENABLE_ALL_FEATURES
+    )
+    set(
+      FULLDATA_DEFINES
+      ${ALLFEATS_DEFINES}
+      -DIGNORE_INDICES
+    )
+
+    message(STATUS "ALLFEATS_DEFINES: ${ALLFEATS_DEFINES}")
+    message(STATUS "FULLDATA_DEFINES: ${FULLDATA_DEFINES}")
+
+    function(add_histogram hsize hname hadd hconst hdir)
     function(add_histogram hsize hname hadd hconst hdir) 
     function(add_histogram hsize hname hadd hconst hdir) 
+      add_library(histo${hsize}${hname} OBJECT src/treelearner/kernels/histogram${hsize}.cu)
+      if(hadd)
+        list(APPEND histograms histo${hsize}${hname})
+        set(histograms ${histograms} PARENT_SCOPE)
+      endif()
+      target_compile_definitions(
+        histo${hsize}${hname}
+        PRIVATE
+        -DCONST_HESSIAN=${hconst}
+        ${hdir}
+      )
+    endfunction()
+
+    foreach(hsize _16_64_256)
+      add_histogram("${hsize}" "_sp_const" "True" "1" "${BASE_DEFINES}")
+      add_histogram("${hsize}" "_sp" "True" "0" "${BASE_DEFINES}")
+      add_histogram("${hsize}" "-allfeats_sp_const" "False" "1" "${ALLFEATS_DEFINES}")
+      add_histogram("${hsize}" "-allfeats_sp" "False" "0" "${ALLFEATS_DEFINES}")
+      add_histogram("${hsize}" "-fulldata_sp_const" "True" "1" "${FULLDATA_DEFINES}")
+      add_histogram("${hsize}" "-fulldata_sp" "True" "0" "${FULLDATA_DEFINES}")
+    endforeach()
+endif()
+
+if(USE_HDFS)
+    find_package(JNI REQUIRED)
+    find_path(HDFS_INCLUDE_DIR hdfs.h REQUIRED)
+    find_library(HDFS_LIB NAMES hdfs REQUIRED)
+    include_directories(${HDFS_INCLUDE_DIR})
+    add_definitions(-DUSE_HDFS)
+    set(HDFS_CXX_LIBRARIES ${HDFS_LIB} ${JAVA_JVM_LIBRARY})
+endif()
+
 include(CheckCXXSourceCompiles)
 check_cxx_source_compiles("
 #include <xmmintrin.h>
@@ -644,6 +729,20 @@ if(USE_CUDA)
   target_link_libraries(_lightgbm PRIVATE ${histograms})
 endif()
 
+if(USE_ROCM)
+  # histograms are list of object libraries. Linking object library to other
+  # object libraries only gets usage requirements, the linked objects won't be
+  # used. Thus we have to call target_link_libraries on final targets here.
+  if(BUILD_CLI)
+    target_link_libraries(lightgbm PRIVATE ${histograms})
+  endif()
+  target_link_libraries(_lightgbm PRIVATE ${histograms})
+endif()
+
+if(USE_HDFS)
+  target_link_libraries(lightgbm_objs PUBLIC ${HDFS_CXX_LIBRARIES})
+endif()
+
 if(WIN32)
     if(MINGW OR CYGWIN)
       target_link_libraries(lightgbm_objs PUBLIC ws2_32 iphlpapi)

@@ -1,6 +1,7 @@
 /*!
  * Copyright (c) 2021 Microsoft Corporation. All rights reserved.
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
+ * Modifications Copyright(C) 2023 Advanced Micro Devices, Inc. All rights reserved.
  */
 
 #ifndef LIGHTGBM_CUDA_CUDA_ALGORITHMS_HPP_
@@ -14,6 +15,7 @@
 
 #include <LightGBM/bin.h>
 #include <LightGBM/cuda/cuda_utils.hu>
+#include <LightGBM/cuda/cuda_rocm_interop.h>
 #include <LightGBM/utils/log.h>
 
 #include <algorithm>
@@ -174,7 +176,7 @@ __device__ __forceinline__ void GlobalMemoryPrefixSum(T* array, const size_t len
   for (size_t index = start; index < end; ++index) {
     thread_sum += array[index];
   }
-  __shared__ T shared_mem[32];
+  __shared__ T shared_mem[WARPSIZE];
   const T thread_base = ShufflePrefixSumExclusive<T>(thread_sum, shared_mem);
   if (start < end) {
     array[start] += thread_base;
@@ -483,7 +485,7 @@ __device__ void ShuffleSortedPrefixSumDevice(const VAL_T* in_values,
                                 const INDEX_T* sorted_indices,
                                 REDUCE_VAL_T* out_values,
                                 const INDEX_T num_data) {
-  __shared__ REDUCE_VAL_T shared_buffer[32];
+  __shared__ REDUCE_VAL_T shared_buffer[WARPSIZE];
   const INDEX_T num_data_per_thread = (num_data + static_cast<INDEX_T>(blockDim.x) - 1) / static_cast<INDEX_T>(blockDim.x);
   const INDEX_T start = num_data_per_thread * static_cast<INDEX_T>(threadIdx.x);
   const INDEX_T end = min(start + num_data_per_thread, num_data);
@@ -572,8 +574,48 @@ __device__ VAL_T PercentileDevice(const VAL_T* values,
                                   INDEX_T* indices,
                                   REDUCE_WEIGHT_T* weights_prefix_sum,
                                   const double alpha,
-                                  const INDEX_T len);
-
+                                  const INDEX_T len) {
+  if (len <= 1) {
+    return values[0];
+  }
+  if (!USE_WEIGHT) {
+    BitonicArgSortDevice<VAL_T, INDEX_T, ASCENDING, BITONIC_SORT_NUM_ELEMENTS / 2, 10>(values, indices, len);
+    const double float_pos = (1.0f - alpha) * len;
+    const INDEX_T pos = static_cast<INDEX_T>(float_pos);
+    if (pos < 1) {
+      return values[indices[0]];
+    } else if (pos >= len) {
+      return values[indices[len - 1]];
+    } else {
+      const double bias = float_pos - pos;
+      const VAL_T v1 = values[indices[pos - 1]];
+      const VAL_T v2 = values[indices[pos]];
+      return static_cast<VAL_T>(v1 - (v1 - v2) * bias);
+    }
+  } else {
+    BitonicArgSortDevice<VAL_T, INDEX_T, ASCENDING, BITONIC_SORT_NUM_ELEMENTS / 4, 9>(values, indices, len);
+    ShuffleSortedPrefixSumDevice<WEIGHT_T, REDUCE_WEIGHT_T, INDEX_T>(weights, indices, weights_prefix_sum, len);
+    const REDUCE_WEIGHT_T threshold = weights_prefix_sum[len - 1] * (1.0f - alpha);
+    __shared__ INDEX_T pos;
+    if (threadIdx.x == 0) {
+      pos = len;
+    }
+    __syncthreads();
+    for (INDEX_T index = static_cast<INDEX_T>(threadIdx.x); index < len; index += static_cast<INDEX_T>(blockDim.x)) {
+      if (weights_prefix_sum[index] > threshold && (index == 0 || weights_prefix_sum[index - 1] <= threshold)) {
+        pos = index;
+      }
+    }
+    __syncthreads();
+    pos = min(pos, len - 1);
+    if (pos == 0 || pos == len - 1) {
+      return values[pos];
+    }
+    const VAL_T v1 = values[indices[pos - 1]];
+    const VAL_T v2 = values[indices[pos]];
+    return static_cast<VAL_T>(v1 - (v1 - v2) * (threshold - weights_prefix_sum[pos - 1]) / (weights_prefix_sum[pos] - weights_prefix_sum[pos - 1]));
+  }
+}
 
 }  // namespace LightGBM
 

@@ -0,0 +1,20 @@
+/*!
+ * Copyright(C) 2023 Advanced Micro Devices, Inc. All rights reserved.
+ */
+#ifdef USE_CUDA
+
+#if defined(__HIP_PLATFORM_AMD__) || defined(__HIP__)
+// ROCm doesn't have __shfl_down_sync, only __shfl_down without mask.
+// Since mask is full 0xffffffff, we can use __shfl_down instead.
+#define __shfl_down_sync(mask, val, offset) __shfl_down(val, offset)
+#define __shfl_up_sync(mask, val, offset) __shfl_up(val, offset)
+// ROCm warpSize is constexpr and is either 32 or 64 depending on gfx arch.
+#define WARPSIZE warpSize
+// ROCm doesn't have atomicAdd_block, but it should be semantically the same as atomicAdd
+#define atomicAdd_block atomicAdd
+#else
+// CUDA warpSize is not a constexpr, but always 32
+#define WARPSIZE 32
+#endif
+
+#endif
@@ -2,6 +2,7 @@
  * Copyright (c) 2021 Microsoft Corporation. All rights reserved.
  * Licensed under the MIT License. See LICENSE file in the project root for
  * license information.
+ * Modifications Copyright(C) 2023 Advanced Micro Devices, Inc. All rights reserved.
  */
 
 #ifdef USE_CUDA
@@ -40,24 +41,24 @@ class CUDASplitInfo {
   uint32_t* cat_threshold = nullptr;
   int* cat_threshold_real = nullptr;
 
-  __device__ CUDASplitInfo() {
+  __host__ __device__ CUDASplitInfo() {
     num_cat_threshold = 0;
     cat_threshold = nullptr;
     cat_threshold_real = nullptr;
   }
 
-  __device__ ~CUDASplitInfo() {
+  __host__ __device__ ~CUDASplitInfo() {
     if (num_cat_threshold > 0) {
       if (cat_threshold != nullptr) {
-        cudaFree(cat_threshold);
+        CUDASUCCESS_OR_FATAL(cudaFree(cat_threshold));
       }
       if (cat_threshold_real != nullptr) {
-        cudaFree(cat_threshold_real);
+        CUDASUCCESS_OR_FATAL(cudaFree(cat_threshold_real));
       }
     }
   }
 
-  __device__ CUDASplitInfo& operator=(const CUDASplitInfo& other) {
+  __host__ __device__ CUDASplitInfo& operator=(const CUDASplitInfo& other) {
     is_valid = other.is_valid;
     leaf_index = other.leaf_index;
     gain = other.gain;

@@ -1,6 +1,7 @@
 /*!
  * Copyright (c) 2020 IBM Corporation, Microsoft Corporation. All rights reserved.
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
+ * Modifications Copyright(C) 2023 Advanced Micro Devices, Inc. All rights reserved.
  */
 #ifndef LIGHTGBM_CUDA_VECTOR_CUDAHOST_H_
 #define LIGHTGBM_CUDA_VECTOR_CUDAHOST_H_
@@ -45,7 +46,7 @@ struct CHAllocator {
     n = SIZE_ALIGNED(n);
     #ifdef USE_CUDA
       if (LGBM_config_::current_device == lgbm_device_cuda) {
-        cudaError_t ret = cudaHostAlloc(&ptr, n*sizeof(T), cudaHostAllocPortable);
+        cudaError_t ret = cudaHostAlloc(reinterpret_cast<void**>(&ptr), n*sizeof(T), cudaHostAllocPortable);
         if (ret != cudaSuccess) {
           Log::Warning("Defaulting to malloc in CHAllocator!!!");
           ptr = reinterpret_cast<T*>(_mm_malloc(n*sizeof(T), 16));
-Original file line number
+Diff line change
@@ Expand Up / @@ -463,3 +463,6 @@ dask-worker-space/ @@
     *.pub
     *.rdp
     *_rsa
+    # hipify-perl -inplace leaves behind *.prehip files
+    *.prehip