KernelTuner · stijnh · Apr 11, 2025 · Nov 1, 2024 · Nov 18, 2024 · Nov 18, 2024
diff --git a/.github/workflows/cmake-action.yml b/.github/workflows/cmake-action.yml
@@ -19,7 +19,7 @@ jobs:
     runs-on: ubuntu-latest
 
     steps:
-    - uses: Jimver/cuda-toolkit@v0.2.11
+    - uses: Jimver/cuda-toolkit@v0.2.22
       id: cuda-toolkit
       with:
         method: network
@@ -33,7 +33,7 @@ jobs:
     - name: Configure CMake
       # Configure CMake in a 'build' subdirectory. `CMAKE_BUILD_TYPE` is only required if you are using a single-configuration generator such as make.
       # See https://cmake.org/cmake/help/latest/variable/CMAKE_BUILD_TYPE.html?highlight=cmake_build_type
-      run: cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DKERNEL_FLOAT_BUILD_TEST=1 -DKERNEL_FLOAT_BUILD_EXAMPLE=1
+      run: CUDAARCHS=all cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DKERNEL_FLOAT_BUILD_TEST=1 -DKERNEL_FLOAT_BUILD_EXAMPLE=1
 
     - name: Build
       # Build your program with the given configuration

diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml
@@ -13,16 +13,16 @@ jobs:
   build-cuda:
     uses: ./.github/workflows/cmake-action.yml
     with:
-      cuda-version: "12.2.0"
+      cuda-version: "12.8.0"
 
-  build-cuda-11-7:
+  build-cuda-12-6:
     needs: build-cuda
     uses: ./.github/workflows/cmake-action.yml
     with:
-      cuda-version: "11.7.0"
+      cuda-version: "12.6.0"
 
-  build-cuda-12-0:
+  build-cuda-12-5:
     needs: build-cuda
     uses: ./.github/workflows/cmake-action.yml
     with:
-      cuda-version: "12.0.0"
+      cuda-version: "12.5.0"
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,17 +1,44 @@
 cmake_minimum_required(VERSION 3.20)
 
 set (PROJECT_NAME kernel_float)
-project(${PROJECT_NAME} CXX CUDA)
+project(${PROJECT_NAME} LANGUAGES CXX)
 
-set(CMAKE_C_STANDARD 11)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
 
+# Validate and enable the appropriate language
+if (NOT DEFINED KERNEL_FLOAT_LANGUAGE)
+    set(KERNEL_FLOAT_LANGUAGE "CUDA")
+endif()
+
+if (KERNEL_FLOAT_LANGUAGE STREQUAL "CUDA")
+    enable_language(CUDA)
+    set(KERNEL_FLOAT_LANGUAGE_CUDA ON)
+elseif (KERNEL_FLOAT_LANGUAGE STREQUAL "HIP")
+    enable_language(HIP)
+    set(KERNEL_FLOAT_LANGUAGE_HIP ON)
+else()
+    message(FATAL_ERROR "KERNEL_FLOAT_LANGUAGE must be either 'HIP' or 'CUDA'")
+endif()
+
+# Create an interface library for kernel_float
 add_library(${PROJECT_NAME} INTERFACE)
 target_include_directories(${PROJECT_NAME} INTERFACE "${PROJECT_SOURCE_DIR}/include")
 
+# Optionally build tests and examples if the corresponding flags are set
+option(KERNEL_FLOAT_BUILD_TEST "Build kernel float tests" OFF)
+option(KERNEL_FLOAT_BUILD_EXAMPLE "Build kernel float examples" OFF)
+
 if (KERNEL_FLOAT_BUILD_TEST)
     add_subdirectory(tests)
 endif()
 
 if (KERNEL_FLOAT_BUILD_EXAMPLE)
     add_subdirectory(examples)
 endif()
+
+# Display configuration
+message(STATUS "=== Kernel Float ===")
+message(STATUS "Using GPU Language: ${KERNEL_FLOAT_LANGUAGE}")
+message(STATUS "Building Tests: ${KERNEL_FLOAT_BUILD_TEST}")
+message(STATUS "Building Examples: ${KERNEL_FLOAT_BUILD_EXAMPLE}")
diff --git a/README.md b/README.md
@@ -9,12 +9,12 @@
 ![GitHub Repo stars](https://img.shields.io/github/stars/KernelTuner/kernel_float?style=social)
 
 
-_Kernel Float_ is a header-only library for CUDA that simplifies working with vector types and reduced precision floating-point arithmetic in GPU code.
+_Kernel Float_ is a header-only library for CUDA/HIP that simplifies working with vector types and reduced precision floating-point arithmetic in GPU code.
 
 
 ## Summary
 
-CUDA natively offers several reduced precision floating-point types (`__half`, `__nv_bfloat16`, `__nv_fp8_e4m3`, `__nv_fp8_e5m2`)
+CUDA/HIP natively offers several reduced precision floating-point types (`__half`, `__nv_bfloat16`, `__nv_fp8_e4m3`, `__nv_fp8_e5m2`)
 and vector types (e.g., `__half2`, `__nv_fp8x4_e4m3`, `float3`).
 However, working with these types is cumbersome:
 mathematical operations require intrinsics (e.g., `__hadd2` performs addition for `__half2`),
@@ -24,9 +24,9 @@ and some functionality is missing (e.g., one cannot convert a `__half` to `__nv_
 _Kernel Float_ resolves this by offering a single data type `kernel_float::vec<T, N>` that stores `N` elements of type `T`.
 Internally, the data is stored as a fixed-sized array of elements.
 Operator overloading (like `+`, `*`, `&&`) has been implemented such that the most optimal intrinsic for the available types is selected automatically.
-Many mathetical functions (like `log`, `exp`, `sin`) and common operations (such as `sum`, `range`, `for_each`) are also available.
+Many mathematical functions (like `log`, `exp`, `sin`) and common operations (such as `sum`, `range`, `for_each`) are also available.
 
-By using this library, developers can avoid the complexity of working with reduced precision floating-point types in CUDA and focus on their applications.
+Using Kernel Float, developers avoid the complexity of reduced precision floating-point types in CUDA and can focus on their applications.
 
 
 ## Features
@@ -40,6 +40,7 @@ In a nutshell, _Kernel Float_ offers the following features:
 * Easy integration as a single header file.
 * Written for C++17.
 * Compatible with NVCC (NVIDIA Compiler) and NVRTC (NVIDIA Runtime Compilation).
+* Compatible with HIPCC (AMD HIP Compiler)
 
 
 ## Example
@@ -49,7 +50,7 @@ Check out the [examples](https://github.com/KernelTuner/kernel_float/tree/master
 
 Below shows a simple example of a CUDA kernel that adds a `constant` to the `input` array and writes the results to the `output` array.
 Each thread processes two elements.
-Notice how easy it would be change the precision (for example, `double` to `half`) or the vector size (for example, 4 instead of 2 items per thread).
+Notice how easy it would be to change the precision (for example, `double` to `half`) or the vector size (for example, 4 instead of 2 items per thread).
 
 
 ```cpp
@@ -63,14 +64,14 @@ __global__ void kernel(const kf::vec<half, 2>* input, float constant, kf::vec<fl
 
 ```
 
-Here is how the same kernel would like without Kernel Float.
+Here is how the same kernel would look for CUDA without Kernel Float.
 
 ```cpp
 __global__ void kernel(const __half* input, float constant, float* output) {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
     __half in0 = input[2 * i + 0];
     __half in1 = input[2 * i + 1];
-    __half2 a = __halves2half2(in0, int1);
+    __half2 a = __halves2half2(in0, in1);
     float b = float(constant);
     __half c = __float2half(b);
     __half2 d = __half2half2(c);

diff --git a/docs/guides/accuracy.md b/docs/guides/accuracy.md
@@ -25,13 +25,13 @@ kf::vec<float, 4> c = kf::fast_rcp(x);
 kf::vec<float, 4> d = kf::fast_div(a, b);
 ```
 
-These functions are only functional for 32-bit and 16-bit floats. 
+These functions are only functional for 32-bit and 16-bit floats.
 For other input types, the operation falls back to the regular version.
 
 ## Approximate Math
 
-For 16-bit floats, several approximate functions are provided. 
-These use approximations (typically low-degree polynomials) to calculate rough estimates of the functions. 
+For 16-bit floats, several approximate functions are provided.
+These use approximations (typically low-degree polynomials) to calculate rough estimates of the functions.
 This can be very fast but also less accurate.
 
 
@@ -69,14 +69,15 @@ kf::vec<half, 4> a = kf::approx_sin<3>(x);
 
 ## Tuning Accuracy Level
 
-Many functions in Kernel Float accept an additional Accuracy option as a template parameter. 
+Many functions in Kernel Float accept an additional `Accuracy` option as a template parameter.
 This allows you to tune the accuracy level without changing the function name.
 
-There are four possible values for this parameter:
+There are five possible values for this parameter:
 
 - `kf::accurate_policy`: Use the most accurate version of the function available.
 - `kf::fast_policy`: Use the "fast math" version.
-- `kf::approx_policy<N>`: Use the approximate version with degree `N`.
+- `kf::approx_level_policy<N>`: Use the approximate version with accuracy level `N` (higher is more accurate).
+- `kf::approx_policy`: Use the approximate version with a default accuracy level.
 - `kf::default_policy`: Use a global default policy (see the next section).
 
 For example, consider this code:
@@ -97,15 +98,19 @@ kf::vec<float, 2> c = kf::cos<kf::accurate_policy>(input);
 kf::vec<float, 2> d = kf::cos<kf::fast_policy>(input);
 
 // Use the approximate policy
-kf::vec<float, 2> e = kf::cos<kf::approx_policy<3>>(input);
+kf::vec<float, 2> e = kf::cos<kf::approx_policy>(input);
+
+// Use the approximate policy with degree 3 polynomial.
+kf::vec<float, 2> f = kf::cos<kf::approx_level_policy<3>>(input);
 
 // You can use aliases to define your own policy
 using my_own_policy = kf::fast_policy;
-kf::vec<float, 2> f = kf::cos<my_own_policy>(input);
+kf::vec<float, 2> g = kf::cos<my_own_policy>(input);
 ```
 
 ## Setting `default_policy`
 
+If no policy is explicitly set, any function use the `kf::default_policy`.
 By default, `kf::default_policy` is set to `kf::accurate_policy`.
 
 Set the preprocessor option `KERNEL_FLOAT_FAST_MATH=1` to change the default policy to `kf::fast_policy`.

diff --git a/example.cu b/example.cu
@@ -0,0 +1,13 @@
+#include "kernel_float.h"
+#include <cuda_fp16.h>
+
+namespace kf = kernel_float;
+
+__global__ void kernel(
+    kf::vec_ptr<half, 4, const __nv_fp8_e5m2> input, 
+    float constant, 
+    kf::vec_ptr<half, 4> output
+) {
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    output(i) = input[i] + kf::cast<half>(constant);
+}
diff --git a/examples/hip_compat.h b/examples/hip_compat.h
@@ -0,0 +1,22 @@
+#pragma once
+
+/**
+ * This header file provides a mapping from CUDA-specific function names and types to their equivalent HIP
+ * counterparts, allowing for cross-platform development between CUDA and HIP. By including this header, code
+ * originally written for CUDA can be compiled with the HIP compiler (hipcc) by automatically replacing CUDA API
+ * calls with their HIP equivalents.
+ */
+#ifdef __HIPCC__
+#define cudaError_t            hipError_t
+#define cudaSuccess            hipSuccess
+#define cudaGetErrorString     hipGetErrorString
+#define cudaGetLastError       hipGetLastError
+#define cudaMalloc             hipMalloc
+#define cudaFree               hipFree
+#define cudaMemcpy             hipMemcpy
+#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define cudaMemcpyDefault      hipMemcpyDefault
+#define cudaMemset             hipMemset
+#define cudaSetDevice          hipSetDevice
+#define cudaDeviceSynchronize  hipDeviceSynchronize
+#endif
diff --git a/examples/pi/CMakeLists.txt b/examples/pi/CMakeLists.txt
@@ -1,12 +1,18 @@
-cmake_minimum_required(VERSION 3.17)
+cmake_minimum_required(VERSION 3.20)
 
 set (PROJECT_NAME kernel_float_pi)
-project(${PROJECT_NAME} LANGUAGES CXX CUDA)
-set (CMAKE_CXX_STANDARD 17)
+project(${PROJECT_NAME} LANGUAGES CXX)
 
+set (CMAKE_CXX_STANDARD 17)
 add_executable(${PROJECT_NAME} "${PROJECT_SOURCE_DIR}/main.cu")
 target_link_libraries(${PROJECT_NAME} kernel_float)
-set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES "80")
 
-find_package(CUDA REQUIRED)
-target_include_directories(${PROJECT_NAME} PRIVATE ${CUDA_TOOLKIT_INCLUDE})
+if(${KERNEL_FLOAT_LANGUAGE_CUDA})
+    find_package(CUDA REQUIRED)
+    target_include_directories(${PROJECT_NAME} PRIVATE ${CUDA_TOOLKIT_INCLUDE})
+    set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES "80")
+endif()
+
+if(${KERNEL_FLOAT_LANGUAGE_HIP})
+    set_source_files_properties("${PROJECT_SOURCE_DIR}/main.cu" PROPERTIES LANGUAGE HIP)
+endif()
diff --git a/examples/pi/main.cu b/examples/pi/main.cu
@@ -1,6 +1,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 
+#include "../hip_compat.h"
 #include "kernel_float.h"
 
 #define CUDA_CHECK(call)                                     \
@@ -9,12 +10,12 @@
         if (__err != cudaSuccess) {                          \
             fprintf(                                         \
                 stderr,                                      \
-                "CUDA error at %s:%d code=%d(%s) \"%s\" \n", \
+                "CUDA error at %s:%d (%s): %s (code %d) \n", \
                 __FILE__,                                    \
                 __LINE__,                                    \
-                __err,                                       \
+                #call,                                       \
                 cudaGetErrorString(__err),                   \
-                #call);                                      \
+                __err);                                      \
             exit(EXIT_FAILURE);                              \
         }                                                    \
     } while (0)

diff --git a/examples/vector_add/CMakeLists.txt b/examples/vector_add/CMakeLists.txt
@@ -1,12 +1,18 @@
 cmake_minimum_required(VERSION 3.17)
 
 set (PROJECT_NAME kernel_float_vecadd)
-project(${PROJECT_NAME} LANGUAGES CXX CUDA)
-set (CMAKE_CXX_STANDARD 17)
+project(${PROJECT_NAME} LANGUAGES CXX)
 
+set (CMAKE_CXX_STANDARD 17)
 add_executable(${PROJECT_NAME} "${PROJECT_SOURCE_DIR}/main.cu")
 target_link_libraries(${PROJECT_NAME} kernel_float)
-set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES "80")
 
-find_package(CUDA REQUIRED)
-target_include_directories(${PROJECT_NAME} PRIVATE ${CUDA_TOOLKIT_INCLUDE})
+if(${KERNEL_FLOAT_LANGUAGE_HIP})
+    set_source_files_properties("${PROJECT_SOURCE_DIR}/main.cu" PROPERTIES LANGUAGE HIP)
+endif()
+
+if(${KERNEL_FLOAT_LANGUAGE_CUDA})
+    find_package(CUDA REQUIRED)
+    target_include_directories(${PROJECT_NAME} PRIVATE ${CUDA_TOOLKIT_INCLUDE})
+    set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES "80")
+endif()
diff --git a/examples/vector_add/main.cu b/examples/vector_add/main.cu
@@ -3,6 +3,7 @@
 #include <stdexcept>
 #include <vector>
 
+#include "../hip_compat.h"
 #include "kernel_float.h"
 namespace kf = kernel_float;
 
@@ -21,7 +22,7 @@ __global__ void my_kernel(
     int i = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (i * N < length) {
-        output(i) = kf::fma(input[i], input[i], kf::cast<__half>(constant));
+        output[i] = kf::fma(input[i], input[i], kf::cast<half>(constant));
     }
 }
 

diff --git a/examples/vector_add_tiling/CMakeLists.txt b/examples/vector_add_tiling/CMakeLists.txt
@@ -1,12 +1,18 @@
 cmake_minimum_required(VERSION 3.17)
 
 set (PROJECT_NAME kernel_float_vecadd_tiling)
-project(${PROJECT_NAME} LANGUAGES CXX CUDA)
-set (CMAKE_CXX_STANDARD 17)
+project(${PROJECT_NAME} LANGUAGES CXX)
 
+set (CMAKE_CXX_STANDARD 17)
 add_executable(${PROJECT_NAME} "${PROJECT_SOURCE_DIR}/main.cu")
 target_link_libraries(${PROJECT_NAME} kernel_float)
-set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES "80")
 
-find_package(CUDA REQUIRED)
-target_include_directories(${PROJECT_NAME} PRIVATE ${CUDA_TOOLKIT_INCLUDE})
+if(${KERNEL_FLOAT_LANGUAGE_HIP})
+    set_source_files_properties("${PROJECT_SOURCE_DIR}/main.cu" PROPERTIES LANGUAGE HIP)
+endif()
+
+if(${KERNEL_FLOAT_LANGUAGE_CUDA})
+    find_package(CUDA REQUIRED)
+    target_include_directories(${PROJECT_NAME} PRIVATE ${CUDA_TOOLKIT_INCLUDE})
+    set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES "80")
+endif()
diff --git a/examples/vector_add_tiling/main.cu b/examples/vector_add_tiling/main.cu
@@ -3,6 +3,7 @@
 #include <stdexcept>
 #include <vector>
 
+#include "../hip_compat.h"
 #include "kernel_float.h"
 #include "kernel_float/tiling.h"
 namespace kf = kernel_float;

diff --git a/include/kernel_float.h b/include/kernel_float.h
@@ -1,6 +1,7 @@
 #ifndef KERNEL_FLOAT_H
 #define KERNEL_FLOAT_H
 
+#include "kernel_float/approx.h"
 #include "kernel_float/base.h"
 #include "kernel_float/bf16.h"
 #include "kernel_float/binops.h"