Skip to content

Prepare 0.3 release #10

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 26 commits into from
Apr 11, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
5490ea7
Make code compile with HIP on Lumi
stijnh Nov 1, 2024
9385655
Changes to make code compile under HIPRTC
stijnh Nov 18, 2024
de62ad0
Merge branch 'hip-support' into dev
stijnh Nov 18, 2024
4fca9cb
Add approximation functions
stijnh Nov 18, 2024
ae0e6b1
Simplify how policies are implemented internally
stijnh Nov 18, 2024
f89cf98
Rename FP16 primitive names: `__half` to `half_t` and `__nv_bfloat16`…
stijnh Nov 18, 2024
014e32f
Implement approximation for `pow`
stijnh Nov 18, 2024
003ce36
Add `apply_fallback_impl` struct
stijnh Nov 18, 2024
76501fd
Add `approx_*` functions
stijnh Nov 18, 2024
ba7356a
Fix incorrect definition of `KERNEL_FLOAT_POLICY`
stijnh Nov 20, 2024
4231f44
Add `Accuracy` parameter to `zip_common`
stijnh Nov 20, 2024
f5edbc8
Add `add_mul` to `vector`
stijnh Nov 20, 2024
e6c8a7c
Overwrite `fast_policy` for FP16 and BF16
stijnh Nov 20, 2024
5c859b9
`kernel_float::approx::sqrt(0)` now returns 0
stijnh Nov 20, 2024
76c695a
Fix compilation error on HIP due to `KERNEL_FLOAT_FAST_F32_MAP`
stijnh Nov 26, 2024
d8a53a3
Remove call to `__exp2f` since it does not exist
stijnh Nov 26, 2024
a2b08a5
Change github workflow to compile for all architectures
stijnh Nov 26, 2024
846de1f
Fix bug in `approx::exp(bfloat16)` for HIP
stijnh Dec 2, 2024
f94bd10
Fix several issues related to HIP compilation for bfloat16
stijnh Dec 2, 2024
2730789
`vector_ptr` now requires alignment in bytes instead of elements
stijnh Jan 27, 2025
1611258
Remove `apply_fallback_impl`
stijnh Jan 27, 2025
c44c6ed
Fix incorrect type name in `approx.h`
stijnh Jan 27, 2025
212efee
Fix incorrect type name in `binops.h`
stijnh Jan 27, 2025
09dc820
Change `AssignConversionProxy` to also accept rvalues
stijnh Jan 27, 2025
126737c
Change `ops::cast` to get rid of `cast_float_fallback`
stijnh Jan 27, 2025
4d18563
Update `Jimver/cuda-toolkit` workflow action to newer version
stijnh Apr 11, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/cmake-action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ jobs:
runs-on: ubuntu-latest

steps:
- uses: Jimver/cuda-toolkit@v0.2.11
- uses: Jimver/cuda-toolkit@v0.2.22
id: cuda-toolkit
with:
method: network
Expand All @@ -33,7 +33,7 @@ jobs:
- name: Configure CMake
# Configure CMake in a 'build' subdirectory. `CMAKE_BUILD_TYPE` is only required if you are using a single-configuration generator such as make.
# See https://cmake.org/cmake/help/latest/variable/CMAKE_BUILD_TYPE.html?highlight=cmake_build_type
run: cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DKERNEL_FLOAT_BUILD_TEST=1 -DKERNEL_FLOAT_BUILD_EXAMPLE=1
run: CUDAARCHS=all cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DKERNEL_FLOAT_BUILD_TEST=1 -DKERNEL_FLOAT_BUILD_EXAMPLE=1

- name: Build
# Build your program with the given configuration
Expand Down
10 changes: 5 additions & 5 deletions .github/workflows/cmake.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,16 +13,16 @@ jobs:
build-cuda:
uses: ./.github/workflows/cmake-action.yml
with:
cuda-version: "12.2.0"
cuda-version: "12.8.0"

build-cuda-11-7:
build-cuda-12-6:
needs: build-cuda
uses: ./.github/workflows/cmake-action.yml
with:
cuda-version: "11.7.0"
cuda-version: "12.6.0"

build-cuda-12-0:
build-cuda-12-5:
needs: build-cuda
uses: ./.github/workflows/cmake-action.yml
with:
cuda-version: "12.0.0"
cuda-version: "12.5.0"
31 changes: 29 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,17 +1,44 @@
cmake_minimum_required(VERSION 3.20)

set (PROJECT_NAME kernel_float)
project(${PROJECT_NAME} CXX CUDA)
project(${PROJECT_NAME} LANGUAGES CXX)

set(CMAKE_C_STANDARD 11)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)

# Validate and enable the appropriate language
if (NOT DEFINED KERNEL_FLOAT_LANGUAGE)
set(KERNEL_FLOAT_LANGUAGE "CUDA")
endif()

if (KERNEL_FLOAT_LANGUAGE STREQUAL "CUDA")
enable_language(CUDA)
set(KERNEL_FLOAT_LANGUAGE_CUDA ON)
elseif (KERNEL_FLOAT_LANGUAGE STREQUAL "HIP")
enable_language(HIP)
set(KERNEL_FLOAT_LANGUAGE_HIP ON)
else()
message(FATAL_ERROR "KERNEL_FLOAT_LANGUAGE must be either 'HIP' or 'CUDA'")
endif()

# Create an interface library for kernel_float
add_library(${PROJECT_NAME} INTERFACE)
target_include_directories(${PROJECT_NAME} INTERFACE "${PROJECT_SOURCE_DIR}/include")

# Optionally build tests and examples if the corresponding flags are set
option(KERNEL_FLOAT_BUILD_TEST "Build kernel float tests" OFF)
option(KERNEL_FLOAT_BUILD_EXAMPLE "Build kernel float examples" OFF)

if (KERNEL_FLOAT_BUILD_TEST)
add_subdirectory(tests)
endif()

if (KERNEL_FLOAT_BUILD_EXAMPLE)
add_subdirectory(examples)
endif()

# Display configuration
message(STATUS "=== Kernel Float ===")
message(STATUS "Using GPU Language: ${KERNEL_FLOAT_LANGUAGE}")
message(STATUS "Building Tests: ${KERNEL_FLOAT_BUILD_TEST}")
message(STATUS "Building Examples: ${KERNEL_FLOAT_BUILD_EXAMPLE}")
15 changes: 8 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,12 @@
![GitHub Repo stars](https://img.shields.io/github/stars/KernelTuner/kernel_float?style=social)


_Kernel Float_ is a header-only library for CUDA that simplifies working with vector types and reduced precision floating-point arithmetic in GPU code.
_Kernel Float_ is a header-only library for CUDA/HIP that simplifies working with vector types and reduced precision floating-point arithmetic in GPU code.


## Summary

CUDA natively offers several reduced precision floating-point types (`__half`, `__nv_bfloat16`, `__nv_fp8_e4m3`, `__nv_fp8_e5m2`)
CUDA/HIP natively offers several reduced precision floating-point types (`__half`, `__nv_bfloat16`, `__nv_fp8_e4m3`, `__nv_fp8_e5m2`)
and vector types (e.g., `__half2`, `__nv_fp8x4_e4m3`, `float3`).
However, working with these types is cumbersome:
mathematical operations require intrinsics (e.g., `__hadd2` performs addition for `__half2`),
Expand All @@ -24,9 +24,9 @@ and some functionality is missing (e.g., one cannot convert a `__half` to `__nv_
_Kernel Float_ resolves this by offering a single data type `kernel_float::vec<T, N>` that stores `N` elements of type `T`.
Internally, the data is stored as a fixed-sized array of elements.
Operator overloading (like `+`, `*`, `&&`) has been implemented such that the most optimal intrinsic for the available types is selected automatically.
Many mathetical functions (like `log`, `exp`, `sin`) and common operations (such as `sum`, `range`, `for_each`) are also available.
Many mathematical functions (like `log`, `exp`, `sin`) and common operations (such as `sum`, `range`, `for_each`) are also available.

By using this library, developers can avoid the complexity of working with reduced precision floating-point types in CUDA and focus on their applications.
Using Kernel Float, developers avoid the complexity of reduced precision floating-point types in CUDA and can focus on their applications.


## Features
Expand All @@ -40,6 +40,7 @@ In a nutshell, _Kernel Float_ offers the following features:
* Easy integration as a single header file.
* Written for C++17.
* Compatible with NVCC (NVIDIA Compiler) and NVRTC (NVIDIA Runtime Compilation).
* Compatible with HIPCC (AMD HIP Compiler)


## Example
Expand All @@ -49,7 +50,7 @@ Check out the [examples](https://github.com/KernelTuner/kernel_float/tree/master

Below shows a simple example of a CUDA kernel that adds a `constant` to the `input` array and writes the results to the `output` array.
Each thread processes two elements.
Notice how easy it would be change the precision (for example, `double` to `half`) or the vector size (for example, 4 instead of 2 items per thread).
Notice how easy it would be to change the precision (for example, `double` to `half`) or the vector size (for example, 4 instead of 2 items per thread).


```cpp
Expand All @@ -63,14 +64,14 @@ __global__ void kernel(const kf::vec<half, 2>* input, float constant, kf::vec<fl

```

Here is how the same kernel would like without Kernel Float.
Here is how the same kernel would look for CUDA without Kernel Float.

```cpp
__global__ void kernel(const __half* input, float constant, float* output) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
__half in0 = input[2 * i + 0];
__half in1 = input[2 * i + 1];
__half2 a = __halves2half2(in0, int1);
__half2 a = __halves2half2(in0, in1);
float b = float(constant);
__half c = __float2half(b);
__half2 d = __half2half2(c);
Expand Down
21 changes: 13 additions & 8 deletions docs/guides/accuracy.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,13 @@ kf::vec<float, 4> c = kf::fast_rcp(x);
kf::vec<float, 4> d = kf::fast_div(a, b);
```

These functions are only functional for 32-bit and 16-bit floats.
These functions are only functional for 32-bit and 16-bit floats.
For other input types, the operation falls back to the regular version.

## Approximate Math

For 16-bit floats, several approximate functions are provided.
These use approximations (typically low-degree polynomials) to calculate rough estimates of the functions.
For 16-bit floats, several approximate functions are provided.
These use approximations (typically low-degree polynomials) to calculate rough estimates of the functions.
This can be very fast but also less accurate.


Expand Down Expand Up @@ -69,14 +69,15 @@ kf::vec<half, 4> a = kf::approx_sin<3>(x);

## Tuning Accuracy Level

Many functions in Kernel Float accept an additional Accuracy option as a template parameter.
Many functions in Kernel Float accept an additional `Accuracy` option as a template parameter.
This allows you to tune the accuracy level without changing the function name.

There are four possible values for this parameter:
There are five possible values for this parameter:

- `kf::accurate_policy`: Use the most accurate version of the function available.
- `kf::fast_policy`: Use the "fast math" version.
- `kf::approx_policy<N>`: Use the approximate version with degree `N`.
- `kf::approx_level_policy<N>`: Use the approximate version with accuracy level `N` (higher is more accurate).
- `kf::approx_policy`: Use the approximate version with a default accuracy level.
- `kf::default_policy`: Use a global default policy (see the next section).

For example, consider this code:
Expand All @@ -97,15 +98,19 @@ kf::vec<float, 2> c = kf::cos<kf::accurate_policy>(input);
kf::vec<float, 2> d = kf::cos<kf::fast_policy>(input);

// Use the approximate policy
kf::vec<float, 2> e = kf::cos<kf::approx_policy<3>>(input);
kf::vec<float, 2> e = kf::cos<kf::approx_policy>(input);

// Use the approximate policy with degree 3 polynomial.
kf::vec<float, 2> f = kf::cos<kf::approx_level_policy<3>>(input);

// You can use aliases to define your own policy
using my_own_policy = kf::fast_policy;
kf::vec<float, 2> f = kf::cos<my_own_policy>(input);
kf::vec<float, 2> g = kf::cos<my_own_policy>(input);
```

## Setting `default_policy`

If no policy is explicitly set, any function use the `kf::default_policy`.
By default, `kf::default_policy` is set to `kf::accurate_policy`.

Set the preprocessor option `KERNEL_FLOAT_FAST_MATH=1` to change the default policy to `kf::fast_policy`.
Expand Down
13 changes: 13 additions & 0 deletions example.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#include "kernel_float.h"
#include <cuda_fp16.h>

namespace kf = kernel_float;

__global__ void kernel(
kf::vec_ptr<half, 4, const __nv_fp8_e5m2> input,
float constant,
kf::vec_ptr<half, 4> output
) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
output(i) = input[i] + kf::cast<half>(constant);
}
22 changes: 22 additions & 0 deletions examples/hip_compat.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#pragma once

/**
* This header file provides a mapping from CUDA-specific function names and types to their equivalent HIP
* counterparts, allowing for cross-platform development between CUDA and HIP. By including this header, code
* originally written for CUDA can be compiled with the HIP compiler (hipcc) by automatically replacing CUDA API
* calls with their HIP equivalents.
*/
#ifdef __HIPCC__
#define cudaError_t hipError_t
#define cudaSuccess hipSuccess
#define cudaGetErrorString hipGetErrorString
#define cudaGetLastError hipGetLastError
#define cudaMalloc hipMalloc
#define cudaFree hipFree
#define cudaMemcpy hipMemcpy
#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
#define cudaMemcpyDefault hipMemcpyDefault
#define cudaMemset hipMemset
#define cudaSetDevice hipSetDevice
#define cudaDeviceSynchronize hipDeviceSynchronize
#endif
18 changes: 12 additions & 6 deletions examples/pi/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,12 +1,18 @@
cmake_minimum_required(VERSION 3.17)
cmake_minimum_required(VERSION 3.20)

set (PROJECT_NAME kernel_float_pi)
project(${PROJECT_NAME} LANGUAGES CXX CUDA)
set (CMAKE_CXX_STANDARD 17)
project(${PROJECT_NAME} LANGUAGES CXX)

set (CMAKE_CXX_STANDARD 17)
add_executable(${PROJECT_NAME} "${PROJECT_SOURCE_DIR}/main.cu")
target_link_libraries(${PROJECT_NAME} kernel_float)
set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES "80")

find_package(CUDA REQUIRED)
target_include_directories(${PROJECT_NAME} PRIVATE ${CUDA_TOOLKIT_INCLUDE})
if(${KERNEL_FLOAT_LANGUAGE_CUDA})
find_package(CUDA REQUIRED)
target_include_directories(${PROJECT_NAME} PRIVATE ${CUDA_TOOLKIT_INCLUDE})
set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES "80")
endif()

if(${KERNEL_FLOAT_LANGUAGE_HIP})
set_source_files_properties("${PROJECT_SOURCE_DIR}/main.cu" PROPERTIES LANGUAGE HIP)
endif()
7 changes: 4 additions & 3 deletions examples/pi/main.cu
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include <stdio.h>
#include <stdlib.h>

#include "../hip_compat.h"
#include "kernel_float.h"

#define CUDA_CHECK(call) \
Expand All @@ -9,12 +10,12 @@
if (__err != cudaSuccess) { \
fprintf( \
stderr, \
"CUDA error at %s:%d code=%d(%s) \"%s\" \n", \
"CUDA error at %s:%d (%s): %s (code %d) \n", \
__FILE__, \
__LINE__, \
__err, \
#call, \
cudaGetErrorString(__err), \
#call); \
__err); \
exit(EXIT_FAILURE); \
} \
} while (0)
Expand Down
16 changes: 11 additions & 5 deletions examples/vector_add/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,12 +1,18 @@
cmake_minimum_required(VERSION 3.17)

set (PROJECT_NAME kernel_float_vecadd)
project(${PROJECT_NAME} LANGUAGES CXX CUDA)
set (CMAKE_CXX_STANDARD 17)
project(${PROJECT_NAME} LANGUAGES CXX)

set (CMAKE_CXX_STANDARD 17)
add_executable(${PROJECT_NAME} "${PROJECT_SOURCE_DIR}/main.cu")
target_link_libraries(${PROJECT_NAME} kernel_float)
set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES "80")

find_package(CUDA REQUIRED)
target_include_directories(${PROJECT_NAME} PRIVATE ${CUDA_TOOLKIT_INCLUDE})
if(${KERNEL_FLOAT_LANGUAGE_HIP})
set_source_files_properties("${PROJECT_SOURCE_DIR}/main.cu" PROPERTIES LANGUAGE HIP)
endif()

if(${KERNEL_FLOAT_LANGUAGE_CUDA})
find_package(CUDA REQUIRED)
target_include_directories(${PROJECT_NAME} PRIVATE ${CUDA_TOOLKIT_INCLUDE})
set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES "80")
endif()
3 changes: 2 additions & 1 deletion examples/vector_add/main.cu
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include <stdexcept>
#include <vector>

#include "../hip_compat.h"
#include "kernel_float.h"
namespace kf = kernel_float;

Expand All @@ -21,7 +22,7 @@ __global__ void my_kernel(
int i = blockIdx.x * blockDim.x + threadIdx.x;

if (i * N < length) {
output(i) = kf::fma(input[i], input[i], kf::cast<__half>(constant));
output[i] = kf::fma(input[i], input[i], kf::cast<half>(constant));
}
}

Expand Down
16 changes: 11 additions & 5 deletions examples/vector_add_tiling/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,12 +1,18 @@
cmake_minimum_required(VERSION 3.17)

set (PROJECT_NAME kernel_float_vecadd_tiling)
project(${PROJECT_NAME} LANGUAGES CXX CUDA)
set (CMAKE_CXX_STANDARD 17)
project(${PROJECT_NAME} LANGUAGES CXX)

set (CMAKE_CXX_STANDARD 17)
add_executable(${PROJECT_NAME} "${PROJECT_SOURCE_DIR}/main.cu")
target_link_libraries(${PROJECT_NAME} kernel_float)
set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES "80")

find_package(CUDA REQUIRED)
target_include_directories(${PROJECT_NAME} PRIVATE ${CUDA_TOOLKIT_INCLUDE})
if(${KERNEL_FLOAT_LANGUAGE_HIP})
set_source_files_properties("${PROJECT_SOURCE_DIR}/main.cu" PROPERTIES LANGUAGE HIP)
endif()

if(${KERNEL_FLOAT_LANGUAGE_CUDA})
find_package(CUDA REQUIRED)
target_include_directories(${PROJECT_NAME} PRIVATE ${CUDA_TOOLKIT_INCLUDE})
set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES "80")
endif()
1 change: 1 addition & 0 deletions examples/vector_add_tiling/main.cu
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include <stdexcept>
#include <vector>

#include "../hip_compat.h"
#include "kernel_float.h"
#include "kernel_float/tiling.h"
namespace kf = kernel_float;
Expand Down
1 change: 1 addition & 0 deletions include/kernel_float.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#ifndef KERNEL_FLOAT_H
#define KERNEL_FLOAT_H

#include "kernel_float/approx.h"
#include "kernel_float/base.h"
#include "kernel_float/bf16.h"
#include "kernel_float/binops.h"
Expand Down
Loading