-
Notifications
You must be signed in to change notification settings - Fork 52
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Workflow to test SPIRVRunner (#3400)
- Loading branch information
Showing
4 changed files
with
191 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
name: Test SPIRVRunner | ||
|
||
on: | ||
workflow_dispatch: | ||
|
||
pull_request: | ||
branches: | ||
- main | ||
push: | ||
branches: | ||
- main | ||
|
||
permissions: read-all | ||
|
||
env: | ||
PYTHON_VERSION: '3.9' | ||
|
||
jobs: | ||
tests: | ||
name: Tests | ||
runs-on: | ||
- rolling | ||
- runner-0.0.22 | ||
steps: | ||
- name: Checkout repository | ||
uses: actions/checkout@v4 | ||
|
||
- name: Install Python | ||
uses: actions/setup-python@v5 | ||
with: | ||
python-version: ${{ env.PYTHON_VERSION }} | ||
|
||
- name: Setup PyTorch | ||
uses: ./.github/actions/setup-pytorch | ||
|
||
- name: Setup Triton | ||
uses: ./.github/actions/setup-triton | ||
|
||
- name: Build SPIRVRunner | ||
run: | | ||
source /opt/intel/oneapi/setvars.sh | ||
set -x | ||
export LLVM_DIR="$HOME/.triton/llvm/llvm-ubuntu-x64" | ||
export CMAKE_PREFIX_PATH="$(python scripts/torch_cmake.py)" | ||
cd utils/SPIRVRunner | ||
mkdir build | ||
cd build | ||
cmake -DCMAKE_BUILD_TYPE=RelWithDebInfo .. | ||
make -j | ||
- name: Test SPIRVRunner | ||
run: | | ||
source /opt/intel/oneapi/setvars.sh | ||
set -x | ||
export SPIRV_RUNNER_PATH="$GITHUB_WORKSPACE/utils/SPIRVRunner/build/SPIRVRunner" | ||
export SPIRV_RUNNER_TESTS="$GITHUB_WORKSPACE/utils/SPIRVRunner/tests" | ||
cd utils/SPIRVRunner | ||
pytest tests/test_spirv_runner.py |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
"""Prints cmake directory for PyTorch.""" | ||
|
||
import importlib.metadata | ||
import pathlib | ||
|
||
|
||
def get_torch_cmake_path() -> pathlib.Path: | ||
"""Returns directory that contains TorchConfig.cmake. | ||
Raises: | ||
importlib.metadata.PackageNotFoundError: if torch not installed. | ||
AssertionError: if TorchConfig.cmake not found. | ||
""" | ||
files = importlib.metadata.files('torch') or [] | ||
for f in files: | ||
if f.name == 'TorchConfig.cmake': | ||
return pathlib.Path(f.locate()).parent.resolve() | ||
raise AssertionError('TorchConfig.cmake not found') | ||
|
||
|
||
if __name__ == '__main__': | ||
print(get_torch_cmake_path()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
""" | ||
Vector Addition | ||
=============== | ||
In this tutorial, you will write a simple vector addition using Triton. | ||
In doing so, you will learn about: | ||
* The basic programming model of Triton. | ||
* The `triton.jit` decorator, which is used to define Triton kernels. | ||
* The best practices for validating and benchmarking your custom ops against native reference implementations. | ||
""" | ||
|
||
# %% | ||
# Compute Kernel | ||
# -------------- | ||
|
||
import torch | ||
|
||
import triton | ||
import triton.language as tl | ||
|
||
DEVICE = triton.runtime.driver.active.get_active_torch_device() | ||
|
||
|
||
@triton.jit | ||
def add_kernel(x_ptr, # *Pointer* to first input vector. | ||
y_ptr, # *Pointer* to second input vector. | ||
output_ptr, # *Pointer* to output vector. | ||
n_elements, # Size of the vector. | ||
BLOCK_SIZE: tl.constexpr, # Number of elements each program should process. | ||
# NOTE: `constexpr` so it can be used as a shape value. | ||
): | ||
# There are multiple 'programs' processing different data. We identify which program | ||
# we are here: | ||
pid = tl.program_id(axis=0) # We use a 1D launch grid so axis is 0. | ||
# This program will process inputs that are offset from the initial data. | ||
# For instance, if you had a vector of length 256 and block_size of 64, the programs | ||
# would each access the elements [0:64, 64:128, 128:192, 192:256]. | ||
# Note that offsets is a list of pointers: | ||
block_start = pid * BLOCK_SIZE | ||
offsets = block_start + tl.arange(0, BLOCK_SIZE) | ||
# Create a mask to guard memory operations against out-of-bounds accesses. | ||
mask = offsets < n_elements | ||
# Load x and y from DRAM, masking out any extra elements in case the input is not a | ||
# multiple of the block size. | ||
x = tl.load(x_ptr + offsets, mask=mask) | ||
y = tl.load(y_ptr + offsets, mask=mask) | ||
output = x + y | ||
# Write x + y back to DRAM. | ||
tl.store(output_ptr + offsets, output, mask=mask) | ||
|
||
|
||
# %% | ||
# Let's also declare a helper function to (1) allocate the `z` tensor | ||
# and (2) enqueue the above kernel with appropriate grid/block sizes: | ||
|
||
|
||
def add(x: torch.Tensor, y: torch.Tensor): | ||
# We need to preallocate the output. | ||
output = torch.empty_like(x) | ||
assert x.device == DEVICE and y.device == DEVICE and output.device == DEVICE | ||
n_elements = output.numel() | ||
# The SPMD launch grid denotes the number of kernel instances that run in parallel. | ||
# It is analogous to CUDA launch grids. It can be either Tuple[int], or Callable(metaparameters) -> Tuple[int]. | ||
# In this case, we use a 1D grid where the size is the number of blocks: | ||
grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']), ) | ||
# NOTE: | ||
# - Each torch.tensor object is implicitly converted into a pointer to its first element. | ||
# - `triton.jit`'ed functions can be indexed with a launch grid to obtain a callable GPU kernel. | ||
# - Don't forget to pass meta-parameters as keywords arguments. | ||
add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=1024) | ||
# We return a handle to z but, since `torch.cuda.synchronize()` hasn't been called, the kernel is still | ||
# running asynchronously at this point. | ||
return output | ||
|
||
|
||
# %% | ||
# We can now use the above function to compute the element-wise sum of two `torch.tensor` objects and test its correctness: | ||
|
||
torch.manual_seed(0) | ||
size = 98432 | ||
x = torch.rand(size, device=DEVICE) | ||
y = torch.rand(size, device=DEVICE) | ||
output_torch = x + y | ||
output_triton = add(x, y) | ||
print(output_torch.cpu()) | ||
print(output_triton.cpu()) | ||
print(f'The maximum difference between torch and triton is ' | ||
f'{torch.max(torch.abs(output_torch.cpu() - output_triton.cpu()))}') |