Skip to content

Commit

Permalink
[Dev] Add cuda example
Browse files Browse the repository at this point in the history
  • Loading branch information
lshmouse committed Sep 19, 2024
1 parent 9b0353b commit f3099cb
Show file tree
Hide file tree
Showing 13 changed files with 345 additions and 10 deletions.
7 changes: 6 additions & 1 deletion WORKSPACE
Original file line number Diff line number Diff line change
Expand Up @@ -45,4 +45,9 @@ load("@io_bazel_rules_docker//repositories:repositories.bzl", container_reposito
container_repositories()

load("@io_bazel_rules_docker//repositories:deps.bzl", container_deps = "deps")
container_deps()
container_deps()

### CUDA Setup
load("@rules_cuda//cuda:repositories.bzl", "register_detected_cuda_toolchains", "rules_cuda_dependencies")
rules_cuda_dependencies()
register_detected_cuda_toolchains()
2 changes: 2 additions & 0 deletions bazel/workspace.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ load("//third_party/pybind11:workspace.bzl", pybind11 = "repo")
load("//third_party/rules_proto:workspace.bzl", rules_proto = "repo")
load("//third_party/rules_go:workspace.bzl", rules_go = "repo")
load("//third_party/rules_rust:workspace.bzl", rules_rust = "repo")
load("//third_party/rules_cuda:workspace.bzl", rules_cuda = "repo")

load("//third_party/rules_docker:workspace.bzl", rules_docker = "repo")

Expand All @@ -32,6 +33,7 @@ def initialize_language_repos():
rules_go()

rules_rust()
rules_cuda()
rules_docker()

def initialize_third_parties():
Expand Down
39 changes: 39 additions & 0 deletions experimental/cuda_example/BUILD
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
load("@rules_cc//cc:defs.bzl", "cc_library", "cc_binary")
load("@rules_cuda//cuda:defs.bzl", "cuda_library")

package(default_visibility = ["//visibility:public"])

cc_binary(
name = "cuda_example",
testonly = True,
srcs = [
"cuda_example.cc",
],
linkstatic = True,
deps = [
":mat_add_lib",
":vector_add_lib",
"@com_github_google_glog//:glog",
"@com_google_absl//absl/strings",
],
)

cuda_library(
name = "vector_add_lib",
srcs = ["vector_add.cu"],
hdrs = ["vector_add.h"],
visibility = ["//visibility:private"],
deps = [
"@com_github_google_glog//:glog",
],
)

cuda_library(
name = "mat_add_lib",
srcs = ["mat_add.cu"],
hdrs = ["mat_add.h"],
visibility = ["//visibility:private"],
deps = [
"@com_github_google_glog//:glog",
],
)
79 changes: 79 additions & 0 deletions experimental/cuda_example/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
## Cuda

### Example
```
bazel run //experimental/cuda_example:cuda_example
```

### Usage
```
$ nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2017 NVIDIA Corporation
Built on Fri_Nov__3_21:07:56_CDT_2017
Cuda compilation tools, release 9.1, V9.1.85
```

```
watch -n 0.5 nvidia-smi
```

```
$ deviceQuery
deviceQuery Starting...
CUDA Device Query (Runtime API) version (CUDART static linking)
Detected 1 CUDA Capable device(s)
Device 0: "NVIDIA GeForce RTX 3060"
CUDA Driver Version / Runtime Version 12.1 / 11.3
CUDA Capability Major/Minor version number: 8.6
Total amount of global memory: 12042 MBytes (12626493440 bytes)
(028) Multiprocessors, (128) CUDA Cores/MP: 3584 CUDA Cores
GPU Max Clock rate: 1867 MHz (1.87 GHz)
Memory Clock rate: 7501 Mhz
Memory Bus Width: 192-bit
L2 Cache Size: 2359296 bytes
Maximum Texture Dimension Size (x,y,z) 1D=(131072), 2D=(131072, 65536), 3D=(16384, 16384, 16384)
Maximum Layered 1D Texture Size, (num) layers 1D=(32768), 2048 layers
Maximum Layered 2D Texture Size, (num) layers 2D=(32768, 32768), 2048 layers
Total amount of constant memory: 65536 bytes
Total amount of shared memory per block: 49152 bytes
Total shared memory per multiprocessor: 102400 bytes
Total number of registers available per block: 65536
Warp size: 32
Maximum number of threads per multiprocessor: 1536
Maximum number of threads per block: 1024
Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
Max dimension size of a grid size (x,y,z): (2147483647, 65535, 65535)
Maximum memory pitch: 2147483647 bytes
Texture alignment: 512 bytes
Concurrent copy and kernel execution: Yes with 2 copy engine(s)
Run time limit on kernels: Yes
Integrated GPU sharing Host Memory: No
Support host page-locked memory mapping: Yes
Alignment requirement for Surfaces: Yes
Device has ECC support: Disabled
Device supports Unified Addressing (UVA): Yes
Device supports Managed Memory: Yes
Device supports Compute Preemption: Yes
Supports Cooperative Kernel Launch: Yes
```

### CUDA Samples
```
git clone [email protected]:NVIDIA/cuda-samples.git
cd cuda-samples
git checkout -b v9.2 tags/v9.2
make
```
See: https://github.com/NVIDIA/cuda-samples

### TODO
- https://developer.nvidia.com/nvidia-visual-profiler
- bandwidthTest

### References
- https://www.cs.cmu.edu/afs/cs/academic/class/15668-s11/www/cuda-doc/
- rules_cuda: https://github.com/bazel-contrib/rules_cuda
53 changes: 53 additions & 0 deletions experimental/cuda_example/cuda_example.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#include <cuda.h>
#include <cuda_runtime.h>

#include "glog/logging.h"

#include "experimental/cuda_example/mat_add.h"
#include "experimental/cuda_example/vector_add.h"

namespace qcraft {

static void HandleError(cudaError_t err, const char* file, int line) {
if (err != cudaSuccess) {
printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line);
exit(EXIT_FAILURE);
}
}

#define HANDLE_ERROR(err) (HandleError(err, __FILE__, __LINE__))

int Main(int argc, char* argv[]) {
cudaDeviceProp dev_prop;
int num_devices = 0;
int driver_version;
HANDLE_ERROR(cudaDriverGetVersion(&driver_version));
LOG(INFO) << "Cuda driver version: " << driver_version;

int runtime_version = 0;
HANDLE_ERROR(cudaRuntimeGetVersion(&runtime_version));
LOG(INFO) << "Cuda runtime version: " << runtime_version;

HANDLE_ERROR(cudaGetDeviceCount(&num_devices));

CHECK_GT(num_devices, 0);
LOG(INFO) << "GPU dvice num: " << num_devices;

for (uint32_t dev = 0; dev < num_devices; dev++) {
HANDLE_ERROR(cudaGetDeviceProperties(&dev_prop, dev));
LOG(INFO) << "dev: " << dev << ", name: " << dev_prop.name
<< ", totalGlobalMem: "
<< dev_prop.totalGlobalMem / 1024 / 1024 / 1024 << "GB"
<< ", sm: " << dev_prop.multiProcessorCount
<< ", block: " << dev_prop.sharedMemPerBlock
<< ", maxThreadsPerBlock: " << dev_prop.maxThreadsPerBlock
<< ", maxThreadsPerMultiProcessor: "
<< dev_prop.maxThreadsPerMultiProcessor;
}
// VectorAddDemo(1000);
MatAddDemo(100);
return 0;
}
} // namespace qcraft

int main(int argc, char* argv[]) { return qcraft::Main(argc, argv); }
73 changes: 73 additions & 0 deletions experimental/cuda_example/mat_add.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
#include <cuda.h>
#include <cuda_runtime.h>

#include "glog/logging.h"

#include "experimental/cuda_example/mat_add.h"

namespace qcraft {

static void HandleError(cudaError_t err, const char* file, int line) {
if (err != cudaSuccess) {
printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line);
exit(EXIT_FAILURE);
}
}

#define HANDLE_ERROR(err) (HandleError(err, __FILE__, __LINE__))

__global__ void MatAdd(int* a, int* b, int* c, int num) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
if (i < num && j < num) {
int idx = i * num + j;
c[idx] = a[idx] + b[idx];
}
}

void MatAddDemo(int num) {
int m_a[num][num], m_b[num][num], m_c[num][num];
for (int i = 0; i < num; i++) {
for (int j = 0; j < num; j++) {
m_a[i][j] = 1;
m_b[i][j] = 2;
m_c[i][j] = 0;
}
}

int total = num * num;
int *a_gpu, *b_gpu, *c_gpu;

HANDLE_ERROR(cudaMalloc((void**)&a_gpu, total * sizeof(int)));
HANDLE_ERROR(cudaMalloc((void**)&b_gpu, total * sizeof(int)));
HANDLE_ERROR(cudaMalloc((void**)&c_gpu, total * sizeof(int)));

HANDLE_ERROR(
cudaMemcpy(a_gpu, m_a, total * sizeof(int), cudaMemcpyHostToDevice));
HANDLE_ERROR(
cudaMemcpy(b_gpu, m_b, total * sizeof(int), cudaMemcpyHostToDevice));

dim3 grid_size(10, 10);
dim3 block_size((num + grid_size.x - 1) / grid_size.x,
(num + grid_size.y - 1) / grid_size.y);

MatAdd<<<grid_size, block_size>>>(a_gpu, b_gpu, c_gpu, num);
HANDLE_ERROR(
cudaMemcpy(m_c, c_gpu, total * sizeof(int), cudaMemcpyDeviceToHost));

// Release device memory
cudaFree(a_gpu);
cudaFree(b_gpu);
cudaFree(c_gpu);
cudaDeviceReset();

for (int i = 0; i < num; i++) {
for (int j = 0; j < num; j++) {
printf("%d ", m_c[i][j]);
}
printf("\n");
}
printf("\n");
}

} // namespace qcraft
8 changes: 8 additions & 0 deletions experimental/cuda_example/mat_add.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#ifndef EXPERIMENTAL_CUDA_EXAMPLE_MAT_ADD_H_
#define EXPERIMENTAL_CUDA_EXAMPLE_MAT_ADD_H_

namespace qcraft {

void MatAddDemo(int num);
}
#endif // EXPERIMENTAL_CUDA_EXAMPLE_MAT_ADD_H_
59 changes: 59 additions & 0 deletions experimental/cuda_example/vector_add.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
#include <cuda.h>
#include <cuda_runtime.h>

#include "glog/logging.h"

#include "experimental/cuda_example/vector_add.h"

namespace qcraft {

static void HandleError(cudaError_t err, const char* file, int line) {
if (err != cudaSuccess) {
printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line);
exit(EXIT_FAILURE);
}
}

#define HANDLE_ERROR(err) (HandleError(err, __FILE__, __LINE__))

__global__ void VectorAdd(int* a, int* b, int* c, int num) {
int i = threadIdx.x;
if (i < num) {
c[i] = a[i] + b[i];
}
}

void VectorAddDemo(int num) {
int a[num], b[num], c[num];
for (int i = 0; i < num; i++) {
a[i] = i;
b[i] = i * i;
}

int block_num = (num + 100 - 1) / 100;
int thread_num = 100;

int *a_gpu, *b_gpu, *c_gpu;
HANDLE_ERROR(cudaMalloc((void**)&a_gpu, num * sizeof(int)));
HANDLE_ERROR(cudaMalloc((void**)&b_gpu, num * sizeof(int)));
HANDLE_ERROR(cudaMalloc((void**)&c_gpu, num * sizeof(int)));

HANDLE_ERROR(cudaMemcpy(a_gpu, a, num * sizeof(int), cudaMemcpyHostToDevice));
HANDLE_ERROR(cudaMemcpy(b_gpu, b, num * sizeof(int), cudaMemcpyHostToDevice));

VectorAdd<<<block_num, thread_num>>>(a_gpu, b_gpu, c_gpu, num);

HANDLE_ERROR(cudaMemcpy(c, c_gpu, num * sizeof(int), cudaMemcpyDeviceToHost));

for (int i = 0; i < num; i++) {
printf("%d + %d = %d\n", a[i], b[i], c[i]);
}

// Release device memory
cudaFree(a_gpu);
cudaFree(b_gpu);
cudaFree(c_gpu);
cudaDeviceReset();
}

} // namespace qcraft
7 changes: 7 additions & 0 deletions experimental/cuda_example/vector_add.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#ifndef EXPERIMENTAL_CUDA_EXAMPLE_VECTOR_ADD_H_
#define EXPERIMENTAL_CUDA_EXAMPLE_VECTOR_ADD_H_

namespace qcraft {
void VectorAddDemo(int num);
}
#endif // EXPERIMENTAL_CUDA_EXAMPLE_VECTOR_ADD_H_
14 changes: 5 additions & 9 deletions third_party/absl/workspace.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,8 @@
load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")

def repo():
absl_version = "20230802.0"
http_archive(
name = "com_google_absl",
sha256 = "59d2976af9d6ecf001a81a35749a6e551a335b949d34918cfade07737b9d93c5",
strip_prefix = "abseil-cpp-{}".format(absl_version),
urls = [
"https://github.com/abseil/abseil-cpp/archive/refs/tags/{}.tar.gz".format(absl_version),
],
)
http_archive(
name = "com_google_absl",
urls = ["https://github.com/abseil/abseil-cpp/archive/273292d1cfc0a94a65082ee350509af1d113344d.zip"],
strip_prefix = "abseil-cpp-273292d1cfc0a94a65082ee350509af1d113344d",
)
1 change: 1 addition & 0 deletions third_party/rules_cuda/BUILD
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
package(default_visibility = ["//visibility:public"])
2 changes: 2 additions & 0 deletions third_party/rules_cuda/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
## rules_cuda
See: https://github.com/bazel-contrib/rules_cuda
11 changes: 11 additions & 0 deletions third_party/rules_cuda/workspace.bzl
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
"""Loads the rules_cuda library"""

load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")

def repo():
http_archive(
name = "rules_cuda",
sha256 = "2f8c8c8c85f727bec4423efecec12d3b751cb0a98bda99f0f9d351608a23b858",
strip_prefix = "rules_cuda-v0.2.1",
urls = ["https://github.com/bazel-contrib/rules_cuda/releases/download/v0.2.1/rules_cuda-v0.2.1.tar.gz"],
)

0 comments on commit f3099cb

Please sign in to comment.