[Dev] Add cuda example

lshmouse · Sep 19, 2024 · f3099cb · f3099cb
1 parent 9b0353b
commit f3099cb
Show file tree

Hide file tree

Showing 13 changed files with 345 additions and 10 deletions.
diff --git a/WORKSPACE b/WORKSPACE
@@ -45,4 +45,9 @@ load("@io_bazel_rules_docker//repositories:repositories.bzl", container_reposito
 container_repositories()
 
 load("@io_bazel_rules_docker//repositories:deps.bzl", container_deps = "deps")
-container_deps()
+container_deps()
+
+### CUDA Setup
+load("@rules_cuda//cuda:repositories.bzl", "register_detected_cuda_toolchains", "rules_cuda_dependencies")
+rules_cuda_dependencies()
+register_detected_cuda_toolchains()
diff --git a/bazel/workspace.bzl b/bazel/workspace.bzl
@@ -8,6 +8,7 @@ load("//third_party/pybind11:workspace.bzl", pybind11 = "repo")
 load("//third_party/rules_proto:workspace.bzl", rules_proto = "repo")
 load("//third_party/rules_go:workspace.bzl", rules_go = "repo")
 load("//third_party/rules_rust:workspace.bzl", rules_rust = "repo")
+load("//third_party/rules_cuda:workspace.bzl", rules_cuda = "repo")
 
 load("//third_party/rules_docker:workspace.bzl", rules_docker = "repo")
 
@@ -32,6 +33,7 @@ def initialize_language_repos():
     rules_go()
 
     rules_rust()
+    rules_cuda()
     rules_docker()
 
 def initialize_third_parties():

diff --git a/experimental/cuda_example/BUILD b/experimental/cuda_example/BUILD
@@ -0,0 +1,39 @@
+load("@rules_cc//cc:defs.bzl", "cc_library", "cc_binary")
+load("@rules_cuda//cuda:defs.bzl", "cuda_library")
+
+package(default_visibility = ["//visibility:public"])
+
+cc_binary(
+    name = "cuda_example",
+    testonly = True,
+    srcs = [
+        "cuda_example.cc",
+    ],
+    linkstatic = True,
+    deps = [
+        ":mat_add_lib",
+        ":vector_add_lib",
+        "@com_github_google_glog//:glog",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cuda_library(
+    name = "vector_add_lib",
+    srcs = ["vector_add.cu"],
+    hdrs = ["vector_add.h"],
+    visibility = ["//visibility:private"],
+    deps = [
+        "@com_github_google_glog//:glog",
+    ],
+)
+
+cuda_library(
+    name = "mat_add_lib",
+    srcs = ["mat_add.cu"],
+    hdrs = ["mat_add.h"],
+    visibility = ["//visibility:private"],
+    deps = [
+        "@com_github_google_glog//:glog",
+    ],
+)
diff --git a/experimental/cuda_example/README.md b/experimental/cuda_example/README.md
@@ -0,0 +1,79 @@
+## Cuda
+
+### Example
+```
+bazel run //experimental/cuda_example:cuda_example
+```
+
+### Usage
+```
+$ nvcc --version
+nvcc: NVIDIA (R) Cuda compiler driver
+Copyright (c) 2005-2017 NVIDIA Corporation
+Built on Fri_Nov__3_21:07:56_CDT_2017
+Cuda compilation tools, release 9.1, V9.1.85
+```
+
+```
+watch -n 0.5 nvidia-smi
+```
+
+```
+$ deviceQuery
+deviceQuery Starting...
+
+ CUDA Device Query (Runtime API) version (CUDART static linking)
+
+Detected 1 CUDA Capable device(s)
+
+Device 0: "NVIDIA GeForce RTX 3060"
+  CUDA Driver Version / Runtime Version          12.1 / 11.3
+  CUDA Capability Major/Minor version number:    8.6
+  Total amount of global memory:                 12042 MBytes (12626493440 bytes)
+  (028) Multiprocessors, (128) CUDA Cores/MP:    3584 CUDA Cores
+  GPU Max Clock rate:                            1867 MHz (1.87 GHz)
+  Memory Clock rate:                             7501 Mhz
+  Memory Bus Width:                              192-bit
+  L2 Cache Size:                                 2359296 bytes
+  Maximum Texture Dimension Size (x,y,z)         1D=(131072), 2D=(131072, 65536), 3D=(16384, 16384, 16384)
+  Maximum Layered 1D Texture Size, (num) layers  1D=(32768), 2048 layers
+  Maximum Layered 2D Texture Size, (num) layers  2D=(32768, 32768), 2048 layers
+  Total amount of constant memory:               65536 bytes
+  Total amount of shared memory per block:       49152 bytes
+  Total shared memory per multiprocessor:        102400 bytes
+  Total number of registers available per block: 65536
+  Warp size:                                     32
+  Maximum number of threads per multiprocessor:  1536
+  Maximum number of threads per block:           1024
+  Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
+  Max dimension size of a grid size    (x,y,z): (2147483647, 65535, 65535)
+  Maximum memory pitch:                          2147483647 bytes
+  Texture alignment:                             512 bytes
+  Concurrent copy and kernel execution:          Yes with 2 copy engine(s)
+  Run time limit on kernels:                     Yes
+  Integrated GPU sharing Host Memory:            No
+  Support host page-locked memory mapping:       Yes
+  Alignment requirement for Surfaces:            Yes
+  Device has ECC support:                        Disabled
+  Device supports Unified Addressing (UVA):      Yes
+  Device supports Managed Memory:                Yes
+  Device supports Compute Preemption:            Yes
+  Supports Cooperative Kernel Launch:            Yes
+```
+
+### CUDA Samples
+```
+git clone [email protected]:NVIDIA/cuda-samples.git
+cd cuda-samples
+git checkout -b v9.2 tags/v9.2
+make
+```
+See: https://github.com/NVIDIA/cuda-samples
+
+### TODO
+- https://developer.nvidia.com/nvidia-visual-profiler
+- bandwidthTest
+
+### References
+- https://www.cs.cmu.edu/afs/cs/academic/class/15668-s11/www/cuda-doc/
+- rules_cuda: https://github.com/bazel-contrib/rules_cuda
diff --git a/experimental/cuda_example/cuda_example.cc b/experimental/cuda_example/cuda_example.cc
@@ -0,0 +1,53 @@
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include "glog/logging.h"
+
+#include "experimental/cuda_example/mat_add.h"
+#include "experimental/cuda_example/vector_add.h"
+
+namespace qcraft {
+
+static void HandleError(cudaError_t err, const char* file, int line) {
+  if (err != cudaSuccess) {
+    printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line);
+    exit(EXIT_FAILURE);
+  }
+}
+
+#define HANDLE_ERROR(err) (HandleError(err, __FILE__, __LINE__))
+
+int Main(int argc, char* argv[]) {
+  cudaDeviceProp dev_prop;
+  int num_devices = 0;
+  int driver_version;
+  HANDLE_ERROR(cudaDriverGetVersion(&driver_version));
+  LOG(INFO) << "Cuda driver version: " << driver_version;
+
+  int runtime_version = 0;
+  HANDLE_ERROR(cudaRuntimeGetVersion(&runtime_version));
+  LOG(INFO) << "Cuda runtime version: " << runtime_version;
+
+  HANDLE_ERROR(cudaGetDeviceCount(&num_devices));
+
+  CHECK_GT(num_devices, 0);
+  LOG(INFO) << "GPU dvice num: " << num_devices;
+
+  for (uint32_t dev = 0; dev < num_devices; dev++) {
+    HANDLE_ERROR(cudaGetDeviceProperties(&dev_prop, dev));
+    LOG(INFO) << "dev: " << dev << ", name: " << dev_prop.name
+              << ", totalGlobalMem: "
+              << dev_prop.totalGlobalMem / 1024 / 1024 / 1024 << "GB"
+              << ", sm: " << dev_prop.multiProcessorCount
+              << ", block: " << dev_prop.sharedMemPerBlock
+              << ", maxThreadsPerBlock: " << dev_prop.maxThreadsPerBlock
+              << ", maxThreadsPerMultiProcessor: "
+              << dev_prop.maxThreadsPerMultiProcessor;
+  }
+  // VectorAddDemo(1000);
+  MatAddDemo(100);
+  return 0;
+}
+}  // namespace qcraft
+
+int main(int argc, char* argv[]) { return qcraft::Main(argc, argv); }
diff --git a/experimental/cuda_example/mat_add.cu b/experimental/cuda_example/mat_add.cu
@@ -0,0 +1,73 @@
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include "glog/logging.h"
+
+#include "experimental/cuda_example/mat_add.h"
+
+namespace qcraft {
+
+static void HandleError(cudaError_t err, const char* file, int line) {
+  if (err != cudaSuccess) {
+    printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line);
+    exit(EXIT_FAILURE);
+  }
+}
+
+#define HANDLE_ERROR(err) (HandleError(err, __FILE__, __LINE__))
+
+__global__ void MatAdd(int* a, int* b, int* c, int num) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  if (i < num && j < num) {
+    int idx = i * num + j;
+    c[idx] = a[idx] + b[idx];
+  }
+}
+
+void MatAddDemo(int num) {
+  int m_a[num][num], m_b[num][num], m_c[num][num];
+  for (int i = 0; i < num; i++) {
+    for (int j = 0; j < num; j++) {
+      m_a[i][j] = 1;
+      m_b[i][j] = 2;
+      m_c[i][j] = 0;
+    }
+  }
+
+  int total = num * num;
+  int *a_gpu, *b_gpu, *c_gpu;
+
+  HANDLE_ERROR(cudaMalloc((void**)&a_gpu, total * sizeof(int)));
+  HANDLE_ERROR(cudaMalloc((void**)&b_gpu, total * sizeof(int)));
+  HANDLE_ERROR(cudaMalloc((void**)&c_gpu, total * sizeof(int)));
+
+  HANDLE_ERROR(
+      cudaMemcpy(a_gpu, m_a, total * sizeof(int), cudaMemcpyHostToDevice));
+  HANDLE_ERROR(
+      cudaMemcpy(b_gpu, m_b, total * sizeof(int), cudaMemcpyHostToDevice));
+
+  dim3 grid_size(10, 10);
+  dim3 block_size((num + grid_size.x - 1) / grid_size.x,
+                  (num + grid_size.y - 1) / grid_size.y);
+
+  MatAdd<<<grid_size, block_size>>>(a_gpu, b_gpu, c_gpu, num);
+  HANDLE_ERROR(
+      cudaMemcpy(m_c, c_gpu, total * sizeof(int), cudaMemcpyDeviceToHost));
+
+  // Release device memory
+  cudaFree(a_gpu);
+  cudaFree(b_gpu);
+  cudaFree(c_gpu);
+  cudaDeviceReset();
+
+  for (int i = 0; i < num; i++) {
+    for (int j = 0; j < num; j++) {
+      printf("%d ", m_c[i][j]);
+    }
+    printf("\n");
+  }
+  printf("\n");
+}
+
+}  // namespace qcraft
diff --git a/experimental/cuda_example/mat_add.h b/experimental/cuda_example/mat_add.h
@@ -0,0 +1,8 @@
+#ifndef EXPERIMENTAL_CUDA_EXAMPLE_MAT_ADD_H_
+#define EXPERIMENTAL_CUDA_EXAMPLE_MAT_ADD_H_
+
+namespace qcraft {
+
+void MatAddDemo(int num);
+}
+#endif  // EXPERIMENTAL_CUDA_EXAMPLE_MAT_ADD_H_
diff --git a/experimental/cuda_example/vector_add.cu b/experimental/cuda_example/vector_add.cu
@@ -0,0 +1,59 @@
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include "glog/logging.h"
+
+#include "experimental/cuda_example/vector_add.h"
+
+namespace qcraft {
+
+static void HandleError(cudaError_t err, const char* file, int line) {
+  if (err != cudaSuccess) {
+    printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line);
+    exit(EXIT_FAILURE);
+  }
+}
+
+#define HANDLE_ERROR(err) (HandleError(err, __FILE__, __LINE__))
+
+__global__ void VectorAdd(int* a, int* b, int* c, int num) {
+  int i = threadIdx.x;
+  if (i < num) {
+    c[i] = a[i] + b[i];
+  }
+}
+
+void VectorAddDemo(int num) {
+  int a[num], b[num], c[num];
+  for (int i = 0; i < num; i++) {
+    a[i] = i;
+    b[i] = i * i;
+  }
+
+  int block_num = (num + 100 - 1) / 100;
+  int thread_num = 100;
+
+  int *a_gpu, *b_gpu, *c_gpu;
+  HANDLE_ERROR(cudaMalloc((void**)&a_gpu, num * sizeof(int)));
+  HANDLE_ERROR(cudaMalloc((void**)&b_gpu, num * sizeof(int)));
+  HANDLE_ERROR(cudaMalloc((void**)&c_gpu, num * sizeof(int)));
+
+  HANDLE_ERROR(cudaMemcpy(a_gpu, a, num * sizeof(int), cudaMemcpyHostToDevice));
+  HANDLE_ERROR(cudaMemcpy(b_gpu, b, num * sizeof(int), cudaMemcpyHostToDevice));
+
+  VectorAdd<<<block_num, thread_num>>>(a_gpu, b_gpu, c_gpu, num);
+
+  HANDLE_ERROR(cudaMemcpy(c, c_gpu, num * sizeof(int), cudaMemcpyDeviceToHost));
+
+  for (int i = 0; i < num; i++) {
+    printf("%d + %d = %d\n", a[i], b[i], c[i]);
+  }
+
+  // Release device memory
+  cudaFree(a_gpu);
+  cudaFree(b_gpu);
+  cudaFree(c_gpu);
+  cudaDeviceReset();
+}
+
+}  // namespace qcraft
diff --git a/experimental/cuda_example/vector_add.h b/experimental/cuda_example/vector_add.h
@@ -0,0 +1,7 @@
+#ifndef EXPERIMENTAL_CUDA_EXAMPLE_VECTOR_ADD_H_
+#define EXPERIMENTAL_CUDA_EXAMPLE_VECTOR_ADD_H_
+
+namespace qcraft {
+void VectorAddDemo(int num);
+}
+#endif  // EXPERIMENTAL_CUDA_EXAMPLE_VECTOR_ADD_H_
diff --git a/third_party/absl/workspace.bzl b/third_party/absl/workspace.bzl
@@ -3,12 +3,8 @@
 load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
 
 def repo():
-    absl_version = "20230802.0"
-    http_archive(
-        name = "com_google_absl",
-        sha256 = "59d2976af9d6ecf001a81a35749a6e551a335b949d34918cfade07737b9d93c5",
-        strip_prefix = "abseil-cpp-{}".format(absl_version),
-        urls = [
-            "https://github.com/abseil/abseil-cpp/archive/refs/tags/{}.tar.gz".format(absl_version),
-        ],
-    )
+   http_archive(
+      name = "com_google_absl",
+      urls = ["https://github.com/abseil/abseil-cpp/archive/273292d1cfc0a94a65082ee350509af1d113344d.zip"],
+      strip_prefix = "abseil-cpp-273292d1cfc0a94a65082ee350509af1d113344d",
+    )
diff --git a/third_party/rules_cuda/BUILD b/third_party/rules_cuda/BUILD
@@ -0,0 +1 @@
+package(default_visibility = ["//visibility:public"])
diff --git a/third_party/rules_cuda/README.md b/third_party/rules_cuda/README.md
@@ -0,0 +1,2 @@
+## rules_cuda
+See: https://github.com/bazel-contrib/rules_cuda
diff --git a/third_party/rules_cuda/workspace.bzl b/third_party/rules_cuda/workspace.bzl
@@ -0,0 +1,11 @@
+"""Loads the rules_cuda library"""
+
+load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+
+def repo():
+    http_archive(
+        name = "rules_cuda",
+        sha256 = "2f8c8c8c85f727bec4423efecec12d3b751cb0a98bda99f0f9d351608a23b858",
+        strip_prefix = "rules_cuda-v0.2.1",
+        urls = ["https://github.com/bazel-contrib/rules_cuda/releases/download/v0.2.1/rules_cuda-v0.2.1.tar.gz"],
+    )
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		package(default_visibility = ["//visibility:public"])
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		## rules_cuda
		See: https://github.com/bazel-contrib/rules_cuda