-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
13 changed files
with
345 additions
and
10 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
load("@rules_cc//cc:defs.bzl", "cc_library", "cc_binary") | ||
load("@rules_cuda//cuda:defs.bzl", "cuda_library") | ||
|
||
package(default_visibility = ["//visibility:public"]) | ||
|
||
cc_binary( | ||
name = "cuda_example", | ||
testonly = True, | ||
srcs = [ | ||
"cuda_example.cc", | ||
], | ||
linkstatic = True, | ||
deps = [ | ||
":mat_add_lib", | ||
":vector_add_lib", | ||
"@com_github_google_glog//:glog", | ||
"@com_google_absl//absl/strings", | ||
], | ||
) | ||
|
||
cuda_library( | ||
name = "vector_add_lib", | ||
srcs = ["vector_add.cu"], | ||
hdrs = ["vector_add.h"], | ||
visibility = ["//visibility:private"], | ||
deps = [ | ||
"@com_github_google_glog//:glog", | ||
], | ||
) | ||
|
||
cuda_library( | ||
name = "mat_add_lib", | ||
srcs = ["mat_add.cu"], | ||
hdrs = ["mat_add.h"], | ||
visibility = ["//visibility:private"], | ||
deps = [ | ||
"@com_github_google_glog//:glog", | ||
], | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
## Cuda | ||
|
||
### Example | ||
``` | ||
bazel run //experimental/cuda_example:cuda_example | ||
``` | ||
|
||
### Usage | ||
``` | ||
$ nvcc --version | ||
nvcc: NVIDIA (R) Cuda compiler driver | ||
Copyright (c) 2005-2017 NVIDIA Corporation | ||
Built on Fri_Nov__3_21:07:56_CDT_2017 | ||
Cuda compilation tools, release 9.1, V9.1.85 | ||
``` | ||
|
||
``` | ||
watch -n 0.5 nvidia-smi | ||
``` | ||
|
||
``` | ||
$ deviceQuery | ||
deviceQuery Starting... | ||
CUDA Device Query (Runtime API) version (CUDART static linking) | ||
Detected 1 CUDA Capable device(s) | ||
Device 0: "NVIDIA GeForce RTX 3060" | ||
CUDA Driver Version / Runtime Version 12.1 / 11.3 | ||
CUDA Capability Major/Minor version number: 8.6 | ||
Total amount of global memory: 12042 MBytes (12626493440 bytes) | ||
(028) Multiprocessors, (128) CUDA Cores/MP: 3584 CUDA Cores | ||
GPU Max Clock rate: 1867 MHz (1.87 GHz) | ||
Memory Clock rate: 7501 Mhz | ||
Memory Bus Width: 192-bit | ||
L2 Cache Size: 2359296 bytes | ||
Maximum Texture Dimension Size (x,y,z) 1D=(131072), 2D=(131072, 65536), 3D=(16384, 16384, 16384) | ||
Maximum Layered 1D Texture Size, (num) layers 1D=(32768), 2048 layers | ||
Maximum Layered 2D Texture Size, (num) layers 2D=(32768, 32768), 2048 layers | ||
Total amount of constant memory: 65536 bytes | ||
Total amount of shared memory per block: 49152 bytes | ||
Total shared memory per multiprocessor: 102400 bytes | ||
Total number of registers available per block: 65536 | ||
Warp size: 32 | ||
Maximum number of threads per multiprocessor: 1536 | ||
Maximum number of threads per block: 1024 | ||
Max dimension size of a thread block (x,y,z): (1024, 1024, 64) | ||
Max dimension size of a grid size (x,y,z): (2147483647, 65535, 65535) | ||
Maximum memory pitch: 2147483647 bytes | ||
Texture alignment: 512 bytes | ||
Concurrent copy and kernel execution: Yes with 2 copy engine(s) | ||
Run time limit on kernels: Yes | ||
Integrated GPU sharing Host Memory: No | ||
Support host page-locked memory mapping: Yes | ||
Alignment requirement for Surfaces: Yes | ||
Device has ECC support: Disabled | ||
Device supports Unified Addressing (UVA): Yes | ||
Device supports Managed Memory: Yes | ||
Device supports Compute Preemption: Yes | ||
Supports Cooperative Kernel Launch: Yes | ||
``` | ||
|
||
### CUDA Samples | ||
``` | ||
git clone [email protected]:NVIDIA/cuda-samples.git | ||
cd cuda-samples | ||
git checkout -b v9.2 tags/v9.2 | ||
make | ||
``` | ||
See: https://github.com/NVIDIA/cuda-samples | ||
|
||
### TODO | ||
- https://developer.nvidia.com/nvidia-visual-profiler | ||
- bandwidthTest | ||
|
||
### References | ||
- https://www.cs.cmu.edu/afs/cs/academic/class/15668-s11/www/cuda-doc/ | ||
- rules_cuda: https://github.com/bazel-contrib/rules_cuda |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
#include <cuda.h> | ||
#include <cuda_runtime.h> | ||
|
||
#include "glog/logging.h" | ||
|
||
#include "experimental/cuda_example/mat_add.h" | ||
#include "experimental/cuda_example/vector_add.h" | ||
|
||
namespace qcraft { | ||
|
||
static void HandleError(cudaError_t err, const char* file, int line) { | ||
if (err != cudaSuccess) { | ||
printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line); | ||
exit(EXIT_FAILURE); | ||
} | ||
} | ||
|
||
#define HANDLE_ERROR(err) (HandleError(err, __FILE__, __LINE__)) | ||
|
||
int Main(int argc, char* argv[]) { | ||
cudaDeviceProp dev_prop; | ||
int num_devices = 0; | ||
int driver_version; | ||
HANDLE_ERROR(cudaDriverGetVersion(&driver_version)); | ||
LOG(INFO) << "Cuda driver version: " << driver_version; | ||
|
||
int runtime_version = 0; | ||
HANDLE_ERROR(cudaRuntimeGetVersion(&runtime_version)); | ||
LOG(INFO) << "Cuda runtime version: " << runtime_version; | ||
|
||
HANDLE_ERROR(cudaGetDeviceCount(&num_devices)); | ||
|
||
CHECK_GT(num_devices, 0); | ||
LOG(INFO) << "GPU dvice num: " << num_devices; | ||
|
||
for (uint32_t dev = 0; dev < num_devices; dev++) { | ||
HANDLE_ERROR(cudaGetDeviceProperties(&dev_prop, dev)); | ||
LOG(INFO) << "dev: " << dev << ", name: " << dev_prop.name | ||
<< ", totalGlobalMem: " | ||
<< dev_prop.totalGlobalMem / 1024 / 1024 / 1024 << "GB" | ||
<< ", sm: " << dev_prop.multiProcessorCount | ||
<< ", block: " << dev_prop.sharedMemPerBlock | ||
<< ", maxThreadsPerBlock: " << dev_prop.maxThreadsPerBlock | ||
<< ", maxThreadsPerMultiProcessor: " | ||
<< dev_prop.maxThreadsPerMultiProcessor; | ||
} | ||
// VectorAddDemo(1000); | ||
MatAddDemo(100); | ||
return 0; | ||
} | ||
} // namespace qcraft | ||
|
||
int main(int argc, char* argv[]) { return qcraft::Main(argc, argv); } |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
#include <cuda.h> | ||
#include <cuda_runtime.h> | ||
|
||
#include "glog/logging.h" | ||
|
||
#include "experimental/cuda_example/mat_add.h" | ||
|
||
namespace qcraft { | ||
|
||
static void HandleError(cudaError_t err, const char* file, int line) { | ||
if (err != cudaSuccess) { | ||
printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line); | ||
exit(EXIT_FAILURE); | ||
} | ||
} | ||
|
||
#define HANDLE_ERROR(err) (HandleError(err, __FILE__, __LINE__)) | ||
|
||
__global__ void MatAdd(int* a, int* b, int* c, int num) { | ||
int i = blockIdx.x * blockDim.x + threadIdx.x; | ||
int j = blockIdx.y * blockDim.y + threadIdx.y; | ||
if (i < num && j < num) { | ||
int idx = i * num + j; | ||
c[idx] = a[idx] + b[idx]; | ||
} | ||
} | ||
|
||
void MatAddDemo(int num) { | ||
int m_a[num][num], m_b[num][num], m_c[num][num]; | ||
for (int i = 0; i < num; i++) { | ||
for (int j = 0; j < num; j++) { | ||
m_a[i][j] = 1; | ||
m_b[i][j] = 2; | ||
m_c[i][j] = 0; | ||
} | ||
} | ||
|
||
int total = num * num; | ||
int *a_gpu, *b_gpu, *c_gpu; | ||
|
||
HANDLE_ERROR(cudaMalloc((void**)&a_gpu, total * sizeof(int))); | ||
HANDLE_ERROR(cudaMalloc((void**)&b_gpu, total * sizeof(int))); | ||
HANDLE_ERROR(cudaMalloc((void**)&c_gpu, total * sizeof(int))); | ||
|
||
HANDLE_ERROR( | ||
cudaMemcpy(a_gpu, m_a, total * sizeof(int), cudaMemcpyHostToDevice)); | ||
HANDLE_ERROR( | ||
cudaMemcpy(b_gpu, m_b, total * sizeof(int), cudaMemcpyHostToDevice)); | ||
|
||
dim3 grid_size(10, 10); | ||
dim3 block_size((num + grid_size.x - 1) / grid_size.x, | ||
(num + grid_size.y - 1) / grid_size.y); | ||
|
||
MatAdd<<<grid_size, block_size>>>(a_gpu, b_gpu, c_gpu, num); | ||
HANDLE_ERROR( | ||
cudaMemcpy(m_c, c_gpu, total * sizeof(int), cudaMemcpyDeviceToHost)); | ||
|
||
// Release device memory | ||
cudaFree(a_gpu); | ||
cudaFree(b_gpu); | ||
cudaFree(c_gpu); | ||
cudaDeviceReset(); | ||
|
||
for (int i = 0; i < num; i++) { | ||
for (int j = 0; j < num; j++) { | ||
printf("%d ", m_c[i][j]); | ||
} | ||
printf("\n"); | ||
} | ||
printf("\n"); | ||
} | ||
|
||
} // namespace qcraft |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
#ifndef EXPERIMENTAL_CUDA_EXAMPLE_MAT_ADD_H_ | ||
#define EXPERIMENTAL_CUDA_EXAMPLE_MAT_ADD_H_ | ||
|
||
namespace qcraft { | ||
|
||
void MatAddDemo(int num); | ||
} | ||
#endif // EXPERIMENTAL_CUDA_EXAMPLE_MAT_ADD_H_ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
#include <cuda.h> | ||
#include <cuda_runtime.h> | ||
|
||
#include "glog/logging.h" | ||
|
||
#include "experimental/cuda_example/vector_add.h" | ||
|
||
namespace qcraft { | ||
|
||
static void HandleError(cudaError_t err, const char* file, int line) { | ||
if (err != cudaSuccess) { | ||
printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line); | ||
exit(EXIT_FAILURE); | ||
} | ||
} | ||
|
||
#define HANDLE_ERROR(err) (HandleError(err, __FILE__, __LINE__)) | ||
|
||
__global__ void VectorAdd(int* a, int* b, int* c, int num) { | ||
int i = threadIdx.x; | ||
if (i < num) { | ||
c[i] = a[i] + b[i]; | ||
} | ||
} | ||
|
||
void VectorAddDemo(int num) { | ||
int a[num], b[num], c[num]; | ||
for (int i = 0; i < num; i++) { | ||
a[i] = i; | ||
b[i] = i * i; | ||
} | ||
|
||
int block_num = (num + 100 - 1) / 100; | ||
int thread_num = 100; | ||
|
||
int *a_gpu, *b_gpu, *c_gpu; | ||
HANDLE_ERROR(cudaMalloc((void**)&a_gpu, num * sizeof(int))); | ||
HANDLE_ERROR(cudaMalloc((void**)&b_gpu, num * sizeof(int))); | ||
HANDLE_ERROR(cudaMalloc((void**)&c_gpu, num * sizeof(int))); | ||
|
||
HANDLE_ERROR(cudaMemcpy(a_gpu, a, num * sizeof(int), cudaMemcpyHostToDevice)); | ||
HANDLE_ERROR(cudaMemcpy(b_gpu, b, num * sizeof(int), cudaMemcpyHostToDevice)); | ||
|
||
VectorAdd<<<block_num, thread_num>>>(a_gpu, b_gpu, c_gpu, num); | ||
|
||
HANDLE_ERROR(cudaMemcpy(c, c_gpu, num * sizeof(int), cudaMemcpyDeviceToHost)); | ||
|
||
for (int i = 0; i < num; i++) { | ||
printf("%d + %d = %d\n", a[i], b[i], c[i]); | ||
} | ||
|
||
// Release device memory | ||
cudaFree(a_gpu); | ||
cudaFree(b_gpu); | ||
cudaFree(c_gpu); | ||
cudaDeviceReset(); | ||
} | ||
|
||
} // namespace qcraft |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
#ifndef EXPERIMENTAL_CUDA_EXAMPLE_VECTOR_ADD_H_ | ||
#define EXPERIMENTAL_CUDA_EXAMPLE_VECTOR_ADD_H_ | ||
|
||
namespace qcraft { | ||
void VectorAddDemo(int num); | ||
} | ||
#endif // EXPERIMENTAL_CUDA_EXAMPLE_VECTOR_ADD_H_ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
package(default_visibility = ["//visibility:public"]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
## rules_cuda | ||
See: https://github.com/bazel-contrib/rules_cuda |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
"""Loads the rules_cuda library""" | ||
|
||
load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive") | ||
|
||
def repo(): | ||
http_archive( | ||
name = "rules_cuda", | ||
sha256 = "2f8c8c8c85f727bec4423efecec12d3b751cb0a98bda99f0f9d351608a23b858", | ||
strip_prefix = "rules_cuda-v0.2.1", | ||
urls = ["https://github.com/bazel-contrib/rules_cuda/releases/download/v0.2.1/rules_cuda-v0.2.1.tar.gz"], | ||
) |