From 0ab44bdde46ac828f154168deb123095d915f9f3 Mon Sep 17 00:00:00 2001 From: kwoodle Date: Sun, 10 May 2020 23:13:31 -0400 Subject: [PATCH] Initial commit --- .idea/.gitignore | 8 + .idea/CudaSamples.iml | 2 + .idea/misc.xml | 7 + .idea/modules.xml | 8 + .idea/runConfigurations/CudaSamples.xml | 7 + .idea/runConfigurations/asyncAPI.xml | 7 + .../runConfigurations/cudaTensorCoreGemm.xml | 7 + .idea/runConfigurations/simpleMPI.xml | 7 + .idea/vcs.xml | 6 + CMakeLists.txt | 32 + asyncAPI.cu | 127 +++ bandwidthTest.cu | 926 ++++++++++++++++++ cudaTensorCoreGemm.cu | 586 +++++++++++ main.cu | 6 + mpirun.sh | 2 + run_bandwidthTest.sh | 2 + simpleMPI.cpp | 122 +++ simpleMPI.cu | 91 ++ simpleMPI.h | 32 + 19 files changed, 1985 insertions(+) create mode 100644 .idea/.gitignore create mode 100644 .idea/CudaSamples.iml create mode 100644 .idea/misc.xml create mode 100644 .idea/modules.xml create mode 100644 .idea/runConfigurations/CudaSamples.xml create mode 100644 .idea/runConfigurations/asyncAPI.xml create mode 100644 .idea/runConfigurations/cudaTensorCoreGemm.xml create mode 100644 .idea/runConfigurations/simpleMPI.xml create mode 100644 .idea/vcs.xml create mode 100644 CMakeLists.txt create mode 100644 asyncAPI.cu create mode 100644 bandwidthTest.cu create mode 100644 cudaTensorCoreGemm.cu create mode 100644 main.cu create mode 100755 mpirun.sh create mode 100755 run_bandwidthTest.sh create mode 100644 simpleMPI.cpp create mode 100644 simpleMPI.cu create mode 100644 simpleMPI.h diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..73f69e0 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml +# Editor-based HTTP Client requests +/httpRequests/ diff --git a/.idea/CudaSamples.iml b/.idea/CudaSamples.iml new file mode 100644 index 0000000..f08604b --- /dev/null +++ b/.idea/CudaSamples.iml @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..8822db8 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,7 @@ + + + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..5217dda --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/runConfigurations/CudaSamples.xml b/.idea/runConfigurations/CudaSamples.xml new file mode 100644 index 0000000..04419df --- /dev/null +++ b/.idea/runConfigurations/CudaSamples.xml @@ -0,0 +1,7 @@ + + + + + + \ No newline at end of file diff --git a/.idea/runConfigurations/asyncAPI.xml b/.idea/runConfigurations/asyncAPI.xml new file mode 100644 index 0000000..db32816 --- /dev/null +++ b/.idea/runConfigurations/asyncAPI.xml @@ -0,0 +1,7 @@ + + + + + + \ No newline at end of file diff --git a/.idea/runConfigurations/cudaTensorCoreGemm.xml b/.idea/runConfigurations/cudaTensorCoreGemm.xml new file mode 100644 index 0000000..57250fd --- /dev/null +++ b/.idea/runConfigurations/cudaTensorCoreGemm.xml @@ -0,0 +1,7 @@ + + + + + + \ No newline at end of file diff --git a/.idea/runConfigurations/simpleMPI.xml b/.idea/runConfigurations/simpleMPI.xml new file mode 100644 index 0000000..f1766a5 --- /dev/null +++ b/.idea/runConfigurations/simpleMPI.xml @@ -0,0 +1,7 @@ + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..c97a31a --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,32 @@ +# Needs CUDAHOSTCXX=/usr/bin/cuda-g++;CUDA_LIB_PATH=/usr/local/cuda/lib64 in +# CLion Settings | Build, Execution, Deployment | CMake > Environment + +cmake_minimum_required(VERSION 3.16) +project(CudaSamples LANGUAGES CXX CUDA) + +set(CMAKE_CUDA_STANDARD 14) +set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "-arch=sm_75") +include_directories(/usr/local/cuda/samples/common/inc) + +set(CUDA_VERBOSE_BUILD ON) +set(CUDA_SEPARABLE_COMPILATION ON) # Allow multiple CUDA files compilation + +add_executable(CudaSamples main.cu) + +set_target_properties( + CudaSamples + PROPERTIES + CUDA_SEPARABLE_COMPILATION ON) + +add_executable(asyncAPI asyncAPI.cu) + +find_package(MPI REQUIRED) +include_directories(${MPI_CXX_INCLUDE_DIRS}) + +add_executable(simpleMPI simpleMPI.cpp simpleMPI.h simpleMPI.cu) +target_link_libraries(simpleMPI ${MPI_CXX_LIBRARIES}) + +add_executable(cudaTensorCoreGemm cudaTensorCoreGemm.cu) +target_link_libraries(cudaTensorCoreGemm) + +add_executable(bandwidthTest bandwidthTest.cu) diff --git a/asyncAPI.cu b/asyncAPI.cu new file mode 100644 index 0000000..dec2def --- /dev/null +++ b/asyncAPI.cu @@ -0,0 +1,127 @@ +// +// Created by kwoodle on 5/10/20. +// + +//////////////////////////////////////////////////////////////////////////// +// +// Copyright 1993-2015 NVIDIA Corporation. All rights reserved. +// +// Please refer to the NVIDIA end user license agreement (EULA) associated +// with this source code for terms and conditions that govern your use of +// this software. Any use, reproduction, disclosure, or distribution of +// this software and related documentation outside the terms of the EULA +// is strictly prohibited. +// +//////////////////////////////////////////////////////////////////////////// + +// +// This sample illustrates the usage of CUDA events for both GPU timing and +// overlapping CPU and GPU execution. Events are inserted into a stream +// of CUDA calls. Since CUDA stream calls are asynchronous, the CPU can +// perform computations while GPU is executing (including DMA memcopies +// between the host and device). CPU can query CUDA events to determine +// whether GPU has completed tasks. +// + +// includes, system +#include + +// includes CUDA Runtime +#include + +// includes, project +#include +#include // helper utility functions + +__global__ void increment_kernel(int *g_data, int inc_value) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + g_data[idx] = g_data[idx] + inc_value; +} + +bool correct_output(int *data, const int n, const int x) { + for (int i = 0; i < n; i++) + if (data[i] != x) { + printf("Error! data[%d] = %d, ref = %d\n", i, data[i], x); + return false; + } + + return true; +} + +int main(int argc, char *argv[]) { + int devID; + cudaDeviceProp deviceProps; + + printf("[%s] - Starting...\n", argv[0]); + + // This will pick the best possible CUDA capable device + devID = findCudaDevice(argc, (const char **) argv); + + // get device name + checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID)); + printf("CUDA device [%s]\n", deviceProps.name); + + int n = 16 * 1024 * 1024; + int nbytes = n * sizeof(int); + int value = 26; + + // allocate host memory + int *a = 0; + checkCudaErrors(cudaMallocHost((void **) &a, nbytes)); + memset(a, 0, nbytes); + + // allocate device memory + int *d_a = 0; + checkCudaErrors(cudaMalloc((void **) &d_a, nbytes)); + checkCudaErrors(cudaMemset(d_a, 255, nbytes)); + + // set kernel launch configuration + dim3 threads = dim3(512, 1); + dim3 blocks = dim3(n / threads.x, 1); + + // create cuda event handles + cudaEvent_t start, stop; + checkCudaErrors(cudaEventCreate(&start)); + checkCudaErrors(cudaEventCreate(&stop)); + + StopWatchInterface *timer = NULL; + sdkCreateTimer(&timer); + sdkResetTimer(&timer); + + checkCudaErrors(cudaDeviceSynchronize()); + float gpu_time = 0.0f; + + // asynchronously issue work to the GPU (all to stream 0) + sdkStartTimer(&timer); + cudaEventRecord(start, 0); + cudaMemcpyAsync(d_a, a, nbytes, cudaMemcpyHostToDevice, 0); + increment_kernel<<>>(d_a, value); + cudaMemcpyAsync(a, d_a, nbytes, cudaMemcpyDeviceToHost, 0); + cudaEventRecord(stop, 0); + sdkStopTimer(&timer); + + // have CPU do some work while waiting for stage 1 to finish + unsigned long int counter = 0; + + while (cudaEventQuery(stop) == cudaErrorNotReady) { + counter++; + } + + checkCudaErrors(cudaEventElapsedTime(&gpu_time, start, stop)); + + // print the cpu and gpu times + printf("time spent executing by the GPU: %.2f\n", gpu_time); + printf("time spent by CPU in CUDA calls: %.2f\n", sdkGetTimerValue(&timer)); + printf("CPU executed %lu iterations while waiting for GPU to finish\n", counter); + + // check the output for correctness + bool bFinalResults = correct_output(a, n, value); + + // release resources + checkCudaErrors(cudaEventDestroy(start)); + checkCudaErrors(cudaEventDestroy(stop)); + checkCudaErrors(cudaFreeHost(a)); + checkCudaErrors(cudaFree(d_a)); + + exit(bFinalResults ? EXIT_SUCCESS : EXIT_FAILURE); +} \ No newline at end of file diff --git a/bandwidthTest.cu b/bandwidthTest.cu new file mode 100644 index 0000000..b4d7e08 --- /dev/null +++ b/bandwidthTest.cu @@ -0,0 +1,926 @@ +// +// Created by kwoodle on 5/10/20. +// + +/* + * Copyright 1993-2015 NVIDIA Corporation. All rights reserved. + * + * Please refer to the NVIDIA end user license agreement (EULA) associated + * with this source code for terms and conditions that govern your use of + * this software. Any use, reproduction, disclosure, or distribution of + * this software and related documentation outside the terms of the EULA + * is strictly prohibited. + * + */ + +/* + * This is a simple test program to measure the memcopy bandwidth of the GPU. + * It can measure device to device copy bandwidth, host to device copy bandwidth + * for pageable and pinned memory, and device to host copy bandwidth for pageable + * and pinned memory. + * + * Usage: + * ./bandwidthTest [option]... + */ + +// CUDA runtime +#include + +// includes +#include // helper for shared functions common to CUDA Samples +#include // helper functions for CUDA error checking and initialization + +#include + +#include +#include +#include + +static const char *sSDKsample = "CUDA Bandwidth Test"; + +// defines, project +#define MEMCOPY_ITERATIONS 100 +#define DEFAULT_SIZE ( 32 * (1e6) ) //32 M +#define DEFAULT_INCREMENT ( 4 * (1e6) ) //4 M +#define CACHE_CLEAR_SIZE ( 16 * (1e6) ) //16 M + +//shmoo mode defines +#define SHMOO_MEMSIZE_MAX (64 * (1e6)) //64 M +#define SHMOO_MEMSIZE_START (1e3) //1 KB +#define SHMOO_INCREMENT_1KB (1e3) //1 KB +#define SHMOO_INCREMENT_2KB (2 * 1e3) //2 KB +#define SHMOO_INCREMENT_10KB (10 * (1e3)) //10KB +#define SHMOO_INCREMENT_100KB (100 * (1e3)) //100 KB +#define SHMOO_INCREMENT_1MB (1e6) //1 MB +#define SHMOO_INCREMENT_2MB (2 * 1e6) //2 MB +#define SHMOO_INCREMENT_4MB (4 * 1e6) //4 MB +#define SHMOO_LIMIT_20KB (20 * (1e3)) //20 KB +#define SHMOO_LIMIT_50KB (50 * (1e3)) //50 KB +#define SHMOO_LIMIT_100KB (100 * (1e3)) //100 KB +#define SHMOO_LIMIT_1MB (1e6) //1 MB +#define SHMOO_LIMIT_16MB (16 * 1e6) //16 MB +#define SHMOO_LIMIT_32MB (32 * 1e6) //32 MB + +//enums, project +enum testMode { + QUICK_MODE, RANGE_MODE, SHMOO_MODE +}; +enum memcpyKind { + DEVICE_TO_HOST, HOST_TO_DEVICE, DEVICE_TO_DEVICE +}; +enum printMode { + USER_READABLE, CSV +}; +enum memoryMode { + PINNED, PAGEABLE +}; + +const char *sMemoryCopyKind[] = + { + "Device to Host", + "Host to Device", + "Device to Device", + NULL + }; + +const char *sMemoryMode[] = + { + "PINNED", + "PAGEABLE", + NULL + }; + +// if true, use CPU based timing for everything +static bool bDontUseGPUTiming; + +int *pArgc = NULL; +char **pArgv = NULL; + +//////////////////////////////////////////////////////////////////////////////// +// declaration, forward +int runTest(const int argc, const char **argv); + +void testBandwidth(unsigned int start, unsigned int end, unsigned int increment, + testMode mode, memcpyKind kind, printMode printmode, memoryMode memMode, int startDevice, + int endDevice, bool wc); + +void testBandwidthQuick(unsigned int size, memcpyKind kind, printMode printmode, memoryMode memMode, int startDevice, + int endDevice, bool wc); + +void testBandwidthRange(unsigned int start, unsigned int end, unsigned int increment, + memcpyKind kind, printMode printmode, memoryMode memMode, int startDevice, int endDevice, + bool wc); + +void +testBandwidthShmoo(memcpyKind kind, printMode printmode, memoryMode memMode, int startDevice, int endDevice, bool wc); + +float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode, bool wc); + +float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode, bool wc); + +float testDeviceToDeviceTransfer(unsigned int memSize); + +void printResultsReadable(unsigned int *memSizes, double *bandwidths, unsigned int count, memcpyKind kind, + memoryMode memMode, int iNumDevs, bool wc); + +void +printResultsCSV(unsigned int *memSizes, double *bandwidths, unsigned int count, memcpyKind kind, memoryMode memMode, + int iNumDevs, bool wc); + +void printHelp(void); + +//////////////////////////////////////////////////////////////////////////////// +// Program main +//////////////////////////////////////////////////////////////////////////////// +int main(int argc, char **argv) { + pArgc = &argc; + pArgv = argv; + + // set logfile name and start logs + printf("[%s] - Starting...\n", sSDKsample); + + int iRetVal = runTest(argc, (const char **) argv); + + if (iRetVal < 0) { + checkCudaErrors(cudaSetDevice(0)); + } + + // finish + printf("%s\n", (iRetVal == 0) ? "Result = PASS" : "Result = FAIL"); + + printf("\nNOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.\n"); + + exit((iRetVal == 0) ? EXIT_SUCCESS : EXIT_FAILURE); +} + +/////////////////////////////////////////////////////////////////////////////// +//Parse args, run the appropriate tests +/////////////////////////////////////////////////////////////////////////////// +int runTest(const int argc, const char **argv) { + int start = DEFAULT_SIZE; + int end = DEFAULT_SIZE; + int startDevice = 0; + int endDevice = 0; + int increment = DEFAULT_INCREMENT; + testMode mode = QUICK_MODE; + bool htod = false; + bool dtoh = false; + bool dtod = false; + bool wc = false; + char *modeStr; + char *device = NULL; + printMode printmode = USER_READABLE; + char *memModeStr = NULL; + memoryMode memMode = PINNED; + + //process command line args + if (checkCmdLineFlag(argc, argv, "help")) { + printHelp(); + return 0; + } + + if (checkCmdLineFlag(argc, argv, "csv")) { + printmode = CSV; + } + + if (getCmdLineArgumentString(argc, argv, "memory", &memModeStr)) { + if (strcmp(memModeStr, "pageable") == 0) { + memMode = PAGEABLE; + } else if (strcmp(memModeStr, "pinned") == 0) { + memMode = PINNED; + } else { + printf("Invalid memory mode - valid modes are pageable or pinned\n"); + printf("See --help for more information\n"); + return -1000; + } + } else { + //default - pinned memory + memMode = PINNED; + } + + if (getCmdLineArgumentString(argc, argv, "device", &device)) { + int deviceCount; + cudaError_t error_id = cudaGetDeviceCount(&deviceCount); + + if (error_id != cudaSuccess) { + printf("cudaGetDeviceCount returned %d\n-> %s\n", (int) error_id, cudaGetErrorString(error_id)); + exit(EXIT_FAILURE); + } + + if (deviceCount == 0) { + printf("!!!!!No devices found!!!!!\n"); + return -2000; + } + + if (strcmp(device, "all") == 0) { + printf("\n!!!!!Cumulative Bandwidth to be computed from all the devices !!!!!!\n\n"); + startDevice = 0; + endDevice = deviceCount - 1; + } else { + startDevice = endDevice = atoi(device); + + if (startDevice >= deviceCount || startDevice < 0) { + printf("\n!!!!!Invalid GPU number %d given hence default gpu %d will be used !!!!!\n", startDevice, 0); + startDevice = endDevice = 0; + } + } + } + + printf("Running on...\n\n"); + + for (int currentDevice = startDevice; currentDevice <= endDevice; currentDevice++) { + cudaDeviceProp deviceProp; + cudaError_t error_id = cudaGetDeviceProperties(&deviceProp, currentDevice); + + if (error_id == cudaSuccess) { + printf(" Device %d: %s\n", currentDevice, deviceProp.name); + + if (deviceProp.computeMode == cudaComputeModeProhibited) { + fprintf(stderr, + "Error: device is running in , no threads can use ::cudaSetDevice().\n"); + checkCudaErrors(cudaSetDevice(currentDevice)); + + exit(EXIT_FAILURE); + } + } else { + printf("cudaGetDeviceProperties returned %d\n-> %s\n", (int) error_id, cudaGetErrorString(error_id)); + checkCudaErrors(cudaSetDevice(currentDevice)); + + exit(EXIT_FAILURE); + } + } + + if (getCmdLineArgumentString(argc, argv, "mode", &modeStr)) { + //figure out the mode + if (strcmp(modeStr, "quick") == 0) { + printf(" Quick Mode\n\n"); + mode = QUICK_MODE; + } else if (strcmp(modeStr, "shmoo") == 0) { + printf(" Shmoo Mode\n\n"); + mode = SHMOO_MODE; + } else if (strcmp(modeStr, "range") == 0) { + printf(" Range Mode\n\n"); + mode = RANGE_MODE; + } else { + printf("Invalid mode - valid modes are quick, range, or shmoo\n"); + printf("See --help for more information\n"); + return -3000; + } + } else { + //default mode - quick + printf(" Quick Mode\n\n"); + mode = QUICK_MODE; + } + + if (checkCmdLineFlag(argc, argv, "htod")) { + htod = true; + } + + if (checkCmdLineFlag(argc, argv, "dtoh")) { + dtoh = true; + } + + if (checkCmdLineFlag(argc, argv, "dtod")) { + dtod = true; + } + +#if CUDART_VERSION >= 2020 + + if (checkCmdLineFlag(argc, argv, "wc")) { + wc = true; + } + +#endif + + if (checkCmdLineFlag(argc, argv, "cputiming")) { + bDontUseGPUTiming = true; + } + + if (!htod && !dtoh && !dtod) { + //default: All + htod = true; + dtoh = true; + dtod = true; + } + + if (RANGE_MODE == mode) { + if (checkCmdLineFlag(argc, (const char **) argv, "start")) { + start = getCmdLineArgumentInt(argc, argv, "start"); + + if (start <= 0) { + printf("Illegal argument - start must be greater than zero\n"); + return -4000; + } + } else { + printf("Must specify a starting size in range mode\n"); + printf("See --help for more information\n"); + return -5000; + } + + if (checkCmdLineFlag(argc, (const char **) argv, "end")) { + end = getCmdLineArgumentInt(argc, argv, "end"); + + if (end <= 0) { + printf("Illegal argument - end must be greater than zero\n"); + return -6000; + } + + if (start > end) { + printf("Illegal argument - start is greater than end\n"); + return -7000; + } + } else { + printf("Must specify an end size in range mode.\n"); + printf("See --help for more information\n"); + return -8000; + } + + + if (checkCmdLineFlag(argc, argv, "increment")) { + increment = getCmdLineArgumentInt(argc, argv, "increment"); + + if (increment <= 0) { + printf("Illegal argument - increment must be greater than zero\n"); + return -9000; + } + } else { + printf("Must specify an increment in user mode\n"); + printf("See --help for more information\n"); + return -10000; + } + } + + if (htod) { + testBandwidth((unsigned int) start, (unsigned int) end, (unsigned int) increment, + mode, HOST_TO_DEVICE, printmode, memMode, startDevice, endDevice, wc); + } + + if (dtoh) { + testBandwidth((unsigned int) start, (unsigned int) end, (unsigned int) increment, + mode, DEVICE_TO_HOST, printmode, memMode, startDevice, endDevice, wc); + } + + if (dtod) { + testBandwidth((unsigned int) start, (unsigned int) end, (unsigned int) increment, + mode, DEVICE_TO_DEVICE, printmode, memMode, startDevice, endDevice, wc); + } + + // Ensure that we reset all CUDA Devices in question + for (int nDevice = startDevice; nDevice <= endDevice; nDevice++) { + cudaSetDevice(nDevice); + } + + return 0; +} + +/////////////////////////////////////////////////////////////////////////////// +// Run a bandwidth test +/////////////////////////////////////////////////////////////////////////////// +void +testBandwidth(unsigned int start, unsigned int end, unsigned int increment, + testMode mode, memcpyKind kind, printMode printmode, memoryMode memMode, int startDevice, int endDevice, + bool wc) { + switch (mode) { + case QUICK_MODE: + testBandwidthQuick(DEFAULT_SIZE, kind, printmode, memMode, startDevice, endDevice, wc); + break; + + case RANGE_MODE: + testBandwidthRange(start, end, increment, kind, printmode, memMode, startDevice, endDevice, wc); + break; + + case SHMOO_MODE: + testBandwidthShmoo(kind, printmode, memMode, startDevice, endDevice, wc); + break; + + default: + break; + } +} + +////////////////////////////////////////////////////////////////////// +// Run a quick mode bandwidth test +////////////////////////////////////////////////////////////////////// +void +testBandwidthQuick(unsigned int size, memcpyKind kind, printMode printmode, memoryMode memMode, int startDevice, + int endDevice, bool wc) { + testBandwidthRange(size, size, DEFAULT_INCREMENT, kind, printmode, memMode, startDevice, endDevice, wc); +} + +/////////////////////////////////////////////////////////////////////// +// Run a range mode bandwidth test +////////////////////////////////////////////////////////////////////// +void +testBandwidthRange(unsigned int start, unsigned int end, unsigned int increment, + memcpyKind kind, printMode printmode, memoryMode memMode, int startDevice, int endDevice, bool wc) { + //count the number of copies we're going to run + unsigned int count = 1 + ((end - start) / increment); + + unsigned int *memSizes = (unsigned int *) malloc(count * sizeof(unsigned int)); + double *bandwidths = (double *) malloc(count * sizeof(double)); + + // Before calculating the cumulative bandwidth, initialize bandwidths array to NULL + for (unsigned int i = 0; i < count; i++) { + bandwidths[i] = 0.0; + } + + // Use the device asked by the user + for (int currentDevice = startDevice; currentDevice <= endDevice; currentDevice++) { + cudaSetDevice(currentDevice); + + //run each of the copies + for (unsigned int i = 0; i < count; i++) { + + memSizes[i] = start + i * increment; + + switch (kind) { + case DEVICE_TO_HOST: + bandwidths[i] += testDeviceToHostTransfer(memSizes[i], memMode, wc); + break; + + case HOST_TO_DEVICE: + bandwidths[i] += testHostToDeviceTransfer(memSizes[i], memMode, wc); + break; + + case DEVICE_TO_DEVICE: + bandwidths[i] += testDeviceToDeviceTransfer(memSizes[i]); + break; + } + } + } // Complete the bandwidth computation on all the devices + + //print results + if (printmode == CSV) { + printResultsCSV(memSizes, bandwidths, count, kind, memMode, (1 + endDevice - startDevice), wc); + } else { + printResultsReadable(memSizes, bandwidths, count, kind, memMode, (1 + endDevice - startDevice), wc); + } + + //clean up + free(memSizes); + free(bandwidths); +} + +////////////////////////////////////////////////////////////////////////////// +// Intense shmoo mode - covers a large range of values with varying increments +////////////////////////////////////////////////////////////////////////////// +void +testBandwidthShmoo(memcpyKind kind, printMode printmode, memoryMode memMode, int startDevice, int endDevice, bool wc) { + //count the number of copies to make + unsigned int count = 1 + (SHMOO_LIMIT_20KB / SHMOO_INCREMENT_1KB) + + ((SHMOO_LIMIT_50KB - SHMOO_LIMIT_20KB) / SHMOO_INCREMENT_2KB) + + ((SHMOO_LIMIT_100KB - SHMOO_LIMIT_50KB) / SHMOO_INCREMENT_10KB) + + ((SHMOO_LIMIT_1MB - SHMOO_LIMIT_100KB) / SHMOO_INCREMENT_100KB) + + ((SHMOO_LIMIT_16MB - SHMOO_LIMIT_1MB) / SHMOO_INCREMENT_1MB) + + ((SHMOO_LIMIT_32MB - SHMOO_LIMIT_16MB) / SHMOO_INCREMENT_2MB) + + ((SHMOO_MEMSIZE_MAX - SHMOO_LIMIT_32MB) / SHMOO_INCREMENT_4MB); + + unsigned int *memSizes = (unsigned int *) malloc(count * sizeof(unsigned int)); + double *bandwidths = (double *) malloc(count * sizeof(double)); + + // Before calculating the cumulative bandwidth, initialize bandwidths array to NULL + for (unsigned int i = 0; i < count; i++) { + bandwidths[i] = 0.0; + } + + // Use the device asked by the user + for (int currentDevice = startDevice; currentDevice <= endDevice; currentDevice++) { + cudaSetDevice(currentDevice); + //Run the shmoo + int iteration = 0; + unsigned int memSize = 0; + + while (memSize <= SHMOO_MEMSIZE_MAX) { + if (memSize < SHMOO_LIMIT_20KB) { + memSize += SHMOO_INCREMENT_1KB; + } else if (memSize < SHMOO_LIMIT_50KB) { + memSize += SHMOO_INCREMENT_2KB; + } else if (memSize < SHMOO_LIMIT_100KB) { + memSize += SHMOO_INCREMENT_10KB; + } else if (memSize < SHMOO_LIMIT_1MB) { + memSize += SHMOO_INCREMENT_100KB; + } else if (memSize < SHMOO_LIMIT_16MB) { + memSize += SHMOO_INCREMENT_1MB; + } else if (memSize < SHMOO_LIMIT_32MB) { + memSize += SHMOO_INCREMENT_2MB; + } else { + memSize += SHMOO_INCREMENT_4MB; + } + + memSizes[iteration] = memSize; + + switch (kind) { + case DEVICE_TO_HOST: + bandwidths[iteration] += testDeviceToHostTransfer(memSizes[iteration], memMode, wc); + break; + + case HOST_TO_DEVICE: + bandwidths[iteration] += testHostToDeviceTransfer(memSizes[iteration], memMode, wc); + break; + + case DEVICE_TO_DEVICE: + bandwidths[iteration] += testDeviceToDeviceTransfer(memSizes[iteration]); + break; + } + + iteration++; + printf("."); + } + } // Complete the bandwidth computation on all the devices + + //print results + printf("\n"); + + if (CSV == printmode) { + printResultsCSV(memSizes, bandwidths, count, kind, memMode, (1 + endDevice - startDevice), wc); + } else { + printResultsReadable(memSizes, bandwidths, count, kind, memMode, (1 + endDevice - startDevice), wc); + } + + //clean up + free(memSizes); + free(bandwidths); +} + +/////////////////////////////////////////////////////////////////////////////// +// test the bandwidth of a device to host memcopy of a specific size +/////////////////////////////////////////////////////////////////////////////// +float +testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode, bool wc) { + StopWatchInterface *timer = NULL; + float elapsedTimeInMs = 0.0f; + float bandwidthInGBs = 0.0f; + unsigned char *h_idata = NULL; + unsigned char *h_odata = NULL; + cudaEvent_t start, stop; + + sdkCreateTimer(&timer); + checkCudaErrors(cudaEventCreate(&start)); + checkCudaErrors(cudaEventCreate(&stop)); + + //allocate host memory + if (PINNED == memMode) { + //pinned memory mode - use special function to get OS-pinned memory +#if CUDART_VERSION >= 2020 + checkCudaErrors(cudaHostAlloc((void **) &h_idata, memSize, (wc) ? cudaHostAllocWriteCombined : 0)); + checkCudaErrors(cudaHostAlloc((void **) &h_odata, memSize, (wc) ? cudaHostAllocWriteCombined : 0)); +#else + checkCudaErrors(cudaMallocHost((void **)&h_idata, memSize)); + checkCudaErrors(cudaMallocHost((void **)&h_odata, memSize)); +#endif + } else { + //pageable memory mode - use malloc + h_idata = (unsigned char *) malloc(memSize); + h_odata = (unsigned char *) malloc(memSize); + + if (h_idata == 0 || h_odata == 0) { + fprintf(stderr, "Not enough memory avaialable on host to run test!\n"); + exit(EXIT_FAILURE); + } + } + + //initialize the memory + for (unsigned int i = 0; i < memSize / sizeof(unsigned char); i++) { + h_idata[i] = (unsigned char) (i & 0xff); + } + + // allocate device memory + unsigned char *d_idata; + checkCudaErrors(cudaMalloc((void **) &d_idata, memSize)); + + //initialize the device memory + checkCudaErrors(cudaMemcpy(d_idata, h_idata, memSize, + cudaMemcpyHostToDevice)); + + //copy data from GPU to Host + sdkStartTimer(&timer); + checkCudaErrors(cudaEventRecord(start, 0)); + + if (PINNED == memMode) { + for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) { + checkCudaErrors(cudaMemcpyAsync(h_odata, d_idata, memSize, + cudaMemcpyDeviceToHost, 0)); + } + } else { + for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) { + checkCudaErrors(cudaMemcpy(h_odata, d_idata, memSize, + cudaMemcpyDeviceToHost)); + } + } + + checkCudaErrors(cudaEventRecord(stop, 0)); + + // make sure GPU has finished copying + checkCudaErrors(cudaDeviceSynchronize()); + //get the total elapsed time in ms + sdkStopTimer(&timer); + checkCudaErrors(cudaEventElapsedTime(&elapsedTimeInMs, start, stop)); + + if (PINNED != memMode || bDontUseGPUTiming) { + elapsedTimeInMs = sdkGetTimerValue(&timer); + } + + //calculate bandwidth in GB/s + double time_s = elapsedTimeInMs / 1e3; + bandwidthInGBs = (memSize * (float) MEMCOPY_ITERATIONS) / (double) 1e9; + bandwidthInGBs = bandwidthInGBs / time_s; + //clean up memory + checkCudaErrors(cudaEventDestroy(stop)); + checkCudaErrors(cudaEventDestroy(start)); + sdkDeleteTimer(&timer); + + if (PINNED == memMode) { + checkCudaErrors(cudaFreeHost(h_idata)); + checkCudaErrors(cudaFreeHost(h_odata)); + } else { + free(h_idata); + free(h_odata); + } + + checkCudaErrors(cudaFree(d_idata)); + + return bandwidthInGBs; +} + +/////////////////////////////////////////////////////////////////////////////// +//! test the bandwidth of a host to device memcopy of a specific size +/////////////////////////////////////////////////////////////////////////////// +float +testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode, bool wc) { + StopWatchInterface *timer = NULL; + float elapsedTimeInMs = 0.0f; + float bandwidthInGBs = 0.0f; + cudaEvent_t start, stop; + sdkCreateTimer(&timer); + checkCudaErrors(cudaEventCreate(&start)); + checkCudaErrors(cudaEventCreate(&stop)); + + //allocate host memory + unsigned char *h_odata = NULL; + + if (PINNED == memMode) { +#if CUDART_VERSION >= 2020 + //pinned memory mode - use special function to get OS-pinned memory + checkCudaErrors(cudaHostAlloc((void **) &h_odata, memSize, (wc) ? cudaHostAllocWriteCombined : 0)); +#else + //pinned memory mode - use special function to get OS-pinned memory + checkCudaErrors(cudaMallocHost((void **)&h_odata, memSize)); +#endif + } else { + //pageable memory mode - use malloc + h_odata = (unsigned char *) malloc(memSize); + + if (h_odata == 0) { + fprintf(stderr, "Not enough memory available on host to run test!\n"); + exit(EXIT_FAILURE); + } + } + + unsigned char *h_cacheClear1 = (unsigned char *) malloc(CACHE_CLEAR_SIZE); + unsigned char *h_cacheClear2 = (unsigned char *) malloc(CACHE_CLEAR_SIZE); + + if (h_cacheClear1 == 0 || h_cacheClear2 == 0) { + fprintf(stderr, "Not enough memory available on host to run test!\n"); + exit(EXIT_FAILURE); + } + + //initialize the memory + for (unsigned int i = 0; i < memSize / sizeof(unsigned char); i++) { + h_odata[i] = (unsigned char) (i & 0xff); + } + + for (unsigned int i = 0; i < CACHE_CLEAR_SIZE / sizeof(unsigned char); i++) { + h_cacheClear1[i] = (unsigned char) (i & 0xff); + h_cacheClear2[i] = (unsigned char) (0xff - (i & 0xff)); + } + + //allocate device memory + unsigned char *d_idata; + checkCudaErrors(cudaMalloc((void **) &d_idata, memSize)); + + sdkStartTimer(&timer); + checkCudaErrors(cudaEventRecord(start, 0)); + + //copy host memory to device memory + if (PINNED == memMode) { + for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) { + checkCudaErrors(cudaMemcpyAsync(d_idata, h_odata, memSize, + cudaMemcpyHostToDevice, 0)); + } + } else { + for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) { + checkCudaErrors(cudaMemcpy(d_idata, h_odata, memSize, + cudaMemcpyHostToDevice)); + } + } + + checkCudaErrors(cudaEventRecord(stop, 0)); + checkCudaErrors(cudaDeviceSynchronize()); + //total elapsed time in ms + sdkStopTimer(&timer); + checkCudaErrors(cudaEventElapsedTime(&elapsedTimeInMs, start, stop)); + + if (PINNED != memMode || bDontUseGPUTiming) { + elapsedTimeInMs = sdkGetTimerValue(&timer); + } + + sdkResetTimer(&timer); + + //calculate bandwidth in GB/s + double time_s = elapsedTimeInMs / 1e3; + bandwidthInGBs = (memSize * (float) MEMCOPY_ITERATIONS) / (double) 1e9; + bandwidthInGBs = bandwidthInGBs / time_s; + //clean up memory + checkCudaErrors(cudaEventDestroy(stop)); + checkCudaErrors(cudaEventDestroy(start)); + sdkDeleteTimer(&timer); + + if (PINNED == memMode) { + checkCudaErrors(cudaFreeHost(h_odata)); + } else { + free(h_odata); + } + + free(h_cacheClear1); + free(h_cacheClear2); + checkCudaErrors(cudaFree(d_idata)); + + return bandwidthInGBs; +} + +/////////////////////////////////////////////////////////////////////////////// +//! test the bandwidth of a device to device memcopy of a specific size +/////////////////////////////////////////////////////////////////////////////// +float +testDeviceToDeviceTransfer(unsigned int memSize) { + StopWatchInterface *timer = NULL; + float elapsedTimeInMs = 0.0f; + float bandwidthInGBs = 0.0f; + cudaEvent_t start, stop; + + sdkCreateTimer(&timer); + checkCudaErrors(cudaEventCreate(&start)); + checkCudaErrors(cudaEventCreate(&stop)); + + //allocate host memory + unsigned char *h_idata = (unsigned char *) malloc(memSize); + + if (h_idata == 0) { + fprintf(stderr, "Not enough memory avaialable on host to run test!\n"); + exit(EXIT_FAILURE); + } + + //initialize the host memory + for (unsigned int i = 0; i < memSize / sizeof(unsigned char); i++) { + h_idata[i] = (unsigned char) (i & 0xff); + } + + //allocate device memory + unsigned char *d_idata; + checkCudaErrors(cudaMalloc((void **) &d_idata, memSize)); + unsigned char *d_odata; + checkCudaErrors(cudaMalloc((void **) &d_odata, memSize)); + + //initialize memory + checkCudaErrors(cudaMemcpy(d_idata, h_idata, memSize, + cudaMemcpyHostToDevice)); + + //run the memcopy + sdkStartTimer(&timer); + checkCudaErrors(cudaEventRecord(start, 0)); + + for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) { + checkCudaErrors(cudaMemcpy(d_odata, d_idata, memSize, + cudaMemcpyDeviceToDevice)); + } + + checkCudaErrors(cudaEventRecord(stop, 0)); + + //Since device to device memory copies are non-blocking, + //cudaDeviceSynchronize() is required in order to get + //proper timing. + checkCudaErrors(cudaDeviceSynchronize()); + + //get the total elapsed time in ms + sdkStopTimer(&timer); + checkCudaErrors(cudaEventElapsedTime(&elapsedTimeInMs, start, stop)); + + if (bDontUseGPUTiming) { + elapsedTimeInMs = sdkGetTimerValue(&timer); + } + + //calculate bandwidth in GB/s + double time_s = elapsedTimeInMs / 1e3; + bandwidthInGBs = (2.0f * memSize * (float) MEMCOPY_ITERATIONS) / (double) 1e9; + bandwidthInGBs = bandwidthInGBs / time_s; + + //clean up memory + sdkDeleteTimer(&timer); + free(h_idata); + checkCudaErrors(cudaEventDestroy(stop)); + checkCudaErrors(cudaEventDestroy(start)); + checkCudaErrors(cudaFree(d_idata)); + checkCudaErrors(cudaFree(d_odata)); + + return bandwidthInGBs; +} + +///////////////////////////////////////////////////////// +//print results in an easily read format +//////////////////////////////////////////////////////// +void printResultsReadable(unsigned int *memSizes, double *bandwidths, unsigned int count, memcpyKind kind, + memoryMode memMode, int iNumDevs, bool wc) { + printf(" %s Bandwidth, %i Device(s)\n", sMemoryCopyKind[kind], iNumDevs); + printf(" %s Memory Transfers\n", sMemoryMode[memMode]); + + if (wc) { + printf(" Write-Combined Memory Writes are Enabled"); + } + + printf(" Transfer Size (Bytes)\tBandwidth(GB/s)\n"); + unsigned int i; + + for (i = 0; i < (count - 1); i++) { + printf(" %u\t\t\t%s%.1f\n", memSizes[i], (memSizes[i] < 10000) ? "\t" : "", bandwidths[i]); + } + + printf(" %u\t\t\t%s%.1f\n\n", memSizes[i], (memSizes[i] < 10000) ? "\t" : "", bandwidths[i]); +} + +/////////////////////////////////////////////////////////////////////////// +//print results in a database format +/////////////////////////////////////////////////////////////////////////// +void +printResultsCSV(unsigned int *memSizes, double *bandwidths, unsigned int count, memcpyKind kind, memoryMode memMode, + int iNumDevs, bool wc) { + std::string sConfig; + + // log config information + if (kind == DEVICE_TO_DEVICE) { + sConfig += "D2D"; + } else { + if (kind == DEVICE_TO_HOST) { + sConfig += "D2H"; + } else if (kind == HOST_TO_DEVICE) { + sConfig += "H2D"; + } + + if (memMode == PAGEABLE) { + sConfig += "-Paged"; + } else if (memMode == PINNED) { + sConfig += "-Pinned"; + + if (wc) { + sConfig += "-WriteCombined"; + } + } + } + + unsigned int i; + double dSeconds = 0.0; + + for (i = 0; i < count; i++) { + dSeconds = (double) memSizes[i] / (bandwidths[i] * (double) (1 << 20)); + printf("bandwidthTest-%s, Bandwidth = %.1f GB/s, Time = %.5f s, Size = %u bytes, NumDevsUsed = %d\n", + sConfig.c_str(), bandwidths[i], dSeconds, memSizes[i], iNumDevs); + } +} + +/////////////////////////////////////////////////////////////////////////// +//Print help screen +/////////////////////////////////////////////////////////////////////////// +void printHelp(void) { + printf("Usage: bandwidthTest [OPTION]...\n"); + printf("Test the bandwidth for device to host, host to device, and device to device transfers\n"); + printf("\n"); + printf("Example: measure the bandwidth of device to host pinned memory copies in the range 1024 Bytes to 102400 Bytes in 1024 Byte increments\n"); + printf("./bandwidthTest --memory=pinned --mode=range --start=1024 --end=102400 --increment=1024 --dtoh\n"); + + printf("\n"); + printf("Options:\n"); + printf("--help\tDisplay this help menu\n"); + printf("--csv\tPrint results as a CSV\n"); + printf("--device=[deviceno]\tSpecify the device device to be used\n"); + printf(" all - compute cumulative bandwidth on all the devices\n"); + printf(" 0,1,2,...,n - Specify any particular device to be used\n"); + printf("--memory=[MEMMODE]\tSpecify which memory mode to use\n"); + printf(" pageable - pageable memory\n"); + printf(" pinned - non-pageable system memory\n"); + printf("--mode=[MODE]\tSpecify the mode to use\n"); + printf(" quick - performs a quick measurement\n"); + printf(" range - measures a user-specified range of values\n"); + printf(" shmoo - performs an intense shmoo of a large range of values\n"); + + printf("--htod\tMeasure host to device transfers\n"); + printf("--dtoh\tMeasure device to host transfers\n"); + printf("--dtod\tMeasure device to device transfers\n"); +#if CUDART_VERSION >= 2020 + printf("--wc\tAllocate pinned memory as write-combined\n"); +#endif + printf("--cputiming\tForce CPU-based timing always\n"); + + printf("Range mode options\n"); + printf("--start=[SIZE]\tStarting transfer size in bytes\n"); + printf("--end=[SIZE]\tEnding transfer size in bytes\n"); + printf("--increment=[SIZE]\tIncrement size in bytes\n"); +} diff --git a/cudaTensorCoreGemm.cu b/cudaTensorCoreGemm.cu new file mode 100644 index 0000000..c583e1b --- /dev/null +++ b/cudaTensorCoreGemm.cu @@ -0,0 +1,586 @@ +// +// Created by kwoodle on 5/10/20. +// + +// +// Created by kwoodle on 5/9/20. +// + +/* + * Copyright 1993-2017 NVIDIA Corporation. All rights reserved. + * + * Please refer to the NVIDIA end user license agreement (EULA) associated + * with this source code for terms and conditions that govern your use of + * this software. Any use, reproduction, disclosure, or distribution of + * this software and related documentation outside the terms of the EULA + * is strictly prohibited. + * + */ + +// CUDA sample demonstrating a GEMM computation using the Warp Matrix Multiply +// and Accumulate API introduced in CUDA 9. + +// In this program, the compute_gemm kernel computes the result of a matrix multiplication +// and addition: D = alpha * A * B + beta * C. The dimensions of both C and D matrices +// are M_GLOBAL x N_GLOBAL. The A matrix is M_GLOBAL x K_GLOBAL (row-major), the B matrix +// is K_GLOBAL x N_GLOBAL (column-major). +// In that kernel, each CTA computes one 128 x 128 tile of the resulting matrix +// per iteration. When the tile is computed, the CTA stores it to the global memory +// and begins a new iteration, selecting a new 128 x 128 tile to compute. +// Each CTA consists of eight warps. For the 128 x 128 tile, each warp computes eight +// 16 x 16 subtiles, organized in a 2 x 4 two-dimensional array. +// Warps compute the 16 x 16 subtiles using nvcuda::wmma::mma_sync operations by +// moving through the K_GLOBAL dimension of the A and B matrices and accumulating +// the intermediate result in the local thread state. + +// There are a number of simple optimizations used in the algorithm: +// - The CTA copies the 128 x 128 tile of the C matrix from the global memory to +// shared memory. After that is done, each warp loads the C matrix fragments from +// shared memory, thus avoiding a random global memory access. +// - On each internal iteration, the CTA copies a portion of the A and B matrices from +// global memory to shared memory. After that, all warps in the CTA reuse the A and B +// data from shared memory, thus reducing the number of data copies from global memory. +// - The portions of the A and B matrices are stored in shared memory with an additional +// padding (skew) to reduce the number of shared memory access bank conflicts. +// (See a detailed explanation near the SKEW_HALF macro definition.) +// - When the CTA finishes computing the tiles of the resulting matrix, each warp stores +// its subtiles to shared memory. The CTA then copies the shared memory contents to +// global memory, again avoiding redundant random global memory accesses. +// - Note that the CTA tile size is chosen to maximize the GPU register utilization, +// but carefully enough to avoid local memory use. + +#include +#include +#include +#include + +// helper functions and utilities to work with CUDA +#include +#include + +// Externally configurable parameters. + +#ifndef CPU_DEBUG +// Set this to 1 to verify the correctness of the GPU-computed matrix. +#define CPU_DEBUG 0 +#endif + +#ifndef SHARED_MEMORY_LIMIT_64K +// Set this to 0 to use more than 64 Kb of shared memory to cache data, to +// improve the performance of the computations on GPU. +// Note that you need a GPU that can have more than 64 Kb of shared memory +// per multiprocessor. +#define SHARED_MEMORY_LIMIT_64K 1 +#endif + +// GPU configuration. + +#define WARP_SIZE 32 + +// MMA matrix tile dimensions. + +#define M 16 +#define N 16 +#define K 16 + +#define WMMA_M 16 +#define WMMA_N 16 +#define WMMA_K 16 + +// GEMM configuration. + +#define M_TILES 256 +#define N_TILES 256 +#define K_TILES 256 + +#define M_GLOBAL (M * M_TILES) +#define N_GLOBAL (N * N_TILES) +#define K_GLOBAL (K * K_TILES) + +#define C_LAYOUT wmma::mem_row_major + +// Implementation constants. + +#define WARPS_PER_BLOCK 8 +#define THREADS_PER_BLOCK (WARP_SIZE * WARPS_PER_BLOCK) + +#if SHARED_MEMORY_LIMIT_64K +// With only 64 Kb shared memory available, we can fit two 8-tile chunks of +// the A and B matrix data, that are 16 * 16 * 8 * 8 * 2 = 32 Kb each +// (i.e. two 8x8 arrays of tiles of 16x16 half-typed elements per CTA). +// But we cannot account the 8 Kb total skew overhead, without which the performance +// would be severely impacted. So we choose to reduce the chunk size in half, +// i.e. the amount of A and B matrix data we cache in shared memory. +// Accordingly, this doubles the number of outer iterations across the global K +// dimension, which only slightly impacts the performance. +#define CHUNK_K 4 +#else +#define CHUNK_K 8 +#endif + +#define CHUNK_LINE_BYTES (CHUNK_K * K * sizeof(half)) +#define WARP_COPY_BYTES (WARP_SIZE * sizeof(int4)) +#define CHUNK_COPY_LINES_PER_WARP (WARP_COPY_BYTES / CHUNK_LINE_BYTES) +#define CHUNK_COPY_LINE_LANES (WARP_SIZE / CHUNK_COPY_LINES_PER_WARP) + +#define BLOCK_ROW_WARPS 2 +#define BLOCK_COL_WARPS 4 + +#define WARP_ROW_TILES 4 +#define WARP_COL_TILES 2 + +#define BLOCK_ROW_TILES (WARP_ROW_TILES * BLOCK_ROW_WARPS) +#define BLOCK_COL_TILES (WARP_COL_TILES * BLOCK_COL_WARPS) + +#define GLOBAL_MEM_STRIDE N_GLOBAL + +#define SHMEM_STRIDE (N * BLOCK_ROW_TILES) +#define SHMEM_OFFSET (N * WARP_ROW_TILES) + +// The macro below is used to shift rows of the A matrix and columns of the B matrix +// in shared memory to minimize possible bank conflicts. +// Before performing the nvcuda::wmma::mma_sync operation, the warp must load the matrix +// data using the nvcuda::wmma::load_matrix_sync operation. Although the memory access pattern +// is not specified for that function, each lane in the warp can read one or multiple matrix +// elements from different matrix rows or columns. +// For shared memory, such access can result in bank conflicts if different rows / columns +// of the matrix map to the same bank. By shifting each row and column by a few bytes, we +// make sure that they map to different banks, thus reducing the number of possible bank +// conflicts. +// The number of 8 two-byte "half" elements is chosen as the minimum possible shift because +// we must keep each row and column 128-bit aligned, as required by nvcuda::wmma::load_matrix_sync. +#define SKEW_HALF 8 + +#define checkKernelErrors(expr) do { \ + expr; \ + \ + cudaError_t __err = cudaGetLastError(); \ + if (__err != cudaSuccess) { \ + printf("Line %d: '%s' failed: %s\n", __LINE__, # expr, cudaGetErrorString(__err)); \ + abort(); \ + } \ +} while(0) + +using namespace nvcuda; + +__host__ void init_host_matrices(half *a, half *b, float *c) { + for (int i = 0; i < M_GLOBAL; i++) { + for (int j = 0; j < K_GLOBAL; j++) { + a[i * K_GLOBAL + j] = (half) (rand() % 3); + } + } + + for (int i = 0; i < N_GLOBAL; i++) { + for (int j = 0; j < K_GLOBAL; j++) { + b[i * K_GLOBAL + j] = (half) (rand() % 3); + } + } + + for (int t = 0; t < M_GLOBAL * N_GLOBAL; t++) { + c[t] = (float) (rand() % 3); + } +} + +__global__ void compute_gemm(const half *A, const half *B, const float *C, float *D, float alpha, float beta) { + extern __shared__ half shmem[][CHUNK_K * K + SKEW_HALF]; + + // Warp and lane identification. + const unsigned int warpId = threadIdx.x / WARP_SIZE; + const unsigned int laneId = threadIdx.x % WARP_SIZE; + + // Offset in shared memory from which the B matrix is stored. + const size_t shmem_idx_b_off = BLOCK_COL_TILES * M; + + // This pointer is used to access the C and D matrix tiles this warp computes. + float *shmem_warp_tile_ptr = + (float *) &shmem[0][0] + (warpId / 2) * SHMEM_STRIDE * K * 2 + (warpId % 2) * SHMEM_OFFSET; + + // This pointer is used to stream the C and D matrices block-wide tile to and from shared memory. + float *shmem_warp_stream_ptr = (float *) &shmem[0][0] + warpId * SHMEM_STRIDE * K; + + // Adjust the beta scaler, as it'll be multiplied by alpha at the end of + // each tile computation. Technically this is not generally correct (may result + // in a loss of precision). Zero still needs to be specially handled though. + beta /= alpha; + + // Each CTA slides along the 128 x 128 tiles from the top left corner of the matrix to the + // right and down, and selects the next tile to compute. Once there's no such tile, + // all warps in this CTA exit. + for (unsigned int block_pos = blockIdx.x;; block_pos += gridDim.x) { + const unsigned int block_tile_i = ((block_pos * BLOCK_ROW_TILES) / N_TILES) * (BLOCK_COL_TILES); + const unsigned int block_tile_j = (block_pos * BLOCK_COL_TILES) % N_TILES; + + // Stop when there are no more D matrix tiles to compute in this CTA. + if (block_tile_i >= M_TILES) { + break; + } + + // This warp's pointer to the C matrix data to copy memory from to shared memory. + const size_t gmem_idx = (block_tile_i + warpId) * M * GLOBAL_MEM_STRIDE + block_tile_j * N; + const float *src_gmem_warp_stream_ptr = &C[gmem_idx]; + + // Stream multiple C tiles to shared memory. +#pragma unroll + for (int i = 0; i < K; i++) { + typedef int4 copy_t; + + *((copy_t *) (shmem_warp_stream_ptr + SHMEM_STRIDE * i) + laneId) = + *((copy_t *) (src_gmem_warp_stream_ptr + GLOBAL_MEM_STRIDE * i) + laneId); + } + + __syncthreads(); + + // These fragments will accumulate the result of A and B matrix fragment multiplications + // along the K_GLOBAL dimension. + wmma::fragment c[WARP_COL_TILES][WARP_ROW_TILES]; + + // Load the C matrix tiles into fragments from shared memory. +#pragma unroll + for (int i = 0; i < WARP_COL_TILES; i++) { +#pragma unroll + for (int j = 0; j < WARP_ROW_TILES; j++) { + const float *tile_ptr = shmem_warp_tile_ptr + i * SHMEM_STRIDE * K + j * N; + + wmma::load_matrix_sync(c[i][j], tile_ptr, SHMEM_STRIDE, C_LAYOUT); + } + } + + __syncthreads(); + + // Scale the C matrix. +#pragma unroll + for (int i = 0; i < WARP_COL_TILES; i++) { +#pragma unroll + for (int j = 0; j < WARP_ROW_TILES; j++) { +#pragma unroll + for (int t = 0; t < c[i][j].num_elements; t++) { + c[i][j].x[t] *= beta; + } + } + } + + // Select what warp copies what matrix to shared memory. + // Warps 0-3 copy the A matrix, warps 4-7 copy the B matrix. + const half *warp_ptr = (warpId < 4) ? (&A[block_tile_i * M * K_GLOBAL] + M * K_GLOBAL * (warpId % 4) * 2) : + (&B[block_tile_j * N * K_GLOBAL] + N * K_GLOBAL * (warpId % 4) * 2); + + // Go through the global K dimension by a fixed step at a time. +#pragma unroll + for (int tile_k = 0; tile_k < K_TILES; tile_k += CHUNK_K) { + // Copy slices of the A and B matrices to shared memory. + // The first half of the warps in the CTA copy the A matrix, the rest copy the B matrix. + size_t shmem_idx = warpId < (WARPS_PER_BLOCK / 2) ? (M * (warpId % (WARPS_PER_BLOCK / 2)) * 2) : + (N * (warpId % (WARPS_PER_BLOCK / 2)) * 2 + shmem_idx_b_off); + + // First half of the warp copies the first row / column of the matrix, + // the second half of the warp copies the next. + int4 *lane_ptr = (int4 *) (warp_ptr + tile_k * K + (laneId / CHUNK_COPY_LINE_LANES) * K_GLOBAL) + + (laneId % CHUNK_COPY_LINE_LANES); + + // Shift the second half of the warp to the next row / column in the shared memory. + shmem_idx += laneId / CHUNK_COPY_LINE_LANES; + +#pragma unroll + for (int i = 0; i < ((WARP_SIZE / 2) / CHUNK_COPY_LINES_PER_WARP) * 2; i++) { + // Copy 16 bytes at once in each lane. + *((int4 *) &shmem[shmem_idx][0] + (laneId % CHUNK_COPY_LINE_LANES)) = *lane_ptr; + + // Advance the global memory pointer and the shared memory index. + lane_ptr = (int4 *) ((half *) lane_ptr + K_GLOBAL * CHUNK_COPY_LINES_PER_WARP); + shmem_idx += CHUNK_COPY_LINES_PER_WARP; + } + + __syncthreads(); + + // Compute a grid of C matrix tiles in each warp. +#pragma unroll + for (int k_step = 0; k_step < CHUNK_K; k_step++) { + wmma::fragment a[WARP_COL_TILES]; + wmma::fragment b[WARP_ROW_TILES]; + +#pragma unroll + for (int i = 0; i < WARP_COL_TILES; i++) { + size_t shmem_idx_a = (warpId / 2) * M * 2 + (i * M); + const half *tile_ptr = &shmem[shmem_idx_a][k_step * K]; + + wmma::load_matrix_sync(a[i], tile_ptr, K * CHUNK_K + SKEW_HALF); + +#pragma unroll + for (int j = 0; j < WARP_ROW_TILES; j++) { + if (i == 0) { + // Load the B matrix fragment once, because it is going to be reused + // against the other A matrix fragments. + size_t shmem_idx_b = shmem_idx_b_off + (WARP_ROW_TILES * N) * (warpId % 2) + (j * N); + const half *tile_ptr = &shmem[shmem_idx_b][k_step * K]; + + wmma::load_matrix_sync(b[j], tile_ptr, K * CHUNK_K + SKEW_HALF); + } + + wmma::mma_sync(c[i][j], a[i], b[j], c[i][j]); + } + } + } + + __syncthreads(); + } + + // Store the D fragments to shared memory. +#pragma unroll + for (int i = 0; i < WARP_COL_TILES; i++) { +#pragma unroll + for (int j = 0; j < WARP_ROW_TILES; j++) { +#pragma unroll + // Uniform, point-wise transformations of ALL fragment elements by ALL threads in the + // warp are well-defined even though element indices within fragment storage are not defined. + for (int t = 0; t < c[i][j].num_elements; t++) + c[i][j].x[t] *= alpha; + + float *tile_ptr = shmem_warp_tile_ptr + i * SHMEM_STRIDE * K + j * N; + + wmma::store_matrix_sync(tile_ptr, c[i][j], SHMEM_STRIDE, C_LAYOUT); + } + } + + __syncthreads(); + + // Now that shared memory contains all the D tiles, stream them to global memory. + float *dst_gmem_warp_stream_ptr = &D[gmem_idx]; + +#pragma unroll + for (int i = 0; i < K; i++) { + *((int4 *) (dst_gmem_warp_stream_ptr + GLOBAL_MEM_STRIDE * i) + laneId) = + *((int4 *) (shmem_warp_stream_ptr + SHMEM_STRIDE * i) + laneId); + } + + __syncthreads(); + } +} + + +// Performs an MxNxK GEMM (C=alpha*A*B + beta*C) assuming: +// 1) Matrices are packed in memory. +// 2) M, N and K are multiples of 16. +// 3) Neither A nor B are transposed. +// Note: This is a less performant version of the compute_gemm kernel. It is designed for +// demonstration purposes only to show the CUDA WMMA API use without relying on +// availability of the shared memory. +__global__ void +simple_wmma_gemm(half *a, half *b, float *c, float *d, int m_ld, int n_ld, int k_ld, float alpha, float beta) { + // Leading dimensions. Packed with no transpositions. + int lda = m_ld; + int ldb = k_ld; + int ldc = n_ld; + + // Tile using a 2D grid + int warpM = (blockIdx.x * blockDim.x + threadIdx.x) / warpSize; + int warpN = (blockIdx.y * blockDim.y + threadIdx.y); + + // Declare the fragments + wmma::fragment a_frag; + wmma::fragment b_frag; + wmma::fragment acc_frag; + wmma::fragment c_frag; + + wmma::fill_fragment(acc_frag, 0.0f); + + // Loop over k + for (int i = 0; i < k_ld; i += WMMA_K) { + int aCol = i; + int aRow = warpM * WMMA_M; + + int bCol = i; + int bRow = warpN * WMMA_N; + + // Bounds checking + if (aRow < m_ld && aCol < k_ld && bRow < k_ld && bCol < n_ld) { + // Load the inputs + wmma::load_matrix_sync(a_frag, a + aCol + aRow * lda, lda); + wmma::load_matrix_sync(b_frag, b + bCol + bRow * ldb, ldb); + + // Perform the matrix multiplication + wmma::mma_sync(acc_frag, a_frag, b_frag, acc_frag); + + } + } + + // Load in the current value of c, scale it by beta, and add this our result scaled by alpha + int cCol = warpN * WMMA_N; + int cRow = warpM * WMMA_M; + + if (cRow < m_ld && cCol < n_ld) { + wmma::load_matrix_sync(c_frag, c + cCol + cRow * ldc, ldc, wmma::mem_row_major); + + for (int i = 0; i < c_frag.num_elements; i++) { + c_frag.x[i] = alpha * acc_frag.x[i] + beta * c_frag.x[i]; + } + + // Store the output + wmma::store_matrix_sync(d + cCol + cRow * ldc, c_frag, ldc, wmma::mem_row_major); + } +} + +__host__ void matMultiplyOnHost(half *A, half *B, float *C, + float alpha, float beta, + int numARows, int numAColumns, + int numBRows, int numBColumns, + int numCRows, int numCColumns) { + for (int i = 0; i < numCRows; i++) { + for (int j = 0; j < numCColumns; j++) { + float temp = 0.0; + + for (int k = 0; k < numAColumns; k++) { + temp += (float) A[i * numAColumns + k] * (float) B[j * numBRows + k]; + } + + C[i * numCColumns + j] = temp * alpha + beta * C[i * numCColumns + j]; + } + } +} + +int main(int argc, char **argv) { + printf("Initializing...\n"); + + int dev = findCudaDevice(argc, (const char **) argv); + + cudaDeviceProp deviceProp; + checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev)); + + // Tensor cores require a GPU of Volta (SM7X) architecture or higher. + if (deviceProp.major < 7) { + printf("cudaTensorCoreGemm requires requires SM 7.0 or higher to use Tensor Cores. Exiting...\n"); + exit(EXIT_WAIVED); + } + + printf("M: %d (%d x %d)\n", M_GLOBAL, M, M_TILES); + printf("N: %d (%d x %d)\n", N_GLOBAL, N, N_TILES); + printf("K: %d (%d x %d)\n", K_GLOBAL, K, K_TILES); + + half *A_h = NULL; + half *B_h = NULL; + float *C_h = NULL; +#if CPU_DEBUG + float *result_hD = NULL; + float *result_host = NULL; +#endif + + A_h = (half *) malloc(sizeof(half) * M_GLOBAL * K_GLOBAL); + B_h = (half *) malloc(sizeof(half) * K_GLOBAL * N_GLOBAL); + C_h = (float *) malloc(sizeof(float) * M_GLOBAL * N_GLOBAL); +#if CPU_DEBUG + result_hD = (float*) malloc(sizeof(float) * M_GLOBAL * N_GLOBAL); + result_host = (float*) malloc(sizeof(float) * M_GLOBAL * N_GLOBAL); +#endif + + half *A = NULL; + half *B = NULL; + float *C = NULL; + float *D = NULL; + + checkCudaErrors(cudaMalloc((void **) &A, sizeof(half) * M_GLOBAL * K_GLOBAL)); + checkCudaErrors(cudaMalloc((void **) &B, sizeof(half) * N_GLOBAL * K_GLOBAL)); + checkCudaErrors(cudaMalloc((void **) &C, sizeof(float) * M_GLOBAL * N_GLOBAL)); + checkCudaErrors(cudaMalloc((void **) &D, sizeof(float) * M_GLOBAL * N_GLOBAL)); + + assert(((unsigned long long) A) % 128 == 0); + assert(((unsigned long long) B) % 128 == 0); + assert(((unsigned long long) C) % 128 == 0); + assert(((unsigned long long) D) % 128 == 0); + + init_host_matrices(A_h, B_h, C_h); + + printf("Preparing data for GPU...\n"); + + checkCudaErrors(cudaMemcpy(A, A_h, sizeof(half) * M_GLOBAL * K_GLOBAL, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(B, B_h, sizeof(half) * N_GLOBAL * K_GLOBAL, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(C, C_h, sizeof(float) * M_GLOBAL * N_GLOBAL, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemset(D, 0, sizeof(float) * M_GLOBAL * N_GLOBAL)); + + enum { + // Compute the right amount of shared memory to request. + // We need shared memory to hold per-CTA C and D matrix tiles, and to cache per-CTA chunks + // of the A and B matrices. Therefore, the right amount to request is the maximum of those + // two numbers. + SHMEM_SZ = MAX(sizeof(half) * (BLOCK_COL_TILES * M) * (CHUNK_K * K + SKEW_HALF) * 2, + M * (BLOCK_ROW_WARPS * WARP_ROW_TILES) * N * (BLOCK_COL_WARPS * WARP_COL_TILES) * sizeof(float)) + }; + + printf("Required shared memory size: %lu Kb\n", SHMEM_SZ / 1024UL); + + const float alpha = 1.1f; + const float beta = 1.2f; + + cudaEvent_t start, stop; + + checkCudaErrors(cudaEventCreate(&start)); + checkCudaErrors(cudaEventCreate(&stop)); + checkCudaErrors(cudaEventRecord(start)); + + // If enough shared memory available on the GPU use high performant kernel + if (deviceProp.sharedMemPerMultiprocessor >= SHMEM_SZ) { + printf("Computing... using high performance kernel compute_gemm \n"); + + checkCudaErrors(cudaFuncSetAttribute(compute_gemm, cudaFuncAttributeMaxDynamicSharedMemorySize, SHMEM_SZ)); + checkKernelErrors( + (compute_gemm<<>>(A, B, C, D, alpha, + beta))); +#if CPU_DEBUG + checkCudaErrors(cudaMemcpy(result_hD, D, sizeof(float)*M_GLOBAL*N_GLOBAL, cudaMemcpyDeviceToHost)); +#endif + } else { + dim3 gridDim; + dim3 blockDim; + + // blockDim.x must be a multple of warpSize + // 128x4 means we have 16 warps and a block computes a 64x64 output tile + blockDim.x = 128; + blockDim.y = 4; + + gridDim.x = (M_GLOBAL + (WMMA_M * blockDim.x / 32 - 1)) / (WMMA_M * blockDim.x / 32); + gridDim.y = (N_GLOBAL + WMMA_N * blockDim.y - 1) / (WMMA_N * blockDim.y); + + printf("Computing... using simple_wmma_gemm kernel\n"); + simple_wmma_gemm<<>>(A, B, C, D, M_GLOBAL, N_GLOBAL, K_GLOBAL, alpha, beta); +#if CPU_DEBUG + checkCudaErrors(cudaMemcpy(result_hD, D, sizeof(float) * M_GLOBAL * N_GLOBAL, cudaMemcpyDeviceToHost)); +#endif + } + + checkCudaErrors(cudaEventRecord(stop)); + checkCudaErrors(cudaEventSynchronize(stop)); + +#if CPU_DEBUG + printf("Verifying correctness of the computations...\n"); + + memcpy(result_host, C_h, sizeof(float) * M_GLOBAL * N_GLOBAL); + + matMultiplyOnHost(A_h, B_h, result_host, + alpha, beta, + M_GLOBAL, K_GLOBAL, + K_GLOBAL, N_GLOBAL, + M_GLOBAL, N_GLOBAL); + + for (int i = 0; i < N_GLOBAL * M_GLOBAL; i++) { + if (fabs(result_hD[i] - result_host[i]) > 0.1f) + printf("mismatch i=%d result_hD=%f result_host=%f\n", i, result_hD[i], result_host[i]); + } + free(result_hD); + free(result_host); +#endif + + float milliseconds = 0; + + checkCudaErrors(cudaEventElapsedTime(&milliseconds, start, stop)); + + printf("Time: %f ms\n", milliseconds); + printf("TFLOPS: %.2f\n", (((double) M_GLOBAL * N_GLOBAL * K_GLOBAL * 2) / (milliseconds / 1000.)) / 1e12); + + free(A_h); + free(B_h); + free(C_h); + checkCudaErrors(cudaFree((void *) A)); + checkCudaErrors(cudaFree((void *) B)); + checkCudaErrors(cudaFree((void *) C)); + checkCudaErrors(cudaFree((void *) D)); + + return 0; +} \ No newline at end of file diff --git a/main.cu b/main.cu new file mode 100644 index 0000000..bc8f460 --- /dev/null +++ b/main.cu @@ -0,0 +1,6 @@ +#include + +int main() { + std::cout << "Hello, World!" << std::endl; + return 0; +} diff --git a/mpirun.sh b/mpirun.sh new file mode 100755 index 0000000..50abf06 --- /dev/null +++ b/mpirun.sh @@ -0,0 +1,2 @@ +#!/usr/bin/env bash +mpirun -np 4 cmake-build-debug/simpleMPI diff --git a/run_bandwidthTest.sh b/run_bandwidthTest.sh new file mode 100755 index 0000000..7f541be --- /dev/null +++ b/run_bandwidthTest.sh @@ -0,0 +1,2 @@ +#!/usr/bin/env bash +cmake-build-release/bandwidthTest --mode=shmoo --memory=pinned \ No newline at end of file diff --git a/simpleMPI.cpp b/simpleMPI.cpp new file mode 100644 index 0000000..82bff92 --- /dev/null +++ b/simpleMPI.cpp @@ -0,0 +1,122 @@ +// +// Created by kwoodle on 5/9/20. +// + +/* + * Copyright 1993-2015 NVIDIA Corporation. All rights reserved. + * + * Please refer to the NVIDIA end user license agreement (EULA) associated + * with this source code for terms and conditions that govern your use of + * this software. Any use, reproduction, disclosure, or distribution of + * this software and related documentation outside the terms of the EULA + * is strictly prohibited. + * + */ + + +/* Simple example demonstrating how to use MPI with CUDA +* +* Generate some random numbers on one node. +* Dispatch them to all nodes. +* Compute their square root on each node's GPU. +* Compute the average of the results using MPI. +* +* simpleMPI.cpp: main program, compiled with mpicxx on linux/Mac platforms +* on Windows, please download the Microsoft HPC Pack SDK 2008 +*/ + +// MPI include +#include + +// System includes +#include + +using std::cout; +using std::cerr; +using std::endl; + +// User include +#include "simpleMPI.h" + +// Error handling macros +#define MPI_CHECK(call) \ + if((call) != MPI_SUCCESS) { \ + cerr << "MPI error calling \""#call"\"\n"; \ + my_abort(-1); } + + +// Host code +// No CUDA here, only MPI +int main(int argc, char *argv[]) { + // Dimensions of the dataset + int blockSize = 256; + int gridSize = 10000; + int dataSizePerNode = gridSize * blockSize; + + // Initialize MPI state + MPI_CHECK(MPI_Init(&argc, &argv)); + + // Get our MPI node number and node count + int commSize, commRank; + MPI_CHECK(MPI_Comm_size(MPI_COMM_WORLD, &commSize)); + MPI_CHECK(MPI_Comm_rank(MPI_COMM_WORLD, &commRank)); + + // Generate some random numbers on the root node (node 0) + int dataSizeTotal = dataSizePerNode * commSize; + float *dataRoot = NULL; + + if (commRank == 0) // Are we the root node? + { + cout << "Running on " << commSize << " nodes" << endl; + dataRoot = new float[dataSizeTotal]; + initData(dataRoot, dataSizeTotal); + } + + // Allocate a buffer on each node + float *dataNode = new float[dataSizePerNode]; + + // Dispatch a portion of the input data to each node + MPI_CHECK(MPI_Scatter(dataRoot, + dataSizePerNode, + MPI_FLOAT, + dataNode, + dataSizePerNode, + MPI_FLOAT, + 0, + MPI_COMM_WORLD)); + + if (commRank == 0) { + // No need for root data any more + delete[] dataRoot; + } + + // On each node, run computation on GPU + computeGPU(dataNode, blockSize, gridSize); + + // Reduction to the root node, computing the sum of output elements + float sumNode = sum(dataNode, dataSizePerNode); + float sumRoot; + + MPI_CHECK(MPI_Reduce(&sumNode, &sumRoot, 1, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD)); + + if (commRank == 0) { + float average = sumRoot / dataSizeTotal; + cout << "Average of square roots is: " << average << endl; + } + + // Cleanup + delete[] dataNode; + MPI_CHECK(MPI_Finalize()); + + if (commRank == 0) { + cout << "PASSED\n"; + } + + return 0; +} + +// Shut down MPI cleanly if something goes wrong +void my_abort(int err) { + cout << "Test FAILED\n"; + MPI_Abort(MPI_COMM_WORLD, err); +} \ No newline at end of file diff --git a/simpleMPI.cu b/simpleMPI.cu new file mode 100644 index 0000000..8d53eb2 --- /dev/null +++ b/simpleMPI.cu @@ -0,0 +1,91 @@ +// +// Created by kwoodle on 5/10/20. +// + +/* + * Copyright 1993-2015 NVIDIA Corporation. All rights reserved. + * + * Please refer to the NVIDIA end user license agreement (EULA) associated + * with this source code for terms and conditions that govern your use of + * this software. Any use, reproduction, disclosure, or distribution of + * this software and related documentation outside the terms of the EULA + * is strictly prohibited. + * + */ + + +/* Simple example demonstrating how to use MPI with CUDA +* +* Generate some random numbers on one node. +* Dispatch them to all nodes. +* Compute their square root on each node's GPU. +* Compute the average of the results using MPI. +* +* simpleMPI.cu: GPU part, compiled with nvcc +*/ + +#include + +using std::cerr; +using std::endl; + +#include "simpleMPI.h" + +// Error handling macro +#define CUDA_CHECK(call) \ + if((call) != cudaSuccess) { \ + cudaError_t err = cudaGetLastError(); \ + cerr << "CUDA error calling \""#call"\", code is " << err << endl; \ + my_abort(err); } + + +// Device code +// Very simple GPU Kernel that computes square roots of input numbers +__global__ void simpleMPIKernel(float *input, float *output) { + int tid = blockIdx.x * blockDim.x + threadIdx.x; + output[tid] = sqrt(input[tid]); +} + + +// Initialize an array with random data (between 0 and 1) +void initData(float *data, int dataSize) { + for (int i = 0; i < dataSize; i++) { + data[i] = (float) rand() / RAND_MAX; + } +} + +// CUDA computation on each node +// No MPI here, only CUDA +void computeGPU(float *hostData, int blockSize, int gridSize) { + int dataSize = blockSize * gridSize; + + // Allocate data on GPU memory + float *deviceInputData = NULL; + CUDA_CHECK(cudaMalloc((void **) &deviceInputData, dataSize * sizeof(float))); + + float *deviceOutputData = NULL; + CUDA_CHECK(cudaMalloc((void **) &deviceOutputData, dataSize * sizeof(float))); + + // Copy to GPU memory + CUDA_CHECK(cudaMemcpy(deviceInputData, hostData, dataSize * sizeof(float), cudaMemcpyHostToDevice)); + + // Run kernel + simpleMPIKernel<<>>(deviceInputData, deviceOutputData); + + // Copy data back to CPU memory + CUDA_CHECK(cudaMemcpy(hostData, deviceOutputData, dataSize * sizeof(float), cudaMemcpyDeviceToHost)); + + // Free GPU memory + CUDA_CHECK(cudaFree(deviceInputData)); + CUDA_CHECK(cudaFree(deviceOutputData)); +} + +float sum(float *data, int size) { + float accum = 0.f; + + for (int i = 0; i < size; i++) { + accum += data[i]; + } + + return accum; +} \ No newline at end of file diff --git a/simpleMPI.h b/simpleMPI.h new file mode 100644 index 0000000..5a9586c --- /dev/null +++ b/simpleMPI.h @@ -0,0 +1,32 @@ +// +// Created by kwoodle on 5/9/20. +// + +/* + * Copyright 1993-2015 NVIDIA Corporation. All rights reserved. + * + * Please refer to the NVIDIA end user license agreement (EULA) associated + * with this source code for terms and conditions that govern your use of + * this software. Any use, reproduction, disclosure, or distribution of + * this software and related documentation outside the terms of the EULA + * is strictly prohibited. + * + */ + +/* Simple example demonstrating how to use MPI with CUDA +* +* Generate some random numbers on one node. +* Dispatch them to all nodes. +* Compute their square root on each node's GPU. +* Compute the average of the results using MPI. +* +* simpleMPI.h: common header file +*/ + +// Forward declarations +extern "C" { +void initData(float *data, int dataSize); +void computeGPU(float *hostData, int blockSize, int gridSize); +float sum(float *data, int size); +void my_abort(int err); +}