diff --git a/.idea/.gitignore b/.idea/.gitignore
new file mode 100644
index 0000000..73f69e0
--- /dev/null
+++ b/.idea/.gitignore
@@ -0,0 +1,8 @@
+# Default ignored files
+/shelf/
+/workspace.xml
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml
+# Editor-based HTTP Client requests
+/httpRequests/
diff --git a/.idea/CudaSamples.iml b/.idea/CudaSamples.iml
new file mode 100644
index 0000000..f08604b
--- /dev/null
+++ b/.idea/CudaSamples.iml
@@ -0,0 +1,2 @@
+
+
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
new file mode 100644
index 0000000..8822db8
--- /dev/null
+++ b/.idea/misc.xml
@@ -0,0 +1,7 @@
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
new file mode 100644
index 0000000..5217dda
--- /dev/null
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/runConfigurations/CudaSamples.xml b/.idea/runConfigurations/CudaSamples.xml
new file mode 100644
index 0000000..04419df
--- /dev/null
+++ b/.idea/runConfigurations/CudaSamples.xml
@@ -0,0 +1,7 @@
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/runConfigurations/asyncAPI.xml b/.idea/runConfigurations/asyncAPI.xml
new file mode 100644
index 0000000..db32816
--- /dev/null
+++ b/.idea/runConfigurations/asyncAPI.xml
@@ -0,0 +1,7 @@
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/runConfigurations/cudaTensorCoreGemm.xml b/.idea/runConfigurations/cudaTensorCoreGemm.xml
new file mode 100644
index 0000000..57250fd
--- /dev/null
+++ b/.idea/runConfigurations/cudaTensorCoreGemm.xml
@@ -0,0 +1,7 @@
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/runConfigurations/simpleMPI.xml b/.idea/runConfigurations/simpleMPI.xml
new file mode 100644
index 0000000..f1766a5
--- /dev/null
+++ b/.idea/runConfigurations/simpleMPI.xml
@@ -0,0 +1,7 @@
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
new file mode 100644
index 0000000..94a25f7
--- /dev/null
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..c97a31a
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,32 @@
+# Needs CUDAHOSTCXX=/usr/bin/cuda-g++;CUDA_LIB_PATH=/usr/local/cuda/lib64 in
+# CLion Settings | Build, Execution, Deployment | CMake > Environment
+
+cmake_minimum_required(VERSION 3.16)
+project(CudaSamples LANGUAGES CXX CUDA)
+
+set(CMAKE_CUDA_STANDARD 14)
+set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "-arch=sm_75")
+include_directories(/usr/local/cuda/samples/common/inc)
+
+set(CUDA_VERBOSE_BUILD ON)
+set(CUDA_SEPARABLE_COMPILATION ON) # Allow multiple CUDA files compilation
+
+add_executable(CudaSamples main.cu)
+
+set_target_properties(
+ CudaSamples
+ PROPERTIES
+ CUDA_SEPARABLE_COMPILATION ON)
+
+add_executable(asyncAPI asyncAPI.cu)
+
+find_package(MPI REQUIRED)
+include_directories(${MPI_CXX_INCLUDE_DIRS})
+
+add_executable(simpleMPI simpleMPI.cpp simpleMPI.h simpleMPI.cu)
+target_link_libraries(simpleMPI ${MPI_CXX_LIBRARIES})
+
+add_executable(cudaTensorCoreGemm cudaTensorCoreGemm.cu)
+target_link_libraries(cudaTensorCoreGemm)
+
+add_executable(bandwidthTest bandwidthTest.cu)
diff --git a/asyncAPI.cu b/asyncAPI.cu
new file mode 100644
index 0000000..dec2def
--- /dev/null
+++ b/asyncAPI.cu
@@ -0,0 +1,127 @@
+//
+// Created by kwoodle on 5/10/20.
+//
+
+////////////////////////////////////////////////////////////////////////////
+//
+// Copyright 1993-2015 NVIDIA Corporation. All rights reserved.
+//
+// Please refer to the NVIDIA end user license agreement (EULA) associated
+// with this source code for terms and conditions that govern your use of
+// this software. Any use, reproduction, disclosure, or distribution of
+// this software and related documentation outside the terms of the EULA
+// is strictly prohibited.
+//
+////////////////////////////////////////////////////////////////////////////
+
+//
+// This sample illustrates the usage of CUDA events for both GPU timing and
+// overlapping CPU and GPU execution. Events are inserted into a stream
+// of CUDA calls. Since CUDA stream calls are asynchronous, the CPU can
+// perform computations while GPU is executing (including DMA memcopies
+// between the host and device). CPU can query CUDA events to determine
+// whether GPU has completed tasks.
+//
+
+// includes, system
+#include
+
+// includes CUDA Runtime
+#include
+
+// includes, project
+#include
+#include // helper utility functions
+
+__global__ void increment_kernel(int *g_data, int inc_value) {
+ int idx = blockIdx.x * blockDim.x + threadIdx.x;
+ g_data[idx] = g_data[idx] + inc_value;
+}
+
+bool correct_output(int *data, const int n, const int x) {
+ for (int i = 0; i < n; i++)
+ if (data[i] != x) {
+ printf("Error! data[%d] = %d, ref = %d\n", i, data[i], x);
+ return false;
+ }
+
+ return true;
+}
+
+int main(int argc, char *argv[]) {
+ int devID;
+ cudaDeviceProp deviceProps;
+
+ printf("[%s] - Starting...\n", argv[0]);
+
+ // This will pick the best possible CUDA capable device
+ devID = findCudaDevice(argc, (const char **) argv);
+
+ // get device name
+ checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
+ printf("CUDA device [%s]\n", deviceProps.name);
+
+ int n = 16 * 1024 * 1024;
+ int nbytes = n * sizeof(int);
+ int value = 26;
+
+ // allocate host memory
+ int *a = 0;
+ checkCudaErrors(cudaMallocHost((void **) &a, nbytes));
+ memset(a, 0, nbytes);
+
+ // allocate device memory
+ int *d_a = 0;
+ checkCudaErrors(cudaMalloc((void **) &d_a, nbytes));
+ checkCudaErrors(cudaMemset(d_a, 255, nbytes));
+
+ // set kernel launch configuration
+ dim3 threads = dim3(512, 1);
+ dim3 blocks = dim3(n / threads.x, 1);
+
+ // create cuda event handles
+ cudaEvent_t start, stop;
+ checkCudaErrors(cudaEventCreate(&start));
+ checkCudaErrors(cudaEventCreate(&stop));
+
+ StopWatchInterface *timer = NULL;
+ sdkCreateTimer(&timer);
+ sdkResetTimer(&timer);
+
+ checkCudaErrors(cudaDeviceSynchronize());
+ float gpu_time = 0.0f;
+
+ // asynchronously issue work to the GPU (all to stream 0)
+ sdkStartTimer(&timer);
+ cudaEventRecord(start, 0);
+ cudaMemcpyAsync(d_a, a, nbytes, cudaMemcpyHostToDevice, 0);
+ increment_kernel<<>>(d_a, value);
+ cudaMemcpyAsync(a, d_a, nbytes, cudaMemcpyDeviceToHost, 0);
+ cudaEventRecord(stop, 0);
+ sdkStopTimer(&timer);
+
+ // have CPU do some work while waiting for stage 1 to finish
+ unsigned long int counter = 0;
+
+ while (cudaEventQuery(stop) == cudaErrorNotReady) {
+ counter++;
+ }
+
+ checkCudaErrors(cudaEventElapsedTime(&gpu_time, start, stop));
+
+ // print the cpu and gpu times
+ printf("time spent executing by the GPU: %.2f\n", gpu_time);
+ printf("time spent by CPU in CUDA calls: %.2f\n", sdkGetTimerValue(&timer));
+ printf("CPU executed %lu iterations while waiting for GPU to finish\n", counter);
+
+ // check the output for correctness
+ bool bFinalResults = correct_output(a, n, value);
+
+ // release resources
+ checkCudaErrors(cudaEventDestroy(start));
+ checkCudaErrors(cudaEventDestroy(stop));
+ checkCudaErrors(cudaFreeHost(a));
+ checkCudaErrors(cudaFree(d_a));
+
+ exit(bFinalResults ? EXIT_SUCCESS : EXIT_FAILURE);
+}
\ No newline at end of file
diff --git a/bandwidthTest.cu b/bandwidthTest.cu
new file mode 100644
index 0000000..b4d7e08
--- /dev/null
+++ b/bandwidthTest.cu
@@ -0,0 +1,926 @@
+//
+// Created by kwoodle on 5/10/20.
+//
+
+/*
+ * Copyright 1993-2015 NVIDIA Corporation. All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+
+/*
+ * This is a simple test program to measure the memcopy bandwidth of the GPU.
+ * It can measure device to device copy bandwidth, host to device copy bandwidth
+ * for pageable and pinned memory, and device to host copy bandwidth for pageable
+ * and pinned memory.
+ *
+ * Usage:
+ * ./bandwidthTest [option]...
+ */
+
+// CUDA runtime
+#include
+
+// includes
+#include // helper for shared functions common to CUDA Samples
+#include // helper functions for CUDA error checking and initialization
+
+#include
+
+#include
+#include
+#include
+
+static const char *sSDKsample = "CUDA Bandwidth Test";
+
+// defines, project
+#define MEMCOPY_ITERATIONS 100
+#define DEFAULT_SIZE ( 32 * (1e6) ) //32 M
+#define DEFAULT_INCREMENT ( 4 * (1e6) ) //4 M
+#define CACHE_CLEAR_SIZE ( 16 * (1e6) ) //16 M
+
+//shmoo mode defines
+#define SHMOO_MEMSIZE_MAX (64 * (1e6)) //64 M
+#define SHMOO_MEMSIZE_START (1e3) //1 KB
+#define SHMOO_INCREMENT_1KB (1e3) //1 KB
+#define SHMOO_INCREMENT_2KB (2 * 1e3) //2 KB
+#define SHMOO_INCREMENT_10KB (10 * (1e3)) //10KB
+#define SHMOO_INCREMENT_100KB (100 * (1e3)) //100 KB
+#define SHMOO_INCREMENT_1MB (1e6) //1 MB
+#define SHMOO_INCREMENT_2MB (2 * 1e6) //2 MB
+#define SHMOO_INCREMENT_4MB (4 * 1e6) //4 MB
+#define SHMOO_LIMIT_20KB (20 * (1e3)) //20 KB
+#define SHMOO_LIMIT_50KB (50 * (1e3)) //50 KB
+#define SHMOO_LIMIT_100KB (100 * (1e3)) //100 KB
+#define SHMOO_LIMIT_1MB (1e6) //1 MB
+#define SHMOO_LIMIT_16MB (16 * 1e6) //16 MB
+#define SHMOO_LIMIT_32MB (32 * 1e6) //32 MB
+
+//enums, project
+enum testMode {
+ QUICK_MODE, RANGE_MODE, SHMOO_MODE
+};
+enum memcpyKind {
+ DEVICE_TO_HOST, HOST_TO_DEVICE, DEVICE_TO_DEVICE
+};
+enum printMode {
+ USER_READABLE, CSV
+};
+enum memoryMode {
+ PINNED, PAGEABLE
+};
+
+const char *sMemoryCopyKind[] =
+ {
+ "Device to Host",
+ "Host to Device",
+ "Device to Device",
+ NULL
+ };
+
+const char *sMemoryMode[] =
+ {
+ "PINNED",
+ "PAGEABLE",
+ NULL
+ };
+
+// if true, use CPU based timing for everything
+static bool bDontUseGPUTiming;
+
+int *pArgc = NULL;
+char **pArgv = NULL;
+
+////////////////////////////////////////////////////////////////////////////////
+// declaration, forward
+int runTest(const int argc, const char **argv);
+
+void testBandwidth(unsigned int start, unsigned int end, unsigned int increment,
+ testMode mode, memcpyKind kind, printMode printmode, memoryMode memMode, int startDevice,
+ int endDevice, bool wc);
+
+void testBandwidthQuick(unsigned int size, memcpyKind kind, printMode printmode, memoryMode memMode, int startDevice,
+ int endDevice, bool wc);
+
+void testBandwidthRange(unsigned int start, unsigned int end, unsigned int increment,
+ memcpyKind kind, printMode printmode, memoryMode memMode, int startDevice, int endDevice,
+ bool wc);
+
+void
+testBandwidthShmoo(memcpyKind kind, printMode printmode, memoryMode memMode, int startDevice, int endDevice, bool wc);
+
+float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode, bool wc);
+
+float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode, bool wc);
+
+float testDeviceToDeviceTransfer(unsigned int memSize);
+
+void printResultsReadable(unsigned int *memSizes, double *bandwidths, unsigned int count, memcpyKind kind,
+ memoryMode memMode, int iNumDevs, bool wc);
+
+void
+printResultsCSV(unsigned int *memSizes, double *bandwidths, unsigned int count, memcpyKind kind, memoryMode memMode,
+ int iNumDevs, bool wc);
+
+void printHelp(void);
+
+////////////////////////////////////////////////////////////////////////////////
+// Program main
+////////////////////////////////////////////////////////////////////////////////
+int main(int argc, char **argv) {
+ pArgc = &argc;
+ pArgv = argv;
+
+ // set logfile name and start logs
+ printf("[%s] - Starting...\n", sSDKsample);
+
+ int iRetVal = runTest(argc, (const char **) argv);
+
+ if (iRetVal < 0) {
+ checkCudaErrors(cudaSetDevice(0));
+ }
+
+ // finish
+ printf("%s\n", (iRetVal == 0) ? "Result = PASS" : "Result = FAIL");
+
+ printf("\nNOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.\n");
+
+ exit((iRetVal == 0) ? EXIT_SUCCESS : EXIT_FAILURE);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+//Parse args, run the appropriate tests
+///////////////////////////////////////////////////////////////////////////////
+int runTest(const int argc, const char **argv) {
+ int start = DEFAULT_SIZE;
+ int end = DEFAULT_SIZE;
+ int startDevice = 0;
+ int endDevice = 0;
+ int increment = DEFAULT_INCREMENT;
+ testMode mode = QUICK_MODE;
+ bool htod = false;
+ bool dtoh = false;
+ bool dtod = false;
+ bool wc = false;
+ char *modeStr;
+ char *device = NULL;
+ printMode printmode = USER_READABLE;
+ char *memModeStr = NULL;
+ memoryMode memMode = PINNED;
+
+ //process command line args
+ if (checkCmdLineFlag(argc, argv, "help")) {
+ printHelp();
+ return 0;
+ }
+
+ if (checkCmdLineFlag(argc, argv, "csv")) {
+ printmode = CSV;
+ }
+
+ if (getCmdLineArgumentString(argc, argv, "memory", &memModeStr)) {
+ if (strcmp(memModeStr, "pageable") == 0) {
+ memMode = PAGEABLE;
+ } else if (strcmp(memModeStr, "pinned") == 0) {
+ memMode = PINNED;
+ } else {
+ printf("Invalid memory mode - valid modes are pageable or pinned\n");
+ printf("See --help for more information\n");
+ return -1000;
+ }
+ } else {
+ //default - pinned memory
+ memMode = PINNED;
+ }
+
+ if (getCmdLineArgumentString(argc, argv, "device", &device)) {
+ int deviceCount;
+ cudaError_t error_id = cudaGetDeviceCount(&deviceCount);
+
+ if (error_id != cudaSuccess) {
+ printf("cudaGetDeviceCount returned %d\n-> %s\n", (int) error_id, cudaGetErrorString(error_id));
+ exit(EXIT_FAILURE);
+ }
+
+ if (deviceCount == 0) {
+ printf("!!!!!No devices found!!!!!\n");
+ return -2000;
+ }
+
+ if (strcmp(device, "all") == 0) {
+ printf("\n!!!!!Cumulative Bandwidth to be computed from all the devices !!!!!!\n\n");
+ startDevice = 0;
+ endDevice = deviceCount - 1;
+ } else {
+ startDevice = endDevice = atoi(device);
+
+ if (startDevice >= deviceCount || startDevice < 0) {
+ printf("\n!!!!!Invalid GPU number %d given hence default gpu %d will be used !!!!!\n", startDevice, 0);
+ startDevice = endDevice = 0;
+ }
+ }
+ }
+
+ printf("Running on...\n\n");
+
+ for (int currentDevice = startDevice; currentDevice <= endDevice; currentDevice++) {
+ cudaDeviceProp deviceProp;
+ cudaError_t error_id = cudaGetDeviceProperties(&deviceProp, currentDevice);
+
+ if (error_id == cudaSuccess) {
+ printf(" Device %d: %s\n", currentDevice, deviceProp.name);
+
+ if (deviceProp.computeMode == cudaComputeModeProhibited) {
+ fprintf(stderr,
+ "Error: device is running in , no threads can use ::cudaSetDevice().\n");
+ checkCudaErrors(cudaSetDevice(currentDevice));
+
+ exit(EXIT_FAILURE);
+ }
+ } else {
+ printf("cudaGetDeviceProperties returned %d\n-> %s\n", (int) error_id, cudaGetErrorString(error_id));
+ checkCudaErrors(cudaSetDevice(currentDevice));
+
+ exit(EXIT_FAILURE);
+ }
+ }
+
+ if (getCmdLineArgumentString(argc, argv, "mode", &modeStr)) {
+ //figure out the mode
+ if (strcmp(modeStr, "quick") == 0) {
+ printf(" Quick Mode\n\n");
+ mode = QUICK_MODE;
+ } else if (strcmp(modeStr, "shmoo") == 0) {
+ printf(" Shmoo Mode\n\n");
+ mode = SHMOO_MODE;
+ } else if (strcmp(modeStr, "range") == 0) {
+ printf(" Range Mode\n\n");
+ mode = RANGE_MODE;
+ } else {
+ printf("Invalid mode - valid modes are quick, range, or shmoo\n");
+ printf("See --help for more information\n");
+ return -3000;
+ }
+ } else {
+ //default mode - quick
+ printf(" Quick Mode\n\n");
+ mode = QUICK_MODE;
+ }
+
+ if (checkCmdLineFlag(argc, argv, "htod")) {
+ htod = true;
+ }
+
+ if (checkCmdLineFlag(argc, argv, "dtoh")) {
+ dtoh = true;
+ }
+
+ if (checkCmdLineFlag(argc, argv, "dtod")) {
+ dtod = true;
+ }
+
+#if CUDART_VERSION >= 2020
+
+ if (checkCmdLineFlag(argc, argv, "wc")) {
+ wc = true;
+ }
+
+#endif
+
+ if (checkCmdLineFlag(argc, argv, "cputiming")) {
+ bDontUseGPUTiming = true;
+ }
+
+ if (!htod && !dtoh && !dtod) {
+ //default: All
+ htod = true;
+ dtoh = true;
+ dtod = true;
+ }
+
+ if (RANGE_MODE == mode) {
+ if (checkCmdLineFlag(argc, (const char **) argv, "start")) {
+ start = getCmdLineArgumentInt(argc, argv, "start");
+
+ if (start <= 0) {
+ printf("Illegal argument - start must be greater than zero\n");
+ return -4000;
+ }
+ } else {
+ printf("Must specify a starting size in range mode\n");
+ printf("See --help for more information\n");
+ return -5000;
+ }
+
+ if (checkCmdLineFlag(argc, (const char **) argv, "end")) {
+ end = getCmdLineArgumentInt(argc, argv, "end");
+
+ if (end <= 0) {
+ printf("Illegal argument - end must be greater than zero\n");
+ return -6000;
+ }
+
+ if (start > end) {
+ printf("Illegal argument - start is greater than end\n");
+ return -7000;
+ }
+ } else {
+ printf("Must specify an end size in range mode.\n");
+ printf("See --help for more information\n");
+ return -8000;
+ }
+
+
+ if (checkCmdLineFlag(argc, argv, "increment")) {
+ increment = getCmdLineArgumentInt(argc, argv, "increment");
+
+ if (increment <= 0) {
+ printf("Illegal argument - increment must be greater than zero\n");
+ return -9000;
+ }
+ } else {
+ printf("Must specify an increment in user mode\n");
+ printf("See --help for more information\n");
+ return -10000;
+ }
+ }
+
+ if (htod) {
+ testBandwidth((unsigned int) start, (unsigned int) end, (unsigned int) increment,
+ mode, HOST_TO_DEVICE, printmode, memMode, startDevice, endDevice, wc);
+ }
+
+ if (dtoh) {
+ testBandwidth((unsigned int) start, (unsigned int) end, (unsigned int) increment,
+ mode, DEVICE_TO_HOST, printmode, memMode, startDevice, endDevice, wc);
+ }
+
+ if (dtod) {
+ testBandwidth((unsigned int) start, (unsigned int) end, (unsigned int) increment,
+ mode, DEVICE_TO_DEVICE, printmode, memMode, startDevice, endDevice, wc);
+ }
+
+ // Ensure that we reset all CUDA Devices in question
+ for (int nDevice = startDevice; nDevice <= endDevice; nDevice++) {
+ cudaSetDevice(nDevice);
+ }
+
+ return 0;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Run a bandwidth test
+///////////////////////////////////////////////////////////////////////////////
+void
+testBandwidth(unsigned int start, unsigned int end, unsigned int increment,
+ testMode mode, memcpyKind kind, printMode printmode, memoryMode memMode, int startDevice, int endDevice,
+ bool wc) {
+ switch (mode) {
+ case QUICK_MODE:
+ testBandwidthQuick(DEFAULT_SIZE, kind, printmode, memMode, startDevice, endDevice, wc);
+ break;
+
+ case RANGE_MODE:
+ testBandwidthRange(start, end, increment, kind, printmode, memMode, startDevice, endDevice, wc);
+ break;
+
+ case SHMOO_MODE:
+ testBandwidthShmoo(kind, printmode, memMode, startDevice, endDevice, wc);
+ break;
+
+ default:
+ break;
+ }
+}
+
+//////////////////////////////////////////////////////////////////////
+// Run a quick mode bandwidth test
+//////////////////////////////////////////////////////////////////////
+void
+testBandwidthQuick(unsigned int size, memcpyKind kind, printMode printmode, memoryMode memMode, int startDevice,
+ int endDevice, bool wc) {
+ testBandwidthRange(size, size, DEFAULT_INCREMENT, kind, printmode, memMode, startDevice, endDevice, wc);
+}
+
+///////////////////////////////////////////////////////////////////////
+// Run a range mode bandwidth test
+//////////////////////////////////////////////////////////////////////
+void
+testBandwidthRange(unsigned int start, unsigned int end, unsigned int increment,
+ memcpyKind kind, printMode printmode, memoryMode memMode, int startDevice, int endDevice, bool wc) {
+ //count the number of copies we're going to run
+ unsigned int count = 1 + ((end - start) / increment);
+
+ unsigned int *memSizes = (unsigned int *) malloc(count * sizeof(unsigned int));
+ double *bandwidths = (double *) malloc(count * sizeof(double));
+
+ // Before calculating the cumulative bandwidth, initialize bandwidths array to NULL
+ for (unsigned int i = 0; i < count; i++) {
+ bandwidths[i] = 0.0;
+ }
+
+ // Use the device asked by the user
+ for (int currentDevice = startDevice; currentDevice <= endDevice; currentDevice++) {
+ cudaSetDevice(currentDevice);
+
+ //run each of the copies
+ for (unsigned int i = 0; i < count; i++) {
+
+ memSizes[i] = start + i * increment;
+
+ switch (kind) {
+ case DEVICE_TO_HOST:
+ bandwidths[i] += testDeviceToHostTransfer(memSizes[i], memMode, wc);
+ break;
+
+ case HOST_TO_DEVICE:
+ bandwidths[i] += testHostToDeviceTransfer(memSizes[i], memMode, wc);
+ break;
+
+ case DEVICE_TO_DEVICE:
+ bandwidths[i] += testDeviceToDeviceTransfer(memSizes[i]);
+ break;
+ }
+ }
+ } // Complete the bandwidth computation on all the devices
+
+ //print results
+ if (printmode == CSV) {
+ printResultsCSV(memSizes, bandwidths, count, kind, memMode, (1 + endDevice - startDevice), wc);
+ } else {
+ printResultsReadable(memSizes, bandwidths, count, kind, memMode, (1 + endDevice - startDevice), wc);
+ }
+
+ //clean up
+ free(memSizes);
+ free(bandwidths);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Intense shmoo mode - covers a large range of values with varying increments
+//////////////////////////////////////////////////////////////////////////////
+void
+testBandwidthShmoo(memcpyKind kind, printMode printmode, memoryMode memMode, int startDevice, int endDevice, bool wc) {
+ //count the number of copies to make
+ unsigned int count = 1 + (SHMOO_LIMIT_20KB / SHMOO_INCREMENT_1KB)
+ + ((SHMOO_LIMIT_50KB - SHMOO_LIMIT_20KB) / SHMOO_INCREMENT_2KB)
+ + ((SHMOO_LIMIT_100KB - SHMOO_LIMIT_50KB) / SHMOO_INCREMENT_10KB)
+ + ((SHMOO_LIMIT_1MB - SHMOO_LIMIT_100KB) / SHMOO_INCREMENT_100KB)
+ + ((SHMOO_LIMIT_16MB - SHMOO_LIMIT_1MB) / SHMOO_INCREMENT_1MB)
+ + ((SHMOO_LIMIT_32MB - SHMOO_LIMIT_16MB) / SHMOO_INCREMENT_2MB)
+ + ((SHMOO_MEMSIZE_MAX - SHMOO_LIMIT_32MB) / SHMOO_INCREMENT_4MB);
+
+ unsigned int *memSizes = (unsigned int *) malloc(count * sizeof(unsigned int));
+ double *bandwidths = (double *) malloc(count * sizeof(double));
+
+ // Before calculating the cumulative bandwidth, initialize bandwidths array to NULL
+ for (unsigned int i = 0; i < count; i++) {
+ bandwidths[i] = 0.0;
+ }
+
+ // Use the device asked by the user
+ for (int currentDevice = startDevice; currentDevice <= endDevice; currentDevice++) {
+ cudaSetDevice(currentDevice);
+ //Run the shmoo
+ int iteration = 0;
+ unsigned int memSize = 0;
+
+ while (memSize <= SHMOO_MEMSIZE_MAX) {
+ if (memSize < SHMOO_LIMIT_20KB) {
+ memSize += SHMOO_INCREMENT_1KB;
+ } else if (memSize < SHMOO_LIMIT_50KB) {
+ memSize += SHMOO_INCREMENT_2KB;
+ } else if (memSize < SHMOO_LIMIT_100KB) {
+ memSize += SHMOO_INCREMENT_10KB;
+ } else if (memSize < SHMOO_LIMIT_1MB) {
+ memSize += SHMOO_INCREMENT_100KB;
+ } else if (memSize < SHMOO_LIMIT_16MB) {
+ memSize += SHMOO_INCREMENT_1MB;
+ } else if (memSize < SHMOO_LIMIT_32MB) {
+ memSize += SHMOO_INCREMENT_2MB;
+ } else {
+ memSize += SHMOO_INCREMENT_4MB;
+ }
+
+ memSizes[iteration] = memSize;
+
+ switch (kind) {
+ case DEVICE_TO_HOST:
+ bandwidths[iteration] += testDeviceToHostTransfer(memSizes[iteration], memMode, wc);
+ break;
+
+ case HOST_TO_DEVICE:
+ bandwidths[iteration] += testHostToDeviceTransfer(memSizes[iteration], memMode, wc);
+ break;
+
+ case DEVICE_TO_DEVICE:
+ bandwidths[iteration] += testDeviceToDeviceTransfer(memSizes[iteration]);
+ break;
+ }
+
+ iteration++;
+ printf(".");
+ }
+ } // Complete the bandwidth computation on all the devices
+
+ //print results
+ printf("\n");
+
+ if (CSV == printmode) {
+ printResultsCSV(memSizes, bandwidths, count, kind, memMode, (1 + endDevice - startDevice), wc);
+ } else {
+ printResultsReadable(memSizes, bandwidths, count, kind, memMode, (1 + endDevice - startDevice), wc);
+ }
+
+ //clean up
+ free(memSizes);
+ free(bandwidths);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// test the bandwidth of a device to host memcopy of a specific size
+///////////////////////////////////////////////////////////////////////////////
+float
+testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode, bool wc) {
+ StopWatchInterface *timer = NULL;
+ float elapsedTimeInMs = 0.0f;
+ float bandwidthInGBs = 0.0f;
+ unsigned char *h_idata = NULL;
+ unsigned char *h_odata = NULL;
+ cudaEvent_t start, stop;
+
+ sdkCreateTimer(&timer);
+ checkCudaErrors(cudaEventCreate(&start));
+ checkCudaErrors(cudaEventCreate(&stop));
+
+ //allocate host memory
+ if (PINNED == memMode) {
+ //pinned memory mode - use special function to get OS-pinned memory
+#if CUDART_VERSION >= 2020
+ checkCudaErrors(cudaHostAlloc((void **) &h_idata, memSize, (wc) ? cudaHostAllocWriteCombined : 0));
+ checkCudaErrors(cudaHostAlloc((void **) &h_odata, memSize, (wc) ? cudaHostAllocWriteCombined : 0));
+#else
+ checkCudaErrors(cudaMallocHost((void **)&h_idata, memSize));
+ checkCudaErrors(cudaMallocHost((void **)&h_odata, memSize));
+#endif
+ } else {
+ //pageable memory mode - use malloc
+ h_idata = (unsigned char *) malloc(memSize);
+ h_odata = (unsigned char *) malloc(memSize);
+
+ if (h_idata == 0 || h_odata == 0) {
+ fprintf(stderr, "Not enough memory avaialable on host to run test!\n");
+ exit(EXIT_FAILURE);
+ }
+ }
+
+ //initialize the memory
+ for (unsigned int i = 0; i < memSize / sizeof(unsigned char); i++) {
+ h_idata[i] = (unsigned char) (i & 0xff);
+ }
+
+ // allocate device memory
+ unsigned char *d_idata;
+ checkCudaErrors(cudaMalloc((void **) &d_idata, memSize));
+
+ //initialize the device memory
+ checkCudaErrors(cudaMemcpy(d_idata, h_idata, memSize,
+ cudaMemcpyHostToDevice));
+
+ //copy data from GPU to Host
+ sdkStartTimer(&timer);
+ checkCudaErrors(cudaEventRecord(start, 0));
+
+ if (PINNED == memMode) {
+ for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
+ checkCudaErrors(cudaMemcpyAsync(h_odata, d_idata, memSize,
+ cudaMemcpyDeviceToHost, 0));
+ }
+ } else {
+ for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
+ checkCudaErrors(cudaMemcpy(h_odata, d_idata, memSize,
+ cudaMemcpyDeviceToHost));
+ }
+ }
+
+ checkCudaErrors(cudaEventRecord(stop, 0));
+
+ // make sure GPU has finished copying
+ checkCudaErrors(cudaDeviceSynchronize());
+ //get the total elapsed time in ms
+ sdkStopTimer(&timer);
+ checkCudaErrors(cudaEventElapsedTime(&elapsedTimeInMs, start, stop));
+
+ if (PINNED != memMode || bDontUseGPUTiming) {
+ elapsedTimeInMs = sdkGetTimerValue(&timer);
+ }
+
+ //calculate bandwidth in GB/s
+ double time_s = elapsedTimeInMs / 1e3;
+ bandwidthInGBs = (memSize * (float) MEMCOPY_ITERATIONS) / (double) 1e9;
+ bandwidthInGBs = bandwidthInGBs / time_s;
+ //clean up memory
+ checkCudaErrors(cudaEventDestroy(stop));
+ checkCudaErrors(cudaEventDestroy(start));
+ sdkDeleteTimer(&timer);
+
+ if (PINNED == memMode) {
+ checkCudaErrors(cudaFreeHost(h_idata));
+ checkCudaErrors(cudaFreeHost(h_odata));
+ } else {
+ free(h_idata);
+ free(h_odata);
+ }
+
+ checkCudaErrors(cudaFree(d_idata));
+
+ return bandwidthInGBs;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+//! test the bandwidth of a host to device memcopy of a specific size
+///////////////////////////////////////////////////////////////////////////////
+float
+testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode, bool wc) {
+ StopWatchInterface *timer = NULL;
+ float elapsedTimeInMs = 0.0f;
+ float bandwidthInGBs = 0.0f;
+ cudaEvent_t start, stop;
+ sdkCreateTimer(&timer);
+ checkCudaErrors(cudaEventCreate(&start));
+ checkCudaErrors(cudaEventCreate(&stop));
+
+ //allocate host memory
+ unsigned char *h_odata = NULL;
+
+ if (PINNED == memMode) {
+#if CUDART_VERSION >= 2020
+ //pinned memory mode - use special function to get OS-pinned memory
+ checkCudaErrors(cudaHostAlloc((void **) &h_odata, memSize, (wc) ? cudaHostAllocWriteCombined : 0));
+#else
+ //pinned memory mode - use special function to get OS-pinned memory
+ checkCudaErrors(cudaMallocHost((void **)&h_odata, memSize));
+#endif
+ } else {
+ //pageable memory mode - use malloc
+ h_odata = (unsigned char *) malloc(memSize);
+
+ if (h_odata == 0) {
+ fprintf(stderr, "Not enough memory available on host to run test!\n");
+ exit(EXIT_FAILURE);
+ }
+ }
+
+ unsigned char *h_cacheClear1 = (unsigned char *) malloc(CACHE_CLEAR_SIZE);
+ unsigned char *h_cacheClear2 = (unsigned char *) malloc(CACHE_CLEAR_SIZE);
+
+ if (h_cacheClear1 == 0 || h_cacheClear2 == 0) {
+ fprintf(stderr, "Not enough memory available on host to run test!\n");
+ exit(EXIT_FAILURE);
+ }
+
+ //initialize the memory
+ for (unsigned int i = 0; i < memSize / sizeof(unsigned char); i++) {
+ h_odata[i] = (unsigned char) (i & 0xff);
+ }
+
+ for (unsigned int i = 0; i < CACHE_CLEAR_SIZE / sizeof(unsigned char); i++) {
+ h_cacheClear1[i] = (unsigned char) (i & 0xff);
+ h_cacheClear2[i] = (unsigned char) (0xff - (i & 0xff));
+ }
+
+ //allocate device memory
+ unsigned char *d_idata;
+ checkCudaErrors(cudaMalloc((void **) &d_idata, memSize));
+
+ sdkStartTimer(&timer);
+ checkCudaErrors(cudaEventRecord(start, 0));
+
+ //copy host memory to device memory
+ if (PINNED == memMode) {
+ for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
+ checkCudaErrors(cudaMemcpyAsync(d_idata, h_odata, memSize,
+ cudaMemcpyHostToDevice, 0));
+ }
+ } else {
+ for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
+ checkCudaErrors(cudaMemcpy(d_idata, h_odata, memSize,
+ cudaMemcpyHostToDevice));
+ }
+ }
+
+ checkCudaErrors(cudaEventRecord(stop, 0));
+ checkCudaErrors(cudaDeviceSynchronize());
+ //total elapsed time in ms
+ sdkStopTimer(&timer);
+ checkCudaErrors(cudaEventElapsedTime(&elapsedTimeInMs, start, stop));
+
+ if (PINNED != memMode || bDontUseGPUTiming) {
+ elapsedTimeInMs = sdkGetTimerValue(&timer);
+ }
+
+ sdkResetTimer(&timer);
+
+ //calculate bandwidth in GB/s
+ double time_s = elapsedTimeInMs / 1e3;
+ bandwidthInGBs = (memSize * (float) MEMCOPY_ITERATIONS) / (double) 1e9;
+ bandwidthInGBs = bandwidthInGBs / time_s;
+ //clean up memory
+ checkCudaErrors(cudaEventDestroy(stop));
+ checkCudaErrors(cudaEventDestroy(start));
+ sdkDeleteTimer(&timer);
+
+ if (PINNED == memMode) {
+ checkCudaErrors(cudaFreeHost(h_odata));
+ } else {
+ free(h_odata);
+ }
+
+ free(h_cacheClear1);
+ free(h_cacheClear2);
+ checkCudaErrors(cudaFree(d_idata));
+
+ return bandwidthInGBs;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+//! test the bandwidth of a device to device memcopy of a specific size
+///////////////////////////////////////////////////////////////////////////////
+float
+testDeviceToDeviceTransfer(unsigned int memSize) {
+ StopWatchInterface *timer = NULL;
+ float elapsedTimeInMs = 0.0f;
+ float bandwidthInGBs = 0.0f;
+ cudaEvent_t start, stop;
+
+ sdkCreateTimer(&timer);
+ checkCudaErrors(cudaEventCreate(&start));
+ checkCudaErrors(cudaEventCreate(&stop));
+
+ //allocate host memory
+ unsigned char *h_idata = (unsigned char *) malloc(memSize);
+
+ if (h_idata == 0) {
+ fprintf(stderr, "Not enough memory avaialable on host to run test!\n");
+ exit(EXIT_FAILURE);
+ }
+
+ //initialize the host memory
+ for (unsigned int i = 0; i < memSize / sizeof(unsigned char); i++) {
+ h_idata[i] = (unsigned char) (i & 0xff);
+ }
+
+ //allocate device memory
+ unsigned char *d_idata;
+ checkCudaErrors(cudaMalloc((void **) &d_idata, memSize));
+ unsigned char *d_odata;
+ checkCudaErrors(cudaMalloc((void **) &d_odata, memSize));
+
+ //initialize memory
+ checkCudaErrors(cudaMemcpy(d_idata, h_idata, memSize,
+ cudaMemcpyHostToDevice));
+
+ //run the memcopy
+ sdkStartTimer(&timer);
+ checkCudaErrors(cudaEventRecord(start, 0));
+
+ for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
+ checkCudaErrors(cudaMemcpy(d_odata, d_idata, memSize,
+ cudaMemcpyDeviceToDevice));
+ }
+
+ checkCudaErrors(cudaEventRecord(stop, 0));
+
+ //Since device to device memory copies are non-blocking,
+ //cudaDeviceSynchronize() is required in order to get
+ //proper timing.
+ checkCudaErrors(cudaDeviceSynchronize());
+
+ //get the total elapsed time in ms
+ sdkStopTimer(&timer);
+ checkCudaErrors(cudaEventElapsedTime(&elapsedTimeInMs, start, stop));
+
+ if (bDontUseGPUTiming) {
+ elapsedTimeInMs = sdkGetTimerValue(&timer);
+ }
+
+ //calculate bandwidth in GB/s
+ double time_s = elapsedTimeInMs / 1e3;
+ bandwidthInGBs = (2.0f * memSize * (float) MEMCOPY_ITERATIONS) / (double) 1e9;
+ bandwidthInGBs = bandwidthInGBs / time_s;
+
+ //clean up memory
+ sdkDeleteTimer(&timer);
+ free(h_idata);
+ checkCudaErrors(cudaEventDestroy(stop));
+ checkCudaErrors(cudaEventDestroy(start));
+ checkCudaErrors(cudaFree(d_idata));
+ checkCudaErrors(cudaFree(d_odata));
+
+ return bandwidthInGBs;
+}
+
+/////////////////////////////////////////////////////////
+//print results in an easily read format
+////////////////////////////////////////////////////////
+void printResultsReadable(unsigned int *memSizes, double *bandwidths, unsigned int count, memcpyKind kind,
+ memoryMode memMode, int iNumDevs, bool wc) {
+ printf(" %s Bandwidth, %i Device(s)\n", sMemoryCopyKind[kind], iNumDevs);
+ printf(" %s Memory Transfers\n", sMemoryMode[memMode]);
+
+ if (wc) {
+ printf(" Write-Combined Memory Writes are Enabled");
+ }
+
+ printf(" Transfer Size (Bytes)\tBandwidth(GB/s)\n");
+ unsigned int i;
+
+ for (i = 0; i < (count - 1); i++) {
+ printf(" %u\t\t\t%s%.1f\n", memSizes[i], (memSizes[i] < 10000) ? "\t" : "", bandwidths[i]);
+ }
+
+ printf(" %u\t\t\t%s%.1f\n\n", memSizes[i], (memSizes[i] < 10000) ? "\t" : "", bandwidths[i]);
+}
+
+///////////////////////////////////////////////////////////////////////////
+//print results in a database format
+///////////////////////////////////////////////////////////////////////////
+void
+printResultsCSV(unsigned int *memSizes, double *bandwidths, unsigned int count, memcpyKind kind, memoryMode memMode,
+ int iNumDevs, bool wc) {
+ std::string sConfig;
+
+ // log config information
+ if (kind == DEVICE_TO_DEVICE) {
+ sConfig += "D2D";
+ } else {
+ if (kind == DEVICE_TO_HOST) {
+ sConfig += "D2H";
+ } else if (kind == HOST_TO_DEVICE) {
+ sConfig += "H2D";
+ }
+
+ if (memMode == PAGEABLE) {
+ sConfig += "-Paged";
+ } else if (memMode == PINNED) {
+ sConfig += "-Pinned";
+
+ if (wc) {
+ sConfig += "-WriteCombined";
+ }
+ }
+ }
+
+ unsigned int i;
+ double dSeconds = 0.0;
+
+ for (i = 0; i < count; i++) {
+ dSeconds = (double) memSizes[i] / (bandwidths[i] * (double) (1 << 20));
+ printf("bandwidthTest-%s, Bandwidth = %.1f GB/s, Time = %.5f s, Size = %u bytes, NumDevsUsed = %d\n",
+ sConfig.c_str(), bandwidths[i], dSeconds, memSizes[i], iNumDevs);
+ }
+}
+
+///////////////////////////////////////////////////////////////////////////
+//Print help screen
+///////////////////////////////////////////////////////////////////////////
+void printHelp(void) {
+ printf("Usage: bandwidthTest [OPTION]...\n");
+ printf("Test the bandwidth for device to host, host to device, and device to device transfers\n");
+ printf("\n");
+ printf("Example: measure the bandwidth of device to host pinned memory copies in the range 1024 Bytes to 102400 Bytes in 1024 Byte increments\n");
+ printf("./bandwidthTest --memory=pinned --mode=range --start=1024 --end=102400 --increment=1024 --dtoh\n");
+
+ printf("\n");
+ printf("Options:\n");
+ printf("--help\tDisplay this help menu\n");
+ printf("--csv\tPrint results as a CSV\n");
+ printf("--device=[deviceno]\tSpecify the device device to be used\n");
+ printf(" all - compute cumulative bandwidth on all the devices\n");
+ printf(" 0,1,2,...,n - Specify any particular device to be used\n");
+ printf("--memory=[MEMMODE]\tSpecify which memory mode to use\n");
+ printf(" pageable - pageable memory\n");
+ printf(" pinned - non-pageable system memory\n");
+ printf("--mode=[MODE]\tSpecify the mode to use\n");
+ printf(" quick - performs a quick measurement\n");
+ printf(" range - measures a user-specified range of values\n");
+ printf(" shmoo - performs an intense shmoo of a large range of values\n");
+
+ printf("--htod\tMeasure host to device transfers\n");
+ printf("--dtoh\tMeasure device to host transfers\n");
+ printf("--dtod\tMeasure device to device transfers\n");
+#if CUDART_VERSION >= 2020
+ printf("--wc\tAllocate pinned memory as write-combined\n");
+#endif
+ printf("--cputiming\tForce CPU-based timing always\n");
+
+ printf("Range mode options\n");
+ printf("--start=[SIZE]\tStarting transfer size in bytes\n");
+ printf("--end=[SIZE]\tEnding transfer size in bytes\n");
+ printf("--increment=[SIZE]\tIncrement size in bytes\n");
+}
diff --git a/cudaTensorCoreGemm.cu b/cudaTensorCoreGemm.cu
new file mode 100644
index 0000000..c583e1b
--- /dev/null
+++ b/cudaTensorCoreGemm.cu
@@ -0,0 +1,586 @@
+//
+// Created by kwoodle on 5/10/20.
+//
+
+//
+// Created by kwoodle on 5/9/20.
+//
+
+/*
+ * Copyright 1993-2017 NVIDIA Corporation. All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+
+// CUDA sample demonstrating a GEMM computation using the Warp Matrix Multiply
+// and Accumulate API introduced in CUDA 9.
+
+// In this program, the compute_gemm kernel computes the result of a matrix multiplication
+// and addition: D = alpha * A * B + beta * C. The dimensions of both C and D matrices
+// are M_GLOBAL x N_GLOBAL. The A matrix is M_GLOBAL x K_GLOBAL (row-major), the B matrix
+// is K_GLOBAL x N_GLOBAL (column-major).
+// In that kernel, each CTA computes one 128 x 128 tile of the resulting matrix
+// per iteration. When the tile is computed, the CTA stores it to the global memory
+// and begins a new iteration, selecting a new 128 x 128 tile to compute.
+// Each CTA consists of eight warps. For the 128 x 128 tile, each warp computes eight
+// 16 x 16 subtiles, organized in a 2 x 4 two-dimensional array.
+// Warps compute the 16 x 16 subtiles using nvcuda::wmma::mma_sync operations by
+// moving through the K_GLOBAL dimension of the A and B matrices and accumulating
+// the intermediate result in the local thread state.
+
+// There are a number of simple optimizations used in the algorithm:
+// - The CTA copies the 128 x 128 tile of the C matrix from the global memory to
+// shared memory. After that is done, each warp loads the C matrix fragments from
+// shared memory, thus avoiding a random global memory access.
+// - On each internal iteration, the CTA copies a portion of the A and B matrices from
+// global memory to shared memory. After that, all warps in the CTA reuse the A and B
+// data from shared memory, thus reducing the number of data copies from global memory.
+// - The portions of the A and B matrices are stored in shared memory with an additional
+// padding (skew) to reduce the number of shared memory access bank conflicts.
+// (See a detailed explanation near the SKEW_HALF macro definition.)
+// - When the CTA finishes computing the tiles of the resulting matrix, each warp stores
+// its subtiles to shared memory. The CTA then copies the shared memory contents to
+// global memory, again avoiding redundant random global memory accesses.
+// - Note that the CTA tile size is chosen to maximize the GPU register utilization,
+// but carefully enough to avoid local memory use.
+
+#include
+#include
+#include
+#include
+
+// helper functions and utilities to work with CUDA
+#include
+#include
+
+// Externally configurable parameters.
+
+#ifndef CPU_DEBUG
+// Set this to 1 to verify the correctness of the GPU-computed matrix.
+#define CPU_DEBUG 0
+#endif
+
+#ifndef SHARED_MEMORY_LIMIT_64K
+// Set this to 0 to use more than 64 Kb of shared memory to cache data, to
+// improve the performance of the computations on GPU.
+// Note that you need a GPU that can have more than 64 Kb of shared memory
+// per multiprocessor.
+#define SHARED_MEMORY_LIMIT_64K 1
+#endif
+
+// GPU configuration.
+
+#define WARP_SIZE 32
+
+// MMA matrix tile dimensions.
+
+#define M 16
+#define N 16
+#define K 16
+
+#define WMMA_M 16
+#define WMMA_N 16
+#define WMMA_K 16
+
+// GEMM configuration.
+
+#define M_TILES 256
+#define N_TILES 256
+#define K_TILES 256
+
+#define M_GLOBAL (M * M_TILES)
+#define N_GLOBAL (N * N_TILES)
+#define K_GLOBAL (K * K_TILES)
+
+#define C_LAYOUT wmma::mem_row_major
+
+// Implementation constants.
+
+#define WARPS_PER_BLOCK 8
+#define THREADS_PER_BLOCK (WARP_SIZE * WARPS_PER_BLOCK)
+
+#if SHARED_MEMORY_LIMIT_64K
+// With only 64 Kb shared memory available, we can fit two 8-tile chunks of
+// the A and B matrix data, that are 16 * 16 * 8 * 8 * 2 = 32 Kb each
+// (i.e. two 8x8 arrays of tiles of 16x16 half-typed elements per CTA).
+// But we cannot account the 8 Kb total skew overhead, without which the performance
+// would be severely impacted. So we choose to reduce the chunk size in half,
+// i.e. the amount of A and B matrix data we cache in shared memory.
+// Accordingly, this doubles the number of outer iterations across the global K
+// dimension, which only slightly impacts the performance.
+#define CHUNK_K 4
+#else
+#define CHUNK_K 8
+#endif
+
+#define CHUNK_LINE_BYTES (CHUNK_K * K * sizeof(half))
+#define WARP_COPY_BYTES (WARP_SIZE * sizeof(int4))
+#define CHUNK_COPY_LINES_PER_WARP (WARP_COPY_BYTES / CHUNK_LINE_BYTES)
+#define CHUNK_COPY_LINE_LANES (WARP_SIZE / CHUNK_COPY_LINES_PER_WARP)
+
+#define BLOCK_ROW_WARPS 2
+#define BLOCK_COL_WARPS 4
+
+#define WARP_ROW_TILES 4
+#define WARP_COL_TILES 2
+
+#define BLOCK_ROW_TILES (WARP_ROW_TILES * BLOCK_ROW_WARPS)
+#define BLOCK_COL_TILES (WARP_COL_TILES * BLOCK_COL_WARPS)
+
+#define GLOBAL_MEM_STRIDE N_GLOBAL
+
+#define SHMEM_STRIDE (N * BLOCK_ROW_TILES)
+#define SHMEM_OFFSET (N * WARP_ROW_TILES)
+
+// The macro below is used to shift rows of the A matrix and columns of the B matrix
+// in shared memory to minimize possible bank conflicts.
+// Before performing the nvcuda::wmma::mma_sync operation, the warp must load the matrix
+// data using the nvcuda::wmma::load_matrix_sync operation. Although the memory access pattern
+// is not specified for that function, each lane in the warp can read one or multiple matrix
+// elements from different matrix rows or columns.
+// For shared memory, such access can result in bank conflicts if different rows / columns
+// of the matrix map to the same bank. By shifting each row and column by a few bytes, we
+// make sure that they map to different banks, thus reducing the number of possible bank
+// conflicts.
+// The number of 8 two-byte "half" elements is chosen as the minimum possible shift because
+// we must keep each row and column 128-bit aligned, as required by nvcuda::wmma::load_matrix_sync.
+#define SKEW_HALF 8
+
+#define checkKernelErrors(expr) do { \
+ expr; \
+ \
+ cudaError_t __err = cudaGetLastError(); \
+ if (__err != cudaSuccess) { \
+ printf("Line %d: '%s' failed: %s\n", __LINE__, # expr, cudaGetErrorString(__err)); \
+ abort(); \
+ } \
+} while(0)
+
+using namespace nvcuda;
+
+__host__ void init_host_matrices(half *a, half *b, float *c) {
+ for (int i = 0; i < M_GLOBAL; i++) {
+ for (int j = 0; j < K_GLOBAL; j++) {
+ a[i * K_GLOBAL + j] = (half) (rand() % 3);
+ }
+ }
+
+ for (int i = 0; i < N_GLOBAL; i++) {
+ for (int j = 0; j < K_GLOBAL; j++) {
+ b[i * K_GLOBAL + j] = (half) (rand() % 3);
+ }
+ }
+
+ for (int t = 0; t < M_GLOBAL * N_GLOBAL; t++) {
+ c[t] = (float) (rand() % 3);
+ }
+}
+
+__global__ void compute_gemm(const half *A, const half *B, const float *C, float *D, float alpha, float beta) {
+ extern __shared__ half shmem[][CHUNK_K * K + SKEW_HALF];
+
+ // Warp and lane identification.
+ const unsigned int warpId = threadIdx.x / WARP_SIZE;
+ const unsigned int laneId = threadIdx.x % WARP_SIZE;
+
+ // Offset in shared memory from which the B matrix is stored.
+ const size_t shmem_idx_b_off = BLOCK_COL_TILES * M;
+
+ // This pointer is used to access the C and D matrix tiles this warp computes.
+ float *shmem_warp_tile_ptr =
+ (float *) &shmem[0][0] + (warpId / 2) * SHMEM_STRIDE * K * 2 + (warpId % 2) * SHMEM_OFFSET;
+
+ // This pointer is used to stream the C and D matrices block-wide tile to and from shared memory.
+ float *shmem_warp_stream_ptr = (float *) &shmem[0][0] + warpId * SHMEM_STRIDE * K;
+
+ // Adjust the beta scaler, as it'll be multiplied by alpha at the end of
+ // each tile computation. Technically this is not generally correct (may result
+ // in a loss of precision). Zero still needs to be specially handled though.
+ beta /= alpha;
+
+ // Each CTA slides along the 128 x 128 tiles from the top left corner of the matrix to the
+ // right and down, and selects the next tile to compute. Once there's no such tile,
+ // all warps in this CTA exit.
+ for (unsigned int block_pos = blockIdx.x;; block_pos += gridDim.x) {
+ const unsigned int block_tile_i = ((block_pos * BLOCK_ROW_TILES) / N_TILES) * (BLOCK_COL_TILES);
+ const unsigned int block_tile_j = (block_pos * BLOCK_COL_TILES) % N_TILES;
+
+ // Stop when there are no more D matrix tiles to compute in this CTA.
+ if (block_tile_i >= M_TILES) {
+ break;
+ }
+
+ // This warp's pointer to the C matrix data to copy memory from to shared memory.
+ const size_t gmem_idx = (block_tile_i + warpId) * M * GLOBAL_MEM_STRIDE + block_tile_j * N;
+ const float *src_gmem_warp_stream_ptr = &C[gmem_idx];
+
+ // Stream multiple C tiles to shared memory.
+#pragma unroll
+ for (int i = 0; i < K; i++) {
+ typedef int4 copy_t;
+
+ *((copy_t *) (shmem_warp_stream_ptr + SHMEM_STRIDE * i) + laneId) =
+ *((copy_t *) (src_gmem_warp_stream_ptr + GLOBAL_MEM_STRIDE * i) + laneId);
+ }
+
+ __syncthreads();
+
+ // These fragments will accumulate the result of A and B matrix fragment multiplications
+ // along the K_GLOBAL dimension.
+ wmma::fragment c[WARP_COL_TILES][WARP_ROW_TILES];
+
+ // Load the C matrix tiles into fragments from shared memory.
+#pragma unroll
+ for (int i = 0; i < WARP_COL_TILES; i++) {
+#pragma unroll
+ for (int j = 0; j < WARP_ROW_TILES; j++) {
+ const float *tile_ptr = shmem_warp_tile_ptr + i * SHMEM_STRIDE * K + j * N;
+
+ wmma::load_matrix_sync(c[i][j], tile_ptr, SHMEM_STRIDE, C_LAYOUT);
+ }
+ }
+
+ __syncthreads();
+
+ // Scale the C matrix.
+#pragma unroll
+ for (int i = 0; i < WARP_COL_TILES; i++) {
+#pragma unroll
+ for (int j = 0; j < WARP_ROW_TILES; j++) {
+#pragma unroll
+ for (int t = 0; t < c[i][j].num_elements; t++) {
+ c[i][j].x[t] *= beta;
+ }
+ }
+ }
+
+ // Select what warp copies what matrix to shared memory.
+ // Warps 0-3 copy the A matrix, warps 4-7 copy the B matrix.
+ const half *warp_ptr = (warpId < 4) ? (&A[block_tile_i * M * K_GLOBAL] + M * K_GLOBAL * (warpId % 4) * 2) :
+ (&B[block_tile_j * N * K_GLOBAL] + N * K_GLOBAL * (warpId % 4) * 2);
+
+ // Go through the global K dimension by a fixed step at a time.
+#pragma unroll
+ for (int tile_k = 0; tile_k < K_TILES; tile_k += CHUNK_K) {
+ // Copy slices of the A and B matrices to shared memory.
+ // The first half of the warps in the CTA copy the A matrix, the rest copy the B matrix.
+ size_t shmem_idx = warpId < (WARPS_PER_BLOCK / 2) ? (M * (warpId % (WARPS_PER_BLOCK / 2)) * 2) :
+ (N * (warpId % (WARPS_PER_BLOCK / 2)) * 2 + shmem_idx_b_off);
+
+ // First half of the warp copies the first row / column of the matrix,
+ // the second half of the warp copies the next.
+ int4 *lane_ptr = (int4 *) (warp_ptr + tile_k * K + (laneId / CHUNK_COPY_LINE_LANES) * K_GLOBAL) +
+ (laneId % CHUNK_COPY_LINE_LANES);
+
+ // Shift the second half of the warp to the next row / column in the shared memory.
+ shmem_idx += laneId / CHUNK_COPY_LINE_LANES;
+
+#pragma unroll
+ for (int i = 0; i < ((WARP_SIZE / 2) / CHUNK_COPY_LINES_PER_WARP) * 2; i++) {
+ // Copy 16 bytes at once in each lane.
+ *((int4 *) &shmem[shmem_idx][0] + (laneId % CHUNK_COPY_LINE_LANES)) = *lane_ptr;
+
+ // Advance the global memory pointer and the shared memory index.
+ lane_ptr = (int4 *) ((half *) lane_ptr + K_GLOBAL * CHUNK_COPY_LINES_PER_WARP);
+ shmem_idx += CHUNK_COPY_LINES_PER_WARP;
+ }
+
+ __syncthreads();
+
+ // Compute a grid of C matrix tiles in each warp.
+#pragma unroll
+ for (int k_step = 0; k_step < CHUNK_K; k_step++) {
+ wmma::fragment a[WARP_COL_TILES];
+ wmma::fragment b[WARP_ROW_TILES];
+
+#pragma unroll
+ for (int i = 0; i < WARP_COL_TILES; i++) {
+ size_t shmem_idx_a = (warpId / 2) * M * 2 + (i * M);
+ const half *tile_ptr = &shmem[shmem_idx_a][k_step * K];
+
+ wmma::load_matrix_sync(a[i], tile_ptr, K * CHUNK_K + SKEW_HALF);
+
+#pragma unroll
+ for (int j = 0; j < WARP_ROW_TILES; j++) {
+ if (i == 0) {
+ // Load the B matrix fragment once, because it is going to be reused
+ // against the other A matrix fragments.
+ size_t shmem_idx_b = shmem_idx_b_off + (WARP_ROW_TILES * N) * (warpId % 2) + (j * N);
+ const half *tile_ptr = &shmem[shmem_idx_b][k_step * K];
+
+ wmma::load_matrix_sync(b[j], tile_ptr, K * CHUNK_K + SKEW_HALF);
+ }
+
+ wmma::mma_sync(c[i][j], a[i], b[j], c[i][j]);
+ }
+ }
+ }
+
+ __syncthreads();
+ }
+
+ // Store the D fragments to shared memory.
+#pragma unroll
+ for (int i = 0; i < WARP_COL_TILES; i++) {
+#pragma unroll
+ for (int j = 0; j < WARP_ROW_TILES; j++) {
+#pragma unroll
+ // Uniform, point-wise transformations of ALL fragment elements by ALL threads in the
+ // warp are well-defined even though element indices within fragment storage are not defined.
+ for (int t = 0; t < c[i][j].num_elements; t++)
+ c[i][j].x[t] *= alpha;
+
+ float *tile_ptr = shmem_warp_tile_ptr + i * SHMEM_STRIDE * K + j * N;
+
+ wmma::store_matrix_sync(tile_ptr, c[i][j], SHMEM_STRIDE, C_LAYOUT);
+ }
+ }
+
+ __syncthreads();
+
+ // Now that shared memory contains all the D tiles, stream them to global memory.
+ float *dst_gmem_warp_stream_ptr = &D[gmem_idx];
+
+#pragma unroll
+ for (int i = 0; i < K; i++) {
+ *((int4 *) (dst_gmem_warp_stream_ptr + GLOBAL_MEM_STRIDE * i) + laneId) =
+ *((int4 *) (shmem_warp_stream_ptr + SHMEM_STRIDE * i) + laneId);
+ }
+
+ __syncthreads();
+ }
+}
+
+
+// Performs an MxNxK GEMM (C=alpha*A*B + beta*C) assuming:
+// 1) Matrices are packed in memory.
+// 2) M, N and K are multiples of 16.
+// 3) Neither A nor B are transposed.
+// Note: This is a less performant version of the compute_gemm kernel. It is designed for
+// demonstration purposes only to show the CUDA WMMA API use without relying on
+// availability of the shared memory.
+__global__ void
+simple_wmma_gemm(half *a, half *b, float *c, float *d, int m_ld, int n_ld, int k_ld, float alpha, float beta) {
+ // Leading dimensions. Packed with no transpositions.
+ int lda = m_ld;
+ int ldb = k_ld;
+ int ldc = n_ld;
+
+ // Tile using a 2D grid
+ int warpM = (blockIdx.x * blockDim.x + threadIdx.x) / warpSize;
+ int warpN = (blockIdx.y * blockDim.y + threadIdx.y);
+
+ // Declare the fragments
+ wmma::fragment a_frag;
+ wmma::fragment b_frag;
+ wmma::fragment acc_frag;
+ wmma::fragment c_frag;
+
+ wmma::fill_fragment(acc_frag, 0.0f);
+
+ // Loop over k
+ for (int i = 0; i < k_ld; i += WMMA_K) {
+ int aCol = i;
+ int aRow = warpM * WMMA_M;
+
+ int bCol = i;
+ int bRow = warpN * WMMA_N;
+
+ // Bounds checking
+ if (aRow < m_ld && aCol < k_ld && bRow < k_ld && bCol < n_ld) {
+ // Load the inputs
+ wmma::load_matrix_sync(a_frag, a + aCol + aRow * lda, lda);
+ wmma::load_matrix_sync(b_frag, b + bCol + bRow * ldb, ldb);
+
+ // Perform the matrix multiplication
+ wmma::mma_sync(acc_frag, a_frag, b_frag, acc_frag);
+
+ }
+ }
+
+ // Load in the current value of c, scale it by beta, and add this our result scaled by alpha
+ int cCol = warpN * WMMA_N;
+ int cRow = warpM * WMMA_M;
+
+ if (cRow < m_ld && cCol < n_ld) {
+ wmma::load_matrix_sync(c_frag, c + cCol + cRow * ldc, ldc, wmma::mem_row_major);
+
+ for (int i = 0; i < c_frag.num_elements; i++) {
+ c_frag.x[i] = alpha * acc_frag.x[i] + beta * c_frag.x[i];
+ }
+
+ // Store the output
+ wmma::store_matrix_sync(d + cCol + cRow * ldc, c_frag, ldc, wmma::mem_row_major);
+ }
+}
+
+__host__ void matMultiplyOnHost(half *A, half *B, float *C,
+ float alpha, float beta,
+ int numARows, int numAColumns,
+ int numBRows, int numBColumns,
+ int numCRows, int numCColumns) {
+ for (int i = 0; i < numCRows; i++) {
+ for (int j = 0; j < numCColumns; j++) {
+ float temp = 0.0;
+
+ for (int k = 0; k < numAColumns; k++) {
+ temp += (float) A[i * numAColumns + k] * (float) B[j * numBRows + k];
+ }
+
+ C[i * numCColumns + j] = temp * alpha + beta * C[i * numCColumns + j];
+ }
+ }
+}
+
+int main(int argc, char **argv) {
+ printf("Initializing...\n");
+
+ int dev = findCudaDevice(argc, (const char **) argv);
+
+ cudaDeviceProp deviceProp;
+ checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev));
+
+ // Tensor cores require a GPU of Volta (SM7X) architecture or higher.
+ if (deviceProp.major < 7) {
+ printf("cudaTensorCoreGemm requires requires SM 7.0 or higher to use Tensor Cores. Exiting...\n");
+ exit(EXIT_WAIVED);
+ }
+
+ printf("M: %d (%d x %d)\n", M_GLOBAL, M, M_TILES);
+ printf("N: %d (%d x %d)\n", N_GLOBAL, N, N_TILES);
+ printf("K: %d (%d x %d)\n", K_GLOBAL, K, K_TILES);
+
+ half *A_h = NULL;
+ half *B_h = NULL;
+ float *C_h = NULL;
+#if CPU_DEBUG
+ float *result_hD = NULL;
+ float *result_host = NULL;
+#endif
+
+ A_h = (half *) malloc(sizeof(half) * M_GLOBAL * K_GLOBAL);
+ B_h = (half *) malloc(sizeof(half) * K_GLOBAL * N_GLOBAL);
+ C_h = (float *) malloc(sizeof(float) * M_GLOBAL * N_GLOBAL);
+#if CPU_DEBUG
+ result_hD = (float*) malloc(sizeof(float) * M_GLOBAL * N_GLOBAL);
+ result_host = (float*) malloc(sizeof(float) * M_GLOBAL * N_GLOBAL);
+#endif
+
+ half *A = NULL;
+ half *B = NULL;
+ float *C = NULL;
+ float *D = NULL;
+
+ checkCudaErrors(cudaMalloc((void **) &A, sizeof(half) * M_GLOBAL * K_GLOBAL));
+ checkCudaErrors(cudaMalloc((void **) &B, sizeof(half) * N_GLOBAL * K_GLOBAL));
+ checkCudaErrors(cudaMalloc((void **) &C, sizeof(float) * M_GLOBAL * N_GLOBAL));
+ checkCudaErrors(cudaMalloc((void **) &D, sizeof(float) * M_GLOBAL * N_GLOBAL));
+
+ assert(((unsigned long long) A) % 128 == 0);
+ assert(((unsigned long long) B) % 128 == 0);
+ assert(((unsigned long long) C) % 128 == 0);
+ assert(((unsigned long long) D) % 128 == 0);
+
+ init_host_matrices(A_h, B_h, C_h);
+
+ printf("Preparing data for GPU...\n");
+
+ checkCudaErrors(cudaMemcpy(A, A_h, sizeof(half) * M_GLOBAL * K_GLOBAL, cudaMemcpyHostToDevice));
+ checkCudaErrors(cudaMemcpy(B, B_h, sizeof(half) * N_GLOBAL * K_GLOBAL, cudaMemcpyHostToDevice));
+ checkCudaErrors(cudaMemcpy(C, C_h, sizeof(float) * M_GLOBAL * N_GLOBAL, cudaMemcpyHostToDevice));
+ checkCudaErrors(cudaMemset(D, 0, sizeof(float) * M_GLOBAL * N_GLOBAL));
+
+ enum {
+ // Compute the right amount of shared memory to request.
+ // We need shared memory to hold per-CTA C and D matrix tiles, and to cache per-CTA chunks
+ // of the A and B matrices. Therefore, the right amount to request is the maximum of those
+ // two numbers.
+ SHMEM_SZ = MAX(sizeof(half) * (BLOCK_COL_TILES * M) * (CHUNK_K * K + SKEW_HALF) * 2,
+ M * (BLOCK_ROW_WARPS * WARP_ROW_TILES) * N * (BLOCK_COL_WARPS * WARP_COL_TILES) * sizeof(float))
+ };
+
+ printf("Required shared memory size: %lu Kb\n", SHMEM_SZ / 1024UL);
+
+ const float alpha = 1.1f;
+ const float beta = 1.2f;
+
+ cudaEvent_t start, stop;
+
+ checkCudaErrors(cudaEventCreate(&start));
+ checkCudaErrors(cudaEventCreate(&stop));
+ checkCudaErrors(cudaEventRecord(start));
+
+ // If enough shared memory available on the GPU use high performant kernel
+ if (deviceProp.sharedMemPerMultiprocessor >= SHMEM_SZ) {
+ printf("Computing... using high performance kernel compute_gemm \n");
+
+ checkCudaErrors(cudaFuncSetAttribute(compute_gemm, cudaFuncAttributeMaxDynamicSharedMemorySize, SHMEM_SZ));
+ checkKernelErrors(
+ (compute_gemm<<>>(A, B, C, D, alpha,
+ beta)));
+#if CPU_DEBUG
+ checkCudaErrors(cudaMemcpy(result_hD, D, sizeof(float)*M_GLOBAL*N_GLOBAL, cudaMemcpyDeviceToHost));
+#endif
+ } else {
+ dim3 gridDim;
+ dim3 blockDim;
+
+ // blockDim.x must be a multple of warpSize
+ // 128x4 means we have 16 warps and a block computes a 64x64 output tile
+ blockDim.x = 128;
+ blockDim.y = 4;
+
+ gridDim.x = (M_GLOBAL + (WMMA_M * blockDim.x / 32 - 1)) / (WMMA_M * blockDim.x / 32);
+ gridDim.y = (N_GLOBAL + WMMA_N * blockDim.y - 1) / (WMMA_N * blockDim.y);
+
+ printf("Computing... using simple_wmma_gemm kernel\n");
+ simple_wmma_gemm<<>>(A, B, C, D, M_GLOBAL, N_GLOBAL, K_GLOBAL, alpha, beta);
+#if CPU_DEBUG
+ checkCudaErrors(cudaMemcpy(result_hD, D, sizeof(float) * M_GLOBAL * N_GLOBAL, cudaMemcpyDeviceToHost));
+#endif
+ }
+
+ checkCudaErrors(cudaEventRecord(stop));
+ checkCudaErrors(cudaEventSynchronize(stop));
+
+#if CPU_DEBUG
+ printf("Verifying correctness of the computations...\n");
+
+ memcpy(result_host, C_h, sizeof(float) * M_GLOBAL * N_GLOBAL);
+
+ matMultiplyOnHost(A_h, B_h, result_host,
+ alpha, beta,
+ M_GLOBAL, K_GLOBAL,
+ K_GLOBAL, N_GLOBAL,
+ M_GLOBAL, N_GLOBAL);
+
+ for (int i = 0; i < N_GLOBAL * M_GLOBAL; i++) {
+ if (fabs(result_hD[i] - result_host[i]) > 0.1f)
+ printf("mismatch i=%d result_hD=%f result_host=%f\n", i, result_hD[i], result_host[i]);
+ }
+ free(result_hD);
+ free(result_host);
+#endif
+
+ float milliseconds = 0;
+
+ checkCudaErrors(cudaEventElapsedTime(&milliseconds, start, stop));
+
+ printf("Time: %f ms\n", milliseconds);
+ printf("TFLOPS: %.2f\n", (((double) M_GLOBAL * N_GLOBAL * K_GLOBAL * 2) / (milliseconds / 1000.)) / 1e12);
+
+ free(A_h);
+ free(B_h);
+ free(C_h);
+ checkCudaErrors(cudaFree((void *) A));
+ checkCudaErrors(cudaFree((void *) B));
+ checkCudaErrors(cudaFree((void *) C));
+ checkCudaErrors(cudaFree((void *) D));
+
+ return 0;
+}
\ No newline at end of file
diff --git a/main.cu b/main.cu
new file mode 100644
index 0000000..bc8f460
--- /dev/null
+++ b/main.cu
@@ -0,0 +1,6 @@
+#include
+
+int main() {
+ std::cout << "Hello, World!" << std::endl;
+ return 0;
+}
diff --git a/mpirun.sh b/mpirun.sh
new file mode 100755
index 0000000..50abf06
--- /dev/null
+++ b/mpirun.sh
@@ -0,0 +1,2 @@
+#!/usr/bin/env bash
+mpirun -np 4 cmake-build-debug/simpleMPI
diff --git a/run_bandwidthTest.sh b/run_bandwidthTest.sh
new file mode 100755
index 0000000..7f541be
--- /dev/null
+++ b/run_bandwidthTest.sh
@@ -0,0 +1,2 @@
+#!/usr/bin/env bash
+cmake-build-release/bandwidthTest --mode=shmoo --memory=pinned
\ No newline at end of file
diff --git a/simpleMPI.cpp b/simpleMPI.cpp
new file mode 100644
index 0000000..82bff92
--- /dev/null
+++ b/simpleMPI.cpp
@@ -0,0 +1,122 @@
+//
+// Created by kwoodle on 5/9/20.
+//
+
+/*
+ * Copyright 1993-2015 NVIDIA Corporation. All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+
+
+/* Simple example demonstrating how to use MPI with CUDA
+*
+* Generate some random numbers on one node.
+* Dispatch them to all nodes.
+* Compute their square root on each node's GPU.
+* Compute the average of the results using MPI.
+*
+* simpleMPI.cpp: main program, compiled with mpicxx on linux/Mac platforms
+* on Windows, please download the Microsoft HPC Pack SDK 2008
+*/
+
+// MPI include
+#include
+
+// System includes
+#include
+
+using std::cout;
+using std::cerr;
+using std::endl;
+
+// User include
+#include "simpleMPI.h"
+
+// Error handling macros
+#define MPI_CHECK(call) \
+ if((call) != MPI_SUCCESS) { \
+ cerr << "MPI error calling \""#call"\"\n"; \
+ my_abort(-1); }
+
+
+// Host code
+// No CUDA here, only MPI
+int main(int argc, char *argv[]) {
+ // Dimensions of the dataset
+ int blockSize = 256;
+ int gridSize = 10000;
+ int dataSizePerNode = gridSize * blockSize;
+
+ // Initialize MPI state
+ MPI_CHECK(MPI_Init(&argc, &argv));
+
+ // Get our MPI node number and node count
+ int commSize, commRank;
+ MPI_CHECK(MPI_Comm_size(MPI_COMM_WORLD, &commSize));
+ MPI_CHECK(MPI_Comm_rank(MPI_COMM_WORLD, &commRank));
+
+ // Generate some random numbers on the root node (node 0)
+ int dataSizeTotal = dataSizePerNode * commSize;
+ float *dataRoot = NULL;
+
+ if (commRank == 0) // Are we the root node?
+ {
+ cout << "Running on " << commSize << " nodes" << endl;
+ dataRoot = new float[dataSizeTotal];
+ initData(dataRoot, dataSizeTotal);
+ }
+
+ // Allocate a buffer on each node
+ float *dataNode = new float[dataSizePerNode];
+
+ // Dispatch a portion of the input data to each node
+ MPI_CHECK(MPI_Scatter(dataRoot,
+ dataSizePerNode,
+ MPI_FLOAT,
+ dataNode,
+ dataSizePerNode,
+ MPI_FLOAT,
+ 0,
+ MPI_COMM_WORLD));
+
+ if (commRank == 0) {
+ // No need for root data any more
+ delete[] dataRoot;
+ }
+
+ // On each node, run computation on GPU
+ computeGPU(dataNode, blockSize, gridSize);
+
+ // Reduction to the root node, computing the sum of output elements
+ float sumNode = sum(dataNode, dataSizePerNode);
+ float sumRoot;
+
+ MPI_CHECK(MPI_Reduce(&sumNode, &sumRoot, 1, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD));
+
+ if (commRank == 0) {
+ float average = sumRoot / dataSizeTotal;
+ cout << "Average of square roots is: " << average << endl;
+ }
+
+ // Cleanup
+ delete[] dataNode;
+ MPI_CHECK(MPI_Finalize());
+
+ if (commRank == 0) {
+ cout << "PASSED\n";
+ }
+
+ return 0;
+}
+
+// Shut down MPI cleanly if something goes wrong
+void my_abort(int err) {
+ cout << "Test FAILED\n";
+ MPI_Abort(MPI_COMM_WORLD, err);
+}
\ No newline at end of file
diff --git a/simpleMPI.cu b/simpleMPI.cu
new file mode 100644
index 0000000..8d53eb2
--- /dev/null
+++ b/simpleMPI.cu
@@ -0,0 +1,91 @@
+//
+// Created by kwoodle on 5/10/20.
+//
+
+/*
+ * Copyright 1993-2015 NVIDIA Corporation. All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+
+
+/* Simple example demonstrating how to use MPI with CUDA
+*
+* Generate some random numbers on one node.
+* Dispatch them to all nodes.
+* Compute their square root on each node's GPU.
+* Compute the average of the results using MPI.
+*
+* simpleMPI.cu: GPU part, compiled with nvcc
+*/
+
+#include
+
+using std::cerr;
+using std::endl;
+
+#include "simpleMPI.h"
+
+// Error handling macro
+#define CUDA_CHECK(call) \
+ if((call) != cudaSuccess) { \
+ cudaError_t err = cudaGetLastError(); \
+ cerr << "CUDA error calling \""#call"\", code is " << err << endl; \
+ my_abort(err); }
+
+
+// Device code
+// Very simple GPU Kernel that computes square roots of input numbers
+__global__ void simpleMPIKernel(float *input, float *output) {
+ int tid = blockIdx.x * blockDim.x + threadIdx.x;
+ output[tid] = sqrt(input[tid]);
+}
+
+
+// Initialize an array with random data (between 0 and 1)
+void initData(float *data, int dataSize) {
+ for (int i = 0; i < dataSize; i++) {
+ data[i] = (float) rand() / RAND_MAX;
+ }
+}
+
+// CUDA computation on each node
+// No MPI here, only CUDA
+void computeGPU(float *hostData, int blockSize, int gridSize) {
+ int dataSize = blockSize * gridSize;
+
+ // Allocate data on GPU memory
+ float *deviceInputData = NULL;
+ CUDA_CHECK(cudaMalloc((void **) &deviceInputData, dataSize * sizeof(float)));
+
+ float *deviceOutputData = NULL;
+ CUDA_CHECK(cudaMalloc((void **) &deviceOutputData, dataSize * sizeof(float)));
+
+ // Copy to GPU memory
+ CUDA_CHECK(cudaMemcpy(deviceInputData, hostData, dataSize * sizeof(float), cudaMemcpyHostToDevice));
+
+ // Run kernel
+ simpleMPIKernel<<>>(deviceInputData, deviceOutputData);
+
+ // Copy data back to CPU memory
+ CUDA_CHECK(cudaMemcpy(hostData, deviceOutputData, dataSize * sizeof(float), cudaMemcpyDeviceToHost));
+
+ // Free GPU memory
+ CUDA_CHECK(cudaFree(deviceInputData));
+ CUDA_CHECK(cudaFree(deviceOutputData));
+}
+
+float sum(float *data, int size) {
+ float accum = 0.f;
+
+ for (int i = 0; i < size; i++) {
+ accum += data[i];
+ }
+
+ return accum;
+}
\ No newline at end of file
diff --git a/simpleMPI.h b/simpleMPI.h
new file mode 100644
index 0000000..5a9586c
--- /dev/null
+++ b/simpleMPI.h
@@ -0,0 +1,32 @@
+//
+// Created by kwoodle on 5/9/20.
+//
+
+/*
+ * Copyright 1993-2015 NVIDIA Corporation. All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+
+/* Simple example demonstrating how to use MPI with CUDA
+*
+* Generate some random numbers on one node.
+* Dispatch them to all nodes.
+* Compute their square root on each node's GPU.
+* Compute the average of the results using MPI.
+*
+* simpleMPI.h: common header file
+*/
+
+// Forward declarations
+extern "C" {
+void initData(float *data, int dataSize);
+void computeGPU(float *hostData, int blockSize, int gridSize);
+float sum(float *data, int size);
+void my_abort(int err);
+}