diff --git a/.idea/.gitignore b/.idea/.gitignore
new file mode 100644
index 0000000..73f69e0
--- /dev/null
+++ b/.idea/.gitignore
@@ -0,0 +1,8 @@
+# Default ignored files
+/shelf/
+/workspace.xml
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml
+# Editor-based HTTP Client requests
+/httpRequests/
diff --git a/.idea/CudaSamples.iml b/.idea/CudaSamples.iml
new file mode 100644
index 0000000..f08604b
--- /dev/null
+++ b/.idea/CudaSamples.iml
@@ -0,0 +1,2 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module classpath="CMake" type="CPP_MODULE" version="4" />
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
new file mode 100644
index 0000000..8822db8
--- /dev/null
+++ b/.idea/misc.xml
@@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="CMakeWorkspace" PROJECT_DIR="$PROJECT_DIR$" />
+  <component name="JavaScriptSettings">
+    <option name="languageLevel" value="ES6" />
+  </component>
+</project>
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
new file mode 100644
index 0000000..5217dda
--- /dev/null
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/CudaSamples.iml" filepath="$PROJECT_DIR$/.idea/CudaSamples.iml" />
+    </modules>
+  </component>
+</project>
\ No newline at end of file
diff --git a/.idea/runConfigurations/CudaSamples.xml b/.idea/runConfigurations/CudaSamples.xml
new file mode 100644
index 0000000..04419df
--- /dev/null
+++ b/.idea/runConfigurations/CudaSamples.xml
@@ -0,0 +1,7 @@
+<component name="ProjectRunConfigurationManager">
+  <configuration default="false" name="CudaSamples" type="CMakeRunConfiguration" factoryName="Application" REDIRECT_INPUT="false" PASS_PARENT_ENVS_2="true" PROJECT_NAME="CudaSamples" TARGET_NAME="CudaSamples" CONFIG_NAME="Debug" RUN_TARGET_PROJECT_NAME="CudaSamples" RUN_TARGET_NAME="CudaSamples">
+    <method v="2">
+      <option name="com.jetbrains.cidr.execution.CidrBuildBeforeRunTaskProvider$BuildBeforeRunTask" enabled="true" />
+    </method>
+  </configuration>
+</component>
\ No newline at end of file
diff --git a/.idea/runConfigurations/asyncAPI.xml b/.idea/runConfigurations/asyncAPI.xml
new file mode 100644
index 0000000..db32816
--- /dev/null
+++ b/.idea/runConfigurations/asyncAPI.xml
@@ -0,0 +1,7 @@
+<component name="ProjectRunConfigurationManager">
+  <configuration default="false" name="asyncAPI" type="CMakeRunConfiguration" factoryName="Application" REDIRECT_INPUT="false" PASS_PARENT_ENVS_2="true" PROJECT_NAME="CudaSamples" TARGET_NAME="asyncAPI" CONFIG_NAME="Debug" RUN_TARGET_PROJECT_NAME="CudaSamples" RUN_TARGET_NAME="asyncAPI">
+    <method v="2">
+      <option name="com.jetbrains.cidr.execution.CidrBuildBeforeRunTaskProvider$BuildBeforeRunTask" enabled="true" />
+    </method>
+  </configuration>
+</component>
\ No newline at end of file
diff --git a/.idea/runConfigurations/cudaTensorCoreGemm.xml b/.idea/runConfigurations/cudaTensorCoreGemm.xml
new file mode 100644
index 0000000..57250fd
--- /dev/null
+++ b/.idea/runConfigurations/cudaTensorCoreGemm.xml
@@ -0,0 +1,7 @@
+<component name="ProjectRunConfigurationManager">
+  <configuration default="false" name="cudaTensorCoreGemm" type="CMakeRunConfiguration" factoryName="Application" REDIRECT_INPUT="false" PASS_PARENT_ENVS_2="true" PROJECT_NAME="CudaSamples" TARGET_NAME="cudaTensorCoreGemm" CONFIG_NAME="Debug" RUN_TARGET_PROJECT_NAME="CudaSamples" RUN_TARGET_NAME="cudaTensorCoreGemm">
+    <method v="2">
+      <option name="com.jetbrains.cidr.execution.CidrBuildBeforeRunTaskProvider$BuildBeforeRunTask" enabled="true" />
+    </method>
+  </configuration>
+</component>
\ No newline at end of file
diff --git a/.idea/runConfigurations/simpleMPI.xml b/.idea/runConfigurations/simpleMPI.xml
new file mode 100644
index 0000000..f1766a5
--- /dev/null
+++ b/.idea/runConfigurations/simpleMPI.xml
@@ -0,0 +1,7 @@
+<component name="ProjectRunConfigurationManager">
+  <configuration default="false" name="simpleMPI" type="CMakeRunConfiguration" factoryName="Application" REDIRECT_INPUT="false" PASS_PARENT_ENVS_2="true" PROJECT_NAME="CudaSamples" TARGET_NAME="simpleMPI" CONFIG_NAME="Debug" RUN_TARGET_PROJECT_NAME="CudaSamples" RUN_TARGET_NAME="simpleMPI">
+    <method v="2">
+      <option name="com.jetbrains.cidr.execution.CidrBuildBeforeRunTaskProvider$BuildBeforeRunTask" enabled="true" />
+    </method>
+  </configuration>
+</component>
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
new file mode 100644
index 0000000..94a25f7
--- /dev/null
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$" vcs="Git" />
+  </component>
+</project>
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..c97a31a
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,32 @@
+# Needs CUDAHOSTCXX=/usr/bin/cuda-g++;CUDA_LIB_PATH=/usr/local/cuda/lib64 in
+# CLion Settings | Build, Execution, Deployment | CMake > Environment
+
+cmake_minimum_required(VERSION 3.16)
+project(CudaSamples LANGUAGES CXX CUDA)
+
+set(CMAKE_CUDA_STANDARD 14)
+set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "-arch=sm_75")
+include_directories(/usr/local/cuda/samples/common/inc)
+
+set(CUDA_VERBOSE_BUILD ON)
+set(CUDA_SEPARABLE_COMPILATION ON) # Allow multiple CUDA files compilation
+
+add_executable(CudaSamples main.cu)
+
+set_target_properties(
+        CudaSamples
+        PROPERTIES
+        CUDA_SEPARABLE_COMPILATION ON)
+
+add_executable(asyncAPI asyncAPI.cu)
+
+find_package(MPI REQUIRED)
+include_directories(${MPI_CXX_INCLUDE_DIRS})
+
+add_executable(simpleMPI simpleMPI.cpp simpleMPI.h simpleMPI.cu)
+target_link_libraries(simpleMPI ${MPI_CXX_LIBRARIES})
+
+add_executable(cudaTensorCoreGemm cudaTensorCoreGemm.cu)
+target_link_libraries(cudaTensorCoreGemm)
+
+add_executable(bandwidthTest bandwidthTest.cu)
diff --git a/asyncAPI.cu b/asyncAPI.cu
new file mode 100644
index 0000000..dec2def
--- /dev/null
+++ b/asyncAPI.cu
@@ -0,0 +1,127 @@
+//
+// Created by kwoodle on 5/10/20.
+//
+
+////////////////////////////////////////////////////////////////////////////
+//
+// Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
+//
+// Please refer to the NVIDIA end user license agreement (EULA) associated
+// with this source code for terms and conditions that govern your use of
+// this software. Any use, reproduction, disclosure, or distribution of
+// this software and related documentation outside the terms of the EULA
+// is strictly prohibited.
+//
+////////////////////////////////////////////////////////////////////////////
+
+//
+// This sample illustrates the usage of CUDA events for both GPU timing and
+// overlapping CPU and GPU execution.  Events are inserted into a stream
+// of CUDA calls.  Since CUDA stream calls are asynchronous, the CPU can
+// perform computations while GPU is executing (including DMA memcopies
+// between the host and device).  CPU can query CUDA events to determine
+// whether GPU has completed tasks.
+//
+
+// includes, system
+#include <stdio.h>
+
+// includes CUDA Runtime
+#include <cuda_runtime.h>
+
+// includes, project
+#include <helper_cuda.h>
+#include <helper_functions.h> // helper utility functions
+
+__global__ void increment_kernel(int *g_data, int inc_value) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    g_data[idx] = g_data[idx] + inc_value;
+}
+
+bool correct_output(int *data, const int n, const int x) {
+    for (int i = 0; i < n; i++)
+        if (data[i] != x) {
+            printf("Error! data[%d] = %d, ref = %d\n", i, data[i], x);
+            return false;
+        }
+
+    return true;
+}
+
+int main(int argc, char *argv[]) {
+    int devID;
+    cudaDeviceProp deviceProps;
+
+    printf("[%s] - Starting...\n", argv[0]);
+
+    // This will pick the best possible CUDA capable device
+    devID = findCudaDevice(argc, (const char **) argv);
+
+    // get device name
+    checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
+    printf("CUDA device [%s]\n", deviceProps.name);
+
+    int n = 16 * 1024 * 1024;
+    int nbytes = n * sizeof(int);
+    int value = 26;
+
+    // allocate host memory
+    int *a = 0;
+    checkCudaErrors(cudaMallocHost((void **) &a, nbytes));
+    memset(a, 0, nbytes);
+
+    // allocate device memory
+    int *d_a = 0;
+    checkCudaErrors(cudaMalloc((void **) &d_a, nbytes));
+    checkCudaErrors(cudaMemset(d_a, 255, nbytes));
+
+    // set kernel launch configuration
+    dim3 threads = dim3(512, 1);
+    dim3 blocks = dim3(n / threads.x, 1);
+
+    // create cuda event handles
+    cudaEvent_t start, stop;
+    checkCudaErrors(cudaEventCreate(&start));
+    checkCudaErrors(cudaEventCreate(&stop));
+
+    StopWatchInterface *timer = NULL;
+    sdkCreateTimer(&timer);
+    sdkResetTimer(&timer);
+
+    checkCudaErrors(cudaDeviceSynchronize());
+    float gpu_time = 0.0f;
+
+    // asynchronously issue work to the GPU (all to stream 0)
+    sdkStartTimer(&timer);
+    cudaEventRecord(start, 0);
+    cudaMemcpyAsync(d_a, a, nbytes, cudaMemcpyHostToDevice, 0);
+    increment_kernel<<<blocks, threads, 0, 0>>>(d_a, value);
+    cudaMemcpyAsync(a, d_a, nbytes, cudaMemcpyDeviceToHost, 0);
+    cudaEventRecord(stop, 0);
+    sdkStopTimer(&timer);
+
+    // have CPU do some work while waiting for stage 1 to finish
+    unsigned long int counter = 0;
+
+    while (cudaEventQuery(stop) == cudaErrorNotReady) {
+        counter++;
+    }
+
+    checkCudaErrors(cudaEventElapsedTime(&gpu_time, start, stop));
+
+    // print the cpu and gpu times
+    printf("time spent executing by the GPU: %.2f\n", gpu_time);
+    printf("time spent by CPU in CUDA calls: %.2f\n", sdkGetTimerValue(&timer));
+    printf("CPU executed %lu iterations while waiting for GPU to finish\n", counter);
+
+    // check the output for correctness
+    bool bFinalResults = correct_output(a, n, value);
+
+    // release resources
+    checkCudaErrors(cudaEventDestroy(start));
+    checkCudaErrors(cudaEventDestroy(stop));
+    checkCudaErrors(cudaFreeHost(a));
+    checkCudaErrors(cudaFree(d_a));
+
+    exit(bFinalResults ? EXIT_SUCCESS : EXIT_FAILURE);
+}
\ No newline at end of file
diff --git a/bandwidthTest.cu b/bandwidthTest.cu
new file mode 100644
index 0000000..b4d7e08
--- /dev/null
+++ b/bandwidthTest.cu
@@ -0,0 +1,926 @@
+//
+// Created by kwoodle on 5/10/20.
+//
+
+/*
+ * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+
+/*
+ * This is a simple test program to measure the memcopy bandwidth of the GPU.
+ * It can measure device to device copy bandwidth, host to device copy bandwidth
+ * for pageable and pinned memory, and device to host copy bandwidth for pageable
+ * and pinned memory.
+ *
+ * Usage:
+ * ./bandwidthTest [option]...
+ */
+
+// CUDA runtime
+#include <cuda_runtime.h>
+
+// includes
+#include <helper_functions.h>  // helper for shared functions common to CUDA Samples
+#include <helper_cuda.h>       // helper functions for CUDA error checking and initialization
+
+#include <cuda.h>
+
+#include <memory>
+#include <iostream>
+#include <cassert>
+
+static const char *sSDKsample = "CUDA Bandwidth Test";
+
+// defines, project
+#define MEMCOPY_ITERATIONS  100
+#define DEFAULT_SIZE        ( 32 * (1e6) )     //32 M
+#define DEFAULT_INCREMENT   ( 4  * (1e6) )     //4 M
+#define CACHE_CLEAR_SIZE    ( 16 * (1e6) )     //16 M
+
+//shmoo mode defines
+#define SHMOO_MEMSIZE_MAX     (64 * (1e6))      //64 M
+#define SHMOO_MEMSIZE_START   (1e3)             //1 KB
+#define SHMOO_INCREMENT_1KB   (1e3)             //1 KB
+#define SHMOO_INCREMENT_2KB   (2 * 1e3)         //2 KB
+#define SHMOO_INCREMENT_10KB  (10 * (1e3))      //10KB
+#define SHMOO_INCREMENT_100KB (100 * (1e3))     //100 KB
+#define SHMOO_INCREMENT_1MB   (1e6)             //1 MB
+#define SHMOO_INCREMENT_2MB   (2 * 1e6)         //2 MB
+#define SHMOO_INCREMENT_4MB   (4 * 1e6)         //4 MB
+#define SHMOO_LIMIT_20KB      (20 * (1e3))      //20 KB
+#define SHMOO_LIMIT_50KB      (50 * (1e3))      //50 KB
+#define SHMOO_LIMIT_100KB     (100 * (1e3))     //100 KB
+#define SHMOO_LIMIT_1MB       (1e6)             //1 MB
+#define SHMOO_LIMIT_16MB      (16 * 1e6)        //16 MB
+#define SHMOO_LIMIT_32MB      (32 * 1e6)        //32 MB
+
+//enums, project
+enum testMode {
+    QUICK_MODE, RANGE_MODE, SHMOO_MODE
+};
+enum memcpyKind {
+    DEVICE_TO_HOST, HOST_TO_DEVICE, DEVICE_TO_DEVICE
+};
+enum printMode {
+    USER_READABLE, CSV
+};
+enum memoryMode {
+    PINNED, PAGEABLE
+};
+
+const char *sMemoryCopyKind[] =
+        {
+                "Device to Host",
+                "Host to Device",
+                "Device to Device",
+                NULL
+        };
+
+const char *sMemoryMode[] =
+        {
+                "PINNED",
+                "PAGEABLE",
+                NULL
+        };
+
+// if true, use CPU based timing for everything
+static bool bDontUseGPUTiming;
+
+int *pArgc = NULL;
+char **pArgv = NULL;
+
+////////////////////////////////////////////////////////////////////////////////
+// declaration, forward
+int runTest(const int argc, const char **argv);
+
+void testBandwidth(unsigned int start, unsigned int end, unsigned int increment,
+                   testMode mode, memcpyKind kind, printMode printmode, memoryMode memMode, int startDevice,
+                   int endDevice, bool wc);
+
+void testBandwidthQuick(unsigned int size, memcpyKind kind, printMode printmode, memoryMode memMode, int startDevice,
+                        int endDevice, bool wc);
+
+void testBandwidthRange(unsigned int start, unsigned int end, unsigned int increment,
+                        memcpyKind kind, printMode printmode, memoryMode memMode, int startDevice, int endDevice,
+                        bool wc);
+
+void
+testBandwidthShmoo(memcpyKind kind, printMode printmode, memoryMode memMode, int startDevice, int endDevice, bool wc);
+
+float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode, bool wc);
+
+float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode, bool wc);
+
+float testDeviceToDeviceTransfer(unsigned int memSize);
+
+void printResultsReadable(unsigned int *memSizes, double *bandwidths, unsigned int count, memcpyKind kind,
+                          memoryMode memMode, int iNumDevs, bool wc);
+
+void
+printResultsCSV(unsigned int *memSizes, double *bandwidths, unsigned int count, memcpyKind kind, memoryMode memMode,
+                int iNumDevs, bool wc);
+
+void printHelp(void);
+
+////////////////////////////////////////////////////////////////////////////////
+// Program main
+////////////////////////////////////////////////////////////////////////////////
+int main(int argc, char **argv) {
+    pArgc = &argc;
+    pArgv = argv;
+
+    // set logfile name and start logs
+    printf("[%s] - Starting...\n", sSDKsample);
+
+    int iRetVal = runTest(argc, (const char **) argv);
+
+    if (iRetVal < 0) {
+        checkCudaErrors(cudaSetDevice(0));
+    }
+
+    // finish
+    printf("%s\n", (iRetVal == 0) ? "Result = PASS" : "Result = FAIL");
+
+    printf("\nNOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.\n");
+
+    exit((iRetVal == 0) ? EXIT_SUCCESS : EXIT_FAILURE);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+//Parse args, run the appropriate tests
+///////////////////////////////////////////////////////////////////////////////
+int runTest(const int argc, const char **argv) {
+    int start = DEFAULT_SIZE;
+    int end = DEFAULT_SIZE;
+    int startDevice = 0;
+    int endDevice = 0;
+    int increment = DEFAULT_INCREMENT;
+    testMode mode = QUICK_MODE;
+    bool htod = false;
+    bool dtoh = false;
+    bool dtod = false;
+    bool wc = false;
+    char *modeStr;
+    char *device = NULL;
+    printMode printmode = USER_READABLE;
+    char *memModeStr = NULL;
+    memoryMode memMode = PINNED;
+
+    //process command line args
+    if (checkCmdLineFlag(argc, argv, "help")) {
+        printHelp();
+        return 0;
+    }
+
+    if (checkCmdLineFlag(argc, argv, "csv")) {
+        printmode = CSV;
+    }
+
+    if (getCmdLineArgumentString(argc, argv, "memory", &memModeStr)) {
+        if (strcmp(memModeStr, "pageable") == 0) {
+            memMode = PAGEABLE;
+        } else if (strcmp(memModeStr, "pinned") == 0) {
+            memMode = PINNED;
+        } else {
+            printf("Invalid memory mode - valid modes are pageable or pinned\n");
+            printf("See --help for more information\n");
+            return -1000;
+        }
+    } else {
+        //default - pinned memory
+        memMode = PINNED;
+    }
+
+    if (getCmdLineArgumentString(argc, argv, "device", &device)) {
+        int deviceCount;
+        cudaError_t error_id = cudaGetDeviceCount(&deviceCount);
+
+        if (error_id != cudaSuccess) {
+            printf("cudaGetDeviceCount returned %d\n-> %s\n", (int) error_id, cudaGetErrorString(error_id));
+            exit(EXIT_FAILURE);
+        }
+
+        if (deviceCount == 0) {
+            printf("!!!!!No devices found!!!!!\n");
+            return -2000;
+        }
+
+        if (strcmp(device, "all") == 0) {
+            printf("\n!!!!!Cumulative Bandwidth to be computed from all the devices !!!!!!\n\n");
+            startDevice = 0;
+            endDevice = deviceCount - 1;
+        } else {
+            startDevice = endDevice = atoi(device);
+
+            if (startDevice >= deviceCount || startDevice < 0) {
+                printf("\n!!!!!Invalid GPU number %d given hence default gpu %d will be used !!!!!\n", startDevice, 0);
+                startDevice = endDevice = 0;
+            }
+        }
+    }
+
+    printf("Running on...\n\n");
+
+    for (int currentDevice = startDevice; currentDevice <= endDevice; currentDevice++) {
+        cudaDeviceProp deviceProp;
+        cudaError_t error_id = cudaGetDeviceProperties(&deviceProp, currentDevice);
+
+        if (error_id == cudaSuccess) {
+            printf(" Device %d: %s\n", currentDevice, deviceProp.name);
+
+            if (deviceProp.computeMode == cudaComputeModeProhibited) {
+                fprintf(stderr,
+                        "Error: device is running in <Compute Mode Prohibited>, no threads can use ::cudaSetDevice().\n");
+                checkCudaErrors(cudaSetDevice(currentDevice));
+
+                exit(EXIT_FAILURE);
+            }
+        } else {
+            printf("cudaGetDeviceProperties returned %d\n-> %s\n", (int) error_id, cudaGetErrorString(error_id));
+            checkCudaErrors(cudaSetDevice(currentDevice));
+
+            exit(EXIT_FAILURE);
+        }
+    }
+
+    if (getCmdLineArgumentString(argc, argv, "mode", &modeStr)) {
+        //figure out the mode
+        if (strcmp(modeStr, "quick") == 0) {
+            printf(" Quick Mode\n\n");
+            mode = QUICK_MODE;
+        } else if (strcmp(modeStr, "shmoo") == 0) {
+            printf(" Shmoo Mode\n\n");
+            mode = SHMOO_MODE;
+        } else if (strcmp(modeStr, "range") == 0) {
+            printf(" Range Mode\n\n");
+            mode = RANGE_MODE;
+        } else {
+            printf("Invalid mode - valid modes are quick, range, or shmoo\n");
+            printf("See --help for more information\n");
+            return -3000;
+        }
+    } else {
+        //default mode - quick
+        printf(" Quick Mode\n\n");
+        mode = QUICK_MODE;
+    }
+
+    if (checkCmdLineFlag(argc, argv, "htod")) {
+        htod = true;
+    }
+
+    if (checkCmdLineFlag(argc, argv, "dtoh")) {
+        dtoh = true;
+    }
+
+    if (checkCmdLineFlag(argc, argv, "dtod")) {
+        dtod = true;
+    }
+
+#if CUDART_VERSION >= 2020
+
+    if (checkCmdLineFlag(argc, argv, "wc")) {
+        wc = true;
+    }
+
+#endif
+
+    if (checkCmdLineFlag(argc, argv, "cputiming")) {
+        bDontUseGPUTiming = true;
+    }
+
+    if (!htod && !dtoh && !dtod) {
+        //default:  All
+        htod = true;
+        dtoh = true;
+        dtod = true;
+    }
+
+    if (RANGE_MODE == mode) {
+        if (checkCmdLineFlag(argc, (const char **) argv, "start")) {
+            start = getCmdLineArgumentInt(argc, argv, "start");
+
+            if (start <= 0) {
+                printf("Illegal argument - start must be greater than zero\n");
+                return -4000;
+            }
+        } else {
+            printf("Must specify a starting size in range mode\n");
+            printf("See --help for more information\n");
+            return -5000;
+        }
+
+        if (checkCmdLineFlag(argc, (const char **) argv, "end")) {
+            end = getCmdLineArgumentInt(argc, argv, "end");
+
+            if (end <= 0) {
+                printf("Illegal argument - end must be greater than zero\n");
+                return -6000;
+            }
+
+            if (start > end) {
+                printf("Illegal argument - start is greater than end\n");
+                return -7000;
+            }
+        } else {
+            printf("Must specify an end size in range mode.\n");
+            printf("See --help for more information\n");
+            return -8000;
+        }
+
+
+        if (checkCmdLineFlag(argc, argv, "increment")) {
+            increment = getCmdLineArgumentInt(argc, argv, "increment");
+
+            if (increment <= 0) {
+                printf("Illegal argument - increment must be greater than zero\n");
+                return -9000;
+            }
+        } else {
+            printf("Must specify an increment in user mode\n");
+            printf("See --help for more information\n");
+            return -10000;
+        }
+    }
+
+    if (htod) {
+        testBandwidth((unsigned int) start, (unsigned int) end, (unsigned int) increment,
+                      mode, HOST_TO_DEVICE, printmode, memMode, startDevice, endDevice, wc);
+    }
+
+    if (dtoh) {
+        testBandwidth((unsigned int) start, (unsigned int) end, (unsigned int) increment,
+                      mode, DEVICE_TO_HOST, printmode, memMode, startDevice, endDevice, wc);
+    }
+
+    if (dtod) {
+        testBandwidth((unsigned int) start, (unsigned int) end, (unsigned int) increment,
+                      mode, DEVICE_TO_DEVICE, printmode, memMode, startDevice, endDevice, wc);
+    }
+
+    // Ensure that we reset all CUDA Devices in question
+    for (int nDevice = startDevice; nDevice <= endDevice; nDevice++) {
+        cudaSetDevice(nDevice);
+    }
+
+    return 0;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+//  Run a bandwidth test
+///////////////////////////////////////////////////////////////////////////////
+void
+testBandwidth(unsigned int start, unsigned int end, unsigned int increment,
+              testMode mode, memcpyKind kind, printMode printmode, memoryMode memMode, int startDevice, int endDevice,
+              bool wc) {
+    switch (mode) {
+        case QUICK_MODE:
+            testBandwidthQuick(DEFAULT_SIZE, kind, printmode, memMode, startDevice, endDevice, wc);
+            break;
+
+        case RANGE_MODE:
+            testBandwidthRange(start, end, increment, kind, printmode, memMode, startDevice, endDevice, wc);
+            break;
+
+        case SHMOO_MODE:
+            testBandwidthShmoo(kind, printmode, memMode, startDevice, endDevice, wc);
+            break;
+
+        default:
+            break;
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+//  Run a quick mode bandwidth test
+//////////////////////////////////////////////////////////////////////
+void
+testBandwidthQuick(unsigned int size, memcpyKind kind, printMode printmode, memoryMode memMode, int startDevice,
+                   int endDevice, bool wc) {
+    testBandwidthRange(size, size, DEFAULT_INCREMENT, kind, printmode, memMode, startDevice, endDevice, wc);
+}
+
+///////////////////////////////////////////////////////////////////////
+//  Run a range mode bandwidth test
+//////////////////////////////////////////////////////////////////////
+void
+testBandwidthRange(unsigned int start, unsigned int end, unsigned int increment,
+                   memcpyKind kind, printMode printmode, memoryMode memMode, int startDevice, int endDevice, bool wc) {
+    //count the number of copies we're going to run
+    unsigned int count = 1 + ((end - start) / increment);
+
+    unsigned int *memSizes = (unsigned int *) malloc(count * sizeof(unsigned int));
+    double *bandwidths = (double *) malloc(count * sizeof(double));
+
+    // Before calculating the cumulative bandwidth, initialize bandwidths array to NULL
+    for (unsigned int i = 0; i < count; i++) {
+        bandwidths[i] = 0.0;
+    }
+
+    // Use the device asked by the user
+    for (int currentDevice = startDevice; currentDevice <= endDevice; currentDevice++) {
+        cudaSetDevice(currentDevice);
+
+        //run each of the copies
+        for (unsigned int i = 0; i < count; i++) {
+
+            memSizes[i] = start + i * increment;
+
+            switch (kind) {
+                case DEVICE_TO_HOST:
+                    bandwidths[i] += testDeviceToHostTransfer(memSizes[i], memMode, wc);
+                    break;
+
+                case HOST_TO_DEVICE:
+                    bandwidths[i] += testHostToDeviceTransfer(memSizes[i], memMode, wc);
+                    break;
+
+                case DEVICE_TO_DEVICE:
+                    bandwidths[i] += testDeviceToDeviceTransfer(memSizes[i]);
+                    break;
+            }
+        }
+    } // Complete the bandwidth computation on all the devices
+
+    //print results
+    if (printmode == CSV) {
+        printResultsCSV(memSizes, bandwidths, count, kind, memMode, (1 + endDevice - startDevice), wc);
+    } else {
+        printResultsReadable(memSizes, bandwidths, count, kind, memMode, (1 + endDevice - startDevice), wc);
+    }
+
+    //clean up
+    free(memSizes);
+    free(bandwidths);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Intense shmoo mode - covers a large range of values with varying increments
+//////////////////////////////////////////////////////////////////////////////
+void
+testBandwidthShmoo(memcpyKind kind, printMode printmode, memoryMode memMode, int startDevice, int endDevice, bool wc) {
+    //count the number of copies to make
+    unsigned int count = 1 + (SHMOO_LIMIT_20KB / SHMOO_INCREMENT_1KB)
+                         + ((SHMOO_LIMIT_50KB - SHMOO_LIMIT_20KB) / SHMOO_INCREMENT_2KB)
+                         + ((SHMOO_LIMIT_100KB - SHMOO_LIMIT_50KB) / SHMOO_INCREMENT_10KB)
+                         + ((SHMOO_LIMIT_1MB - SHMOO_LIMIT_100KB) / SHMOO_INCREMENT_100KB)
+                         + ((SHMOO_LIMIT_16MB - SHMOO_LIMIT_1MB) / SHMOO_INCREMENT_1MB)
+                         + ((SHMOO_LIMIT_32MB - SHMOO_LIMIT_16MB) / SHMOO_INCREMENT_2MB)
+                         + ((SHMOO_MEMSIZE_MAX - SHMOO_LIMIT_32MB) / SHMOO_INCREMENT_4MB);
+
+    unsigned int *memSizes = (unsigned int *) malloc(count * sizeof(unsigned int));
+    double *bandwidths = (double *) malloc(count * sizeof(double));
+
+    // Before calculating the cumulative bandwidth, initialize bandwidths array to NULL
+    for (unsigned int i = 0; i < count; i++) {
+        bandwidths[i] = 0.0;
+    }
+
+    // Use the device asked by the user
+    for (int currentDevice = startDevice; currentDevice <= endDevice; currentDevice++) {
+        cudaSetDevice(currentDevice);
+        //Run the shmoo
+        int iteration = 0;
+        unsigned int memSize = 0;
+
+        while (memSize <= SHMOO_MEMSIZE_MAX) {
+            if (memSize < SHMOO_LIMIT_20KB) {
+                memSize += SHMOO_INCREMENT_1KB;
+            } else if (memSize < SHMOO_LIMIT_50KB) {
+                memSize += SHMOO_INCREMENT_2KB;
+            } else if (memSize < SHMOO_LIMIT_100KB) {
+                memSize += SHMOO_INCREMENT_10KB;
+            } else if (memSize < SHMOO_LIMIT_1MB) {
+                memSize += SHMOO_INCREMENT_100KB;
+            } else if (memSize < SHMOO_LIMIT_16MB) {
+                memSize += SHMOO_INCREMENT_1MB;
+            } else if (memSize < SHMOO_LIMIT_32MB) {
+                memSize += SHMOO_INCREMENT_2MB;
+            } else {
+                memSize += SHMOO_INCREMENT_4MB;
+            }
+
+            memSizes[iteration] = memSize;
+
+            switch (kind) {
+                case DEVICE_TO_HOST:
+                    bandwidths[iteration] += testDeviceToHostTransfer(memSizes[iteration], memMode, wc);
+                    break;
+
+                case HOST_TO_DEVICE:
+                    bandwidths[iteration] += testHostToDeviceTransfer(memSizes[iteration], memMode, wc);
+                    break;
+
+                case DEVICE_TO_DEVICE:
+                    bandwidths[iteration] += testDeviceToDeviceTransfer(memSizes[iteration]);
+                    break;
+            }
+
+            iteration++;
+            printf(".");
+        }
+    } // Complete the bandwidth computation on all the devices
+
+    //print results
+    printf("\n");
+
+    if (CSV == printmode) {
+        printResultsCSV(memSizes, bandwidths, count, kind, memMode, (1 + endDevice - startDevice), wc);
+    } else {
+        printResultsReadable(memSizes, bandwidths, count, kind, memMode, (1 + endDevice - startDevice), wc);
+    }
+
+    //clean up
+    free(memSizes);
+    free(bandwidths);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+//  test the bandwidth of a device to host memcopy of a specific size
+///////////////////////////////////////////////////////////////////////////////
+float
+testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode, bool wc) {
+    StopWatchInterface *timer = NULL;
+    float elapsedTimeInMs = 0.0f;
+    float bandwidthInGBs = 0.0f;
+    unsigned char *h_idata = NULL;
+    unsigned char *h_odata = NULL;
+    cudaEvent_t start, stop;
+
+    sdkCreateTimer(&timer);
+    checkCudaErrors(cudaEventCreate(&start));
+    checkCudaErrors(cudaEventCreate(&stop));
+
+    //allocate host memory
+    if (PINNED == memMode) {
+        //pinned memory mode - use special function to get OS-pinned memory
+#if CUDART_VERSION >= 2020
+        checkCudaErrors(cudaHostAlloc((void **) &h_idata, memSize, (wc) ? cudaHostAllocWriteCombined : 0));
+        checkCudaErrors(cudaHostAlloc((void **) &h_odata, memSize, (wc) ? cudaHostAllocWriteCombined : 0));
+#else
+        checkCudaErrors(cudaMallocHost((void **)&h_idata, memSize));
+        checkCudaErrors(cudaMallocHost((void **)&h_odata, memSize));
+#endif
+    } else {
+        //pageable memory mode - use malloc
+        h_idata = (unsigned char *) malloc(memSize);
+        h_odata = (unsigned char *) malloc(memSize);
+
+        if (h_idata == 0 || h_odata == 0) {
+            fprintf(stderr, "Not enough memory avaialable on host to run test!\n");
+            exit(EXIT_FAILURE);
+        }
+    }
+
+    //initialize the memory
+    for (unsigned int i = 0; i < memSize / sizeof(unsigned char); i++) {
+        h_idata[i] = (unsigned char) (i & 0xff);
+    }
+
+    // allocate device memory
+    unsigned char *d_idata;
+    checkCudaErrors(cudaMalloc((void **) &d_idata, memSize));
+
+    //initialize the device memory
+    checkCudaErrors(cudaMemcpy(d_idata, h_idata, memSize,
+                               cudaMemcpyHostToDevice));
+
+    //copy data from GPU to Host
+    sdkStartTimer(&timer);
+    checkCudaErrors(cudaEventRecord(start, 0));
+
+    if (PINNED == memMode) {
+        for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
+            checkCudaErrors(cudaMemcpyAsync(h_odata, d_idata, memSize,
+                                            cudaMemcpyDeviceToHost, 0));
+        }
+    } else {
+        for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
+            checkCudaErrors(cudaMemcpy(h_odata, d_idata, memSize,
+                                       cudaMemcpyDeviceToHost));
+        }
+    }
+
+    checkCudaErrors(cudaEventRecord(stop, 0));
+
+    // make sure GPU has finished copying
+    checkCudaErrors(cudaDeviceSynchronize());
+    //get the total elapsed time in ms
+    sdkStopTimer(&timer);
+    checkCudaErrors(cudaEventElapsedTime(&elapsedTimeInMs, start, stop));
+
+    if (PINNED != memMode || bDontUseGPUTiming) {
+        elapsedTimeInMs = sdkGetTimerValue(&timer);
+    }
+
+    //calculate bandwidth in GB/s
+    double time_s = elapsedTimeInMs / 1e3;
+    bandwidthInGBs = (memSize * (float) MEMCOPY_ITERATIONS) / (double) 1e9;
+    bandwidthInGBs = bandwidthInGBs / time_s;
+    //clean up memory
+    checkCudaErrors(cudaEventDestroy(stop));
+    checkCudaErrors(cudaEventDestroy(start));
+    sdkDeleteTimer(&timer);
+
+    if (PINNED == memMode) {
+        checkCudaErrors(cudaFreeHost(h_idata));
+        checkCudaErrors(cudaFreeHost(h_odata));
+    } else {
+        free(h_idata);
+        free(h_odata);
+    }
+
+    checkCudaErrors(cudaFree(d_idata));
+
+    return bandwidthInGBs;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+//! test the bandwidth of a host to device memcopy of a specific size
+///////////////////////////////////////////////////////////////////////////////
+float
+testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode, bool wc) {
+    StopWatchInterface *timer = NULL;
+    float elapsedTimeInMs = 0.0f;
+    float bandwidthInGBs = 0.0f;
+    cudaEvent_t start, stop;
+    sdkCreateTimer(&timer);
+    checkCudaErrors(cudaEventCreate(&start));
+    checkCudaErrors(cudaEventCreate(&stop));
+
+    //allocate host memory
+    unsigned char *h_odata = NULL;
+
+    if (PINNED == memMode) {
+#if CUDART_VERSION >= 2020
+        //pinned memory mode - use special function to get OS-pinned memory
+        checkCudaErrors(cudaHostAlloc((void **) &h_odata, memSize, (wc) ? cudaHostAllocWriteCombined : 0));
+#else
+        //pinned memory mode - use special function to get OS-pinned memory
+        checkCudaErrors(cudaMallocHost((void **)&h_odata, memSize));
+#endif
+    } else {
+        //pageable memory mode - use malloc
+        h_odata = (unsigned char *) malloc(memSize);
+
+        if (h_odata == 0) {
+            fprintf(stderr, "Not enough memory available on host to run test!\n");
+            exit(EXIT_FAILURE);
+        }
+    }
+
+    unsigned char *h_cacheClear1 = (unsigned char *) malloc(CACHE_CLEAR_SIZE);
+    unsigned char *h_cacheClear2 = (unsigned char *) malloc(CACHE_CLEAR_SIZE);
+
+    if (h_cacheClear1 == 0 || h_cacheClear2 == 0) {
+        fprintf(stderr, "Not enough memory available on host to run test!\n");
+        exit(EXIT_FAILURE);
+    }
+
+    //initialize the memory
+    for (unsigned int i = 0; i < memSize / sizeof(unsigned char); i++) {
+        h_odata[i] = (unsigned char) (i & 0xff);
+    }
+
+    for (unsigned int i = 0; i < CACHE_CLEAR_SIZE / sizeof(unsigned char); i++) {
+        h_cacheClear1[i] = (unsigned char) (i & 0xff);
+        h_cacheClear2[i] = (unsigned char) (0xff - (i & 0xff));
+    }
+
+    //allocate device memory
+    unsigned char *d_idata;
+    checkCudaErrors(cudaMalloc((void **) &d_idata, memSize));
+
+    sdkStartTimer(&timer);
+    checkCudaErrors(cudaEventRecord(start, 0));
+
+    //copy host memory to device memory
+    if (PINNED == memMode) {
+        for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
+            checkCudaErrors(cudaMemcpyAsync(d_idata, h_odata, memSize,
+                                            cudaMemcpyHostToDevice, 0));
+        }
+    } else {
+        for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
+            checkCudaErrors(cudaMemcpy(d_idata, h_odata, memSize,
+                                       cudaMemcpyHostToDevice));
+        }
+    }
+
+    checkCudaErrors(cudaEventRecord(stop, 0));
+    checkCudaErrors(cudaDeviceSynchronize());
+    //total elapsed time in ms
+    sdkStopTimer(&timer);
+    checkCudaErrors(cudaEventElapsedTime(&elapsedTimeInMs, start, stop));
+
+    if (PINNED != memMode || bDontUseGPUTiming) {
+        elapsedTimeInMs = sdkGetTimerValue(&timer);
+    }
+
+    sdkResetTimer(&timer);
+
+    //calculate bandwidth in GB/s
+    double time_s = elapsedTimeInMs / 1e3;
+    bandwidthInGBs = (memSize * (float) MEMCOPY_ITERATIONS) / (double) 1e9;
+    bandwidthInGBs = bandwidthInGBs / time_s;
+    //clean up memory
+    checkCudaErrors(cudaEventDestroy(stop));
+    checkCudaErrors(cudaEventDestroy(start));
+    sdkDeleteTimer(&timer);
+
+    if (PINNED == memMode) {
+        checkCudaErrors(cudaFreeHost(h_odata));
+    } else {
+        free(h_odata);
+    }
+
+    free(h_cacheClear1);
+    free(h_cacheClear2);
+    checkCudaErrors(cudaFree(d_idata));
+
+    return bandwidthInGBs;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+//! test the bandwidth of a device to device memcopy of a specific size
+///////////////////////////////////////////////////////////////////////////////
+float
+testDeviceToDeviceTransfer(unsigned int memSize) {
+    StopWatchInterface *timer = NULL;
+    float elapsedTimeInMs = 0.0f;
+    float bandwidthInGBs = 0.0f;
+    cudaEvent_t start, stop;
+
+    sdkCreateTimer(&timer);
+    checkCudaErrors(cudaEventCreate(&start));
+    checkCudaErrors(cudaEventCreate(&stop));
+
+    //allocate host memory
+    unsigned char *h_idata = (unsigned char *) malloc(memSize);
+
+    if (h_idata == 0) {
+        fprintf(stderr, "Not enough memory avaialable on host to run test!\n");
+        exit(EXIT_FAILURE);
+    }
+
+    //initialize the host memory
+    for (unsigned int i = 0; i < memSize / sizeof(unsigned char); i++) {
+        h_idata[i] = (unsigned char) (i & 0xff);
+    }
+
+    //allocate device memory
+    unsigned char *d_idata;
+    checkCudaErrors(cudaMalloc((void **) &d_idata, memSize));
+    unsigned char *d_odata;
+    checkCudaErrors(cudaMalloc((void **) &d_odata, memSize));
+
+    //initialize memory
+    checkCudaErrors(cudaMemcpy(d_idata, h_idata, memSize,
+                               cudaMemcpyHostToDevice));
+
+    //run the memcopy
+    sdkStartTimer(&timer);
+    checkCudaErrors(cudaEventRecord(start, 0));
+
+    for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
+        checkCudaErrors(cudaMemcpy(d_odata, d_idata, memSize,
+                                   cudaMemcpyDeviceToDevice));
+    }
+
+    checkCudaErrors(cudaEventRecord(stop, 0));
+
+    //Since device to device memory copies are non-blocking,
+    //cudaDeviceSynchronize() is required in order to get
+    //proper timing.
+    checkCudaErrors(cudaDeviceSynchronize());
+
+    //get the total elapsed time in ms
+    sdkStopTimer(&timer);
+    checkCudaErrors(cudaEventElapsedTime(&elapsedTimeInMs, start, stop));
+
+    if (bDontUseGPUTiming) {
+        elapsedTimeInMs = sdkGetTimerValue(&timer);
+    }
+
+    //calculate bandwidth in GB/s
+    double time_s = elapsedTimeInMs / 1e3;
+    bandwidthInGBs = (2.0f * memSize * (float) MEMCOPY_ITERATIONS) / (double) 1e9;
+    bandwidthInGBs = bandwidthInGBs / time_s;
+
+    //clean up memory
+    sdkDeleteTimer(&timer);
+    free(h_idata);
+    checkCudaErrors(cudaEventDestroy(stop));
+    checkCudaErrors(cudaEventDestroy(start));
+    checkCudaErrors(cudaFree(d_idata));
+    checkCudaErrors(cudaFree(d_odata));
+
+    return bandwidthInGBs;
+}
+
+/////////////////////////////////////////////////////////
+//print results in an easily read format
+////////////////////////////////////////////////////////
+void printResultsReadable(unsigned int *memSizes, double *bandwidths, unsigned int count, memcpyKind kind,
+                          memoryMode memMode, int iNumDevs, bool wc) {
+    printf(" %s Bandwidth, %i Device(s)\n", sMemoryCopyKind[kind], iNumDevs);
+    printf(" %s Memory Transfers\n", sMemoryMode[memMode]);
+
+    if (wc) {
+        printf(" Write-Combined Memory Writes are Enabled");
+    }
+
+    printf("   Transfer Size (Bytes)\tBandwidth(GB/s)\n");
+    unsigned int i;
+
+    for (i = 0; i < (count - 1); i++) {
+        printf("   %u\t\t\t%s%.1f\n", memSizes[i], (memSizes[i] < 10000) ? "\t" : "", bandwidths[i]);
+    }
+
+    printf("   %u\t\t\t%s%.1f\n\n", memSizes[i], (memSizes[i] < 10000) ? "\t" : "", bandwidths[i]);
+}
+
+///////////////////////////////////////////////////////////////////////////
+//print results in a database format
+///////////////////////////////////////////////////////////////////////////
+void
+printResultsCSV(unsigned int *memSizes, double *bandwidths, unsigned int count, memcpyKind kind, memoryMode memMode,
+                int iNumDevs, bool wc) {
+    std::string sConfig;
+
+    // log config information
+    if (kind == DEVICE_TO_DEVICE) {
+        sConfig += "D2D";
+    } else {
+        if (kind == DEVICE_TO_HOST) {
+            sConfig += "D2H";
+        } else if (kind == HOST_TO_DEVICE) {
+            sConfig += "H2D";
+        }
+
+        if (memMode == PAGEABLE) {
+            sConfig += "-Paged";
+        } else if (memMode == PINNED) {
+            sConfig += "-Pinned";
+
+            if (wc) {
+                sConfig += "-WriteCombined";
+            }
+        }
+    }
+
+    unsigned int i;
+    double dSeconds = 0.0;
+
+    for (i = 0; i < count; i++) {
+        dSeconds = (double) memSizes[i] / (bandwidths[i] * (double) (1 << 20));
+        printf("bandwidthTest-%s, Bandwidth = %.1f GB/s, Time = %.5f s, Size = %u bytes, NumDevsUsed = %d\n",
+               sConfig.c_str(), bandwidths[i], dSeconds, memSizes[i], iNumDevs);
+    }
+}
+
+///////////////////////////////////////////////////////////////////////////
+//Print help screen
+///////////////////////////////////////////////////////////////////////////
+void printHelp(void) {
+    printf("Usage:  bandwidthTest [OPTION]...\n");
+    printf("Test the bandwidth for device to host, host to device, and device to device transfers\n");
+    printf("\n");
+    printf("Example:  measure the bandwidth of device to host pinned memory copies in the range 1024 Bytes to 102400 Bytes in 1024 Byte increments\n");
+    printf("./bandwidthTest --memory=pinned --mode=range --start=1024 --end=102400 --increment=1024 --dtoh\n");
+
+    printf("\n");
+    printf("Options:\n");
+    printf("--help\tDisplay this help menu\n");
+    printf("--csv\tPrint results as a CSV\n");
+    printf("--device=[deviceno]\tSpecify the device device to be used\n");
+    printf("  all - compute cumulative bandwidth on all the devices\n");
+    printf("  0,1,2,...,n - Specify any particular device to be used\n");
+    printf("--memory=[MEMMODE]\tSpecify which memory mode to use\n");
+    printf("  pageable - pageable memory\n");
+    printf("  pinned   - non-pageable system memory\n");
+    printf("--mode=[MODE]\tSpecify the mode to use\n");
+    printf("  quick - performs a quick measurement\n");
+    printf("  range - measures a user-specified range of values\n");
+    printf("  shmoo - performs an intense shmoo of a large range of values\n");
+
+    printf("--htod\tMeasure host to device transfers\n");
+    printf("--dtoh\tMeasure device to host transfers\n");
+    printf("--dtod\tMeasure device to device transfers\n");
+#if CUDART_VERSION >= 2020
+    printf("--wc\tAllocate pinned memory as write-combined\n");
+#endif
+    printf("--cputiming\tForce CPU-based timing always\n");
+
+    printf("Range mode options\n");
+    printf("--start=[SIZE]\tStarting transfer size in bytes\n");
+    printf("--end=[SIZE]\tEnding transfer size in bytes\n");
+    printf("--increment=[SIZE]\tIncrement size in bytes\n");
+}
diff --git a/cudaTensorCoreGemm.cu b/cudaTensorCoreGemm.cu
new file mode 100644
index 0000000..c583e1b
--- /dev/null
+++ b/cudaTensorCoreGemm.cu
@@ -0,0 +1,586 @@
+//
+// Created by kwoodle on 5/10/20.
+//
+
+//
+// Created by kwoodle on 5/9/20.
+//
+
+/*
+ * Copyright 1993-2017 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+
+// CUDA sample demonstrating a GEMM computation using the Warp Matrix Multiply
+// and Accumulate API introduced in CUDA 9.
+
+// In this program, the compute_gemm kernel computes the result of a matrix multiplication
+// and addition: D = alpha * A * B + beta * C. The dimensions of both C and D matrices
+// are M_GLOBAL x N_GLOBAL. The A matrix is M_GLOBAL x K_GLOBAL (row-major), the B matrix
+// is K_GLOBAL x N_GLOBAL (column-major).
+// In that kernel, each CTA computes one 128 x 128 tile of the resulting matrix
+// per iteration. When the tile is computed, the CTA stores it to the global memory
+// and begins a new iteration, selecting a new 128 x 128 tile to compute.
+// Each CTA consists of eight warps. For the 128 x 128 tile, each warp computes eight
+// 16 x 16 subtiles, organized in a 2 x 4 two-dimensional array.
+// Warps compute the 16 x 16 subtiles using nvcuda::wmma::mma_sync operations by
+// moving through the K_GLOBAL dimension of the A and B matrices and accumulating
+// the intermediate result in the local thread state.
+
+// There are a number of simple optimizations used in the algorithm:
+// - The CTA copies the 128 x 128 tile of the C matrix from the global memory to
+//   shared memory. After that is done, each warp loads the C matrix fragments from
+//   shared memory, thus avoiding a random global memory access.
+// - On each internal iteration, the CTA copies a portion of the A and B matrices from
+//   global memory to shared memory. After that, all warps in the CTA reuse the A and B
+//   data from shared memory, thus reducing the number of data copies from global memory.
+// - The portions of the A and B matrices are stored in shared memory with an additional
+//   padding (skew) to reduce the number of shared memory access bank conflicts.
+//   (See a detailed explanation near the SKEW_HALF macro definition.)
+// - When the CTA finishes computing the tiles of the resulting matrix, each warp stores
+//   its subtiles to shared memory. The CTA then copies the shared memory contents to
+//   global memory, again avoiding redundant random global memory accesses.
+// - Note that the CTA tile size is chosen to maximize the GPU register utilization,
+//   but carefully enough to avoid local memory use.
+
+#include <assert.h>
+#include <stdio.h>
+#include <cuda.h>
+#include <mma.h>
+
+// helper functions and utilities to work with CUDA
+#include <helper_functions.h>
+#include <helper_cuda.h>
+
+// Externally configurable parameters.
+
+#ifndef CPU_DEBUG
+// Set this to 1 to verify the correctness of the GPU-computed matrix.
+#define CPU_DEBUG 0
+#endif
+
+#ifndef SHARED_MEMORY_LIMIT_64K
+// Set this to 0 to use more than 64 Kb of shared memory to cache data, to
+// improve the performance of the computations on GPU.
+// Note that you need a GPU that can have more than 64 Kb of shared memory
+// per multiprocessor.
+#define SHARED_MEMORY_LIMIT_64K 1
+#endif
+
+// GPU configuration.
+
+#define WARP_SIZE 32
+
+// MMA matrix tile dimensions.
+
+#define M 16
+#define N 16
+#define K 16
+
+#define WMMA_M 16
+#define WMMA_N 16
+#define WMMA_K 16
+
+// GEMM configuration.
+
+#define M_TILES 256
+#define N_TILES 256
+#define K_TILES 256
+
+#define M_GLOBAL (M * M_TILES)
+#define N_GLOBAL (N * N_TILES)
+#define K_GLOBAL (K * K_TILES)
+
+#define C_LAYOUT wmma::mem_row_major
+
+// Implementation constants.
+
+#define WARPS_PER_BLOCK 8
+#define THREADS_PER_BLOCK (WARP_SIZE * WARPS_PER_BLOCK)
+
+#if SHARED_MEMORY_LIMIT_64K
+// With only 64 Kb shared memory available, we can fit two 8-tile chunks of
+// the A and B matrix data, that are 16 * 16 * 8 * 8 * 2 = 32 Kb each
+// (i.e. two 8x8 arrays of tiles of 16x16 half-typed elements per CTA).
+// But we cannot account the 8 Kb total skew overhead, without which the performance
+// would be severely impacted. So we choose to reduce the chunk size in half,
+// i.e. the amount of A and B matrix data we cache in shared memory.
+// Accordingly, this doubles the number of outer iterations across the global K
+// dimension, which only slightly impacts the performance.
+#define CHUNK_K 4
+#else
+#define CHUNK_K 8
+#endif
+
+#define CHUNK_LINE_BYTES (CHUNK_K * K * sizeof(half))
+#define WARP_COPY_BYTES (WARP_SIZE * sizeof(int4))
+#define CHUNK_COPY_LINES_PER_WARP (WARP_COPY_BYTES / CHUNK_LINE_BYTES)
+#define CHUNK_COPY_LINE_LANES (WARP_SIZE / CHUNK_COPY_LINES_PER_WARP)
+
+#define BLOCK_ROW_WARPS 2
+#define BLOCK_COL_WARPS 4
+
+#define WARP_ROW_TILES 4
+#define WARP_COL_TILES 2
+
+#define BLOCK_ROW_TILES (WARP_ROW_TILES * BLOCK_ROW_WARPS)
+#define BLOCK_COL_TILES (WARP_COL_TILES * BLOCK_COL_WARPS)
+
+#define GLOBAL_MEM_STRIDE N_GLOBAL
+
+#define SHMEM_STRIDE (N * BLOCK_ROW_TILES)
+#define SHMEM_OFFSET (N * WARP_ROW_TILES)
+
+// The macro below is used to shift rows of the A matrix and columns of the B matrix
+// in shared memory to minimize possible bank conflicts.
+// Before performing the nvcuda::wmma::mma_sync operation, the warp must load the matrix
+// data using the nvcuda::wmma::load_matrix_sync operation. Although the memory access pattern
+// is not specified for that function, each lane in the warp can read one or multiple matrix
+// elements from different matrix rows or columns.
+// For shared memory, such access can result in bank conflicts if different rows / columns
+// of the matrix map to the same bank. By shifting each row and column by a few bytes, we
+// make sure that they map to different banks, thus reducing the number of possible bank
+// conflicts.
+// The number of 8 two-byte "half" elements is chosen as the minimum possible shift because
+// we must keep each row and column 128-bit aligned, as required by nvcuda::wmma::load_matrix_sync.
+#define SKEW_HALF 8
+
+#define checkKernelErrors(expr) do {                                                        \
+    expr;                                                                                   \
+                                                                                            \
+    cudaError_t __err = cudaGetLastError();                                                 \
+    if (__err != cudaSuccess) {                                                             \
+        printf("Line %d: '%s' failed: %s\n", __LINE__, # expr, cudaGetErrorString(__err));  \
+        abort();                                                                            \
+    }                                                                                       \
+} while(0)
+
+using namespace nvcuda;
+
+__host__ void init_host_matrices(half *a, half *b, float *c) {
+    for (int i = 0; i < M_GLOBAL; i++) {
+        for (int j = 0; j < K_GLOBAL; j++) {
+            a[i * K_GLOBAL + j] = (half) (rand() % 3);
+        }
+    }
+
+    for (int i = 0; i < N_GLOBAL; i++) {
+        for (int j = 0; j < K_GLOBAL; j++) {
+            b[i * K_GLOBAL + j] = (half) (rand() % 3);
+        }
+    }
+
+    for (int t = 0; t < M_GLOBAL * N_GLOBAL; t++) {
+        c[t] = (float) (rand() % 3);
+    }
+}
+
+__global__ void compute_gemm(const half *A, const half *B, const float *C, float *D, float alpha, float beta) {
+    extern __shared__ half shmem[][CHUNK_K * K + SKEW_HALF];
+
+    // Warp and lane identification.
+    const unsigned int warpId = threadIdx.x / WARP_SIZE;
+    const unsigned int laneId = threadIdx.x % WARP_SIZE;
+
+    // Offset in shared memory from which the B matrix is stored.
+    const size_t shmem_idx_b_off = BLOCK_COL_TILES * M;
+
+    // This pointer is used to access the C and D matrix tiles this warp computes.
+    float *shmem_warp_tile_ptr =
+            (float *) &shmem[0][0] + (warpId / 2) * SHMEM_STRIDE * K * 2 + (warpId % 2) * SHMEM_OFFSET;
+
+    // This pointer is used to stream the C and D matrices block-wide tile to and from shared memory.
+    float *shmem_warp_stream_ptr = (float *) &shmem[0][0] + warpId * SHMEM_STRIDE * K;
+
+    // Adjust the beta scaler, as it'll be multiplied by alpha at the end of
+    // each tile computation. Technically this is not generally correct (may result
+    // in a loss of precision). Zero still needs to be specially handled though.
+    beta /= alpha;
+
+    // Each CTA slides along the 128 x 128 tiles from the top left corner of the matrix to the
+    // right and down, and selects the next tile to compute. Once there's no such tile,
+    // all warps in this CTA exit.
+    for (unsigned int block_pos = blockIdx.x;; block_pos += gridDim.x) {
+        const unsigned int block_tile_i = ((block_pos * BLOCK_ROW_TILES) / N_TILES) * (BLOCK_COL_TILES);
+        const unsigned int block_tile_j = (block_pos * BLOCK_COL_TILES) % N_TILES;
+
+        // Stop when there are no more D matrix tiles to compute in this CTA.
+        if (block_tile_i >= M_TILES) {
+            break;
+        }
+
+        // This warp's pointer to the C matrix data to copy memory from to shared memory.
+        const size_t gmem_idx = (block_tile_i + warpId) * M * GLOBAL_MEM_STRIDE + block_tile_j * N;
+        const float *src_gmem_warp_stream_ptr = &C[gmem_idx];
+
+        // Stream multiple C tiles to shared memory.
+#pragma unroll
+        for (int i = 0; i < K; i++) {
+            typedef int4 copy_t;
+
+            *((copy_t *) (shmem_warp_stream_ptr + SHMEM_STRIDE * i) + laneId) =
+                    *((copy_t *) (src_gmem_warp_stream_ptr + GLOBAL_MEM_STRIDE * i) + laneId);
+        }
+
+        __syncthreads();
+
+        // These fragments will accumulate the result of A and B matrix fragment multiplications
+        // along the K_GLOBAL dimension.
+        wmma::fragment<wmma::accumulator, M, N, K, float> c[WARP_COL_TILES][WARP_ROW_TILES];
+
+        // Load the C matrix tiles into fragments from shared memory.
+#pragma unroll
+        for (int i = 0; i < WARP_COL_TILES; i++) {
+#pragma unroll
+            for (int j = 0; j < WARP_ROW_TILES; j++) {
+                const float *tile_ptr = shmem_warp_tile_ptr + i * SHMEM_STRIDE * K + j * N;
+
+                wmma::load_matrix_sync(c[i][j], tile_ptr, SHMEM_STRIDE, C_LAYOUT);
+            }
+        }
+
+        __syncthreads();
+
+        // Scale the C matrix.
+#pragma unroll
+        for (int i = 0; i < WARP_COL_TILES; i++) {
+#pragma unroll
+            for (int j = 0; j < WARP_ROW_TILES; j++) {
+#pragma unroll
+                for (int t = 0; t < c[i][j].num_elements; t++) {
+                    c[i][j].x[t] *= beta;
+                }
+            }
+        }
+
+        // Select what warp copies what matrix to shared memory.
+        // Warps 0-3 copy the A matrix, warps 4-7 copy the B matrix.
+        const half *warp_ptr = (warpId < 4) ? (&A[block_tile_i * M * K_GLOBAL] + M * K_GLOBAL * (warpId % 4) * 2) :
+                               (&B[block_tile_j * N * K_GLOBAL] + N * K_GLOBAL * (warpId % 4) * 2);
+
+        // Go through the global K dimension by a fixed step at a time.
+#pragma unroll
+        for (int tile_k = 0; tile_k < K_TILES; tile_k += CHUNK_K) {
+            // Copy slices of the A and B matrices to shared memory.
+            // The first half of the warps in the CTA copy the A matrix, the rest copy the B matrix.
+            size_t shmem_idx = warpId < (WARPS_PER_BLOCK / 2) ? (M * (warpId % (WARPS_PER_BLOCK / 2)) * 2) :
+                               (N * (warpId % (WARPS_PER_BLOCK / 2)) * 2 + shmem_idx_b_off);
+
+            // First half of the warp copies the first row / column of the matrix,
+            // the second half of the warp copies the next.
+            int4 *lane_ptr = (int4 *) (warp_ptr + tile_k * K + (laneId / CHUNK_COPY_LINE_LANES) * K_GLOBAL) +
+                             (laneId % CHUNK_COPY_LINE_LANES);
+
+            // Shift the second half of the warp to the next row / column in the shared memory.
+            shmem_idx += laneId / CHUNK_COPY_LINE_LANES;
+
+#pragma unroll
+            for (int i = 0; i < ((WARP_SIZE / 2) / CHUNK_COPY_LINES_PER_WARP) * 2; i++) {
+                // Copy 16 bytes at once in each lane.
+                *((int4 *) &shmem[shmem_idx][0] + (laneId % CHUNK_COPY_LINE_LANES)) = *lane_ptr;
+
+                // Advance the global memory pointer and the shared memory index.
+                lane_ptr = (int4 *) ((half *) lane_ptr + K_GLOBAL * CHUNK_COPY_LINES_PER_WARP);
+                shmem_idx += CHUNK_COPY_LINES_PER_WARP;
+            }
+
+            __syncthreads();
+
+            // Compute a grid of C matrix tiles in each warp.
+#pragma unroll
+            for (int k_step = 0; k_step < CHUNK_K; k_step++) {
+                wmma::fragment<wmma::matrix_a, M, N, K, half, wmma::row_major> a[WARP_COL_TILES];
+                wmma::fragment<wmma::matrix_b, M, N, K, half, wmma::col_major> b[WARP_ROW_TILES];
+
+#pragma unroll
+                for (int i = 0; i < WARP_COL_TILES; i++) {
+                    size_t shmem_idx_a = (warpId / 2) * M * 2 + (i * M);
+                    const half *tile_ptr = &shmem[shmem_idx_a][k_step * K];
+
+                    wmma::load_matrix_sync(a[i], tile_ptr, K * CHUNK_K + SKEW_HALF);
+
+#pragma unroll
+                    for (int j = 0; j < WARP_ROW_TILES; j++) {
+                        if (i == 0) {
+                            // Load the B matrix fragment once, because it is going to be reused
+                            // against the other A matrix fragments.
+                            size_t shmem_idx_b = shmem_idx_b_off + (WARP_ROW_TILES * N) * (warpId % 2) + (j * N);
+                            const half *tile_ptr = &shmem[shmem_idx_b][k_step * K];
+
+                            wmma::load_matrix_sync(b[j], tile_ptr, K * CHUNK_K + SKEW_HALF);
+                        }
+
+                        wmma::mma_sync(c[i][j], a[i], b[j], c[i][j]);
+                    }
+                }
+            }
+
+            __syncthreads();
+        }
+
+        // Store the D fragments to shared memory.
+#pragma unroll
+        for (int i = 0; i < WARP_COL_TILES; i++) {
+#pragma unroll
+            for (int j = 0; j < WARP_ROW_TILES; j++) {
+#pragma unroll
+                // Uniform, point-wise transformations of ALL fragment elements by ALL threads in the
+                // warp are well-defined even though element indices within fragment storage are not defined.
+                for (int t = 0; t < c[i][j].num_elements; t++)
+                    c[i][j].x[t] *= alpha;
+
+                float *tile_ptr = shmem_warp_tile_ptr + i * SHMEM_STRIDE * K + j * N;
+
+                wmma::store_matrix_sync(tile_ptr, c[i][j], SHMEM_STRIDE, C_LAYOUT);
+            }
+        }
+
+        __syncthreads();
+
+        // Now that shared memory contains all the D tiles, stream them to global memory.
+        float *dst_gmem_warp_stream_ptr = &D[gmem_idx];
+
+#pragma unroll
+        for (int i = 0; i < K; i++) {
+            *((int4 *) (dst_gmem_warp_stream_ptr + GLOBAL_MEM_STRIDE * i) + laneId) =
+                    *((int4 *) (shmem_warp_stream_ptr + SHMEM_STRIDE * i) + laneId);
+        }
+
+        __syncthreads();
+    }
+}
+
+
+// Performs an MxNxK GEMM (C=alpha*A*B + beta*C) assuming:
+//  1) Matrices are packed in memory.
+//  2) M, N and K are multiples of 16.
+//  3) Neither A nor B are transposed.
+// Note: This is a less performant version of the compute_gemm kernel. It is designed for
+//       demonstration purposes only to show the CUDA WMMA API use without relying on
+//       availability of the shared memory.
+__global__ void
+simple_wmma_gemm(half *a, half *b, float *c, float *d, int m_ld, int n_ld, int k_ld, float alpha, float beta) {
+    // Leading dimensions. Packed with no transpositions.
+    int lda = m_ld;
+    int ldb = k_ld;
+    int ldc = n_ld;
+
+    // Tile using a 2D grid
+    int warpM = (blockIdx.x * blockDim.x + threadIdx.x) / warpSize;
+    int warpN = (blockIdx.y * blockDim.y + threadIdx.y);
+
+    // Declare the fragments
+    wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_N, WMMA_K, half, wmma::row_major> a_frag;
+    wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_N, WMMA_K, half, wmma::col_major> b_frag;
+    wmma::fragment<wmma::accumulator, WMMA_M, WMMA_N, WMMA_K, float> acc_frag;
+    wmma::fragment<wmma::accumulator, WMMA_M, WMMA_N, WMMA_K, float> c_frag;
+
+    wmma::fill_fragment(acc_frag, 0.0f);
+
+    // Loop over k
+    for (int i = 0; i < k_ld; i += WMMA_K) {
+        int aCol = i;
+        int aRow = warpM * WMMA_M;
+
+        int bCol = i;
+        int bRow = warpN * WMMA_N;
+
+        // Bounds checking
+        if (aRow < m_ld && aCol < k_ld && bRow < k_ld && bCol < n_ld) {
+            // Load the inputs
+            wmma::load_matrix_sync(a_frag, a + aCol + aRow * lda, lda);
+            wmma::load_matrix_sync(b_frag, b + bCol + bRow * ldb, ldb);
+
+            // Perform the matrix multiplication
+            wmma::mma_sync(acc_frag, a_frag, b_frag, acc_frag);
+
+        }
+    }
+
+    // Load in the current value of c, scale it by beta, and add this our result scaled by alpha
+    int cCol = warpN * WMMA_N;
+    int cRow = warpM * WMMA_M;
+
+    if (cRow < m_ld && cCol < n_ld) {
+        wmma::load_matrix_sync(c_frag, c + cCol + cRow * ldc, ldc, wmma::mem_row_major);
+
+        for (int i = 0; i < c_frag.num_elements; i++) {
+            c_frag.x[i] = alpha * acc_frag.x[i] + beta * c_frag.x[i];
+        }
+
+        // Store the output
+        wmma::store_matrix_sync(d + cCol + cRow * ldc, c_frag, ldc, wmma::mem_row_major);
+    }
+}
+
+__host__ void matMultiplyOnHost(half *A, half *B, float *C,
+                                float alpha, float beta,
+                                int numARows, int numAColumns,
+                                int numBRows, int numBColumns,
+                                int numCRows, int numCColumns) {
+    for (int i = 0; i < numCRows; i++) {
+        for (int j = 0; j < numCColumns; j++) {
+            float temp = 0.0;
+
+            for (int k = 0; k < numAColumns; k++) {
+                temp += (float) A[i * numAColumns + k] * (float) B[j * numBRows + k];
+            }
+
+            C[i * numCColumns + j] = temp * alpha + beta * C[i * numCColumns + j];
+        }
+    }
+}
+
+int main(int argc, char **argv) {
+    printf("Initializing...\n");
+
+    int dev = findCudaDevice(argc, (const char **) argv);
+
+    cudaDeviceProp deviceProp;
+    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev));
+
+    // Tensor cores require a GPU of Volta (SM7X) architecture or higher.
+    if (deviceProp.major < 7) {
+        printf("cudaTensorCoreGemm requires requires SM 7.0 or higher to use Tensor Cores.  Exiting...\n");
+        exit(EXIT_WAIVED);
+    }
+
+    printf("M: %d (%d x %d)\n", M_GLOBAL, M, M_TILES);
+    printf("N: %d (%d x %d)\n", N_GLOBAL, N, N_TILES);
+    printf("K: %d (%d x %d)\n", K_GLOBAL, K, K_TILES);
+
+    half *A_h = NULL;
+    half *B_h = NULL;
+    float *C_h = NULL;
+#if CPU_DEBUG
+    float *result_hD = NULL;
+    float *result_host = NULL;
+#endif
+
+    A_h = (half *) malloc(sizeof(half) * M_GLOBAL * K_GLOBAL);
+    B_h = (half *) malloc(sizeof(half) * K_GLOBAL * N_GLOBAL);
+    C_h = (float *) malloc(sizeof(float) * M_GLOBAL * N_GLOBAL);
+#if CPU_DEBUG
+    result_hD   = (float*) malloc(sizeof(float) * M_GLOBAL * N_GLOBAL);
+    result_host = (float*) malloc(sizeof(float) * M_GLOBAL * N_GLOBAL);
+#endif
+
+    half *A = NULL;
+    half *B = NULL;
+    float *C = NULL;
+    float *D = NULL;
+
+    checkCudaErrors(cudaMalloc((void **) &A, sizeof(half) * M_GLOBAL * K_GLOBAL));
+    checkCudaErrors(cudaMalloc((void **) &B, sizeof(half) * N_GLOBAL * K_GLOBAL));
+    checkCudaErrors(cudaMalloc((void **) &C, sizeof(float) * M_GLOBAL * N_GLOBAL));
+    checkCudaErrors(cudaMalloc((void **) &D, sizeof(float) * M_GLOBAL * N_GLOBAL));
+
+    assert(((unsigned long long) A) % 128 == 0);
+    assert(((unsigned long long) B) % 128 == 0);
+    assert(((unsigned long long) C) % 128 == 0);
+    assert(((unsigned long long) D) % 128 == 0);
+
+    init_host_matrices(A_h, B_h, C_h);
+
+    printf("Preparing data for GPU...\n");
+
+    checkCudaErrors(cudaMemcpy(A, A_h, sizeof(half) * M_GLOBAL * K_GLOBAL, cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(B, B_h, sizeof(half) * N_GLOBAL * K_GLOBAL, cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(C, C_h, sizeof(float) * M_GLOBAL * N_GLOBAL, cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemset(D, 0, sizeof(float) * M_GLOBAL * N_GLOBAL));
+
+    enum {
+        // Compute the right amount of shared memory to request.
+        // We need shared memory to hold per-CTA C and D matrix tiles, and to cache per-CTA chunks
+        // of the A and B matrices. Therefore, the right amount to request is the maximum of those
+        // two numbers.
+        SHMEM_SZ = MAX(sizeof(half) * (BLOCK_COL_TILES * M) * (CHUNK_K * K + SKEW_HALF) * 2,
+                       M * (BLOCK_ROW_WARPS * WARP_ROW_TILES) * N * (BLOCK_COL_WARPS * WARP_COL_TILES) * sizeof(float))
+    };
+
+    printf("Required shared memory size: %lu Kb\n", SHMEM_SZ / 1024UL);
+
+    const float alpha = 1.1f;
+    const float beta = 1.2f;
+
+    cudaEvent_t start, stop;
+
+    checkCudaErrors(cudaEventCreate(&start));
+    checkCudaErrors(cudaEventCreate(&stop));
+    checkCudaErrors(cudaEventRecord(start));
+
+    // If enough shared memory available on the GPU use high performant kernel
+    if (deviceProp.sharedMemPerMultiprocessor >= SHMEM_SZ) {
+        printf("Computing... using high performance kernel compute_gemm \n");
+
+        checkCudaErrors(cudaFuncSetAttribute(compute_gemm, cudaFuncAttributeMaxDynamicSharedMemorySize, SHMEM_SZ));
+        checkKernelErrors(
+                (compute_gemm<<<deviceProp.multiProcessorCount, THREADS_PER_BLOCK, SHMEM_SZ>>>(A, B, C, D, alpha,
+                                                                                               beta)));
+#if CPU_DEBUG
+        checkCudaErrors(cudaMemcpy(result_hD, D, sizeof(float)*M_GLOBAL*N_GLOBAL, cudaMemcpyDeviceToHost));
+#endif
+    } else {
+        dim3 gridDim;
+        dim3 blockDim;
+
+        // blockDim.x must be a multple of warpSize
+        // 128x4 means we have 16 warps and a block computes a 64x64 output tile
+        blockDim.x = 128;
+        blockDim.y = 4;
+
+        gridDim.x = (M_GLOBAL + (WMMA_M * blockDim.x / 32 - 1)) / (WMMA_M * blockDim.x / 32);
+        gridDim.y = (N_GLOBAL + WMMA_N * blockDim.y - 1) / (WMMA_N * blockDim.y);
+
+        printf("Computing... using simple_wmma_gemm kernel\n");
+        simple_wmma_gemm<<<gridDim, blockDim>>>(A, B, C, D, M_GLOBAL, N_GLOBAL, K_GLOBAL, alpha, beta);
+#if CPU_DEBUG
+        checkCudaErrors(cudaMemcpy(result_hD, D, sizeof(float) * M_GLOBAL * N_GLOBAL, cudaMemcpyDeviceToHost));
+#endif
+    }
+
+    checkCudaErrors(cudaEventRecord(stop));
+    checkCudaErrors(cudaEventSynchronize(stop));
+
+#if CPU_DEBUG
+    printf("Verifying correctness of the computations...\n");
+
+    memcpy(result_host, C_h, sizeof(float) * M_GLOBAL * N_GLOBAL);
+
+    matMultiplyOnHost(A_h, B_h, result_host,
+                      alpha, beta,
+                      M_GLOBAL, K_GLOBAL,
+                      K_GLOBAL, N_GLOBAL,
+                      M_GLOBAL, N_GLOBAL);
+
+    for (int i = 0; i < N_GLOBAL * M_GLOBAL; i++) {
+        if (fabs(result_hD[i] - result_host[i]) > 0.1f)
+            printf("mismatch i=%d result_hD=%f result_host=%f\n", i, result_hD[i], result_host[i]);
+    }
+    free(result_hD);
+    free(result_host);
+#endif
+
+    float milliseconds = 0;
+
+    checkCudaErrors(cudaEventElapsedTime(&milliseconds, start, stop));
+
+    printf("Time: %f ms\n", milliseconds);
+    printf("TFLOPS: %.2f\n", (((double) M_GLOBAL * N_GLOBAL * K_GLOBAL * 2) / (milliseconds / 1000.)) / 1e12);
+
+    free(A_h);
+    free(B_h);
+    free(C_h);
+    checkCudaErrors(cudaFree((void *) A));
+    checkCudaErrors(cudaFree((void *) B));
+    checkCudaErrors(cudaFree((void *) C));
+    checkCudaErrors(cudaFree((void *) D));
+
+    return 0;
+}
\ No newline at end of file
diff --git a/main.cu b/main.cu
new file mode 100644
index 0000000..bc8f460
--- /dev/null
+++ b/main.cu
@@ -0,0 +1,6 @@
+#include <iostream>
+
+int main() {
+    std::cout << "Hello, World!" << std::endl;
+    return 0;
+}
diff --git a/mpirun.sh b/mpirun.sh
new file mode 100755
index 0000000..50abf06
--- /dev/null
+++ b/mpirun.sh
@@ -0,0 +1,2 @@
+#!/usr/bin/env bash
+mpirun -np 4 cmake-build-debug/simpleMPI
diff --git a/run_bandwidthTest.sh b/run_bandwidthTest.sh
new file mode 100755
index 0000000..7f541be
--- /dev/null
+++ b/run_bandwidthTest.sh
@@ -0,0 +1,2 @@
+#!/usr/bin/env bash
+cmake-build-release/bandwidthTest --mode=shmoo --memory=pinned
\ No newline at end of file
diff --git a/simpleMPI.cpp b/simpleMPI.cpp
new file mode 100644
index 0000000..82bff92
--- /dev/null
+++ b/simpleMPI.cpp
@@ -0,0 +1,122 @@
+//
+// Created by kwoodle on 5/9/20.
+//
+
+/*
+ * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+
+
+/* Simple example demonstrating how to use MPI with CUDA
+*
+*  Generate some random numbers on one node.
+*  Dispatch them to all nodes.
+*  Compute their square root on each node's GPU.
+*  Compute the average of the results using MPI.
+*
+*  simpleMPI.cpp: main program, compiled with mpicxx on linux/Mac platforms
+*                 on Windows, please download the Microsoft HPC Pack SDK 2008
+*/
+
+// MPI include
+#include <mpi.h>
+
+// System includes
+#include <iostream>
+
+using std::cout;
+using std::cerr;
+using std::endl;
+
+// User include
+#include "simpleMPI.h"
+
+// Error handling macros
+#define MPI_CHECK(call) \
+    if((call) != MPI_SUCCESS) { \
+        cerr << "MPI error calling \""#call"\"\n"; \
+        my_abort(-1); }
+
+
+// Host code
+// No CUDA here, only MPI
+int main(int argc, char *argv[]) {
+    // Dimensions of the dataset
+    int blockSize = 256;
+    int gridSize = 10000;
+    int dataSizePerNode = gridSize * blockSize;
+
+    // Initialize MPI state
+    MPI_CHECK(MPI_Init(&argc, &argv));
+
+    // Get our MPI node number and node count
+    int commSize, commRank;
+    MPI_CHECK(MPI_Comm_size(MPI_COMM_WORLD, &commSize));
+    MPI_CHECK(MPI_Comm_rank(MPI_COMM_WORLD, &commRank));
+
+    // Generate some random numbers on the root node (node 0)
+    int dataSizeTotal = dataSizePerNode * commSize;
+    float *dataRoot = NULL;
+
+    if (commRank == 0)  // Are we the root node?
+    {
+        cout << "Running on " << commSize << " nodes" << endl;
+        dataRoot = new float[dataSizeTotal];
+        initData(dataRoot, dataSizeTotal);
+    }
+
+    // Allocate a buffer on each node
+    float *dataNode = new float[dataSizePerNode];
+
+    // Dispatch a portion of the input data to each node
+    MPI_CHECK(MPI_Scatter(dataRoot,
+                          dataSizePerNode,
+                          MPI_FLOAT,
+                          dataNode,
+                          dataSizePerNode,
+                          MPI_FLOAT,
+                          0,
+                          MPI_COMM_WORLD));
+
+    if (commRank == 0) {
+        // No need for root data any more
+        delete[] dataRoot;
+    }
+
+    // On each node, run computation on GPU
+    computeGPU(dataNode, blockSize, gridSize);
+
+    // Reduction to the root node, computing the sum of output elements
+    float sumNode = sum(dataNode, dataSizePerNode);
+    float sumRoot;
+
+    MPI_CHECK(MPI_Reduce(&sumNode, &sumRoot, 1, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD));
+
+    if (commRank == 0) {
+        float average = sumRoot / dataSizeTotal;
+        cout << "Average of square roots is: " << average << endl;
+    }
+
+    // Cleanup
+    delete[] dataNode;
+    MPI_CHECK(MPI_Finalize());
+
+    if (commRank == 0) {
+        cout << "PASSED\n";
+    }
+
+    return 0;
+}
+
+// Shut down MPI cleanly if something goes wrong
+void my_abort(int err) {
+    cout << "Test FAILED\n";
+    MPI_Abort(MPI_COMM_WORLD, err);
+}
\ No newline at end of file
diff --git a/simpleMPI.cu b/simpleMPI.cu
new file mode 100644
index 0000000..8d53eb2
--- /dev/null
+++ b/simpleMPI.cu
@@ -0,0 +1,91 @@
+//
+// Created by kwoodle on 5/10/20.
+//
+
+/*
+ * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+
+
+/* Simple example demonstrating how to use MPI with CUDA
+*
+*  Generate some random numbers on one node.
+*  Dispatch them to all nodes.
+*  Compute their square root on each node's GPU.
+*  Compute the average of the results using MPI.
+*
+*  simpleMPI.cu: GPU part, compiled with nvcc
+*/
+
+#include <iostream>
+
+using std::cerr;
+using std::endl;
+
+#include "simpleMPI.h"
+
+// Error handling macro
+#define CUDA_CHECK(call) \
+    if((call) != cudaSuccess) { \
+        cudaError_t err = cudaGetLastError(); \
+        cerr << "CUDA error calling \""#call"\", code is " << err << endl; \
+        my_abort(err); }
+
+
+// Device code
+// Very simple GPU Kernel that computes square roots of input numbers
+__global__ void simpleMPIKernel(float *input, float *output) {
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    output[tid] = sqrt(input[tid]);
+}
+
+
+// Initialize an array with random data (between 0 and 1)
+void initData(float *data, int dataSize) {
+    for (int i = 0; i < dataSize; i++) {
+        data[i] = (float) rand() / RAND_MAX;
+    }
+}
+
+// CUDA computation on each node
+// No MPI here, only CUDA
+void computeGPU(float *hostData, int blockSize, int gridSize) {
+    int dataSize = blockSize * gridSize;
+
+    // Allocate data on GPU memory
+    float *deviceInputData = NULL;
+    CUDA_CHECK(cudaMalloc((void **) &deviceInputData, dataSize * sizeof(float)));
+
+    float *deviceOutputData = NULL;
+    CUDA_CHECK(cudaMalloc((void **) &deviceOutputData, dataSize * sizeof(float)));
+
+    // Copy to GPU memory
+    CUDA_CHECK(cudaMemcpy(deviceInputData, hostData, dataSize * sizeof(float), cudaMemcpyHostToDevice));
+
+    // Run kernel
+    simpleMPIKernel<<<gridSize, blockSize>>>(deviceInputData, deviceOutputData);
+
+    // Copy data back to CPU memory
+    CUDA_CHECK(cudaMemcpy(hostData, deviceOutputData, dataSize * sizeof(float), cudaMemcpyDeviceToHost));
+
+    // Free GPU memory
+    CUDA_CHECK(cudaFree(deviceInputData));
+    CUDA_CHECK(cudaFree(deviceOutputData));
+}
+
+float sum(float *data, int size) {
+    float accum = 0.f;
+
+    for (int i = 0; i < size; i++) {
+        accum += data[i];
+    }
+
+    return accum;
+}
\ No newline at end of file
diff --git a/simpleMPI.h b/simpleMPI.h
new file mode 100644
index 0000000..5a9586c
--- /dev/null
+++ b/simpleMPI.h
@@ -0,0 +1,32 @@
+//
+// Created by kwoodle on 5/9/20.
+//
+
+/*
+ * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+
+/* Simple example demonstrating how to use MPI with CUDA
+*
+*  Generate some random numbers on one node.
+*  Dispatch them to all nodes.
+*  Compute their square root on each node's GPU.
+*  Compute the average of the results using MPI.
+*
+*  simpleMPI.h: common header file
+*/
+
+// Forward declarations
+extern "C" {
+void initData(float *data, int dataSize);
+void computeGPU(float *hostData, int blockSize, int gridSize);
+float sum(float *data, int size);
+void my_abort(int err);
+}