diff --git a/Common/helper_cuda.h b/Common/helper_cuda.h
index e5b8e9f39..0ef40206b 100644
--- a/Common/helper_cuda.h
+++ b/Common/helper_cuda.h
@@ -282,6 +282,42 @@ static const char *_cudaGetErrorEnum(curandStatus_t error) {
 }
 #endif
 
+#ifdef NVJPEGAPI
+// nvJPEG API errors
+static const char *_cudaGetErrorEnum(nvjpegStatus_t error) {
+  switch (error) {
+    case NVJPEG_STATUS_SUCCESS:
+      return "NVJPEG_STATUS_SUCCESS";
+
+    case NVJPEG_STATUS_NOT_INITIALIZED:
+      return "NVJPEG_STATUS_NOT_INITIALIZED";
+
+    case NVJPEG_STATUS_INVALID_PARAMETER:
+      return "NVJPEG_STATUS_INVALID_PARAMETER";
+
+    case NVJPEG_STATUS_BAD_JPEG:
+      return "NVJPEG_STATUS_BAD_JPEG";
+
+    case NVJPEG_STATUS_JPEG_NOT_SUPPORTED:
+      return "NVJPEG_STATUS_JPEG_NOT_SUPPORTED";
+
+    case NVJPEG_STATUS_ALLOCATOR_FAILURE:
+      return "NVJPEG_STATUS_ALLOCATOR_FAILURE";
+
+    case NVJPEG_STATUS_EXECUTION_FAILED:
+      return "NVJPEG_STATUS_EXECUTION_FAILED";
+
+    case NVJPEG_STATUS_ARCH_MISMATCH:
+      return "NVJPEG_STATUS_ARCH_MISMATCH";
+
+    case NVJPEG_STATUS_INTERNAL_ERROR:
+      return "NVJPEG_STATUS_INTERNAL_ERROR";
+  }
+
+  return "<unknown>";
+}
+#endif
+
 #ifdef NV_NPPIDEFS_H
 // NPP API errors
 static const char *_cudaGetErrorEnum(NppStatus error) {
diff --git a/Common/helper_multiprocess.cpp b/Common/helper_multiprocess.cpp
new file mode 100644
index 000000000..9fb955606
--- /dev/null
+++ b/Common/helper_multiprocess.cpp
@@ -0,0 +1,178 @@
+/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "helper_multiprocess.h"
+#include <cstdlib>
+#include <string>
+
+int sharedMemoryCreate(const char *name, size_t sz, sharedMemoryInfo *info)
+{
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+    info->size = sz;
+    info->shmHandle = CreateFileMapping(INVALID_HANDLE_VALUE,
+                                        NULL,
+                                        PAGE_READWRITE,
+                                        0,
+                                        (DWORD)sz,
+                                        name);
+    if (info->shmHandle == 0) {
+        return GetLastError();
+    }
+
+    info->addr = MapViewOfFile(info->shmHandle, FILE_MAP_ALL_ACCESS, 0, 0, sz);
+    if (info->addr == NULL) {
+        return GetLastError();
+    }
+
+    return 0;
+#else
+    int status = 0;
+
+    info->size = sz;
+
+    info->shmFd = shm_open(name, O_RDWR | O_CREAT, 0777);
+    if (info->shmFd < 0) {
+        return errno;
+    }
+
+    status = ftruncate(info->shmFd, sz);
+    if (status != 0) {
+        return status;
+    }
+
+    info->addr = mmap(0, sz, PROT_READ | PROT_WRITE, MAP_SHARED, info->shmFd, 0);
+    if (info->addr == NULL) {
+        return errno;
+    }
+
+    return 0;
+#endif
+}
+
+int sharedMemoryOpen(const char *name, size_t sz, sharedMemoryInfo *info)
+{
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+    info->size = sz;
+
+    info->shmHandle = OpenFileMapping(FILE_MAP_ALL_ACCESS, FALSE, name);
+    if (info->shmHandle == 0) {
+        return GetLastError();
+    }
+
+    info->addr = MapViewOfFile(info->shmHandle, FILE_MAP_ALL_ACCESS, 0, 0, sz);
+    if (info->addr == NULL) {
+        return GetLastError();
+    }
+
+    return 0;
+#else
+    info->size = sz;
+
+    info->shmFd = shm_open(name, O_RDWR, 0777);
+    if (info->shmFd < 0) {
+        return errno;
+    }
+
+    info->addr = mmap(0, sz, PROT_READ | PROT_WRITE, MAP_SHARED, info->shmFd, 0);
+    if (info->addr == NULL) {
+        return errno;
+    }
+
+    return 0;
+#endif
+}
+
+void sharedMemoryClose(sharedMemoryInfo *info)
+{
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+    if (info->addr) {
+        UnmapViewOfFile(info->addr);
+    }
+    if (info->shmHandle) {
+        CloseHandle(info->shmHandle);
+    }
+#else
+    if (info->addr) {
+        munmap(info->addr, info->size);
+    }
+    if (info->shmFd) {
+        close(info->shmFd);
+    }
+#endif
+}
+
+int spawnProcess(Process *process, const char *app, char * const *args)
+{
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+    STARTUPINFO si = {0};
+    BOOL status;
+    size_t arglen = 0;
+    size_t argIdx = 0;
+    std::string arg_string;
+	memset(process, 0, sizeof(*process));
+
+    while (*args) {
+		arg_string.append(*args).append(1, ' ');
+		args++;
+	}
+
+    status = CreateProcess(app, LPSTR(arg_string.c_str()), NULL, NULL, FALSE, 0, NULL, NULL, &si, process);
+
+    return status ? 0 : GetLastError();
+#else
+    *process = fork();
+    if (*process == 0) {
+        if (0 > execvp(app, args)) {
+            return errno;
+        }
+    }
+    else if (*process < 0) {
+        return errno;
+    }
+    return 0;
+#endif
+}
+
+int waitProcess(Process *process)
+{
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+	DWORD exitCode;
+    WaitForSingleObject(process->hProcess, INFINITE);
+    GetExitCodeProcess(process->hProcess, &exitCode);
+    CloseHandle(process->hProcess);
+    CloseHandle(process->hThread);
+	return (int)exitCode;
+#else
+    int status = 0;
+    do {
+        if (0 > waitpid(*process, &status, 0)) {
+            return errno;
+        }
+    } while (!WIFEXITED(status));
+    return WEXITSTATUS(status);
+#endif
+}
diff --git a/Common/helper_multiprocess.h b/Common/helper_multiprocess.h
new file mode 100644
index 000000000..9f1d3dbfc
--- /dev/null
+++ b/Common/helper_multiprocess.h
@@ -0,0 +1,71 @@
+/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef HELPER_MULTIPROCESS_H
+#define HELPER_MULTIPROCESS_H
+
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+#ifndef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN
+#endif
+#include <windows.h>
+#else
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <errno.h>
+#include <sys/wait.h>
+#endif
+
+typedef struct sharedMemoryInfo_st {
+    void *addr;
+    size_t size;
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+    HANDLE shmHandle;
+#else
+    int shmFd;
+#endif
+} sharedMemoryInfo;
+
+int sharedMemoryCreate(const char *name, size_t sz, sharedMemoryInfo *info);
+
+int sharedMemoryOpen(const char *name, size_t sz, sharedMemoryInfo *info);
+
+void sharedMemoryClose(sharedMemoryInfo *info);
+
+
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+typedef PROCESS_INFORMATION Process;
+#else
+typedef pid_t Process;
+#endif
+
+int spawnProcess(Process *process, const char *app, char * const *args);
+
+int waitProcess(Process *process);
+
+#endif // HELPER_MULTIPROCESS_H
diff --git a/Samples/UnifiedMemoryPerf/Makefile b/Samples/UnifiedMemoryPerf/Makefile
index 628ebd400..ca4f20dad 100644
--- a/Samples/UnifiedMemoryPerf/Makefile
+++ b/Samples/UnifiedMemoryPerf/Makefile
@@ -234,6 +234,12 @@ ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
 
 SAMPLE_ENABLED := 1
 
+# This sample is not supported on QNX
+ifeq ($(TARGET_OS),qnx)
+  $(info >>> WARNING - UnifiedMemoryPerf is not supported on QNX - waiving sample <<<)
+  SAMPLE_ENABLED := 0
+endif
+
 ALL_LDFLAGS :=
 ALL_LDFLAGS += $(ALL_CCFLAGS)
 ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
@@ -246,7 +252,11 @@ LIBRARIES :=
 ################################################################################
 
 # Gencode arguments
+ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
+SMS ?= 30 35 37 50 52 60 61 70 72 75
+else
 SMS ?= 30 35 37 50 52 60 61 70 75
+endif
 
 ifeq ($(SMS),)
 $(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
diff --git a/Samples/UnifiedMemoryPerf/NsightEclipse.xml b/Samples/UnifiedMemoryPerf/NsightEclipse.xml
index 32ab4ef20..b2567d4bc 100644
--- a/Samples/UnifiedMemoryPerf/NsightEclipse.xml
+++ b/Samples/UnifiedMemoryPerf/NsightEclipse.xml
@@ -52,6 +52,7 @@
   <sm-arch>sm60</sm-arch>
   <sm-arch>sm61</sm-arch>
   <sm-arch>sm70</sm-arch>
+  <sm-arch>sm72</sm-arch>
   <sm-arch>sm75</sm-arch>
   <supported_envs>
     <env>
diff --git a/Samples/UnifiedMemoryPerf/README.md b/Samples/UnifiedMemoryPerf/README.md
index da827faf4..3e4d0343b 100644
--- a/Samples/UnifiedMemoryPerf/README.md
+++ b/Samples/UnifiedMemoryPerf/README.md
@@ -10,7 +10,7 @@ CUDA Systems Integration, Unified Memory, CUDA Streams and Events, Pinned System
 
 ## Supported SM Architectures
 
-[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
+[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
 
 ## Supported OSes
 
@@ -30,7 +30,7 @@ cudaMallocManaged, cudaStreamAttachMemAsync, cudaMemcpyAsync, cudaMallocHost, cu
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 10.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 
 ## Build and Run
diff --git a/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2012.vcxproj b/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2012.vcxproj
index 68259eda9..d8cddae02 100644
--- a/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2012.vcxproj
+++ b/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2012.vcxproj
@@ -33,7 +33,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -105,6 +105,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2013.vcxproj b/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2013.vcxproj
index 1bb100bfc..f129b608d 100644
--- a/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2013.vcxproj
+++ b/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2013.vcxproj
@@ -33,7 +33,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -105,6 +105,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2015.vcxproj b/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2015.vcxproj
index cfe0e652a..aa83dcb27 100644
--- a/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2015.vcxproj
+++ b/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2015.vcxproj
@@ -33,7 +33,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -105,6 +105,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2017.vcxproj b/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2017.vcxproj
index 8aa9ef460..09daab5d9 100644
--- a/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2017.vcxproj
+++ b/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2017.vcxproj
@@ -34,7 +34,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -106,6 +106,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/bandwidthTest/Makefile b/Samples/bandwidthTest/Makefile
new file mode 100644
index 000000000..5bc74dbf3
--- /dev/null
+++ b/Samples/bandwidthTest/Makefile
@@ -0,0 +1,304 @@
+################################################################################
+# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+#
+# Makefile project only supported on Mac OS X and Linux Platforms)
+#
+################################################################################
+
+# Location of the CUDA Toolkit
+CUDA_PATH ?= /usr/local/cuda
+
+##############################
+# start deprecated interface #
+##############################
+ifeq ($(x86_64),1)
+    $(info WARNING - x86_64 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=x86_64 instead)
+    TARGET_ARCH ?= x86_64
+endif
+ifeq ($(ARMv7),1)
+    $(info WARNING - ARMv7 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=armv7l instead)
+    TARGET_ARCH ?= armv7l
+endif
+ifeq ($(aarch64),1)
+    $(info WARNING - aarch64 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=aarch64 instead)
+    TARGET_ARCH ?= aarch64
+endif
+ifeq ($(ppc64le),1)
+    $(info WARNING - ppc64le variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=ppc64le instead)
+    TARGET_ARCH ?= ppc64le
+endif
+ifneq ($(GCC),)
+    $(info WARNING - GCC variable has been deprecated)
+    $(info WARNING - please use HOST_COMPILER=$(GCC) instead)
+    HOST_COMPILER ?= $(GCC)
+endif
+ifneq ($(abi),)
+    $(error ERROR - abi variable has been removed)
+endif
+############################
+# end deprecated interface #
+############################
+
+# architecture
+HOST_ARCH   := $(shell uname -m)
+TARGET_ARCH ?= $(HOST_ARCH)
+ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
+    ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le))
+            TARGET_SIZE := 64
+        else ifneq (,$(filter $(TARGET_ARCH),armv7l))
+            TARGET_SIZE := 32
+        endif
+    else
+        TARGET_SIZE := $(shell getconf LONG_BIT)
+    endif
+else
+    $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
+endif
+ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le))
+        $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
+    endif
+endif
+
+# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
+ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
+    TARGET_ARCH = armv7l
+endif
+
+# operating system
+HOST_OS   := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
+TARGET_OS ?= $(HOST_OS)
+ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
+    $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
+endif
+
+# host compiler
+ifeq ($(TARGET_OS),darwin)
+    ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
+        HOST_COMPILER ?= clang++
+    endif
+else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
+        ifeq ($(TARGET_OS),linux)
+            HOST_COMPILER ?= arm-linux-gnueabihf-g++
+        else ifeq ($(TARGET_OS),qnx)
+            ifeq ($(QNX_HOST),)
+                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
+            endif
+            ifeq ($(QNX_TARGET),)
+                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
+            endif
+            export QNX_HOST
+            export QNX_TARGET
+            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
+        else ifeq ($(TARGET_OS),android)
+            HOST_COMPILER ?= arm-linux-androideabi-g++
+        endif
+    else ifeq ($(TARGET_ARCH),aarch64)
+        ifeq ($(TARGET_OS), linux)
+            HOST_COMPILER ?= aarch64-linux-gnu-g++
+        else ifeq ($(TARGET_OS),qnx)
+            ifeq ($(QNX_HOST),)
+                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
+            endif
+            ifeq ($(QNX_TARGET),)
+                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
+            endif
+            export QNX_HOST
+            export QNX_TARGET
+            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/aarch64-unknown-nto-qnx7.0.0-g++
+        else ifeq ($(TARGET_OS), android)
+            HOST_COMPILER ?= aarch64-linux-android-clang++
+        endif
+    else ifeq ($(TARGET_ARCH),ppc64le)
+        HOST_COMPILER ?= powerpc64le-linux-gnu-g++
+    endif
+endif
+HOST_COMPILER ?= g++
+NVCC          := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
+
+# internal flags
+NVCCFLAGS   := -m${TARGET_SIZE}
+CCFLAGS     :=
+LDFLAGS     :=
+
+# build flags
+ifeq ($(TARGET_OS),darwin)
+    LDFLAGS += -rpath $(CUDA_PATH)/lib
+    CCFLAGS += -arch $(HOST_ARCH)
+else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
+    LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
+    CCFLAGS += -mfloat-abi=hard
+else ifeq ($(TARGET_OS),android)
+    LDFLAGS += -pie
+    CCFLAGS += -fpie -fpic -fexceptions
+endif
+
+ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
+        ifneq ($(TARGET_FS),)
+            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
+            ifeq ($(GCCVERSIONLTEQ46),1)
+                CCFLAGS += --sysroot=$(TARGET_FS)
+            endif
+            LDFLAGS += --sysroot=$(TARGET_FS)
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
+        ifneq ($(TARGET_FS),)
+            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
+            ifeq ($(GCCVERSIONLTEQ46),1)
+                CCFLAGS += --sysroot=$(TARGET_FS)
+            endif
+            LDFLAGS += --sysroot=$(TARGET_FS)
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L $(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L $(TARGET_FS)/usr/lib/aarch64-linux-gnu
+            LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu
+        endif
+    endif
+endif
+
+ifeq ($(TARGET_OS),qnx)
+    CCFLAGS += -DWIN_INTERFACE_CUSTOM
+    LDFLAGS += -lsocket
+endif
+
+# Install directory of different arch
+CUDA_INSTALL_TARGET_DIR :=
+ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
+    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
+    CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
+else ifeq ($(TARGET_ARCH),ppc64le)
+    CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
+endif
+
+# Debug build flags
+ifeq ($(dbg),1)
+      NVCCFLAGS += -g -G
+      BUILD_TYPE := debug
+else
+      BUILD_TYPE := release
+endif
+
+ALL_CCFLAGS :=
+ALL_CCFLAGS += $(NVCCFLAGS)
+ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
+ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
+ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
+
+SAMPLE_ENABLED := 1
+
+ALL_LDFLAGS :=
+ALL_LDFLAGS += $(ALL_CCFLAGS)
+ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
+ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
+
+# Common includes and paths for CUDA
+INCLUDES  := -I../../Common
+LIBRARIES :=
+
+################################################################################
+
+# Gencode arguments
+ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
+SMS ?= 30 35 37 50 52 60 61 70 72 75
+else
+SMS ?= 30 35 37 50 52 60 61 70 75
+endif
+
+ifeq ($(SMS),)
+$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
+SAMPLE_ENABLED := 0
+endif
+
+ifeq ($(GENCODE_FLAGS),)
+# Generate SASS code for each SM architecture listed in $(SMS)
+$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
+
+# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
+HIGHEST_SM := $(lastword $(sort $(SMS)))
+ifneq ($(HIGHEST_SM),)
+GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
+endif
+endif
+
+ifeq ($(SAMPLE_ENABLED),0)
+EXEC ?= @echo "[@]"
+endif
+
+################################################################################
+
+# Target rules
+all: build
+
+build: bandwidthTest
+
+check.deps:
+ifeq ($(SAMPLE_ENABLED),0)
+	@echo "Sample will be waived due to the above missing dependencies"
+else
+	@echo "Sample is ready - all dependencies have been met"
+endif
+
+bandwidthTest.o:bandwidthTest.cu
+	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
+
+bandwidthTest: bandwidthTest.o
+	$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
+	$(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
+	$(EXEC) cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
+
+run: build
+	$(EXEC) ./bandwidthTest
+
+clean:
+	rm -f bandwidthTest bandwidthTest.o
+	rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/bandwidthTest
+
+clobber: clean
diff --git a/Samples/bandwidthTest/NsightEclipse.xml b/Samples/bandwidthTest/NsightEclipse.xml
new file mode 100644
index 000000000..5aeaead53
--- /dev/null
+++ b/Samples/bandwidthTest/NsightEclipse.xml
@@ -0,0 +1,79 @@
+<?xml version="1.0" encoding="UTF-8"?> 
+<!DOCTYPE entry SYSTEM "SamplesInfo.dtd">
+<entry>
+  <name>bandwidthTest</name>
+  <cuda_api_list>
+    <toolkit>cudaSetDevice</toolkit>
+    <toolkit>cudaHostAlloc</toolkit>
+    <toolkit>cudaFree</toolkit>
+    <toolkit>cudaMallocHost</toolkit>
+    <toolkit>cudaFreeHost</toolkit>
+    <toolkit>cudaMemcpy</toolkit>
+    <toolkit>cudaMemcpyAsync</toolkit>
+    <toolkit>cudaEventCreate</toolkit>
+    <toolkit>cudaEventRecord</toolkit>
+    <toolkit>cudaEventDestroy</toolkit>
+    <toolkit>cudaDeviceSynchronize</toolkit>
+    <toolkit>cudaEventElapsedTime</toolkit>
+  </cuda_api_list>
+  <description><![CDATA[This is a simple test program to measure the memcopy bandwidth of the GPU and memcpy bandwidth across PCI-e. This test application is capable of measuring device to device copy bandwidth, host to device copy bandwidth for pageable and page-locked memory, and device to host copy bandwidth for pageable and page-locked memory.]]></description>
+  <devicecompilation>whole</devicecompilation>
+  <includepaths>
+    <path>./</path>
+    <path>../</path>
+    <path>../../common/inc</path>
+  </includepaths>
+  <keyconcepts>
+    <concept level="basic">CUDA Streams and Events</concept>
+    <concept level="basic">Performance Strategies</concept>
+  </keyconcepts>
+  <keywords>
+    <keyword>GPGPU</keyword>
+    <keyword>bandwidth</keyword>
+  </keywords>
+  <libraries>
+  </libraries>
+  <librarypaths>
+  </librarypaths>
+  <nsight_eclipse>true</nsight_eclipse>
+  <primary_file>bandwidthTest.cu</primary_file>
+  <scopes>
+    <scope>1:CUDA Basic Topics</scope>
+    <scope>1:Performance Strategies</scope>
+  </scopes>
+  <sm-arch>sm30</sm-arch>
+  <sm-arch>sm35</sm-arch>
+  <sm-arch>sm37</sm-arch>
+  <sm-arch>sm50</sm-arch>
+  <sm-arch>sm52</sm-arch>
+  <sm-arch>sm60</sm-arch>
+  <sm-arch>sm61</sm-arch>
+  <sm-arch>sm70</sm-arch>
+  <sm-arch>sm72</sm-arch>
+  <sm-arch>sm75</sm-arch>
+  <supported_envs>
+    <env>
+      <arch>x86_64</arch>
+      <platform>linux</platform>
+    </env>
+    <env>
+      <platform>windows7</platform>
+    </env>
+    <env>
+      <arch>x86_64</arch>
+      <platform>macosx</platform>
+    </env>
+    <env>
+      <arch>arm</arch>
+    </env>
+    <env>
+      <arch>ppc64le</arch>
+      <platform>linux</platform>
+    </env>
+  </supported_envs>
+  <supported_sm_architectures>
+    <include>all</include>
+  </supported_sm_architectures>
+  <title>Bandwidth Test</title>
+  <type>exe</type>
+</entry>
diff --git a/Samples/bandwidthTest/README.md b/Samples/bandwidthTest/README.md
new file mode 100644
index 000000000..8348179c4
--- /dev/null
+++ b/Samples/bandwidthTest/README.md
@@ -0,0 +1,94 @@
+# bandwidthTest - Bandwidth Test
+
+## Description
+
+This is a simple test program to measure the memcopy bandwidth of the GPU and memcpy bandwidth across PCI-e. This test application is capable of measuring device to device copy bandwidth, host to device copy bandwidth for pageable and page-locked memory, and device to host copy bandwidth for pageable and page-locked memory.
+
+## Key Concepts
+
+CUDA Streams and Events, Performance Strategies
+
+## Supported SM Architectures
+
+[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
+
+## Supported OSes
+
+Linux, Windows, MacOSX
+
+## Supported CPU Architecture
+
+x86_64, ppc64le, armv7l
+
+## CUDA APIs involved
+
+### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html)
+cudaSetDevice, cudaHostAlloc, cudaFree, cudaMallocHost, cudaFreeHost, cudaMemcpy, cudaMemcpyAsync, cudaEventCreate, cudaEventRecord, cudaEventDestroy, cudaDeviceSynchronize, cudaEventElapsedTime
+
+## Prerequisites
+
+Download and install the [CUDA Toolkit 10.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+
+## Build and Run
+
+### Windows
+The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format:
+```
+*_vs<version>.sln - for Visual Studio <version>
+```
+Each individual sample has its own set of solution files in its directory:
+
+To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used.
+> **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details."
+
+### Linux
+The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make:
+```
+$ cd <sample_dir>
+$ make
+```
+The samples makefiles can take advantage of certain options:
+*  **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, armv7l.
+    By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.<br/>
+`$ make TARGET_ARCH=x86_64` <br/> `$ make TARGET_ARCH=ppc64le` <br/> `$ make TARGET_ARCH=armv7l` <br/>
+    See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details.
+*   **dbg=1** - build with debug symbols
+    ```
+    $ make dbg=1
+    ```
+*   **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`.
+    ```
+    $ make SMS="50 60"
+    ```
+
+*  **HOST_COMPILER=<host_compiler>** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers.
+```
+    $ make HOST_COMPILER=g++
+```
+
+### Mac
+The Mac samples are built using makefiles. To use the makefiles, change directory into the sample directory you wish to build, and run make:
+```
+$ cd <sample_dir>
+$ make
+```
+
+The samples makefiles can take advantage of certain options:
+
+*  **dbg=1** - build with debug symbols
+    ```
+    $ make dbg=1
+    ```
+
+*  **SMS="A B ..."** - override the SM architectures for which the sample will be built, where "A B ..." is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use SMS="50 60".
+    ```
+    $ make SMS="A B ..."
+    ```
+
+*  **HOST_COMPILER=<host_compiler>** - override the default clang host compiler. See the [Mac Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#system-requirements) for a list of supported host compilers.
+    ```
+    $ make HOST_COMPILER=clang
+    ```
+
+## References (for more details)
+
diff --git a/Samples/bandwidthTest/bandwidthTest.cu b/Samples/bandwidthTest/bandwidthTest.cu
new file mode 100644
index 000000000..dbb8582ee
--- /dev/null
+++ b/Samples/bandwidthTest/bandwidthTest.cu
@@ -0,0 +1,969 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * This is a simple test program to measure the memcopy bandwidth of the GPU.
+ * It can measure device to device copy bandwidth, host to device copy bandwidth
+ * for pageable and pinned memory, and device to host copy bandwidth for
+ * pageable and pinned memory.
+ *
+ * Usage:
+ * ./bandwidthTest [option]...
+ */
+
+// CUDA runtime
+#include <cuda_runtime.h>
+
+// includes
+#include <helper_cuda.h>  // helper functions for CUDA error checking and initialization
+#include <helper_functions.h>  // helper for shared functions common to CUDA Samples
+
+#include <cuda.h>
+
+#include <cassert>
+#include <iostream>
+#include <memory>
+
+static const char *sSDKsample = "CUDA Bandwidth Test";
+
+// defines, project
+#define MEMCOPY_ITERATIONS 100
+#define DEFAULT_SIZE (32 * (1e6))      // 32 M
+#define DEFAULT_INCREMENT (4 * (1e6))  // 4 M
+#define CACHE_CLEAR_SIZE (16 * (1e6))  // 16 M
+
+// shmoo mode defines
+#define SHMOO_MEMSIZE_MAX (64 * (1e6))       // 64 M
+#define SHMOO_MEMSIZE_START (1e3)            // 1 KB
+#define SHMOO_INCREMENT_1KB (1e3)            // 1 KB
+#define SHMOO_INCREMENT_2KB (2 * 1e3)        // 2 KB
+#define SHMOO_INCREMENT_10KB (10 * (1e3))    // 10KB
+#define SHMOO_INCREMENT_100KB (100 * (1e3))  // 100 KB
+#define SHMOO_INCREMENT_1MB (1e6)            // 1 MB
+#define SHMOO_INCREMENT_2MB (2 * 1e6)        // 2 MB
+#define SHMOO_INCREMENT_4MB (4 * 1e6)        // 4 MB
+#define SHMOO_LIMIT_20KB (20 * (1e3))        // 20 KB
+#define SHMOO_LIMIT_50KB (50 * (1e3))        // 50 KB
+#define SHMOO_LIMIT_100KB (100 * (1e3))      // 100 KB
+#define SHMOO_LIMIT_1MB (1e6)                // 1 MB
+#define SHMOO_LIMIT_16MB (16 * 1e6)          // 16 MB
+#define SHMOO_LIMIT_32MB (32 * 1e6)          // 32 MB
+
+// CPU cache flush
+#define FLUSH_SIZE (256 * 1024 * 1024)
+char *flush_buf;
+
+// enums, project
+enum testMode { QUICK_MODE, RANGE_MODE, SHMOO_MODE };
+enum memcpyKind { DEVICE_TO_HOST, HOST_TO_DEVICE, DEVICE_TO_DEVICE };
+enum printMode { USER_READABLE, CSV };
+enum memoryMode { PINNED, PAGEABLE };
+
+const char *sMemoryCopyKind[] = {"Device to Host", "Host to Device",
+                                 "Device to Device", NULL};
+
+const char *sMemoryMode[] = {"PINNED", "PAGEABLE", NULL};
+
+// if true, use CPU based timing for everything
+static bool bDontUseGPUTiming;
+
+int *pArgc = NULL;
+char **pArgv = NULL;
+
+////////////////////////////////////////////////////////////////////////////////
+// declaration, forward
+int runTest(const int argc, const char **argv);
+void testBandwidth(unsigned int start, unsigned int end, unsigned int increment,
+                   testMode mode, memcpyKind kind, printMode printmode,
+                   memoryMode memMode, int startDevice, int endDevice, bool wc);
+void testBandwidthQuick(unsigned int size, memcpyKind kind, printMode printmode,
+                        memoryMode memMode, int startDevice, int endDevice,
+                        bool wc);
+void testBandwidthRange(unsigned int start, unsigned int end,
+                        unsigned int increment, memcpyKind kind,
+                        printMode printmode, memoryMode memMode,
+                        int startDevice, int endDevice, bool wc);
+void testBandwidthShmoo(memcpyKind kind, printMode printmode,
+                        memoryMode memMode, int startDevice, int endDevice,
+                        bool wc);
+float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode,
+                               bool wc);
+float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode,
+                               bool wc);
+float testDeviceToDeviceTransfer(unsigned int memSize);
+void printResultsReadable(unsigned int *memSizes, double *bandwidths,
+                          unsigned int count, memcpyKind kind,
+                          memoryMode memMode, int iNumDevs, bool wc);
+void printResultsCSV(unsigned int *memSizes, double *bandwidths,
+                     unsigned int count, memcpyKind kind, memoryMode memMode,
+                     int iNumDevs, bool wc);
+void printHelp(void);
+
+////////////////////////////////////////////////////////////////////////////////
+// Program main
+////////////////////////////////////////////////////////////////////////////////
+int main(int argc, char **argv) {
+  pArgc = &argc;
+  pArgv = argv;
+
+  flush_buf = (char *)malloc(FLUSH_SIZE);
+
+  // set logfile name and start logs
+  printf("[%s] - Starting...\n", sSDKsample);
+
+  int iRetVal = runTest(argc, (const char **)argv);
+
+  if (iRetVal < 0) {
+    checkCudaErrors(cudaSetDevice(0));
+  }
+
+  // finish
+  printf("%s\n", (iRetVal == 0) ? "Result = PASS" : "Result = FAIL");
+
+  printf(
+      "\nNOTE: The CUDA Samples are not meant for performance measurements. "
+      "Results may vary when GPU Boost is enabled.\n");
+
+  free(flush_buf);
+
+  exit((iRetVal == 0) ? EXIT_SUCCESS : EXIT_FAILURE);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Parse args, run the appropriate tests
+///////////////////////////////////////////////////////////////////////////////
+int runTest(const int argc, const char **argv) {
+  int start = DEFAULT_SIZE;
+  int end = DEFAULT_SIZE;
+  int startDevice = 0;
+  int endDevice = 0;
+  int increment = DEFAULT_INCREMENT;
+  testMode mode = QUICK_MODE;
+  bool htod = false;
+  bool dtoh = false;
+  bool dtod = false;
+  bool wc = false;
+  char *modeStr;
+  char *device = NULL;
+  printMode printmode = USER_READABLE;
+  char *memModeStr = NULL;
+  memoryMode memMode = PINNED;
+
+  // process command line args
+  if (checkCmdLineFlag(argc, argv, "help")) {
+    printHelp();
+    return 0;
+  }
+
+  if (checkCmdLineFlag(argc, argv, "csv")) {
+    printmode = CSV;
+  }
+
+  if (getCmdLineArgumentString(argc, argv, "memory", &memModeStr)) {
+    if (strcmp(memModeStr, "pageable") == 0) {
+      memMode = PAGEABLE;
+    } else if (strcmp(memModeStr, "pinned") == 0) {
+      memMode = PINNED;
+    } else {
+      printf("Invalid memory mode - valid modes are pageable or pinned\n");
+      printf("See --help for more information\n");
+      return -1000;
+    }
+  } else {
+    // default - pinned memory
+    memMode = PINNED;
+  }
+
+  if (getCmdLineArgumentString(argc, argv, "device", &device)) {
+    int deviceCount;
+    cudaError_t error_id = cudaGetDeviceCount(&deviceCount);
+
+    if (error_id != cudaSuccess) {
+      printf("cudaGetDeviceCount returned %d\n-> %s\n", (int)error_id,
+             cudaGetErrorString(error_id));
+      exit(EXIT_FAILURE);
+    }
+
+    if (deviceCount == 0) {
+      printf("!!!!!No devices found!!!!!\n");
+      return -2000;
+    }
+
+    if (strcmp(device, "all") == 0) {
+      printf(
+          "\n!!!!!Cumulative Bandwidth to be computed from all the devices "
+          "!!!!!!\n\n");
+      startDevice = 0;
+      endDevice = deviceCount - 1;
+    } else {
+      startDevice = endDevice = atoi(device);
+
+      if (startDevice >= deviceCount || startDevice < 0) {
+        printf(
+            "\n!!!!!Invalid GPU number %d given hence default gpu %d will be "
+            "used !!!!!\n",
+            startDevice, 0);
+        startDevice = endDevice = 0;
+      }
+    }
+  }
+
+  printf("Running on...\n\n");
+
+  for (int currentDevice = startDevice; currentDevice <= endDevice;
+       currentDevice++) {
+    cudaDeviceProp deviceProp;
+    cudaError_t error_id = cudaGetDeviceProperties(&deviceProp, currentDevice);
+
+    if (error_id == cudaSuccess) {
+      printf(" Device %d: %s\n", currentDevice, deviceProp.name);
+
+      if (deviceProp.computeMode == cudaComputeModeProhibited) {
+        fprintf(stderr,
+                "Error: device is running in <Compute Mode Prohibited>, no "
+                "threads can use ::cudaSetDevice().\n");
+        checkCudaErrors(cudaSetDevice(currentDevice));
+
+        exit(EXIT_FAILURE);
+      }
+    } else {
+      printf("cudaGetDeviceProperties returned %d\n-> %s\n", (int)error_id,
+             cudaGetErrorString(error_id));
+      checkCudaErrors(cudaSetDevice(currentDevice));
+
+      exit(EXIT_FAILURE);
+    }
+  }
+
+  if (getCmdLineArgumentString(argc, argv, "mode", &modeStr)) {
+    // figure out the mode
+    if (strcmp(modeStr, "quick") == 0) {
+      printf(" Quick Mode\n\n");
+      mode = QUICK_MODE;
+    } else if (strcmp(modeStr, "shmoo") == 0) {
+      printf(" Shmoo Mode\n\n");
+      mode = SHMOO_MODE;
+    } else if (strcmp(modeStr, "range") == 0) {
+      printf(" Range Mode\n\n");
+      mode = RANGE_MODE;
+    } else {
+      printf("Invalid mode - valid modes are quick, range, or shmoo\n");
+      printf("See --help for more information\n");
+      return -3000;
+    }
+  } else {
+    // default mode - quick
+    printf(" Quick Mode\n\n");
+    mode = QUICK_MODE;
+  }
+
+  if (checkCmdLineFlag(argc, argv, "htod")) {
+    htod = true;
+  }
+
+  if (checkCmdLineFlag(argc, argv, "dtoh")) {
+    dtoh = true;
+  }
+
+  if (checkCmdLineFlag(argc, argv, "dtod")) {
+    dtod = true;
+  }
+
+#if CUDART_VERSION >= 2020
+
+  if (checkCmdLineFlag(argc, argv, "wc")) {
+    wc = true;
+  }
+
+#endif
+
+  if (checkCmdLineFlag(argc, argv, "cputiming")) {
+    bDontUseGPUTiming = true;
+  }
+
+  if (!htod && !dtoh && !dtod) {
+    // default:  All
+    htod = true;
+    dtoh = true;
+    dtod = true;
+  }
+
+  if (RANGE_MODE == mode) {
+    if (checkCmdLineFlag(argc, (const char **)argv, "start")) {
+      start = getCmdLineArgumentInt(argc, argv, "start");
+
+      if (start <= 0) {
+        printf("Illegal argument - start must be greater than zero\n");
+        return -4000;
+      }
+    } else {
+      printf("Must specify a starting size in range mode\n");
+      printf("See --help for more information\n");
+      return -5000;
+    }
+
+    if (checkCmdLineFlag(argc, (const char **)argv, "end")) {
+      end = getCmdLineArgumentInt(argc, argv, "end");
+
+      if (end <= 0) {
+        printf("Illegal argument - end must be greater than zero\n");
+        return -6000;
+      }
+
+      if (start > end) {
+        printf("Illegal argument - start is greater than end\n");
+        return -7000;
+      }
+    } else {
+      printf("Must specify an end size in range mode.\n");
+      printf("See --help for more information\n");
+      return -8000;
+    }
+
+    if (checkCmdLineFlag(argc, argv, "increment")) {
+      increment = getCmdLineArgumentInt(argc, argv, "increment");
+
+      if (increment <= 0) {
+        printf("Illegal argument - increment must be greater than zero\n");
+        return -9000;
+      }
+    } else {
+      printf("Must specify an increment in user mode\n");
+      printf("See --help for more information\n");
+      return -10000;
+    }
+  }
+
+  if (htod) {
+    testBandwidth((unsigned int)start, (unsigned int)end,
+                  (unsigned int)increment, mode, HOST_TO_DEVICE, printmode,
+                  memMode, startDevice, endDevice, wc);
+  }
+
+  if (dtoh) {
+    testBandwidth((unsigned int)start, (unsigned int)end,
+                  (unsigned int)increment, mode, DEVICE_TO_HOST, printmode,
+                  memMode, startDevice, endDevice, wc);
+  }
+
+  if (dtod) {
+    testBandwidth((unsigned int)start, (unsigned int)end,
+                  (unsigned int)increment, mode, DEVICE_TO_DEVICE, printmode,
+                  memMode, startDevice, endDevice, wc);
+  }
+
+  // Ensure that we reset all CUDA Devices in question
+  for (int nDevice = startDevice; nDevice <= endDevice; nDevice++) {
+    cudaSetDevice(nDevice);
+  }
+
+  return 0;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+//  Run a bandwidth test
+///////////////////////////////////////////////////////////////////////////////
+void testBandwidth(unsigned int start, unsigned int end, unsigned int increment,
+                   testMode mode, memcpyKind kind, printMode printmode,
+                   memoryMode memMode, int startDevice, int endDevice,
+                   bool wc) {
+  switch (mode) {
+    case QUICK_MODE:
+      testBandwidthQuick(DEFAULT_SIZE, kind, printmode, memMode, startDevice,
+                         endDevice, wc);
+      break;
+
+    case RANGE_MODE:
+      testBandwidthRange(start, end, increment, kind, printmode, memMode,
+                         startDevice, endDevice, wc);
+      break;
+
+    case SHMOO_MODE:
+      testBandwidthShmoo(kind, printmode, memMode, startDevice, endDevice, wc);
+      break;
+
+    default:
+      break;
+  }
+}
+
+//////////////////////////////////////////////////////////////////////
+//  Run a quick mode bandwidth test
+//////////////////////////////////////////////////////////////////////
+void testBandwidthQuick(unsigned int size, memcpyKind kind, printMode printmode,
+                        memoryMode memMode, int startDevice, int endDevice,
+                        bool wc) {
+  testBandwidthRange(size, size, DEFAULT_INCREMENT, kind, printmode, memMode,
+                     startDevice, endDevice, wc);
+}
+
+///////////////////////////////////////////////////////////////////////
+//  Run a range mode bandwidth test
+//////////////////////////////////////////////////////////////////////
+void testBandwidthRange(unsigned int start, unsigned int end,
+                        unsigned int increment, memcpyKind kind,
+                        printMode printmode, memoryMode memMode,
+                        int startDevice, int endDevice, bool wc) {
+  // count the number of copies we're going to run
+  unsigned int count = 1 + ((end - start) / increment);
+
+  unsigned int *memSizes = (unsigned int *)malloc(count * sizeof(unsigned int));
+  double *bandwidths = (double *)malloc(count * sizeof(double));
+
+  // Before calculating the cumulative bandwidth, initialize bandwidths array to
+  // NULL
+  for (unsigned int i = 0; i < count; i++) {
+    bandwidths[i] = 0.0;
+  }
+
+  // Use the device asked by the user
+  for (int currentDevice = startDevice; currentDevice <= endDevice;
+       currentDevice++) {
+    cudaSetDevice(currentDevice);
+
+    // run each of the copies
+    for (unsigned int i = 0; i < count; i++) {
+      memSizes[i] = start + i * increment;
+
+      switch (kind) {
+        case DEVICE_TO_HOST:
+          bandwidths[i] += testDeviceToHostTransfer(memSizes[i], memMode, wc);
+          break;
+
+        case HOST_TO_DEVICE:
+          bandwidths[i] += testHostToDeviceTransfer(memSizes[i], memMode, wc);
+          break;
+
+        case DEVICE_TO_DEVICE:
+          bandwidths[i] += testDeviceToDeviceTransfer(memSizes[i]);
+          break;
+      }
+    }
+  }  // Complete the bandwidth computation on all the devices
+
+  // print results
+  if (printmode == CSV) {
+    printResultsCSV(memSizes, bandwidths, count, kind, memMode,
+                    (1 + endDevice - startDevice), wc);
+  } else {
+    printResultsReadable(memSizes, bandwidths, count, kind, memMode,
+                         (1 + endDevice - startDevice), wc);
+  }
+
+  // clean up
+  free(memSizes);
+  free(bandwidths);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Intense shmoo mode - covers a large range of values with varying increments
+//////////////////////////////////////////////////////////////////////////////
+void testBandwidthShmoo(memcpyKind kind, printMode printmode,
+                        memoryMode memMode, int startDevice, int endDevice,
+                        bool wc) {
+  // count the number of copies to make
+  unsigned int count =
+      1 + (SHMOO_LIMIT_20KB / SHMOO_INCREMENT_1KB) +
+      ((SHMOO_LIMIT_50KB - SHMOO_LIMIT_20KB) / SHMOO_INCREMENT_2KB) +
+      ((SHMOO_LIMIT_100KB - SHMOO_LIMIT_50KB) / SHMOO_INCREMENT_10KB) +
+      ((SHMOO_LIMIT_1MB - SHMOO_LIMIT_100KB) / SHMOO_INCREMENT_100KB) +
+      ((SHMOO_LIMIT_16MB - SHMOO_LIMIT_1MB) / SHMOO_INCREMENT_1MB) +
+      ((SHMOO_LIMIT_32MB - SHMOO_LIMIT_16MB) / SHMOO_INCREMENT_2MB) +
+      ((SHMOO_MEMSIZE_MAX - SHMOO_LIMIT_32MB) / SHMOO_INCREMENT_4MB);
+
+  unsigned int *memSizes = (unsigned int *)malloc(count * sizeof(unsigned int));
+  double *bandwidths = (double *)malloc(count * sizeof(double));
+
+  // Before calculating the cumulative bandwidth, initialize bandwidths array to
+  // NULL
+  for (unsigned int i = 0; i < count; i++) {
+    bandwidths[i] = 0.0;
+  }
+
+  // Use the device asked by the user
+  for (int currentDevice = startDevice; currentDevice <= endDevice;
+       currentDevice++) {
+    cudaSetDevice(currentDevice);
+    // Run the shmoo
+    int iteration = 0;
+    unsigned int memSize = 0;
+
+    while (memSize <= SHMOO_MEMSIZE_MAX) {
+      if (memSize < SHMOO_LIMIT_20KB) {
+        memSize += SHMOO_INCREMENT_1KB;
+      } else if (memSize < SHMOO_LIMIT_50KB) {
+        memSize += SHMOO_INCREMENT_2KB;
+      } else if (memSize < SHMOO_LIMIT_100KB) {
+        memSize += SHMOO_INCREMENT_10KB;
+      } else if (memSize < SHMOO_LIMIT_1MB) {
+        memSize += SHMOO_INCREMENT_100KB;
+      } else if (memSize < SHMOO_LIMIT_16MB) {
+        memSize += SHMOO_INCREMENT_1MB;
+      } else if (memSize < SHMOO_LIMIT_32MB) {
+        memSize += SHMOO_INCREMENT_2MB;
+      } else {
+        memSize += SHMOO_INCREMENT_4MB;
+      }
+
+      memSizes[iteration] = memSize;
+
+      switch (kind) {
+        case DEVICE_TO_HOST:
+          bandwidths[iteration] +=
+              testDeviceToHostTransfer(memSizes[iteration], memMode, wc);
+          break;
+
+        case HOST_TO_DEVICE:
+          bandwidths[iteration] +=
+              testHostToDeviceTransfer(memSizes[iteration], memMode, wc);
+          break;
+
+        case DEVICE_TO_DEVICE:
+          bandwidths[iteration] +=
+              testDeviceToDeviceTransfer(memSizes[iteration]);
+          break;
+      }
+
+      iteration++;
+      printf(".");
+      fflush(0);
+    }
+  }  // Complete the bandwidth computation on all the devices
+
+  // print results
+  printf("\n");
+
+  if (CSV == printmode) {
+    printResultsCSV(memSizes, bandwidths, count, kind, memMode,
+                    (1 + endDevice - startDevice), wc);
+  } else {
+    printResultsReadable(memSizes, bandwidths, count, kind, memMode,
+                         (1 + endDevice - startDevice), wc);
+  }
+
+  // clean up
+  free(memSizes);
+  free(bandwidths);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+//  test the bandwidth of a device to host memcopy of a specific size
+///////////////////////////////////////////////////////////////////////////////
+float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode,
+                               bool wc) {
+  StopWatchInterface *timer = NULL;
+  float elapsedTimeInMs = 0.0f;
+  float bandwidthInGBs = 0.0f;
+  unsigned char *h_idata = NULL;
+  unsigned char *h_odata = NULL;
+  cudaEvent_t start, stop;
+
+  sdkCreateTimer(&timer);
+  checkCudaErrors(cudaEventCreate(&start));
+  checkCudaErrors(cudaEventCreate(&stop));
+
+  // allocate host memory
+  if (PINNED == memMode) {
+  // pinned memory mode - use special function to get OS-pinned memory
+#if CUDART_VERSION >= 2020
+    checkCudaErrors(cudaHostAlloc((void **)&h_idata, memSize,
+                                  (wc) ? cudaHostAllocWriteCombined : 0));
+    checkCudaErrors(cudaHostAlloc((void **)&h_odata, memSize,
+                                  (wc) ? cudaHostAllocWriteCombined : 0));
+#else
+    checkCudaErrors(cudaMallocHost((void **)&h_idata, memSize));
+    checkCudaErrors(cudaMallocHost((void **)&h_odata, memSize));
+#endif
+  } else {
+    // pageable memory mode - use malloc
+    h_idata = (unsigned char *)malloc(memSize);
+    h_odata = (unsigned char *)malloc(memSize);
+
+    if (h_idata == 0 || h_odata == 0) {
+      fprintf(stderr, "Not enough memory avaialable on host to run test!\n");
+      exit(EXIT_FAILURE);
+    }
+  }
+
+  // initialize the memory
+  for (unsigned int i = 0; i < memSize / sizeof(unsigned char); i++) {
+    h_idata[i] = (unsigned char)(i & 0xff);
+  }
+
+  // allocate device memory
+  unsigned char *d_idata;
+  checkCudaErrors(cudaMalloc((void **)&d_idata, memSize));
+
+  // initialize the device memory
+  checkCudaErrors(
+      cudaMemcpy(d_idata, h_idata, memSize, cudaMemcpyHostToDevice));
+
+  // copy data from GPU to Host
+  if (PINNED == memMode) {
+    if (bDontUseGPUTiming) sdkStartTimer(&timer);
+    checkCudaErrors(cudaEventRecord(start, 0));
+    for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
+      checkCudaErrors(cudaMemcpyAsync(h_odata, d_idata, memSize,
+                                      cudaMemcpyDeviceToHost, 0));
+    }
+    checkCudaErrors(cudaEventRecord(stop, 0));
+    checkCudaErrors(cudaDeviceSynchronize());
+    checkCudaErrors(cudaEventElapsedTime(&elapsedTimeInMs, start, stop));
+    if (bDontUseGPUTiming) {
+      sdkStopTimer(&timer);
+      elapsedTimeInMs = sdkGetTimerValue(&timer);
+      sdkResetTimer(&timer);
+    }
+  } else {
+    elapsedTimeInMs = 0;
+    for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
+      sdkStartTimer(&timer);
+      checkCudaErrors(
+          cudaMemcpy(h_odata, d_idata, memSize, cudaMemcpyDeviceToHost));
+      sdkStopTimer(&timer);
+      elapsedTimeInMs += sdkGetTimerValue(&timer);
+      sdkResetTimer(&timer);
+      memset(flush_buf, i, FLUSH_SIZE);
+    }
+  }
+
+  // calculate bandwidth in GB/s
+  double time_s = elapsedTimeInMs / 1e3;
+  bandwidthInGBs = (memSize * (float)MEMCOPY_ITERATIONS) / (double)1e9;
+  bandwidthInGBs = bandwidthInGBs / time_s;
+  // clean up memory
+  checkCudaErrors(cudaEventDestroy(stop));
+  checkCudaErrors(cudaEventDestroy(start));
+  sdkDeleteTimer(&timer);
+
+  if (PINNED == memMode) {
+    checkCudaErrors(cudaFreeHost(h_idata));
+    checkCudaErrors(cudaFreeHost(h_odata));
+  } else {
+    free(h_idata);
+    free(h_odata);
+  }
+
+  checkCudaErrors(cudaFree(d_idata));
+
+  return bandwidthInGBs;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+//! test the bandwidth of a host to device memcopy of a specific size
+///////////////////////////////////////////////////////////////////////////////
+float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode,
+                               bool wc) {
+  StopWatchInterface *timer = NULL;
+  float elapsedTimeInMs = 0.0f;
+  float bandwidthInGBs = 0.0f;
+  cudaEvent_t start, stop;
+  sdkCreateTimer(&timer);
+  checkCudaErrors(cudaEventCreate(&start));
+  checkCudaErrors(cudaEventCreate(&stop));
+
+  // allocate host memory
+  unsigned char *h_odata = NULL;
+
+  if (PINNED == memMode) {
+#if CUDART_VERSION >= 2020
+    // pinned memory mode - use special function to get OS-pinned memory
+    checkCudaErrors(cudaHostAlloc((void **)&h_odata, memSize,
+                                  (wc) ? cudaHostAllocWriteCombined : 0));
+#else
+    // pinned memory mode - use special function to get OS-pinned memory
+    checkCudaErrors(cudaMallocHost((void **)&h_odata, memSize));
+#endif
+  } else {
+    // pageable memory mode - use malloc
+    h_odata = (unsigned char *)malloc(memSize);
+
+    if (h_odata == 0) {
+      fprintf(stderr, "Not enough memory available on host to run test!\n");
+      exit(EXIT_FAILURE);
+    }
+  }
+
+  unsigned char *h_cacheClear1 = (unsigned char *)malloc(CACHE_CLEAR_SIZE);
+  unsigned char *h_cacheClear2 = (unsigned char *)malloc(CACHE_CLEAR_SIZE);
+
+  if (h_cacheClear1 == 0 || h_cacheClear2 == 0) {
+    fprintf(stderr, "Not enough memory available on host to run test!\n");
+    exit(EXIT_FAILURE);
+  }
+
+  // initialize the memory
+  for (unsigned int i = 0; i < memSize / sizeof(unsigned char); i++) {
+    h_odata[i] = (unsigned char)(i & 0xff);
+  }
+
+  for (unsigned int i = 0; i < CACHE_CLEAR_SIZE / sizeof(unsigned char); i++) {
+    h_cacheClear1[i] = (unsigned char)(i & 0xff);
+    h_cacheClear2[i] = (unsigned char)(0xff - (i & 0xff));
+  }
+
+  // allocate device memory
+  unsigned char *d_idata;
+  checkCudaErrors(cudaMalloc((void **)&d_idata, memSize));
+
+  // copy host memory to device memory
+  if (PINNED == memMode) {
+    if (bDontUseGPUTiming) sdkStartTimer(&timer);
+    checkCudaErrors(cudaEventRecord(start, 0));
+    for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
+      checkCudaErrors(cudaMemcpyAsync(d_idata, h_odata, memSize,
+                                      cudaMemcpyHostToDevice, 0));
+    }
+    checkCudaErrors(cudaEventRecord(stop, 0));
+    checkCudaErrors(cudaDeviceSynchronize());
+    checkCudaErrors(cudaEventElapsedTime(&elapsedTimeInMs, start, stop));
+    if (bDontUseGPUTiming) {
+      sdkStopTimer(&timer);
+      elapsedTimeInMs = sdkGetTimerValue(&timer);
+      sdkResetTimer(&timer);
+    }
+  } else {
+    elapsedTimeInMs = 0;
+    for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
+      sdkStartTimer(&timer);
+      checkCudaErrors(
+          cudaMemcpy(d_idata, h_odata, memSize, cudaMemcpyHostToDevice));
+      sdkStopTimer(&timer);
+      elapsedTimeInMs += sdkGetTimerValue(&timer);
+      sdkResetTimer(&timer);
+      memset(flush_buf, i, FLUSH_SIZE);
+    }
+  }
+
+  // calculate bandwidth in GB/s
+  double time_s = elapsedTimeInMs / 1e3;
+  bandwidthInGBs = (memSize * (float)MEMCOPY_ITERATIONS) / (double)1e9;
+  bandwidthInGBs = bandwidthInGBs / time_s;
+  // clean up memory
+  checkCudaErrors(cudaEventDestroy(stop));
+  checkCudaErrors(cudaEventDestroy(start));
+  sdkDeleteTimer(&timer);
+
+  if (PINNED == memMode) {
+    checkCudaErrors(cudaFreeHost(h_odata));
+  } else {
+    free(h_odata);
+  }
+
+  free(h_cacheClear1);
+  free(h_cacheClear2);
+  checkCudaErrors(cudaFree(d_idata));
+
+  return bandwidthInGBs;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+//! test the bandwidth of a device to device memcopy of a specific size
+///////////////////////////////////////////////////////////////////////////////
+float testDeviceToDeviceTransfer(unsigned int memSize) {
+  StopWatchInterface *timer = NULL;
+  float elapsedTimeInMs = 0.0f;
+  float bandwidthInGBs = 0.0f;
+  cudaEvent_t start, stop;
+
+  sdkCreateTimer(&timer);
+  checkCudaErrors(cudaEventCreate(&start));
+  checkCudaErrors(cudaEventCreate(&stop));
+
+  // allocate host memory
+  unsigned char *h_idata = (unsigned char *)malloc(memSize);
+
+  if (h_idata == 0) {
+    fprintf(stderr, "Not enough memory avaialable on host to run test!\n");
+    exit(EXIT_FAILURE);
+  }
+
+  // initialize the host memory
+  for (unsigned int i = 0; i < memSize / sizeof(unsigned char); i++) {
+    h_idata[i] = (unsigned char)(i & 0xff);
+  }
+
+  // allocate device memory
+  unsigned char *d_idata;
+  checkCudaErrors(cudaMalloc((void **)&d_idata, memSize));
+  unsigned char *d_odata;
+  checkCudaErrors(cudaMalloc((void **)&d_odata, memSize));
+
+  // initialize memory
+  checkCudaErrors(
+      cudaMemcpy(d_idata, h_idata, memSize, cudaMemcpyHostToDevice));
+
+  // run the memcopy
+  sdkStartTimer(&timer);
+  checkCudaErrors(cudaEventRecord(start, 0));
+
+  for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
+    checkCudaErrors(
+        cudaMemcpy(d_odata, d_idata, memSize, cudaMemcpyDeviceToDevice));
+  }
+
+  checkCudaErrors(cudaEventRecord(stop, 0));
+
+  // Since device to device memory copies are non-blocking,
+  // cudaDeviceSynchronize() is required in order to get
+  // proper timing.
+  checkCudaErrors(cudaDeviceSynchronize());
+
+  // get the total elapsed time in ms
+  sdkStopTimer(&timer);
+  checkCudaErrors(cudaEventElapsedTime(&elapsedTimeInMs, start, stop));
+
+  if (bDontUseGPUTiming) {
+    elapsedTimeInMs = sdkGetTimerValue(&timer);
+  }
+
+  // calculate bandwidth in GB/s
+  double time_s = elapsedTimeInMs / 1e3;
+  bandwidthInGBs = (2.0f * memSize * (float)MEMCOPY_ITERATIONS) / (double)1e9;
+  bandwidthInGBs = bandwidthInGBs / time_s;
+
+  // clean up memory
+  sdkDeleteTimer(&timer);
+  free(h_idata);
+  checkCudaErrors(cudaEventDestroy(stop));
+  checkCudaErrors(cudaEventDestroy(start));
+  checkCudaErrors(cudaFree(d_idata));
+  checkCudaErrors(cudaFree(d_odata));
+
+  return bandwidthInGBs;
+}
+
+/////////////////////////////////////////////////////////
+// print results in an easily read format
+////////////////////////////////////////////////////////
+void printResultsReadable(unsigned int *memSizes, double *bandwidths,
+                          unsigned int count, memcpyKind kind,
+                          memoryMode memMode, int iNumDevs, bool wc) {
+  printf(" %s Bandwidth, %i Device(s)\n", sMemoryCopyKind[kind], iNumDevs);
+  printf(" %s Memory Transfers\n", sMemoryMode[memMode]);
+
+  if (wc) {
+    printf(" Write-Combined Memory Writes are Enabled");
+  }
+
+  printf("   Transfer Size (Bytes)\tBandwidth(GB/s)\n");
+  unsigned int i;
+
+  for (i = 0; i < (count - 1); i++) {
+    printf("   %u\t\t\t%s%.1f\n", memSizes[i],
+           (memSizes[i] < 10000) ? "\t" : "", bandwidths[i]);
+  }
+
+  printf("   %u\t\t\t%s%.1f\n\n", memSizes[i],
+         (memSizes[i] < 10000) ? "\t" : "", bandwidths[i]);
+}
+
+///////////////////////////////////////////////////////////////////////////
+// print results in a database format
+///////////////////////////////////////////////////////////////////////////
+void printResultsCSV(unsigned int *memSizes, double *bandwidths,
+                     unsigned int count, memcpyKind kind, memoryMode memMode,
+                     int iNumDevs, bool wc) {
+  std::string sConfig;
+
+  // log config information
+  if (kind == DEVICE_TO_DEVICE) {
+    sConfig += "D2D";
+  } else {
+    if (kind == DEVICE_TO_HOST) {
+      sConfig += "D2H";
+    } else if (kind == HOST_TO_DEVICE) {
+      sConfig += "H2D";
+    }
+
+    if (memMode == PAGEABLE) {
+      sConfig += "-Paged";
+    } else if (memMode == PINNED) {
+      sConfig += "-Pinned";
+
+      if (wc) {
+        sConfig += "-WriteCombined";
+      }
+    }
+  }
+
+  unsigned int i;
+  double dSeconds = 0.0;
+
+  for (i = 0; i < count; i++) {
+    dSeconds = (double)memSizes[i] / (bandwidths[i] * (double)(1 << 20));
+    printf(
+        "bandwidthTest-%s, Bandwidth = %.1f GB/s, Time = %.5f s, Size = %u "
+        "bytes, NumDevsUsed = %d\n",
+        sConfig.c_str(), bandwidths[i], dSeconds, memSizes[i], iNumDevs);
+  }
+}
+
+///////////////////////////////////////////////////////////////////////////
+// Print help screen
+///////////////////////////////////////////////////////////////////////////
+void printHelp(void) {
+  printf("Usage:  bandwidthTest [OPTION]...\n");
+  printf(
+      "Test the bandwidth for device to host, host to device, and device to "
+      "device transfers\n");
+  printf("\n");
+  printf(
+      "Example:  measure the bandwidth of device to host pinned memory copies "
+      "in the range 1024 Bytes to 102400 Bytes in 1024 Byte increments\n");
+  printf(
+      "./bandwidthTest --memory=pinned --mode=range --start=1024 --end=102400 "
+      "--increment=1024 --dtoh\n");
+
+  printf("\n");
+  printf("Options:\n");
+  printf("--help\tDisplay this help menu\n");
+  printf("--csv\tPrint results as a CSV\n");
+  printf("--device=[deviceno]\tSpecify the device device to be used\n");
+  printf("  all - compute cumulative bandwidth on all the devices\n");
+  printf("  0,1,2,...,n - Specify any particular device to be used\n");
+  printf("--memory=[MEMMODE]\tSpecify which memory mode to use\n");
+  printf("  pageable - pageable memory\n");
+  printf("  pinned   - non-pageable system memory\n");
+  printf("--mode=[MODE]\tSpecify the mode to use\n");
+  printf("  quick - performs a quick measurement\n");
+  printf("  range - measures a user-specified range of values\n");
+  printf("  shmoo - performs an intense shmoo of a large range of values\n");
+
+  printf("--htod\tMeasure host to device transfers\n");
+  printf("--dtoh\tMeasure device to host transfers\n");
+  printf("--dtod\tMeasure device to device transfers\n");
+#if CUDART_VERSION >= 2020
+  printf("--wc\tAllocate pinned memory as write-combined\n");
+#endif
+  printf("--cputiming\tForce CPU-based timing always\n");
+
+  printf("Range mode options\n");
+  printf("--start=[SIZE]\tStarting transfer size in bytes\n");
+  printf("--end=[SIZE]\tEnding transfer size in bytes\n");
+  printf("--increment=[SIZE]\tIncrement size in bytes\n");
+}
diff --git a/Samples/bandwidthTest/bandwidthTest_vs2012.sln b/Samples/bandwidthTest/bandwidthTest_vs2012.sln
new file mode 100644
index 000000000..12f46461b
--- /dev/null
+++ b/Samples/bandwidthTest/bandwidthTest_vs2012.sln
@@ -0,0 +1,20 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2012
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "bandwidthTest", "bandwidthTest_vs2012.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
diff --git a/Samples/bandwidthTest/bandwidthTest_vs2012.vcxproj b/Samples/bandwidthTest/bandwidthTest_vs2012.vcxproj
new file mode 100644
index 000000000..f567519c5
--- /dev/null
+++ b/Samples/bandwidthTest/bandwidthTest_vs2012.vcxproj
@@ -0,0 +1,107 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>bandwidthTest_vs2012</RootNamespace>
+    <ProjectName>bandwidthTest</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v110</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/bandwidthTest.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="bandwidthTest.cu" />
+
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
+  </ImportGroup>
+</Project>
diff --git a/Samples/bandwidthTest/bandwidthTest_vs2013.sln b/Samples/bandwidthTest/bandwidthTest_vs2013.sln
new file mode 100644
index 000000000..63178742c
--- /dev/null
+++ b/Samples/bandwidthTest/bandwidthTest_vs2013.sln
@@ -0,0 +1,20 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 13.00
+# Visual Studio 2013
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "bandwidthTest", "bandwidthTest_vs2013.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
diff --git a/Samples/bandwidthTest/bandwidthTest_vs2013.vcxproj b/Samples/bandwidthTest/bandwidthTest_vs2013.vcxproj
new file mode 100644
index 000000000..04c193b5a
--- /dev/null
+++ b/Samples/bandwidthTest/bandwidthTest_vs2013.vcxproj
@@ -0,0 +1,107 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>bandwidthTest_vs2013</RootNamespace>
+    <ProjectName>bandwidthTest</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v120</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/bandwidthTest.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="bandwidthTest.cu" />
+
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
+  </ImportGroup>
+</Project>
diff --git a/Samples/bandwidthTest/bandwidthTest_vs2015.sln b/Samples/bandwidthTest/bandwidthTest_vs2015.sln
new file mode 100644
index 000000000..749f041e9
--- /dev/null
+++ b/Samples/bandwidthTest/bandwidthTest_vs2015.sln
@@ -0,0 +1,20 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 14.00
+# Visual Studio 2015
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "bandwidthTest", "bandwidthTest_vs2015.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
diff --git a/Samples/bandwidthTest/bandwidthTest_vs2015.vcxproj b/Samples/bandwidthTest/bandwidthTest_vs2015.vcxproj
new file mode 100644
index 000000000..e4fda9091
--- /dev/null
+++ b/Samples/bandwidthTest/bandwidthTest_vs2015.vcxproj
@@ -0,0 +1,107 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>bandwidthTest_vs2015</RootNamespace>
+    <ProjectName>bandwidthTest</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v140</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/bandwidthTest.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="bandwidthTest.cu" />
+
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
+  </ImportGroup>
+</Project>
diff --git a/Samples/bandwidthTest/bandwidthTest_vs2017.sln b/Samples/bandwidthTest/bandwidthTest_vs2017.sln
new file mode 100644
index 000000000..53bbf3deb
--- /dev/null
+++ b/Samples/bandwidthTest/bandwidthTest_vs2017.sln
@@ -0,0 +1,20 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2017
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "bandwidthTest", "bandwidthTest_vs2017.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
diff --git a/Samples/bandwidthTest/bandwidthTest_vs2017.vcxproj b/Samples/bandwidthTest/bandwidthTest_vs2017.vcxproj
new file mode 100644
index 000000000..f17a5e2c1
--- /dev/null
+++ b/Samples/bandwidthTest/bandwidthTest_vs2017.vcxproj
@@ -0,0 +1,108 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>bandwidthTest_vs2017</RootNamespace>
+    <ProjectName>bandwidthTest</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v141</PlatformToolset>
+	<WindowsTargetPlatformVersion>10.0.15063.0</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/bandwidthTest.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="bandwidthTest.cu" />
+
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
+  </ImportGroup>
+</Project>
diff --git a/Samples/conjugateGradientCudaGraphs/Makefile b/Samples/conjugateGradientCudaGraphs/Makefile
index 0130308ec..98e29ee19 100644
--- a/Samples/conjugateGradientCudaGraphs/Makefile
+++ b/Samples/conjugateGradientCudaGraphs/Makefile
@@ -246,7 +246,11 @@ LIBRARIES :=
 ################################################################################
 
 # Gencode arguments
+ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
+SMS ?= 30 35 37 50 52 60 61 70 72 75
+else
 SMS ?= 30 35 37 50 52 60 61 70 75
+endif
 
 ifeq ($(SMS),)
 $(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
@@ -264,7 +268,7 @@ GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
 endif
 endif
 
-LIBRARIES += -lcublas_static -lcusparse_static -lculibos
+LIBRARIES += -lcublas_static -lcublasLt_static -lcusparse_static -lculibos
 
 ifeq ($(SAMPLE_ENABLED),0)
 EXEC ?= @echo "[@]"
diff --git a/Samples/conjugateGradientCudaGraphs/NsightEclipse.xml b/Samples/conjugateGradientCudaGraphs/NsightEclipse.xml
index 8d06fc36c..8362bbd9c 100644
--- a/Samples/conjugateGradientCudaGraphs/NsightEclipse.xml
+++ b/Samples/conjugateGradientCudaGraphs/NsightEclipse.xml
@@ -31,6 +31,7 @@
   </keywords>
   <libraries>
     <library>cublas_static</library>
+    <library>cublasLt_static</library>
     <library>cusparse_static</library>
     <library>culibos</library>
   </libraries>
@@ -55,6 +56,7 @@
   <sm-arch>sm60</sm-arch>
   <sm-arch>sm61</sm-arch>
   <sm-arch>sm70</sm-arch>
+  <sm-arch>sm72</sm-arch>
   <sm-arch>sm75</sm-arch>
   <supported_envs>
     <env>
diff --git a/Samples/conjugateGradientCudaGraphs/README.md b/Samples/conjugateGradientCudaGraphs/README.md
index 5a829f54a..62346a124 100644
--- a/Samples/conjugateGradientCudaGraphs/README.md
+++ b/Samples/conjugateGradientCudaGraphs/README.md
@@ -10,7 +10,7 @@ Linear Algebra, CUBLAS Library, CUSPARSE Library
 
 ## Supported SM Architectures
 
-[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
+[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
 
 ## Supported OSes
 
@@ -30,7 +30,7 @@ cudaStreamBeginCapture, cudaStreamEndCapture, cudaGraphCreate, cudaGraphLaunch,
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 10.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 
 ## Build and Run
diff --git a/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs.cu b/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs.cu
index b6b83fbac..61e755f5f 100644
--- a/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs.cu
+++ b/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs.cu
@@ -323,7 +323,7 @@ int main(int argc, char **argv) {
   checkCudaErrors(cudaStreamCreate(&streamForGraph));
   checkCudaErrors(cublasSetStream(cublasHandle, stream1));
   checkCudaErrors(cusparseSetStream(cusparseHandle, stream1));
-  checkCudaErrors(cudaStreamBeginCapture(stream1));
+  checkCudaErrors(cudaStreamBeginCapture(stream1, cudaStreamCaptureModeGlobal));
 
   r1_div_x<<<1, 1, 0, stream1>>>(d_r1, d_r0, d_b);
   cublasSetPointerMode(cublasHandle, CUBLAS_POINTER_MODE_DEVICE);
diff --git a/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2012.vcxproj b/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2012.vcxproj
index 4a583fc93..52c7d2deb 100644
--- a/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2012.vcxproj
+++ b/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2012.vcxproj
@@ -33,7 +33,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -102,6 +102,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2013.vcxproj b/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2013.vcxproj
index eeb90636a..1659a3ff1 100644
--- a/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2013.vcxproj
+++ b/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2013.vcxproj
@@ -33,7 +33,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -102,6 +102,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2015.vcxproj b/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2015.vcxproj
index 69312b053..9d6d02069 100644
--- a/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2015.vcxproj
+++ b/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2015.vcxproj
@@ -33,7 +33,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -102,6 +102,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2017.vcxproj b/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2017.vcxproj
index 13de64fc7..099dc0182 100644
--- a/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2017.vcxproj
+++ b/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2017.vcxproj
@@ -34,7 +34,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -103,6 +103,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/conjugateGradientMultiBlockCG/Makefile b/Samples/conjugateGradientMultiBlockCG/Makefile
index 2092da3bc..7c05f18f8 100644
--- a/Samples/conjugateGradientMultiBlockCG/Makefile
+++ b/Samples/conjugateGradientMultiBlockCG/Makefile
@@ -264,7 +264,11 @@ LIBRARIES :=
 ################################################################################
 
 # Gencode arguments
+ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
+SMS ?= 60 61 70 72 75
+else
 SMS ?= 60 61 70 75
+endif
 
 ifeq ($(SMS),)
 $(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
diff --git a/Samples/conjugateGradientMultiBlockCG/NsightEclipse.xml b/Samples/conjugateGradientMultiBlockCG/NsightEclipse.xml
index 067f3f078..a7b88e1e1 100644
--- a/Samples/conjugateGradientMultiBlockCG/NsightEclipse.xml
+++ b/Samples/conjugateGradientMultiBlockCG/NsightEclipse.xml
@@ -42,6 +42,7 @@
   <sm-arch>sm60</sm-arch>
   <sm-arch>sm61</sm-arch>
   <sm-arch>sm70</sm-arch>
+  <sm-arch>sm72</sm-arch>
   <sm-arch>sm75</sm-arch>
   <supported_envs>
     <env>
diff --git a/Samples/conjugateGradientMultiBlockCG/README.md b/Samples/conjugateGradientMultiBlockCG/README.md
index 101b692a6..fcfad8eff 100644
--- a/Samples/conjugateGradientMultiBlockCG/README.md
+++ b/Samples/conjugateGradientMultiBlockCG/README.md
@@ -10,7 +10,7 @@ Unified Memory, Linear Algebra, Cooperative Groups, MultiBlock Cooperative Group
 
 ## Supported SM Architectures
 
-[SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
+[SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
 
 ## Supported OSes
 
@@ -27,7 +27,7 @@ x86_64, ppc64le
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 10.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 
 ## Build and Run
diff --git a/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2012.vcxproj b/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2012.vcxproj
index fdf29d912..f30303498 100644
--- a/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2012.vcxproj
+++ b/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2012.vcxproj
@@ -33,7 +33,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -103,6 +103,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2013.vcxproj b/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2013.vcxproj
index 3ff37342b..f1efeff6e 100644
--- a/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2013.vcxproj
+++ b/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2013.vcxproj
@@ -33,7 +33,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -103,6 +103,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2015.vcxproj b/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2015.vcxproj
index 0a5ad1500..e6f5de81e 100644
--- a/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2015.vcxproj
+++ b/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2015.vcxproj
@@ -33,7 +33,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -103,6 +103,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2017.vcxproj b/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2017.vcxproj
index 69e2bf9f0..f6764ec82 100644
--- a/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2017.vcxproj
+++ b/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2017.vcxproj
@@ -34,7 +34,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -104,6 +104,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/conjugateGradientMultiDeviceCG/Makefile b/Samples/conjugateGradientMultiDeviceCG/Makefile
index cfeb87836..62dc80daa 100644
--- a/Samples/conjugateGradientMultiDeviceCG/Makefile
+++ b/Samples/conjugateGradientMultiDeviceCG/Makefile
@@ -264,7 +264,11 @@ LIBRARIES :=
 ################################################################################
 
 # Gencode arguments
+ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
+SMS ?= 60 61 70 72 75
+else
 SMS ?= 60 61 70 75
+endif
 
 ifeq ($(SMS),)
 $(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
diff --git a/Samples/conjugateGradientMultiDeviceCG/NsightEclipse.xml b/Samples/conjugateGradientMultiDeviceCG/NsightEclipse.xml
index b17237fcc..19570b770 100644
--- a/Samples/conjugateGradientMultiDeviceCG/NsightEclipse.xml
+++ b/Samples/conjugateGradientMultiDeviceCG/NsightEclipse.xml
@@ -49,6 +49,7 @@
   <sm-arch>sm60</sm-arch>
   <sm-arch>sm61</sm-arch>
   <sm-arch>sm70</sm-arch>
+  <sm-arch>sm72</sm-arch>
   <sm-arch>sm75</sm-arch>
   <supported_envs>
     <env>
diff --git a/Samples/conjugateGradientMultiDeviceCG/README.md b/Samples/conjugateGradientMultiDeviceCG/README.md
index cc989f40c..bc5087306 100644
--- a/Samples/conjugateGradientMultiDeviceCG/README.md
+++ b/Samples/conjugateGradientMultiDeviceCG/README.md
@@ -10,7 +10,7 @@ Unified Memory, Linear Algebra, Cooperative Groups, MultiDevice Cooperative Grou
 
 ## Supported SM Architectures
 
-[SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
+[SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
 
 ## Supported OSes
 
@@ -30,7 +30,7 @@ cudaMemAdvise, cudaMemPrefetchAsync, cudaLaunchCooperativeKernelMultiDevice, cud
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 10.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 
 ## Build and Run
diff --git a/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG.cu b/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG.cu
index f71233b4a..e823911cd 100644
--- a/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG.cu
+++ b/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG.cu
@@ -415,7 +415,7 @@ void getIdenticalGPUs(int num_of_gpus, std::set<int> &identicalGPUs) {
       identicalGPUs.erase(it);
     }
     if (!deviceProp.cooperativeMultiDeviceLaunch ||
-        !deviceProp.concurrentManagedAccess) {
+        !deviceProp.managedMemory) {
       identicalGPUs.erase(it);
     }
     it++;
@@ -450,8 +450,7 @@ int main(int argc, char **argv) {
   if (identicalGPUs.size() <= 1) {
     printf(
         "No Two or more GPUs with same architecture capable of "
-        "cooperativeMultiDeviceLaunch & concurrentManagedAccess found. "
-        "\nWaiving the sample\n");
+        "cooperativeMultiDeviceLaunch & managedMemory found. \nWaiving the sample\n");
     exit(EXIT_WAIVED);
   }
 
diff --git a/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2012.vcxproj b/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2012.vcxproj
index 3e2d33774..8b59da3aa 100644
--- a/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2012.vcxproj
+++ b/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2012.vcxproj
@@ -33,7 +33,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -103,6 +103,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2013.vcxproj b/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2013.vcxproj
index c8afcd5e0..7cf463c25 100644
--- a/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2013.vcxproj
+++ b/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2013.vcxproj
@@ -33,7 +33,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -103,6 +103,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2015.vcxproj b/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2015.vcxproj
index 8c4961b14..37a092c69 100644
--- a/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2015.vcxproj
+++ b/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2015.vcxproj
@@ -33,7 +33,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -103,6 +103,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2017.vcxproj b/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2017.vcxproj
index a1198775f..d08c49928 100644
--- a/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2017.vcxproj
+++ b/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2017.vcxproj
@@ -34,7 +34,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -104,6 +104,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/cudaTensorCoreGemm/Makefile b/Samples/cudaTensorCoreGemm/Makefile
index b44fe0822..8a0b5024b 100644
--- a/Samples/cudaTensorCoreGemm/Makefile
+++ b/Samples/cudaTensorCoreGemm/Makefile
@@ -246,12 +246,6 @@ ifeq ($(TARGET_ARCH),armv7l)
   SAMPLE_ENABLED := 0
 endif
 
-# This sample is not supported on aarch64
-ifeq ($(TARGET_ARCH),aarch64)
-  $(info >>> WARNING - cudaTensorCoreGemm is not supported on aarch64 - waiving sample <<<)
-  SAMPLE_ENABLED := 0
-endif
-
 ALL_LDFLAGS :=
 ALL_LDFLAGS += $(ALL_CCFLAGS)
 ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
@@ -264,7 +258,11 @@ LIBRARIES :=
 ################################################################################
 
 # Gencode arguments
+ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
+SMS ?= 70 72 75
+else
 SMS ?= 70 75
+endif
 
 ifeq ($(SMS),)
 $(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
diff --git a/Samples/cudaTensorCoreGemm/NsightEclipse.xml b/Samples/cudaTensorCoreGemm/NsightEclipse.xml
index 73020e63b..b8b24e8cc 100644
--- a/Samples/cudaTensorCoreGemm/NsightEclipse.xml
+++ b/Samples/cudaTensorCoreGemm/NsightEclipse.xml
@@ -43,12 +43,16 @@ In addition to that, it demonstrates the use of the new CUDA function attribute
     <scope>1:CUDA Basic Topics</scope>
   </scopes>
   <sm-arch>sm70</sm-arch>
+  <sm-arch>sm72</sm-arch>
   <sm-arch>sm75</sm-arch>
   <supported_envs>
     <env>
       <arch>x86_64</arch>
       <platform>linux</platform>
     </env>
+    <env>
+      <arch>aarch64</arch>
+    </env>
     <env>
       <platform>windows7</platform>
     </env>
diff --git a/Samples/cudaTensorCoreGemm/README.md b/Samples/cudaTensorCoreGemm/README.md
index 4cc8e332e..83c833f55 100644
--- a/Samples/cudaTensorCoreGemm/README.md
+++ b/Samples/cudaTensorCoreGemm/README.md
@@ -14,7 +14,7 @@ Matrix Multiply, WMMA, Tensor Cores
 
 ## Supported SM Architectures
 
-[SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
+[SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
 
 ## Supported OSes
 
@@ -22,7 +22,7 @@ Linux, Windows
 
 ## Supported CPU Architecture
 
-x86_64, ppc64le
+x86_64, ppc64le, aarch64
 
 ## CUDA APIs involved
 
@@ -31,7 +31,7 @@ cudaMallocManaged, cudaDeviceSynchronize, cudaFuncSetAttribute, cudaEventCreate,
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 10.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 
 ## Build and Run
 
@@ -52,9 +52,9 @@ $ cd <sample_dir>
 $ make
 ```
 The samples makefiles can take advantage of certain options:
-*  **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le.
+*  **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, aarch64.
     By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.<br/>
-`$ make TARGET_ARCH=x86_64` <br/> `$ make TARGET_ARCH=ppc64le` <br/>
+`$ make TARGET_ARCH=x86_64` <br/> `$ make TARGET_ARCH=ppc64le` <br/> `$ make TARGET_ARCH=aarch64` <br/>
     See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details.
 *   **dbg=1** - build with debug symbols
     ```
diff --git a/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm.cu b/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm.cu
index d2ce38ecf..8a945d2a9 100644
--- a/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm.cu
+++ b/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm.cu
@@ -180,16 +180,16 @@
 
 using namespace nvcuda;
 
-__host__ void init_host_matrices(float *a, float *b, float *c) {
+__host__ void init_host_matrices(half *a, half *b, float *c) {
   for (int i = 0; i < M_GLOBAL; i++) {
     for (int j = 0; j < K_GLOBAL; j++) {
-      a[i * K_GLOBAL + j] = static_cast<float>(rand() % 3);
+      a[i * K_GLOBAL + j] = (half)(rand() % 3);
     }
   }
 
   for (int i = 0; i < N_GLOBAL; i++) {
     for (int j = 0; j < K_GLOBAL; j++) {
-      b[i * K_GLOBAL + j] = static_cast<float>(rand() % 3);
+      b[i * K_GLOBAL + j] = (half)(rand() % 3);
     }
   }
 
@@ -198,26 +198,6 @@ __host__ void init_host_matrices(float *a, float *b, float *c) {
   }
 }
 
-__global__ void init_device_matrices(const float *A_h, const float *B_h,
-                                     const float *C_h, half *A, half *B,
-                                     float *C, float *D) {
-  for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M_GLOBAL * K_GLOBAL;
-       i += gridDim.x * blockDim.x)
-    A[i] = __float2half(A_h[i]);
-
-  for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < N_GLOBAL * K_GLOBAL;
-       i += gridDim.x * blockDim.x)
-    B[i] = __float2half(B_h[i]);
-
-  for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M_GLOBAL * N_GLOBAL;
-       i += gridDim.x * blockDim.x)
-    C[i] = C_h[i];
-
-  for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M_GLOBAL * N_GLOBAL;
-       i += gridDim.x * blockDim.x)
-    D[i] = 0;
-}
-
 __global__ void compute_gemm(const half *A, const half *B, const float *C,
                              float *D, float alpha, float beta) {
   extern __shared__ half shmem[][CHUNK_K * K + SKEW_HALF];
@@ -486,7 +466,7 @@ __global__ void simple_wmma_gemm(half *a, half *b, float *c, float *d, int m_ld,
   }
 }
 
-__host__ void matMultiplyOnHost(float *A, float *B, float *C, float alpha,
+__host__ void matMultiplyOnHost(half *A, half *B, float *C, float alpha,
                                 float beta, int numARows, int numAColumns,
                                 int numBRows, int numBColumns, int numCRows,
                                 int numCColumns) {
@@ -495,7 +475,7 @@ __host__ void matMultiplyOnHost(float *A, float *B, float *C, float alpha,
       float temp = 0.0;
 
       for (int k = 0; k < numAColumns; k++) {
-        temp += A[i * numAColumns + k] * B[j * numBRows + k];
+        temp += (float)A[i * numAColumns + k] * (float)B[j * numBRows + k];
       }
 
       C[i * numCColumns + j] = temp * alpha + beta * C[i * numCColumns + j];
@@ -514,7 +494,7 @@ int main(int argc, char **argv) {
   // Tensor cores require a GPU of Volta (SM7X) architecture or higher.
   if (deviceProp.major < 7) {
     printf(
-        "cudaTensorCoreGemm requires requires SM 7.0 or higher to use Tensor "
+        "cudaTensorCoreGemm requires SM 7.0 or higher to use Tensor "
         "Cores.  Exiting...\n");
     exit(EXIT_WAIVED);
   }
@@ -523,25 +503,20 @@ int main(int argc, char **argv) {
   printf("N: %d (%d x %d)\n", N_GLOBAL, N, N_TILES);
   printf("K: %d (%d x %d)\n", K_GLOBAL, K, K_TILES);
 
-  float *A_h = NULL;
-  float *B_h = NULL;
+  half *A_h = NULL;
+  half *B_h = NULL;
   float *C_h = NULL;
 #if CPU_DEBUG
   float *result_hD = NULL;
   float *result_host = NULL;
 #endif
 
-  checkCudaErrors(cudaMallocManaged(reinterpret_cast<void **>(&A_h),
-                                    sizeof(float) * M_GLOBAL * K_GLOBAL));
-  checkCudaErrors(cudaMallocManaged(reinterpret_cast<void **>(&B_h),
-                                    sizeof(float) * K_GLOBAL * N_GLOBAL));
-  checkCudaErrors(cudaMallocManaged(reinterpret_cast<void **>(&C_h),
-                                    sizeof(float) * M_GLOBAL * N_GLOBAL));
+  A_h = (half *)malloc(sizeof(half) * M_GLOBAL * K_GLOBAL);
+  B_h = (half *)malloc(sizeof(half) * K_GLOBAL * N_GLOBAL);
+  C_h = (float *)malloc(sizeof(float) * M_GLOBAL * N_GLOBAL);
 #if CPU_DEBUG
-  checkCudaErrors(cudaMallocManaged((void **)&result_hD,
-                                    sizeof(float) * M_GLOBAL * N_GLOBAL));
-  checkCudaErrors(cudaMallocManaged((void **)&result_host,
-                                    sizeof(float) * M_GLOBAL * N_GLOBAL));
+  result_hD = (float *)malloc(sizeof(float) * M_GLOBAL * N_GLOBAL);
+  result_host = (float *)malloc(sizeof(float) * M_GLOBAL * N_GLOBAL);
 #endif
 
   half *A = NULL;
@@ -567,11 +542,13 @@ int main(int argc, char **argv) {
 
   printf("Preparing data for GPU...\n");
 
-  checkKernelErrors(
-      (init_device_matrices<<<deviceProp.multiProcessorCount,
-                              THREADS_PER_BLOCK>>>(A_h, B_h, C_h, A, B, C, D)));
-
-  checkCudaErrors(cudaDeviceSynchronize());
+  checkCudaErrors(cudaMemcpy(A, A_h, sizeof(half) * M_GLOBAL * K_GLOBAL,
+                             cudaMemcpyHostToDevice));
+  checkCudaErrors(cudaMemcpy(B, B_h, sizeof(half) * N_GLOBAL * K_GLOBAL,
+                             cudaMemcpyHostToDevice));
+  checkCudaErrors(cudaMemcpy(C, C_h, sizeof(float) * M_GLOBAL * N_GLOBAL,
+                             cudaMemcpyHostToDevice));
+  checkCudaErrors(cudaMemset(D, 0, sizeof(float) * M_GLOBAL * N_GLOBAL));
 
   enum {
     // Compute the right amount of shared memory to request.
@@ -650,6 +627,8 @@ int main(int argc, char **argv) {
       printf("mismatch i=%d result_hD=%f result_host=%f\n", i, result_hD[i],
              result_host[i]);
   }
+  free(result_hD);
+  free(result_host);
 #endif
 
   float milliseconds = 0;
@@ -662,9 +641,9 @@ int main(int argc, char **argv) {
                                                (milliseconds / 1000.)) /
                                1e12);
 
-  checkCudaErrors(cudaFree(reinterpret_cast<void *>(A_h)));
-  checkCudaErrors(cudaFree(reinterpret_cast<void *>(B_h)));
-  checkCudaErrors(cudaFree(reinterpret_cast<void *>(C_h)));
+  free(A_h);
+  free(B_h);
+  free(C_h);
   checkCudaErrors(cudaFree(reinterpret_cast<void *>(A)));
   checkCudaErrors(cudaFree(reinterpret_cast<void *>(B)));
   checkCudaErrors(cudaFree(reinterpret_cast<void *>(C)));
diff --git a/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2012.vcxproj b/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2012.vcxproj
index bf77cf937..29f45dec8 100644
--- a/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2012.vcxproj
+++ b/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2012.vcxproj
@@ -33,7 +33,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -102,6 +102,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2013.vcxproj b/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2013.vcxproj
index d8afbc904..757336c45 100644
--- a/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2013.vcxproj
+++ b/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2013.vcxproj
@@ -33,7 +33,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -102,6 +102,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2015.vcxproj b/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2015.vcxproj
index b6e3fb600..1c2ffc7e5 100644
--- a/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2015.vcxproj
+++ b/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2015.vcxproj
@@ -33,7 +33,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -102,6 +102,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2017.vcxproj b/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2017.vcxproj
index da8345fc7..9a4d3f30c 100644
--- a/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2017.vcxproj
+++ b/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2017.vcxproj
@@ -34,7 +34,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -103,6 +103,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/deviceQuery/Makefile b/Samples/deviceQuery/Makefile
index 09ef57d6e..7d2ff2c09 100644
--- a/Samples/deviceQuery/Makefile
+++ b/Samples/deviceQuery/Makefile
@@ -246,7 +246,11 @@ LIBRARIES :=
 ################################################################################
 
 # Gencode arguments
+ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
+SMS ?= 30 35 37 50 52 60 61 70 72 75
+else
 SMS ?= 30 35 37 50 52 60 61 70 75
+endif
 
 ifeq ($(SMS),)
 $(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
diff --git a/Samples/deviceQuery/NsightEclipse.xml b/Samples/deviceQuery/NsightEclipse.xml
index e93f08649..04bfe94a9 100644
--- a/Samples/deviceQuery/NsightEclipse.xml
+++ b/Samples/deviceQuery/NsightEclipse.xml
@@ -39,6 +39,7 @@
   <sm-arch>sm60</sm-arch>
   <sm-arch>sm61</sm-arch>
   <sm-arch>sm70</sm-arch>
+  <sm-arch>sm72</sm-arch>
   <sm-arch>sm75</sm-arch>
   <supported_envs>
     <env>
diff --git a/Samples/deviceQuery/README.md b/Samples/deviceQuery/README.md
index 29182b74b..e0e217df9 100644
--- a/Samples/deviceQuery/README.md
+++ b/Samples/deviceQuery/README.md
@@ -10,7 +10,7 @@ CUDA Runtime API, Device Query
 
 ## Supported SM Architectures
 
-[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
+[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
 
 ## Supported OSes
 
@@ -27,7 +27,7 @@ cudaSetDevice, cudaGetDeviceCount, cudaGetDeviceProperties, cudaDriverGetVersion
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 10.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 
 ## Build and Run
 
diff --git a/Samples/deviceQuery/deviceQuery_vs2012.vcxproj b/Samples/deviceQuery/deviceQuery_vs2012.vcxproj
index 43281e98f..27e817015 100644
--- a/Samples/deviceQuery/deviceQuery_vs2012.vcxproj
+++ b/Samples/deviceQuery/deviceQuery_vs2012.vcxproj
@@ -33,7 +33,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -102,6 +102,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/deviceQuery/deviceQuery_vs2013.vcxproj b/Samples/deviceQuery/deviceQuery_vs2013.vcxproj
index 73e0e3ee0..1433399fb 100644
--- a/Samples/deviceQuery/deviceQuery_vs2013.vcxproj
+++ b/Samples/deviceQuery/deviceQuery_vs2013.vcxproj
@@ -33,7 +33,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -102,6 +102,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/deviceQuery/deviceQuery_vs2015.vcxproj b/Samples/deviceQuery/deviceQuery_vs2015.vcxproj
index 60fb078b0..45024d450 100644
--- a/Samples/deviceQuery/deviceQuery_vs2015.vcxproj
+++ b/Samples/deviceQuery/deviceQuery_vs2015.vcxproj
@@ -33,7 +33,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -102,6 +102,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/deviceQuery/deviceQuery_vs2017.vcxproj b/Samples/deviceQuery/deviceQuery_vs2017.vcxproj
index b6c0f4782..b5d52c6e1 100644
--- a/Samples/deviceQuery/deviceQuery_vs2017.vcxproj
+++ b/Samples/deviceQuery/deviceQuery_vs2017.vcxproj
@@ -34,7 +34,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -103,6 +103,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/immaTensorCoreGemm/Makefile b/Samples/immaTensorCoreGemm/Makefile
new file mode 100644
index 000000000..5236c7f41
--- /dev/null
+++ b/Samples/immaTensorCoreGemm/Makefile
@@ -0,0 +1,318 @@
+################################################################################
+# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+#
+# Makefile project only supported on Mac OS X and Linux Platforms)
+#
+################################################################################
+
+# Location of the CUDA Toolkit
+CUDA_PATH ?= /usr/local/cuda
+
+##############################
+# start deprecated interface #
+##############################
+ifeq ($(x86_64),1)
+    $(info WARNING - x86_64 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=x86_64 instead)
+    TARGET_ARCH ?= x86_64
+endif
+ifeq ($(ARMv7),1)
+    $(info WARNING - ARMv7 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=armv7l instead)
+    TARGET_ARCH ?= armv7l
+endif
+ifeq ($(aarch64),1)
+    $(info WARNING - aarch64 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=aarch64 instead)
+    TARGET_ARCH ?= aarch64
+endif
+ifeq ($(ppc64le),1)
+    $(info WARNING - ppc64le variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=ppc64le instead)
+    TARGET_ARCH ?= ppc64le
+endif
+ifneq ($(GCC),)
+    $(info WARNING - GCC variable has been deprecated)
+    $(info WARNING - please use HOST_COMPILER=$(GCC) instead)
+    HOST_COMPILER ?= $(GCC)
+endif
+ifneq ($(abi),)
+    $(error ERROR - abi variable has been removed)
+endif
+############################
+# end deprecated interface #
+############################
+
+# architecture
+HOST_ARCH   := $(shell uname -m)
+TARGET_ARCH ?= $(HOST_ARCH)
+ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
+    ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le))
+            TARGET_SIZE := 64
+        else ifneq (,$(filter $(TARGET_ARCH),armv7l))
+            TARGET_SIZE := 32
+        endif
+    else
+        TARGET_SIZE := $(shell getconf LONG_BIT)
+    endif
+else
+    $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
+endif
+ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le))
+        $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
+    endif
+endif
+
+# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
+ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
+    TARGET_ARCH = armv7l
+endif
+
+# operating system
+HOST_OS   := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
+TARGET_OS ?= $(HOST_OS)
+ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
+    $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
+endif
+
+# host compiler
+ifeq ($(TARGET_OS),darwin)
+    ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
+        HOST_COMPILER ?= clang++
+    endif
+else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
+        ifeq ($(TARGET_OS),linux)
+            HOST_COMPILER ?= arm-linux-gnueabihf-g++
+        else ifeq ($(TARGET_OS),qnx)
+            ifeq ($(QNX_HOST),)
+                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
+            endif
+            ifeq ($(QNX_TARGET),)
+                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
+            endif
+            export QNX_HOST
+            export QNX_TARGET
+            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
+        else ifeq ($(TARGET_OS),android)
+            HOST_COMPILER ?= arm-linux-androideabi-g++
+        endif
+    else ifeq ($(TARGET_ARCH),aarch64)
+        ifeq ($(TARGET_OS), linux)
+            HOST_COMPILER ?= aarch64-linux-gnu-g++
+        else ifeq ($(TARGET_OS),qnx)
+            ifeq ($(QNX_HOST),)
+                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
+            endif
+            ifeq ($(QNX_TARGET),)
+                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
+            endif
+            export QNX_HOST
+            export QNX_TARGET
+            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/aarch64-unknown-nto-qnx7.0.0-g++
+        else ifeq ($(TARGET_OS), android)
+            HOST_COMPILER ?= aarch64-linux-android-clang++
+        endif
+    else ifeq ($(TARGET_ARCH),ppc64le)
+        HOST_COMPILER ?= powerpc64le-linux-gnu-g++
+    endif
+endif
+HOST_COMPILER ?= g++
+NVCC          := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
+
+# internal flags
+NVCCFLAGS   := -m${TARGET_SIZE}
+CCFLAGS     :=
+LDFLAGS     :=
+
+# build flags
+ifeq ($(TARGET_OS),darwin)
+    LDFLAGS += -rpath $(CUDA_PATH)/lib
+    CCFLAGS += -arch $(HOST_ARCH)
+else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
+    LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
+    CCFLAGS += -mfloat-abi=hard
+else ifeq ($(TARGET_OS),android)
+    LDFLAGS += -pie
+    CCFLAGS += -fpie -fpic -fexceptions
+endif
+
+ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
+        ifneq ($(TARGET_FS),)
+            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
+            ifeq ($(GCCVERSIONLTEQ46),1)
+                CCFLAGS += --sysroot=$(TARGET_FS)
+            endif
+            LDFLAGS += --sysroot=$(TARGET_FS)
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
+        ifneq ($(TARGET_FS),)
+            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
+            ifeq ($(GCCVERSIONLTEQ46),1)
+                CCFLAGS += --sysroot=$(TARGET_FS)
+            endif
+            LDFLAGS += --sysroot=$(TARGET_FS)
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L $(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L $(TARGET_FS)/usr/lib/aarch64-linux-gnu
+            LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu
+        endif
+    endif
+endif
+
+ifeq ($(TARGET_OS),qnx)
+    CCFLAGS += -DWIN_INTERFACE_CUSTOM
+    LDFLAGS += -lsocket
+endif
+
+# Install directory of different arch
+CUDA_INSTALL_TARGET_DIR :=
+ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
+    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
+    CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
+else ifeq ($(TARGET_ARCH),ppc64le)
+    CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
+endif
+
+# Debug build flags
+ifeq ($(dbg),1)
+      NVCCFLAGS += -g -G
+      BUILD_TYPE := debug
+else
+      BUILD_TYPE := release
+endif
+
+ALL_CCFLAGS :=
+ALL_CCFLAGS += $(NVCCFLAGS)
+ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
+ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
+ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
+
+SAMPLE_ENABLED := 1
+
+# This sample is not supported on Mac OSX
+ifeq ($(TARGET_OS),darwin)
+  $(info >>> WARNING - immaTensorCoreGemm is not supported on Mac OSX - waiving sample <<<)
+  SAMPLE_ENABLED := 0
+endif
+
+# This sample is not supported on ARMv7
+ifeq ($(TARGET_ARCH),armv7l)
+  $(info >>> WARNING - immaTensorCoreGemm is not supported on ARMv7 - waiving sample <<<)
+  SAMPLE_ENABLED := 0
+endif
+
+ALL_LDFLAGS :=
+ALL_LDFLAGS += $(ALL_CCFLAGS)
+ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
+ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
+
+# Common includes and paths for CUDA
+INCLUDES  := -I../../Common
+LIBRARIES :=
+
+################################################################################
+
+# Gencode arguments
+ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
+SMS ?= 72 75
+else
+SMS ?= 75
+endif
+
+ifeq ($(SMS),)
+$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
+SAMPLE_ENABLED := 0
+endif
+
+ifeq ($(GENCODE_FLAGS),)
+# Generate SASS code for each SM architecture listed in $(SMS)
+$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
+
+# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
+HIGHEST_SM := $(lastword $(sort $(SMS)))
+ifneq ($(HIGHEST_SM),)
+GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
+endif
+endif
+
+ALL_CCFLAGS += -maxrregcount=255
+
+ifeq ($(SAMPLE_ENABLED),0)
+EXEC ?= @echo "[@]"
+endif
+
+################################################################################
+
+# Target rules
+all: build
+
+build: immaTensorCoreGemm
+
+check.deps:
+ifeq ($(SAMPLE_ENABLED),0)
+	@echo "Sample will be waived due to the above missing dependencies"
+else
+	@echo "Sample is ready - all dependencies have been met"
+endif
+
+immaTensorCoreGemm.o:immaTensorCoreGemm.cu
+	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
+
+immaTensorCoreGemm: immaTensorCoreGemm.o
+	$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
+	$(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
+	$(EXEC) cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
+
+run: build
+	$(EXEC) ./immaTensorCoreGemm
+
+clean:
+	rm -f immaTensorCoreGemm immaTensorCoreGemm.o
+	rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/immaTensorCoreGemm
+
+clobber: clean
diff --git a/Samples/immaTensorCoreGemm/NsightEclipse.xml b/Samples/immaTensorCoreGemm/NsightEclipse.xml
new file mode 100644
index 000000000..d87551ca0
--- /dev/null
+++ b/Samples/immaTensorCoreGemm/NsightEclipse.xml
@@ -0,0 +1,64 @@
+<?xml version="1.0" encoding="UTF-8"?> 
+<!DOCTYPE entry SYSTEM "SamplesInfo.dtd">
+<entry>
+  <name>immaTensorCoreGemm</name>
+  <cflags>
+    <flag>-maxrregcount=255</flag>
+  </cflags>
+  <cuda_api_list>
+    <toolkit>cudaMallocManaged</toolkit>
+    <toolkit>cudaDeviceSynchronize</toolkit>
+    <toolkit>cudaFuncSetAttribute</toolkit>
+    <toolkit>cudaEventCreate</toolkit>
+    <toolkit>cudaEventRecord</toolkit>
+    <toolkit>cudaEventSynchronize</toolkit>
+    <toolkit>cudaEventElapsedTime</toolkit>
+    <toolkit>cudaFree</toolkit>
+  </cuda_api_list>
+  <description><![CDATA[CUDA sample demonstrating a integer GEMM computation using the Warp Matrix Multiply and Accumulate (WMMA) API for integer introduced in CUDA 10. This sample demonstrates the use of the CUDA WMMA API employing the Tensor Cores introduced in the Volta chip family for faster matrix operations. In addition to that, it demonstrates the use of the new CUDA function attribute cudaFuncAttributeMaxDynamicSharedMemorySize that allows the application to reserve an extended amount of shared memory than it is available by default.]]></description>
+  <devicecompilation>whole</devicecompilation>
+  <includepaths>
+    <path>./</path>
+    <path>../</path>
+    <path>../../common/inc</path>
+  </includepaths>
+  <keyconcepts>
+    <concept level="basic">Matrix Multiply</concept>
+    <concept level="advanced">WMMA</concept>
+    <concept level="advanced">Tensor Cores</concept>
+  </keyconcepts>
+  <keywords>
+  </keywords>
+  <libraries>
+  </libraries>
+  <librarypaths>
+  </librarypaths>
+  <nsight_eclipse>true</nsight_eclipse>
+  <primary_file>immaTensorCoreGemm.cu</primary_file>
+  <scopes>
+    <scope>1:CUDA Basic Topics</scope>
+  </scopes>
+  <sm-arch>sm72</sm-arch>
+  <sm-arch>sm75</sm-arch>
+  <supported_envs>
+    <env>
+      <arch>x86_64</arch>
+      <platform>linux</platform>
+    </env>
+    <env>
+      <arch>aarch64</arch>
+    </env>
+    <env>
+      <platform>windows7</platform>
+    </env>
+    <env>
+      <arch>ppc64le</arch>
+      <platform>linux</platform>
+    </env>
+  </supported_envs>
+  <supported_sm_architectures>
+    <from>7.2</from>
+  </supported_sm_architectures>
+  <title>Tensor Core GEMM Integer MMA</title>
+  <type>exe</type>
+</entry>
diff --git a/Samples/immaTensorCoreGemm/README.md b/Samples/immaTensorCoreGemm/README.md
new file mode 100644
index 000000000..6d52d628b
--- /dev/null
+++ b/Samples/immaTensorCoreGemm/README.md
@@ -0,0 +1,70 @@
+# immaTensorCoreGemm - Tensor Core GEMM Integer MMA
+
+## Description
+
+CUDA sample demonstrating a integer GEMM computation using the Warp Matrix Multiply and Accumulate (WMMA) API for integer introduced in CUDA 10. This sample demonstrates the use of the CUDA WMMA API employing the Tensor Cores introduced in the Volta chip family for faster matrix operations. In addition to that, it demonstrates the use of the new CUDA function attribute cudaFuncAttributeMaxDynamicSharedMemorySize that allows the application to reserve an extended amount of shared memory than it is available by default.
+
+## Key Concepts
+
+Matrix Multiply, WMMA, Tensor Cores
+
+## Supported SM Architectures
+
+[SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
+
+## Supported OSes
+
+Linux, Windows
+
+## Supported CPU Architecture
+
+x86_64, ppc64le, aarch64
+
+## CUDA APIs involved
+
+### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html)
+cudaMallocManaged, cudaDeviceSynchronize, cudaFuncSetAttribute, cudaEventCreate, cudaEventRecord, cudaEventSynchronize, cudaEventElapsedTime, cudaFree
+
+## Prerequisites
+
+Download and install the [CUDA Toolkit 10.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+
+## Build and Run
+
+### Windows
+The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format:
+```
+*_vs<version>.sln - for Visual Studio <version>
+```
+Each individual sample has its own set of solution files in its directory:
+
+To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used.
+> **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details."
+
+### Linux
+The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make:
+```
+$ cd <sample_dir>
+$ make
+```
+The samples makefiles can take advantage of certain options:
+*  **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, aarch64.
+    By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.<br/>
+`$ make TARGET_ARCH=x86_64` <br/> `$ make TARGET_ARCH=ppc64le` <br/> `$ make TARGET_ARCH=aarch64` <br/>
+    See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details.
+*   **dbg=1** - build with debug symbols
+    ```
+    $ make dbg=1
+    ```
+*   **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`.
+    ```
+    $ make SMS="50 60"
+    ```
+
+*  **HOST_COMPILER=<host_compiler>** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers.
+```
+    $ make HOST_COMPILER=g++
+```
+
+## References (for more details)
+
diff --git a/Samples/immaTensorCoreGemm/immaTensorCoreGemm.cu b/Samples/immaTensorCoreGemm/immaTensorCoreGemm.cu
new file mode 100644
index 000000000..08369aa42
--- /dev/null
+++ b/Samples/immaTensorCoreGemm/immaTensorCoreGemm.cu
@@ -0,0 +1,655 @@
+/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+// CUDA sample demonstrating a integer GEMM computation using the Warp Matrix
+// Multiply and Accumulate API.
+
+// In this program, the compute_gemm kernel computes the result of a matrix
+// multiplication and addition: D = alpha * A * B + beta * C. The dimensions of
+// both C and D matrices are M_GLOBAL x N_GLOBAL. The A matrix is M_GLOBAL x
+// K_GLOBAL (row-major), the B matrix is K_GLOBAL x N_GLOBAL (column-major). In
+// that kernel, each CTA computes one 128 x 128 tile of the resulting matrix per
+// iteration. When the tile is computed, the CTA stores it to the global memory
+// and begins a new iteration, selecting a new 128 x 128 tile to compute.
+// Each CTA consists of eight warps. For the 128 x 128 tile, each warp computes
+// eight 16 x 16 subtiles, organized in a 2 x 4 two-dimensional array. Warps
+// compute the 16 x 16 subtiles using nvcuda::wmma::mma_sync operations by
+// moving through the K_GLOBAL dimension of the A and B matrices and
+// accumulating the intermediate result in the local thread state.
+
+// There are a number of simple optimizations used in the algorithm:
+// - The CTA copies the 128 x 128 tile of the C matrix from the global memory to
+//   shared memory. After that is done, each warp loads the C matrix fragments
+//   from shared memory, thus avoiding a random global memory access.
+// - On each internal iteration, the CTA copies a portion of the A and B
+// matrices from
+//   global memory to shared memory. After that, all warps in the CTA reuse the
+//   A and B data from shared memory, thus reducing the number of data copies
+//   from global memory.
+// - The portions of the A and B matrices are stored in shared memory with an
+// additional
+//   padding (skew) to reduce the number of shared memory access bank conflicts.
+//   (See a detailed explanation near the SKEW_HALF macro definition.)
+// - When the CTA finishes computing the tiles of the resulting matrix, each
+// warp stores
+//   its subtiles to shared memory. The CTA then copies the shared memory
+//   contents to global memory, again avoiding redundant random global memory
+//   accesses.
+// - Note that the CTA tile size is chosen to maximize the GPU register
+// utilization,
+//   but carefully enough to avoid local memory use.
+
+#include <assert.h>
+#include <cuda.h>
+#include <mma.h>
+#include <stdio.h>
+
+// helper functions and utilities to work with CUDA
+#include <helper_cuda.h>
+#include <helper_functions.h>
+
+// Externally configurable parameters.
+
+#ifndef CPU_DEBUG
+// Set this to 1 to verify the correctness of the GPU-computed matrix.
+#define CPU_DEBUG 0
+#endif
+
+#ifndef SHARED_MEMORY_LIMIT_64K
+// Set this to 0 to use more than 64 Kb of shared memory to cache data, to
+// improve the performance of the computations on GPU.
+// Note that you need a GPU that can have more than 64 Kb of shared memory
+// per multiprocessor.
+#define SHARED_MEMORY_LIMIT_64K 1
+#endif
+
+// GPU configuration.
+
+#define WARP_SIZE 32
+
+// MMA matrix tile dimensions.
+
+#define M 16
+#define N 16
+#define K 16
+
+#define WMMA_M 16
+#define WMMA_N 16
+#define WMMA_K 16
+
+// GEMM configuration.
+
+#define M_TILES 256
+#define N_TILES 256
+#define K_TILES 256
+
+#define M_GLOBAL (M * M_TILES)
+#define N_GLOBAL (N * N_TILES)
+#define K_GLOBAL (K * K_TILES)
+
+#define C_LAYOUT wmma::mem_row_major
+
+// Implementation constants.
+
+#define WARPS_PER_BLOCK 8
+#define THREADS_PER_BLOCK (WARP_SIZE * WARPS_PER_BLOCK)
+
+#if SHARED_MEMORY_LIMIT_64K
+// With only 64 Kb shared memory available, we can fit two 8-tile chunks of
+// the A and B matrix data, that are 16 * 16 * 8 * 8 * 2 = 32 Kb each
+// (i.e. two 8x8 arrays of tiles of 16x16 uint8_t-typed elements per CTA).
+// But we cannot account the 8 Kb total skew overhead, without which the
+// performance would be severely impacted. So we choose to reduce the chunk size
+// in half, i.e. the amount of A and B matrix data we cache in shared memory.
+// Accordingly, this doubles the number of outer iterations across the global K
+// dimension, which only slightly impacts the performance.
+#define CHUNK_K 8
+#else
+#define CHUNK_K 16
+#endif
+
+#define CHUNK_LINE_BYTES (CHUNK_K * K * sizeof(uint8_t))
+#define WARP_COPY_BYTES (WARP_SIZE * sizeof(int4))
+#define CHUNK_COPY_LINES_PER_WARP (WARP_COPY_BYTES / CHUNK_LINE_BYTES)
+#define CHUNK_COPY_LINE_LANES (WARP_SIZE / CHUNK_COPY_LINES_PER_WARP)
+
+#define BLOCK_ROW_WARPS 2
+#define BLOCK_COL_WARPS 4
+
+#define WARP_ROW_TILES 4
+#define WARP_COL_TILES 2
+
+#define BLOCK_ROW_TILES (WARP_ROW_TILES * BLOCK_ROW_WARPS)
+#define BLOCK_COL_TILES (WARP_COL_TILES * BLOCK_COL_WARPS)
+
+#define GLOBAL_MEM_STRIDE N_GLOBAL
+
+#define SHMEM_STRIDE (N * BLOCK_ROW_TILES)
+#define SHMEM_OFFSET (N * WARP_ROW_TILES)
+
+// The macro below is used to shift rows of the A matrix and columns of the B
+// matrix in shared memory to minimize possible bank conflicts. Before
+// performing the nvcuda::wmma::mma_sync operation, the warp must load the
+// matrix data using the nvcuda::wmma::load_matrix_sync operation. Although the
+// memory access pattern is not specified for that function, each lane in the
+// warp can read one or multiple matrix elements from different matrix rows or
+// columns. For shared memory, such access can result in bank conflicts if
+// different rows / columns of the matrix map to the same bank. By shifting each
+// row and column by a few bytes, we make sure that they map to different banks,
+// thus reducing the number of possible bank conflicts. The number of 16
+// one-byte "uint8_t" elements is chosen as the minimum possible shift because
+// we must keep each row and column 128-bit aligned, as required by
+// nvcuda::wmma::load_matrix_sync.
+#define SKEW_UINT8 16
+
+#define checkKernelErrors(expr)                             \
+  do {                                                      \
+    expr;                                                   \
+                                                            \
+    cudaError_t __err = cudaGetLastError();                 \
+    if (__err != cudaSuccess) {                             \
+      printf("Line %d: '%s' failed: %s\n", __LINE__, #expr, \
+             cudaGetErrorString(__err));                    \
+      abort();                                              \
+    }                                                       \
+  } while (0)
+
+using namespace nvcuda;
+
+__host__ void init_host_matrices(uint8_t *a, uint8_t *b, int *c) {
+  for (int i = 0; i < M_GLOBAL; i++) {
+    for (int j = 0; j < K_GLOBAL; j++) {
+      a[i * K_GLOBAL + j] = (uint8_t)(rand() % 3);
+    }
+  }
+
+  for (int i = 0; i < N_GLOBAL; i++) {
+    for (int j = 0; j < K_GLOBAL; j++) {
+      b[i * K_GLOBAL + j] = (uint8_t)(rand() % 3);
+    }
+  }
+
+  for (int t = 0; t < M_GLOBAL * N_GLOBAL; t++) {
+    c[t] = (rand() % 3);
+  }
+}
+
+__global__ void compute_gemm_imma(const uint8_t *A, const uint8_t *B,
+                                  const int *C, int *D, int alpha, int beta) {
+  extern __shared__ uint8_t shmem[][CHUNK_K * K + SKEW_UINT8];
+
+  // Warp and lane identification.
+  const unsigned int warpId = threadIdx.x / WARP_SIZE;
+  const unsigned int laneId = threadIdx.x % WARP_SIZE;
+
+  // Offset in shared memory from which the B matrix is stored.
+  const size_t shmem_idx_b_off = BLOCK_COL_TILES * M;
+
+  // This pointer is used to access the C and D matrix tiles this warp computes.
+  int *shmem_warp_tile_ptr = (int *)&shmem[0][0] +
+                             (warpId / 2) * SHMEM_STRIDE * K * 2 +
+                             (warpId % 2) * SHMEM_OFFSET;
+
+  // This pointer is used to stream the C and D matrices block-wide tile to and
+  // from shared memory.
+  int *shmem_warp_stream_ptr = (int *)&shmem[0][0] + warpId * SHMEM_STRIDE * K;
+
+  // Adjust the beta scaler, as it'll be multiplied by alpha at the end of
+  // each tile computation. Technically this is not generally correct (may
+  // result in a loss of precision). Zero still needs to be specially handled
+  // though.
+  beta /= alpha;
+
+  // Each CTA slides along the 128 x 128 tiles from the top left corner of the
+  // matrix to the right and down, and selects the next tile to compute. Once
+  // there's no such tile, all warps in this CTA exit.
+  for (unsigned int block_pos = blockIdx.x;; block_pos += gridDim.x) {
+    const unsigned int block_tile_i =
+        ((block_pos * BLOCK_ROW_TILES) / N_TILES) * (BLOCK_COL_TILES);
+    const unsigned int block_tile_j = (block_pos * BLOCK_COL_TILES) % N_TILES;
+
+    // Stop when there are no more D matrix tiles to compute in this CTA.
+    if (block_tile_i >= M_TILES) {
+      break;
+    }
+
+    // This warp's pointer to the C matrix data to copy memory from to shared
+    // memory.
+    const size_t gmem_idx =
+        (block_tile_i + warpId) * M * GLOBAL_MEM_STRIDE + block_tile_j * N;
+    const int *src_gmem_warp_stream_ptr = &C[gmem_idx];
+
+    // Stream multiple C tiles to shared memory.
+#pragma unroll
+    for (int i = 0; i < K; i++) {
+      typedef int4 copy_t;
+
+      *((copy_t *)(shmem_warp_stream_ptr + SHMEM_STRIDE * i) + laneId) =
+          *((copy_t *)(src_gmem_warp_stream_ptr + GLOBAL_MEM_STRIDE * i) +
+            laneId);
+    }
+
+    __syncthreads();
+
+    // These fragments will accumulate the result of A and B matrix fragment
+    // multiplications along the K_GLOBAL dimension.
+    wmma::fragment<wmma::accumulator, M, N, K, int> c[WARP_COL_TILES]
+                                                     [WARP_ROW_TILES];
+
+    // Load the C matrix tiles into fragments from shared memory.
+#pragma unroll
+    for (int i = 0; i < WARP_COL_TILES; i++) {
+#pragma unroll
+      for (int j = 0; j < WARP_ROW_TILES; j++) {
+        const int *tile_ptr =
+            shmem_warp_tile_ptr + i * SHMEM_STRIDE * K + j * N;
+
+        wmma::load_matrix_sync(c[i][j], tile_ptr, SHMEM_STRIDE, C_LAYOUT);
+      }
+    }
+
+    __syncthreads();
+
+    // Scale the C matrix.
+#pragma unroll
+    for (int i = 0; i < WARP_COL_TILES; i++) {
+#pragma unroll
+      for (int j = 0; j < WARP_ROW_TILES; j++) {
+#pragma unroll
+        for (int t = 0; t < c[i][j].num_elements; t++) {
+          c[i][j].x[t] *= beta;
+        }
+      }
+    }
+
+    // Select what warp copies what matrix to shared memory.
+    // Warps 0-3 copy the A matrix, warps 4-7 copy the B matrix.
+    const uint8_t *warp_ptr = (warpId < 4) ? (&A[block_tile_i * M * K_GLOBAL] +
+                                              M * K_GLOBAL * (warpId % 4) * 2)
+                                           : (&B[block_tile_j * N * K_GLOBAL] +
+                                              N * K_GLOBAL * (warpId % 4) * 2);
+
+    // Go through the global K dimension by a fixed step at a time.
+#pragma unroll
+    for (int tile_k = 0; tile_k < K_TILES; tile_k += CHUNK_K) {
+      // Copy slices of the A and B matrices to shared memory.
+      // The first half of the warps in the CTA copy the A matrix, the rest copy
+      // the B matrix.
+      size_t shmem_idx =
+          warpId < (WARPS_PER_BLOCK / 2)
+              ? (M * (warpId % (WARPS_PER_BLOCK / 2)) * 2)
+              : (N * (warpId % (WARPS_PER_BLOCK / 2)) * 2 + shmem_idx_b_off);
+
+      // First half of the warp copies the first row / column of the matrix,
+      // the second half of the warp copies the next.
+      int4 *lane_ptr = (int4 *)(warp_ptr + tile_k * K +
+                                (laneId / CHUNK_COPY_LINE_LANES) * K_GLOBAL) +
+                       (laneId % CHUNK_COPY_LINE_LANES);
+
+      // Shift the second half of the warp to the next row / column in the
+      // shared memory.
+      shmem_idx += laneId / CHUNK_COPY_LINE_LANES;
+
+#pragma unroll
+      for (int i = 0; i < ((WARP_SIZE / 2) / CHUNK_COPY_LINES_PER_WARP) * 2;
+           i++) {
+        // Copy 16 bytes at once in each lane.
+        *((int4 *)&shmem[shmem_idx][0] + (laneId % CHUNK_COPY_LINE_LANES)) =
+            *lane_ptr;
+
+        // Advance the global memory pointer and the shared memory index.
+        lane_ptr = (int4 *)((uint8_t *)lane_ptr +
+                            K_GLOBAL * CHUNK_COPY_LINES_PER_WARP);
+        shmem_idx += CHUNK_COPY_LINES_PER_WARP;
+      }
+
+      __syncthreads();
+
+      // Compute a grid of C matrix tiles in each warp.
+#pragma unroll
+      for (int k_step = 0; k_step < CHUNK_K; k_step++) {
+        wmma::fragment<wmma::matrix_a, M, N, K, uint8_t, wmma::row_major>
+            a[WARP_COL_TILES];
+        wmma::fragment<wmma::matrix_b, M, N, K, uint8_t, wmma::col_major>
+            b[WARP_ROW_TILES];
+
+#pragma unroll
+        for (int i = 0; i < WARP_COL_TILES; i++) {
+          size_t shmem_idx_a = (warpId / 2) * M * 2 + (i * M);
+          const uint8_t *tile_ptr = &shmem[shmem_idx_a][k_step * K];
+
+          wmma::load_matrix_sync(a[i], tile_ptr, K * CHUNK_K + SKEW_UINT8);
+
+#pragma unroll
+          for (int j = 0; j < WARP_ROW_TILES; j++) {
+            if (i == 0) {
+              // Load the B matrix fragment once, because it is going to be
+              // reused against the other A matrix fragments.
+              size_t shmem_idx_b = shmem_idx_b_off +
+                                   (WARP_ROW_TILES * N) * (warpId % 2) +
+                                   (j * N);
+              const uint8_t *tile_ptr = &shmem[shmem_idx_b][k_step * K];
+
+              wmma::load_matrix_sync(b[j], tile_ptr, K * CHUNK_K + SKEW_UINT8);
+            }
+
+            wmma::mma_sync(c[i][j], a[i], b[j], c[i][j]);
+          }
+        }
+      }
+
+      __syncthreads();
+    }
+
+      // Store the D fragments to shared memory.
+#pragma unroll
+    for (int i = 0; i < WARP_COL_TILES; i++) {
+#pragma unroll
+      for (int j = 0; j < WARP_ROW_TILES; j++) {
+#pragma unroll
+        // Uniform, point-wise transformations of ALL fragment elements by ALL
+        // threads in the warp are well-defined even though element indices
+        // within fragment storage are not defined.
+        for (int t = 0; t < c[i][j].num_elements; t++) c[i][j].x[t] *= alpha;
+
+        int *tile_ptr = shmem_warp_tile_ptr + i * SHMEM_STRIDE * K + j * N;
+
+        wmma::store_matrix_sync(tile_ptr, c[i][j], SHMEM_STRIDE, C_LAYOUT);
+      }
+    }
+
+    __syncthreads();
+
+    // Now that shared memory contains all the D tiles, stream them to global
+    // memory.
+    int *dst_gmem_warp_stream_ptr = &D[gmem_idx];
+
+#pragma unroll
+    for (int i = 0; i < K; i++) {
+      *((int4 *)(dst_gmem_warp_stream_ptr + GLOBAL_MEM_STRIDE * i) + laneId) =
+          *((int4 *)(shmem_warp_stream_ptr + SHMEM_STRIDE * i) + laneId);
+    }
+
+    __syncthreads();
+  }
+}
+
+// Performs an MxNxK GEMM (C=alpha*A*B + beta*C) assuming:
+//  1) Matrices are packed in memory.
+//  2) M, N and K are multiples of 16.
+//  3) Neither A nor B are transposed.
+// Note: This is a less performant version of the compute_gemm_imma kernel. It
+// is designed for
+//       demonstration purposes only to show the CUDA WMMA API use without
+//       relying on availability of the shared memory.
+__global__ void simple_wmma_gemm_imma(const uint8_t *a, const uint8_t *b,
+                                      const int *c, int *d, int m_ld, int n_ld,
+                                      int k_ld, int alpha, int beta) {
+  // Leading dimensions. Packed with no transpositions.
+  int lda = m_ld;
+  int ldb = k_ld;
+  int ldc = n_ld;
+
+  // Tile using a 2D grid
+  int warpM = (blockIdx.x * blockDim.x + threadIdx.x) / warpSize;
+  int warpN = (blockIdx.y * blockDim.y + threadIdx.y);
+
+  // Declare the fragments
+  wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_N, WMMA_K, uint8_t,
+                 wmma::row_major>
+      a_frag;
+  wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_N, WMMA_K, uint8_t,
+                 wmma::col_major>
+      b_frag;
+  wmma::fragment<wmma::accumulator, WMMA_M, WMMA_N, WMMA_K, int> acc_frag;
+  wmma::fragment<wmma::accumulator, WMMA_M, WMMA_N, WMMA_K, int> c_frag;
+
+  wmma::fill_fragment(acc_frag, 0.0f);
+
+  // Loop over k
+  for (int i = 0; i < k_ld; i += WMMA_K) {
+    int aCol = i;
+    int aRow = warpM * WMMA_M;
+
+    int bCol = i;
+    int bRow = warpN * WMMA_N;
+
+    // Bounds checking
+    if (aRow < m_ld && aCol < k_ld && bRow < k_ld && bCol < n_ld) {
+      // Load the inputs
+      wmma::load_matrix_sync(a_frag, a + aCol + aRow * lda, lda);
+      wmma::load_matrix_sync(b_frag, b + bCol + bRow * ldb, ldb);
+
+      // Perform the matrix multiplication
+      wmma::mma_sync(acc_frag, a_frag, b_frag, acc_frag);
+    }
+  }
+
+  // Load in the current value of c, scale it by beta, and add this our result
+  // scaled by alpha
+  int cCol = warpN * WMMA_N;
+  int cRow = warpM * WMMA_M;
+
+  if (cRow < m_ld && cCol < n_ld) {
+    wmma::load_matrix_sync(c_frag, c + cCol + cRow * ldc, ldc,
+                           wmma::mem_row_major);
+
+    for (int i = 0; i < c_frag.num_elements; i++) {
+      c_frag.x[i] = alpha * acc_frag.x[i] + beta * c_frag.x[i];
+    }
+
+    // Store the output
+    wmma::store_matrix_sync(d + cCol + cRow * ldc, c_frag, ldc,
+                            wmma::mem_row_major);
+  }
+}
+
+__host__ void matMultiplyOnHost(uint8_t *A, uint8_t *B, int *C, int alpha,
+                                int beta, int numARows, int numAColumns,
+                                int numBRows, int numBColumns, int numCRows,
+                                int numCColumns) {
+  for (int i = 0; i < numCRows; i++) {
+    for (int j = 0; j < numCColumns; j++) {
+      int temp = 0;
+
+      for (int k = 0; k < numAColumns; k++) {
+        temp += A[i * numAColumns + k] * B[j * numBRows + k];
+      }
+
+      C[i * numCColumns + j] = temp * alpha + beta * C[i * numCColumns + j];
+    }
+  }
+}
+
+int main(int argc, char **argv) {
+  printf("Initializing...\n");
+
+  int dev = findCudaDevice(argc, (const char **)argv);
+
+  cudaDeviceProp deviceProp;
+  checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev));
+
+  // Tensor cores require a GPU of Volta (SM72) architecture or higher.
+  if (deviceProp.major < 7 || (deviceProp.major <= 7 && deviceProp.minor < 2)) {
+    printf(
+        "immaTensorCoreGemm requires SM 7.2 or higher to use Tensor Cores.  "
+        "Exiting...\n");
+    exit(EXIT_WAIVED);
+  }
+
+  printf("M: %d (%d x %d)\n", M_GLOBAL, M, M_TILES);
+  printf("N: %d (%d x %d)\n", N_GLOBAL, N, N_TILES);
+  printf("K: %d (%d x %d)\n", K_GLOBAL, K, K_TILES);
+
+  uint8_t *A_h = NULL;
+  uint8_t *B_h = NULL;
+  int *C_h = NULL;
+#if CPU_DEBUG
+  int *result_hD = NULL;
+  int *result_host = NULL;
+#endif
+
+  A_h = (uint8_t *)malloc(sizeof(uint8_t) * M_GLOBAL * K_GLOBAL);
+  B_h = (uint8_t *)malloc(sizeof(uint8_t) * K_GLOBAL * N_GLOBAL);
+  C_h = (int *)malloc(sizeof(int) * M_GLOBAL * N_GLOBAL);
+#if CPU_DEBUG
+  result_hD = (int *)malloc(sizeof(int) * M_GLOBAL * N_GLOBAL);
+  result_host = (int *)malloc(sizeof(int) * M_GLOBAL * N_GLOBAL);
+#endif
+
+  uint8_t *A = NULL;
+  uint8_t *B = NULL;
+  int *C = NULL;
+  int *D = NULL;
+
+  checkCudaErrors(
+      cudaMalloc(reinterpret_cast<void **>(&A), sizeof(uint8_t) * M_GLOBAL * K_GLOBAL));
+  checkCudaErrors(
+      cudaMalloc(reinterpret_cast<void **>(&B), sizeof(uint8_t) * N_GLOBAL * K_GLOBAL));
+  checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&C), sizeof(int) * M_GLOBAL * N_GLOBAL));
+  checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&D), sizeof(int) * M_GLOBAL * N_GLOBAL));
+
+  assert(((unsigned long long)A) % 128 == 0);
+  assert(((unsigned long long)B) % 128 == 0);
+  assert(((unsigned long long)C) % 128 == 0);
+  assert(((unsigned long long)D) % 128 == 0);
+
+  init_host_matrices(A_h, B_h, C_h);
+
+  checkCudaErrors(cudaMemcpy(A, A_h, sizeof(uint8_t) * M_GLOBAL * K_GLOBAL,
+                             cudaMemcpyHostToDevice));
+  checkCudaErrors(cudaMemcpy(B, B_h, sizeof(uint8_t) * N_GLOBAL * K_GLOBAL,
+                             cudaMemcpyHostToDevice));
+  checkCudaErrors(cudaMemcpy(C, C_h, sizeof(int) * M_GLOBAL * N_GLOBAL,
+                             cudaMemcpyHostToDevice));
+  checkCudaErrors(cudaMemset(D, 0, sizeof(int) * M_GLOBAL * N_GLOBAL));
+
+  printf("Preparing data for GPU...\n");
+
+  assert(((unsigned long long)A) % 128 == 0);
+  assert(((unsigned long long)B) % 128 == 0);
+  assert(((unsigned long long)C) % 128 == 0);
+  assert(((unsigned long long)D) % 128 == 0);
+
+  enum {
+    // Compute the right amount of shared memory to request.
+    // We need shared memory to hold per-CTA C and D matrix tiles, and to cache
+    // per-CTA chunks
+    // of the A and B matrices. Therefore, the right amount to request is the
+    // maximum of those
+    // two numbers.
+    SHMEM_SZ = MAX(sizeof(uint8_t) * (BLOCK_COL_TILES * M) *
+                       (CHUNK_K * K + SKEW_UINT8) * 2,
+                   M * (BLOCK_ROW_WARPS * WARP_ROW_TILES) * N *
+                       (BLOCK_COL_WARPS * WARP_COL_TILES) * sizeof(int))
+  };
+
+  printf("Required shared memory size: %lu Kb\n", SHMEM_SZ / 1024UL);
+
+  int alpha = 1;
+  int beta = 1;
+
+  cudaEvent_t start, stop;
+
+  checkCudaErrors(cudaEventCreate(&start));
+  checkCudaErrors(cudaEventCreate(&stop));
+  checkCudaErrors(cudaEventRecord(start));
+
+  // If enough shared memory available on the GPU use high performant kernel
+  if (deviceProp.sharedMemPerMultiprocessor >= SHMEM_SZ) {
+    printf("Computing... using high performance kernel compute_gemm_imma \n");
+
+    checkCudaErrors(cudaFuncSetAttribute(
+        compute_gemm_imma, cudaFuncAttributeMaxDynamicSharedMemorySize,
+        SHMEM_SZ));
+    checkKernelErrors(
+        (compute_gemm_imma<<<deviceProp.multiProcessorCount, THREADS_PER_BLOCK,
+                             SHMEM_SZ>>>(A, B, C, D, alpha, beta)));
+#if CPU_DEBUG
+    checkCudaErrors(cudaMemcpy(result_hD, D, sizeof(int) * M_GLOBAL * N_GLOBAL,
+                               cudaMemcpyDeviceToHost));
+#endif
+  } else {
+    dim3 gridDim;
+    dim3 blockDim;
+
+    // blockDim.x must be a multiple of warpSize
+    // 128x4 means we have 16 warps and a block computes a 64x64 output tile
+    blockDim.x = 128;
+    blockDim.y = 4;
+
+    gridDim.x = (M_GLOBAL + (WMMA_M * blockDim.x / 32 - 1)) /
+                (WMMA_M * blockDim.x / 32);
+    gridDim.y = (N_GLOBAL + WMMA_N * blockDim.y - 1) / (WMMA_N * blockDim.y);
+
+    printf("Computing... using simple_wmma_gemm_imma kernel\n");
+    simple_wmma_gemm_imma<<<gridDim, blockDim>>>(A, B, C, D, M_GLOBAL, N_GLOBAL,
+                                                 K_GLOBAL, alpha, beta);
+#if CPU_DEBUG
+    checkCudaErrors(cudaMemcpy(result_hD, D, sizeof(int) * M_GLOBAL * N_GLOBAL,
+                               cudaMemcpyDeviceToHost));
+#endif
+  }
+
+  checkCudaErrors(cudaEventRecord(stop));
+  checkCudaErrors(cudaEventSynchronize(stop));
+
+#if CPU_DEBUG
+  printf("Verifying correctness of the computations...\n");
+
+  memcpy(result_host, C_h, sizeof(int) * M_GLOBAL * N_GLOBAL);
+
+  matMultiplyOnHost(A_h, B_h, result_host, alpha, beta, M_GLOBAL, K_GLOBAL,
+                    K_GLOBAL, N_GLOBAL, M_GLOBAL, N_GLOBAL);
+
+  for (int i = 0; i < N_GLOBAL * M_GLOBAL; i++) {
+    if (abs(result_hD[i] - result_host[i]) > 0) {
+      printf("mismatch i=%d result_hD=%d result_host=%d\n", i, result_hD[i],
+             result_host[i]);
+    }
+  }
+  free(result_host);
+  free(result_hD);
+#endif
+
+  float milliseconds = 0;
+
+  checkCudaErrors(cudaEventElapsedTime(&milliseconds, start, stop));
+
+    printf("Time: %f ms\n", milliseconds);
+    printf("TOPS: %.2f\n", (((double)M_GLOBAL * N_GLOBAL * K_GLOBAL * 2)/(milliseconds/1000.)) / 1e12);
+
+  free(A_h);
+  free(B_h);
+  free(C_h);
+  checkCudaErrors(cudaFree(reinterpret_cast<void *>(A)));
+  checkCudaErrors(cudaFree(reinterpret_cast<void *>(B)));
+  checkCudaErrors(cudaFree(reinterpret_cast<void *>(C)));
+  checkCudaErrors(cudaFree(reinterpret_cast<void *>(D)));
+
+  return EXIT_SUCCESS;
+}
diff --git a/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2012.sln b/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2012.sln
new file mode 100644
index 000000000..fccd0a255
--- /dev/null
+++ b/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2012.sln
@@ -0,0 +1,20 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2012
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "immaTensorCoreGemm", "immaTensorCoreGemm_vs2012.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
diff --git a/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2012.vcxproj b/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2012.vcxproj
new file mode 100644
index 000000000..d9a369da1
--- /dev/null
+++ b/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2012.vcxproj
@@ -0,0 +1,107 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>immaTensorCoreGemm_vs2012</RootNamespace>
+    <ProjectName>immaTensorCoreGemm</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v110</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/immaTensorCoreGemm.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_75,sm_75;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="immaTensorCoreGemm.cu" />
+
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
+  </ImportGroup>
+</Project>
diff --git a/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2013.sln b/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2013.sln
new file mode 100644
index 000000000..e389a7073
--- /dev/null
+++ b/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2013.sln
@@ -0,0 +1,20 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 13.00
+# Visual Studio 2013
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "immaTensorCoreGemm", "immaTensorCoreGemm_vs2013.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
diff --git a/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2013.vcxproj b/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2013.vcxproj
new file mode 100644
index 000000000..a4d885456
--- /dev/null
+++ b/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2013.vcxproj
@@ -0,0 +1,107 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>immaTensorCoreGemm_vs2013</RootNamespace>
+    <ProjectName>immaTensorCoreGemm</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v120</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/immaTensorCoreGemm.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_75,sm_75;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="immaTensorCoreGemm.cu" />
+
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
+  </ImportGroup>
+</Project>
diff --git a/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2015.sln b/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2015.sln
new file mode 100644
index 000000000..b630ed5ee
--- /dev/null
+++ b/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2015.sln
@@ -0,0 +1,20 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 14.00
+# Visual Studio 2015
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "immaTensorCoreGemm", "immaTensorCoreGemm_vs2015.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
diff --git a/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2015.vcxproj b/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2015.vcxproj
new file mode 100644
index 000000000..e4a8b3e1b
--- /dev/null
+++ b/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2015.vcxproj
@@ -0,0 +1,107 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>immaTensorCoreGemm_vs2015</RootNamespace>
+    <ProjectName>immaTensorCoreGemm</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v140</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/immaTensorCoreGemm.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_75,sm_75;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="immaTensorCoreGemm.cu" />
+
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
+  </ImportGroup>
+</Project>
diff --git a/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2017.sln b/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2017.sln
new file mode 100644
index 000000000..5579f261d
--- /dev/null
+++ b/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2017.sln
@@ -0,0 +1,20 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2017
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "immaTensorCoreGemm", "immaTensorCoreGemm_vs2017.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
diff --git a/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2017.vcxproj b/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2017.vcxproj
new file mode 100644
index 000000000..0c260dca6
--- /dev/null
+++ b/Samples/immaTensorCoreGemm/immaTensorCoreGemm_vs2017.vcxproj
@@ -0,0 +1,108 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>immaTensorCoreGemm_vs2017</RootNamespace>
+    <ProjectName>immaTensorCoreGemm</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v141</PlatformToolset>
+	<WindowsTargetPlatformVersion>10.0.15063.0</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/immaTensorCoreGemm.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_75,sm_75;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="immaTensorCoreGemm.cu" />
+
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
+  </ImportGroup>
+</Project>
diff --git a/Samples/matrixMul/Makefile b/Samples/matrixMul/Makefile
index e5ade9c22..c4ab15e5d 100644
--- a/Samples/matrixMul/Makefile
+++ b/Samples/matrixMul/Makefile
@@ -246,7 +246,11 @@ LIBRARIES :=
 ################################################################################
 
 # Gencode arguments
+ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
+SMS ?= 30 35 37 50 52 60 61 70 72 75
+else
 SMS ?= 30 35 37 50 52 60 61 70 75
+endif
 
 ifeq ($(SMS),)
 $(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
diff --git a/Samples/matrixMul/NsightEclipse.xml b/Samples/matrixMul/NsightEclipse.xml
index 38ea6b03d..364b84c10 100644
--- a/Samples/matrixMul/NsightEclipse.xml
+++ b/Samples/matrixMul/NsightEclipse.xml
@@ -46,6 +46,7 @@
   <sm-arch>sm60</sm-arch>
   <sm-arch>sm61</sm-arch>
   <sm-arch>sm70</sm-arch>
+  <sm-arch>sm72</sm-arch>
   <sm-arch>sm75</sm-arch>
   <supported_envs>
     <env>
diff --git a/Samples/matrixMul/README.md b/Samples/matrixMul/README.md
index ae3bdf716..36927ba4d 100644
--- a/Samples/matrixMul/README.md
+++ b/Samples/matrixMul/README.md
@@ -10,7 +10,7 @@ CUDA Runtime API, Linear Algebra
 
 ## Supported SM Architectures
 
-[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
+[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
 
 ## Supported OSes
 
@@ -27,7 +27,7 @@ cudaEventCreate, cudaEventRecord, cudaEventQuery, cudaEventDestroy, cudaEventEla
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 10.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 
 ## Build and Run
 
diff --git a/Samples/matrixMul/matrixMul_vs2012.vcxproj b/Samples/matrixMul/matrixMul_vs2012.vcxproj
index 109801176..365318d58 100644
--- a/Samples/matrixMul/matrixMul_vs2012.vcxproj
+++ b/Samples/matrixMul/matrixMul_vs2012.vcxproj
@@ -33,7 +33,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -102,6 +102,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/matrixMul/matrixMul_vs2013.vcxproj b/Samples/matrixMul/matrixMul_vs2013.vcxproj
index fc8f15800..373420eba 100644
--- a/Samples/matrixMul/matrixMul_vs2013.vcxproj
+++ b/Samples/matrixMul/matrixMul_vs2013.vcxproj
@@ -33,7 +33,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -102,6 +102,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/matrixMul/matrixMul_vs2015.vcxproj b/Samples/matrixMul/matrixMul_vs2015.vcxproj
index 135f764cb..65f334121 100644
--- a/Samples/matrixMul/matrixMul_vs2015.vcxproj
+++ b/Samples/matrixMul/matrixMul_vs2015.vcxproj
@@ -33,7 +33,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -102,6 +102,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/matrixMul/matrixMul_vs2017.vcxproj b/Samples/matrixMul/matrixMul_vs2017.vcxproj
index d145fa07b..5b9e1e586 100644
--- a/Samples/matrixMul/matrixMul_vs2017.vcxproj
+++ b/Samples/matrixMul/matrixMul_vs2017.vcxproj
@@ -34,7 +34,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -103,6 +103,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/matrixMulDrv/README.md b/Samples/matrixMulDrv/README.md
index e22bee66b..5096f5241 100644
--- a/Samples/matrixMulDrv/README.md
+++ b/Samples/matrixMulDrv/README.md
@@ -10,7 +10,7 @@ CUDA Driver API, Matrix Multiply
 
 ## Supported SM Architectures
 
-[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
+[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
 
 ## Supported OSes
 
@@ -27,7 +27,7 @@ cuModuleLoad, cuModuleLoadDataEx, cuModuleGetFunction, cuMemAlloc, cuMemFree, cu
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 10.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 
 ## Build and Run
 
diff --git a/Samples/matrixMulDrv/matrixMulDrv_vs2012.vcxproj b/Samples/matrixMulDrv/matrixMulDrv_vs2012.vcxproj
index e13d4bad9..51d15d5d2 100644
--- a/Samples/matrixMulDrv/matrixMulDrv_vs2012.vcxproj
+++ b/Samples/matrixMulDrv/matrixMulDrv_vs2012.vcxproj
@@ -33,7 +33,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -106,6 +106,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/matrixMulDrv/matrixMulDrv_vs2013.vcxproj b/Samples/matrixMulDrv/matrixMulDrv_vs2013.vcxproj
index 827374d79..ed0d8bee9 100644
--- a/Samples/matrixMulDrv/matrixMulDrv_vs2013.vcxproj
+++ b/Samples/matrixMulDrv/matrixMulDrv_vs2013.vcxproj
@@ -33,7 +33,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -106,6 +106,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/matrixMulDrv/matrixMulDrv_vs2015.vcxproj b/Samples/matrixMulDrv/matrixMulDrv_vs2015.vcxproj
index 6bff35672..638d1bb57 100644
--- a/Samples/matrixMulDrv/matrixMulDrv_vs2015.vcxproj
+++ b/Samples/matrixMulDrv/matrixMulDrv_vs2015.vcxproj
@@ -33,7 +33,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -106,6 +106,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/matrixMulDrv/matrixMulDrv_vs2017.vcxproj b/Samples/matrixMulDrv/matrixMulDrv_vs2017.vcxproj
index bd5d40780..c04b34878 100644
--- a/Samples/matrixMulDrv/matrixMulDrv_vs2017.vcxproj
+++ b/Samples/matrixMulDrv/matrixMulDrv_vs2017.vcxproj
@@ -34,7 +34,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -107,6 +107,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/nvJPEG/Makefile b/Samples/nvJPEG/Makefile
new file mode 100644
index 000000000..0d7452f3c
--- /dev/null
+++ b/Samples/nvJPEG/Makefile
@@ -0,0 +1,301 @@
+################################################################################
+# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+#
+# Makefile project only supported on Mac OS X and Linux Platforms)
+#
+################################################################################
+
+# Location of the CUDA Toolkit
+CUDA_PATH ?= /usr/local/cuda
+
+##############################
+# start deprecated interface #
+##############################
+ifeq ($(x86_64),1)
+    $(info WARNING - x86_64 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=x86_64 instead)
+    TARGET_ARCH ?= x86_64
+endif
+ifeq ($(ARMv7),1)
+    $(info WARNING - ARMv7 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=armv7l instead)
+    TARGET_ARCH ?= armv7l
+endif
+ifeq ($(aarch64),1)
+    $(info WARNING - aarch64 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=aarch64 instead)
+    TARGET_ARCH ?= aarch64
+endif
+ifeq ($(ppc64le),1)
+    $(info WARNING - ppc64le variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=ppc64le instead)
+    TARGET_ARCH ?= ppc64le
+endif
+ifneq ($(GCC),)
+    $(info WARNING - GCC variable has been deprecated)
+    $(info WARNING - please use HOST_COMPILER=$(GCC) instead)
+    HOST_COMPILER ?= $(GCC)
+endif
+ifneq ($(abi),)
+    $(error ERROR - abi variable has been removed)
+endif
+############################
+# end deprecated interface #
+############################
+
+# architecture
+HOST_ARCH   := $(shell uname -m)
+TARGET_ARCH ?= $(HOST_ARCH)
+ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
+    ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le))
+            TARGET_SIZE := 64
+        else ifneq (,$(filter $(TARGET_ARCH),armv7l))
+            TARGET_SIZE := 32
+        endif
+    else
+        TARGET_SIZE := $(shell getconf LONG_BIT)
+    endif
+else
+    $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
+endif
+ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le))
+        $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
+    endif
+endif
+
+# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
+ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
+    TARGET_ARCH = armv7l
+endif
+
+# operating system
+HOST_OS   := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
+TARGET_OS ?= $(HOST_OS)
+ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
+    $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
+endif
+
+# host compiler
+ifeq ($(TARGET_OS),darwin)
+    ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
+        HOST_COMPILER ?= clang++
+    endif
+else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
+        ifeq ($(TARGET_OS),linux)
+            HOST_COMPILER ?= arm-linux-gnueabihf-g++
+        else ifeq ($(TARGET_OS),qnx)
+            ifeq ($(QNX_HOST),)
+                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
+            endif
+            ifeq ($(QNX_TARGET),)
+                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
+            endif
+            export QNX_HOST
+            export QNX_TARGET
+            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
+        else ifeq ($(TARGET_OS),android)
+            HOST_COMPILER ?= arm-linux-androideabi-g++
+        endif
+    else ifeq ($(TARGET_ARCH),aarch64)
+        ifeq ($(TARGET_OS), linux)
+            HOST_COMPILER ?= aarch64-linux-gnu-g++
+        else ifeq ($(TARGET_OS),qnx)
+            ifeq ($(QNX_HOST),)
+                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
+            endif
+            ifeq ($(QNX_TARGET),)
+                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
+            endif
+            export QNX_HOST
+            export QNX_TARGET
+            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/aarch64-unknown-nto-qnx7.0.0-g++
+        else ifeq ($(TARGET_OS), android)
+            HOST_COMPILER ?= aarch64-linux-android-clang++
+        endif
+    else ifeq ($(TARGET_ARCH),ppc64le)
+        HOST_COMPILER ?= powerpc64le-linux-gnu-g++
+    endif
+endif
+HOST_COMPILER ?= g++
+NVCC          := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
+
+# internal flags
+NVCCFLAGS   := -m${TARGET_SIZE}
+CCFLAGS     :=
+LDFLAGS     :=
+
+# build flags
+ifeq ($(TARGET_OS),darwin)
+    LDFLAGS += -rpath $(CUDA_PATH)/lib
+    CCFLAGS += -arch $(HOST_ARCH)
+else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
+    LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
+    CCFLAGS += -mfloat-abi=hard
+else ifeq ($(TARGET_OS),android)
+    LDFLAGS += -pie
+    CCFLAGS += -fpie -fpic -fexceptions
+endif
+
+ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
+        ifneq ($(TARGET_FS),)
+            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
+            ifeq ($(GCCVERSIONLTEQ46),1)
+                CCFLAGS += --sysroot=$(TARGET_FS)
+            endif
+            LDFLAGS += --sysroot=$(TARGET_FS)
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
+        ifneq ($(TARGET_FS),)
+            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
+            ifeq ($(GCCVERSIONLTEQ46),1)
+                CCFLAGS += --sysroot=$(TARGET_FS)
+            endif
+            LDFLAGS += --sysroot=$(TARGET_FS)
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L $(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L $(TARGET_FS)/usr/lib/aarch64-linux-gnu
+            LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu
+        endif
+    endif
+endif
+
+ifeq ($(TARGET_OS),qnx)
+    CCFLAGS += -DWIN_INTERFACE_CUSTOM
+    LDFLAGS += -lsocket
+endif
+
+# Install directory of different arch
+CUDA_INSTALL_TARGET_DIR :=
+ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
+    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
+    CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
+else ifeq ($(TARGET_ARCH),ppc64le)
+    CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
+endif
+
+# Debug build flags
+ifeq ($(dbg),1)
+      NVCCFLAGS += -g -G
+      BUILD_TYPE := debug
+else
+      BUILD_TYPE := release
+endif
+
+ALL_CCFLAGS :=
+ALL_CCFLAGS += $(NVCCFLAGS)
+ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
+ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
+ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
+
+SAMPLE_ENABLED := 1
+
+# This sample is not supported on Mac OSX
+ifeq ($(TARGET_OS),darwin)
+  $(info >>> WARNING - nvJPEG is not supported on Mac OSX - waiving sample <<<)
+  SAMPLE_ENABLED := 0
+endif
+
+# This sample is not supported on ARMv7
+ifeq ($(TARGET_ARCH),armv7l)
+  $(info >>> WARNING - nvJPEG is not supported on ARMv7 - waiving sample <<<)
+  SAMPLE_ENABLED := 0
+endif
+
+# This sample is not supported on aarch64
+ifeq ($(TARGET_ARCH),aarch64)
+  $(info >>> WARNING - nvJPEG is not supported on aarch64 - waiving sample <<<)
+  SAMPLE_ENABLED := 0
+endif
+
+ALL_LDFLAGS :=
+ALL_LDFLAGS += $(ALL_CCFLAGS)
+ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
+ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
+
+# Common includes and paths for CUDA
+INCLUDES  := -I../../Common
+LIBRARIES :=
+
+################################################################################
+
+LIBRARIES += -lnvjpeg
+
+ifeq ($(SAMPLE_ENABLED),0)
+EXEC ?= @echo "[@]"
+endif
+
+################################################################################
+
+# Target rules
+all: build
+
+build: nvJPEG
+
+check.deps:
+ifeq ($(SAMPLE_ENABLED),0)
+	@echo "Sample will be waived due to the above missing dependencies"
+else
+	@echo "Sample is ready - all dependencies have been met"
+endif
+
+nvJPEG.o:nvJPEG.cpp
+	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
+
+nvJPEG: nvJPEG.o
+	$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
+	$(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
+	$(EXEC) cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
+
+run: build
+	$(EXEC) ./nvJPEG
+
+clean:
+	rm -f nvJPEG nvJPEG.o
+	rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/nvJPEG
+
+clobber: clean
diff --git a/Samples/nvJPEG/NsightEclipse.xml b/Samples/nvJPEG/NsightEclipse.xml
new file mode 100644
index 000000000..1485c97b8
--- /dev/null
+++ b/Samples/nvJPEG/NsightEclipse.xml
@@ -0,0 +1,58 @@
+<?xml version="1.0" encoding="UTF-8"?> 
+<!DOCTYPE entry SYSTEM "SamplesInfo.dtd">
+<entry>
+  <name>nvJPEG</name>
+  <description><![CDATA[A CUDA Sample that demonstrates single and batched decoding of jpeg images using NVJPEG Library.]]></description>
+  <devicecompilation>whole</devicecompilation>
+  <includepaths>
+    <path>./</path>
+    <path>../</path>
+    <path>../../common/inc</path>
+  </includepaths>
+  <keyconcepts>
+    <concept level="basic">Image Decoding</concept>
+    <concept level="basic">NVJPEG Library</concept>
+  </keyconcepts>
+  <keywords>
+    <keyword>NVJPEG</keyword>
+    <keyword>JPEG Decoding</keyword>
+  </keywords>
+  <libraries>
+    <library>nvjpeg</library>
+  </libraries>
+  <librarypaths>
+  </librarypaths>
+  <nsight_eclipse>true</nsight_eclipse>
+  <primary_file>nvJPEG.cpp</primary_file>
+  <qatests>
+    <qatest>-i ../../../../Samples/nvJPEG/images/</qatest>
+  </qatests>
+  <required_dependencies>
+    <dependency>NVJPEG</dependency>
+  </required_dependencies>
+  <scopes>
+    <scope>1:CUDA Basic Topics</scope>
+    <scope>3:JPEG Decoding</scope>
+  </scopes>
+  <sm-arch>sm30</sm-arch>
+  <sm-arch>sm35</sm-arch>
+  <sm-arch>sm37</sm-arch>
+  <sm-arch>sm50</sm-arch>
+  <sm-arch>sm52</sm-arch>
+  <sm-arch>sm60</sm-arch>
+  <sm-arch>sm61</sm-arch>
+  <sm-arch>sm70</sm-arch>
+  <sm-arch>sm72</sm-arch>
+  <sm-arch>sm75</sm-arch>
+  <supported_envs>
+    <env>
+      <arch>x86_64</arch>
+      <platform>linux</platform>
+    </env>
+  </supported_envs>
+  <supported_sm_architectures>
+    <from>3.0</from>
+  </supported_sm_architectures>
+  <title>NVJPEG simple</title>
+  <type>exe</type>
+</entry>
diff --git a/Samples/nvJPEG/README.md b/Samples/nvJPEG/README.md
new file mode 100644
index 000000000..19454c318
--- /dev/null
+++ b/Samples/nvJPEG/README.md
@@ -0,0 +1,61 @@
+# nvJPEG - NVJPEG simple
+
+## Description
+
+A CUDA Sample that demonstrates single and batched decoding of jpeg images using NVJPEG Library.
+
+## Key Concepts
+
+Image Decoding, NVJPEG Library
+
+## Supported SM Architectures
+
+[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
+
+## Supported OSes
+
+Linux
+
+## Supported CPU Architecture
+
+x86_64
+
+## CUDA APIs involved
+
+## Dependencies needed to build/run
+[NVJPEG](../../README.md#nvjpeg)
+
+## Prerequisites
+
+Download and install the [CUDA Toolkit 10.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Make sure the dependencies mentioned in [Dependencies]() section above are installed.
+
+## Build and Run
+
+### Linux
+The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make:
+```
+$ cd <sample_dir>
+$ make
+```
+The samples makefiles can take advantage of certain options:
+*  **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64.
+    By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.<br/>
+`$ make TARGET_ARCH=x86_64` <br/>
+    See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details.
+*   **dbg=1** - build with debug symbols
+    ```
+    $ make dbg=1
+    ```
+*   **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`.
+    ```
+    $ make SMS="50 60"
+    ```
+
+*  **HOST_COMPILER=<host_compiler>** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers.
+```
+    $ make HOST_COMPILER=g++
+```
+
+## References (for more details)
+
diff --git a/Samples/nvJPEG/images/img1.jpg b/Samples/nvJPEG/images/img1.jpg
new file mode 100644
index 000000000..7413faf05
Binary files /dev/null and b/Samples/nvJPEG/images/img1.jpg differ
diff --git a/Samples/nvJPEG/images/img2.jpg b/Samples/nvJPEG/images/img2.jpg
new file mode 100644
index 000000000..d5b53b266
Binary files /dev/null and b/Samples/nvJPEG/images/img2.jpg differ
diff --git a/Samples/nvJPEG/images/img3.jpg b/Samples/nvJPEG/images/img3.jpg
new file mode 100644
index 000000000..41b1d9b48
Binary files /dev/null and b/Samples/nvJPEG/images/img3.jpg differ
diff --git a/Samples/nvJPEG/images/img4.jpg b/Samples/nvJPEG/images/img4.jpg
new file mode 100644
index 000000000..17fa69ef2
Binary files /dev/null and b/Samples/nvJPEG/images/img4.jpg differ
diff --git a/Samples/nvJPEG/images/img5.jpg b/Samples/nvJPEG/images/img5.jpg
new file mode 100644
index 000000000..148ccb415
Binary files /dev/null and b/Samples/nvJPEG/images/img5.jpg differ
diff --git a/Samples/nvJPEG/images/img6.jpg b/Samples/nvJPEG/images/img6.jpg
new file mode 100644
index 000000000..a58ef7c5f
Binary files /dev/null and b/Samples/nvJPEG/images/img6.jpg differ
diff --git a/Samples/nvJPEG/images/img7.jpg b/Samples/nvJPEG/images/img7.jpg
new file mode 100644
index 000000000..f49d9122e
Binary files /dev/null and b/Samples/nvJPEG/images/img7.jpg differ
diff --git a/Samples/nvJPEG/images/img8.jpg b/Samples/nvJPEG/images/img8.jpg
new file mode 100644
index 000000000..ab51a789b
Binary files /dev/null and b/Samples/nvJPEG/images/img8.jpg differ
diff --git a/Samples/nvJPEG/nvJPEG.cpp b/Samples/nvJPEG/nvJPEG.cpp
new file mode 100644
index 000000000..b52f52b08
--- /dev/null
+++ b/Samples/nvJPEG/nvJPEG.cpp
@@ -0,0 +1,559 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+// This sample needs at least CUDA 10.0. It demonstrates usages of the nvJPEG
+// library nvJPEG supports single and multiple image(batched) decode. Multiple
+// images can be decoded using the API for batch mode
+
+#include <cuda_runtime_api.h>
+#include "nvJPEG_helper.hxx"
+
+int dev_malloc(void **p, size_t s) { return (int)cudaMalloc(p, s); }
+
+int dev_free(void *p) { return (int)cudaFree(p); }
+
+typedef std::vector<std::string> FileNames;
+typedef std::vector<std::vector<char> > FileData;
+
+struct decode_params_t {
+  std::string input_dir;
+  int batch_size;
+  int total_images;
+  int dev;
+  int warmup;
+
+  nvjpegJpegState_t nvjpeg_state;
+  nvjpegHandle_t nvjpeg_handle;
+  cudaStream_t stream;
+
+  nvjpegOutputFormat_t fmt;
+  bool write_decoded;
+  std::string output_dir;
+
+  bool pipelined;
+  bool batched;
+};
+
+int read_next_batch(FileNames &image_names, int batch_size,
+                    FileNames::iterator &cur_iter, FileData &raw_data,
+                    std::vector<size_t> &raw_len, FileNames &current_names) {
+  int counter = 0;
+
+  while (counter < batch_size) {
+    if (cur_iter == image_names.end()) {
+      std::cerr << "Image list is too short to fill the batch, adding files "
+                   "from the beginning of the image list"
+                << std::endl;
+      cur_iter = image_names.begin();
+    }
+
+    if (image_names.size() == 0) {
+      std::cerr << "No valid images left in the input list, exit" << std::endl;
+      return EXIT_FAILURE;
+    }
+
+    // Read an image from disk.
+    std::ifstream input(cur_iter->c_str(),
+                        std::ios::in | std::ios::binary | std::ios::ate);
+    if (!(input.is_open())) {
+      std::cerr << "Cannot open image: " << *cur_iter
+                << ", removing it from image list" << std::endl;
+      image_names.erase(cur_iter);
+      continue;
+    }
+
+    // Get the size
+    std::streamsize file_size = input.tellg();
+    input.seekg(0, std::ios::beg);
+    // resize if buffer is too small
+    if (raw_data[counter].size() < file_size) {
+      raw_data[counter].resize(file_size);
+    }
+    if (!input.read(raw_data[counter].data(), file_size)) {
+      std::cerr << "Cannot read from file: " << *cur_iter
+                << ", removing it from image list" << std::endl;
+      image_names.erase(cur_iter);
+      continue;
+    }
+    raw_len[counter] = file_size;
+
+    current_names[counter] = *cur_iter;
+
+    counter++;
+    cur_iter++;
+  }
+  return EXIT_SUCCESS;
+}
+
+// prepare buffers for RGBi output format
+int prepare_buffers(FileData &file_data, std::vector<size_t> &file_len,
+                    std::vector<int> &img_width, std::vector<int> &img_height,
+                    std::vector<nvjpegImage_t> &ibuf,
+                    std::vector<nvjpegImage_t> &isz, FileNames &current_names,
+                    decode_params_t &params) {
+  int widths[NVJPEG_MAX_COMPONENT];
+  int heights[NVJPEG_MAX_COMPONENT];
+  int channels;
+  nvjpegChromaSubsampling_t subsampling;
+
+  for (int i = 0; i < file_data.size(); i++) {
+    checkCudaErrors(nvjpegGetImageInfo(
+        params.nvjpeg_handle, (unsigned char *)file_data[i].data(), file_len[i],
+        &channels, &subsampling, widths, heights));
+
+    img_width[i] = widths[0];
+    img_height[i] = heights[0];
+
+    std::cout << "Processing: " << current_names[i] << std::endl;
+    std::cout << "Image is " << channels << " channels." << std::endl;
+    for (int c = 0; c < channels; c++) {
+      std::cout << "Channel #" << c << " size: " << widths[c] << " x "
+                << heights[c] << std::endl;
+    }
+
+    switch (subsampling) {
+      case NVJPEG_CSS_444:
+        std::cout << "YUV 4:4:4 chroma subsampling" << std::endl;
+        break;
+      case NVJPEG_CSS_440:
+        std::cout << "YUV 4:4:0 chroma subsampling" << std::endl;
+        break;
+      case NVJPEG_CSS_422:
+        std::cout << "YUV 4:2:2 chroma subsampling" << std::endl;
+        break;
+      case NVJPEG_CSS_420:
+        std::cout << "YUV 4:2:0 chroma subsampling" << std::endl;
+        break;
+      case NVJPEG_CSS_411:
+        std::cout << "YUV 4:1:1 chroma subsampling" << std::endl;
+        break;
+      case NVJPEG_CSS_410:
+        std::cout << "YUV 4:1:0 chroma subsampling" << std::endl;
+        break;
+      case NVJPEG_CSS_GRAY:
+        std::cout << "Grayscale JPEG " << std::endl;
+        break;
+      case NVJPEG_CSS_UNKNOWN:
+        std::cout << "Unknown chroma subsampling" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    int mul = 1;
+    // in the case of interleaved RGB output, write only to single channel, but
+    // 3 samples at once
+    if (params.fmt == NVJPEG_OUTPUT_RGBI || params.fmt == NVJPEG_OUTPUT_BGRI) {
+      channels = 1;
+      mul = 3;
+    }
+    // in the case of rgb create 3 buffers with sizes of original image
+    else if (params.fmt == NVJPEG_OUTPUT_RGB ||
+             params.fmt == NVJPEG_OUTPUT_BGR) {
+      channels = 3;
+      widths[1] = widths[2] = widths[0];
+      heights[1] = heights[2] = heights[0];
+    }
+
+    // realloc output buffer if required
+    for (int c = 0; c < channels; c++) {
+      int aw = mul * widths[c];
+      int ah = heights[c];
+      int sz = aw * ah;
+      ibuf[i].pitch[c] = aw;
+      if (sz > isz[i].pitch[c]) {
+        if (ibuf[i].channel[c]) {
+          checkCudaErrors(cudaFree(ibuf[i].channel[c]));
+        }
+        checkCudaErrors(cudaMalloc(&ibuf[i].channel[c], sz));
+        isz[i].pitch[c] = sz;
+      }
+    }
+  }
+  return EXIT_SUCCESS;
+}
+
+void release_buffers(std::vector<nvjpegImage_t> &ibuf) {
+  for (int i = 0; i < ibuf.size(); i++) {
+    for (int c = 0; c < NVJPEG_MAX_COMPONENT; c++)
+      if (ibuf[i].channel[c]) checkCudaErrors(cudaFree(ibuf[i].channel[c]));
+  }
+}
+
+int decode_images(const FileData &img_data, const std::vector<size_t> &img_len,
+                  std::vector<nvjpegImage_t> &out, decode_params_t &params,
+                  double &time) {
+  checkCudaErrors(cudaStreamSynchronize(params.stream));
+  nvjpegStatus_t err;
+  StopWatchInterface *timer = NULL;
+  sdkCreateTimer(&timer);
+
+  if (!params.batched) {
+    if (!params.pipelined)  // decode one image at a time
+    {
+      int thread_idx = 0;
+      sdkStartTimer(&timer);
+      for (int i = 0; i < params.batch_size; i++) {
+        checkCudaErrors(nvjpegDecode(params.nvjpeg_handle, params.nvjpeg_state,
+                                     (const unsigned char *)img_data[i].data(),
+                                     img_len[i], params.fmt, &out[i],
+                                     params.stream));
+        checkCudaErrors(cudaStreamSynchronize(params.stream));
+      }
+    } else {
+      int thread_idx = 0;
+      sdkStartTimer(&timer);
+      for (int i = 0; i < params.batch_size; i++) {
+        checkCudaErrors(
+            nvjpegDecodePhaseOne(params.nvjpeg_handle, params.nvjpeg_state,
+                                 (const unsigned char *)img_data[i].data(),
+                                 img_len[i], params.fmt, params.stream));
+        checkCudaErrors(cudaStreamSynchronize(params.stream));
+        checkCudaErrors(nvjpegDecodePhaseTwo(
+            params.nvjpeg_handle, params.nvjpeg_state, params.stream));
+        checkCudaErrors(nvjpegDecodePhaseThree(
+            params.nvjpeg_handle, params.nvjpeg_state, &out[i], params.stream));
+      }
+      checkCudaErrors(cudaStreamSynchronize(params.stream));
+    }
+  } else {
+    std::vector<const unsigned char *> raw_inputs;
+    for (int i = 0; i < params.batch_size; i++) {
+      raw_inputs.push_back((const unsigned char *)img_data[i].data());
+    }
+
+    if (!params.pipelined)  // decode multiple images in a single batch
+    {
+      sdkStartTimer(&timer);
+      checkCudaErrors(nvjpegDecodeBatched(
+          params.nvjpeg_handle, params.nvjpeg_state, raw_inputs.data(),
+          img_len.data(), out.data(), params.stream));
+      checkCudaErrors(cudaStreamSynchronize(params.stream));
+    } else {
+      int thread_idx = 0;
+      for (int i = 0; i < params.batch_size; i++) {
+        checkCudaErrors(nvjpegDecodeBatchedPhaseOne(
+            params.nvjpeg_handle, params.nvjpeg_state, raw_inputs[i],
+            img_len[i], i, thread_idx, params.stream));
+      }
+      checkCudaErrors(nvjpegDecodeBatchedPhaseTwo(
+          params.nvjpeg_handle, params.nvjpeg_state, params.stream));
+      checkCudaErrors(nvjpegDecodeBatchedPhaseThree(params.nvjpeg_handle,
+                                                    params.nvjpeg_state,
+                                                    out.data(), params.stream));
+      checkCudaErrors(cudaStreamSynchronize(params.stream));
+    }
+  }
+  sdkStopTimer(&timer);
+  time = sdkGetAverageTimerValue(&timer)/1000.0f;
+
+  return EXIT_SUCCESS;
+}
+
+int write_images(std::vector<nvjpegImage_t> &iout, std::vector<int> &widths,
+                 std::vector<int> &heights, decode_params_t &params,
+                 FileNames &filenames) {
+  for (int i = 0; i < params.batch_size; i++) {
+    // Get the file name, without extension.
+    // This will be used to rename the output file.
+    size_t position = filenames[i].rfind("/");
+    std::string sFileName =
+        (std::string::npos == position)
+            ? filenames[i]
+            : filenames[i].substr(position + 1, filenames[i].size());
+    position = sFileName.rfind(".");
+    sFileName = (std::string::npos == position) ? sFileName
+                                                : sFileName.substr(0, position);
+    std::string fname(params.output_dir + "/" + sFileName + ".bmp");
+
+    int err;
+    if (params.fmt == NVJPEG_OUTPUT_RGB || params.fmt == NVJPEG_OUTPUT_BGR) {
+      err = writeBMP(fname.c_str(), iout[i].channel[0], iout[i].pitch[0],
+                     iout[i].channel[1], iout[i].pitch[1], iout[i].channel[2],
+                     iout[i].pitch[2], widths[i], heights[i]);
+    } else if (params.fmt == NVJPEG_OUTPUT_RGBI ||
+               params.fmt == NVJPEG_OUTPUT_BGRI) {
+      // Write BMP from interleaved data
+      err = writeBMPi(fname.c_str(), iout[i].channel[0], iout[i].pitch[0],
+                      widths[i], heights[i]);
+    }
+    if (err) {
+      std::cout << "Cannot write output file: " << fname << std::endl;
+      return EXIT_FAILURE;
+    }
+    std::cout << "Done writing decoded image to file: " << fname << std::endl;
+  }
+}
+
+double process_images(FileNames &image_names, decode_params_t &params,
+                      double &total) {
+  // vector for storing raw files and file lengths
+  FileData file_data(params.batch_size);
+  std::vector<size_t> file_len(params.batch_size);
+  FileNames current_names(params.batch_size);
+  std::vector<int> widths(params.batch_size);
+  std::vector<int> heights(params.batch_size);
+  // we wrap over image files to process total_images of files
+  FileNames::iterator file_iter = image_names.begin();
+
+  // stream for decoding
+  checkCudaErrors(
+      cudaStreamCreateWithFlags(&params.stream, cudaStreamNonBlocking));
+
+  int total_processed = 0;
+
+  // output buffers
+  std::vector<nvjpegImage_t> iout(params.batch_size);
+  // output buffer sizes, for convenience
+  std::vector<nvjpegImage_t> isz(params.batch_size);
+
+  for (int i = 0; i < iout.size(); i++) {
+    for (int c = 0; c < NVJPEG_MAX_COMPONENT; c++) {
+      iout[i].channel[c] = NULL;
+      iout[i].pitch[c] = 0;
+      isz[i].pitch[c] = 0;
+    }
+  }
+
+  double test_time = 0;
+  int warmup = 0;
+  while (total_processed < params.total_images) {
+    if (read_next_batch(image_names, params.batch_size, file_iter, file_data,
+                        file_len, current_names))
+      return EXIT_FAILURE;
+
+    if (prepare_buffers(file_data, file_len, widths, heights, iout, isz,
+                        current_names, params))
+      return EXIT_FAILURE;
+
+    double time;
+    if (decode_images(file_data, file_len, iout, params, time))
+      return EXIT_FAILURE;
+    if (warmup < params.warmup) {
+      warmup++;
+    } else {
+      total_processed += params.batch_size;
+      test_time += time;
+    }
+
+    if (params.write_decoded)
+      write_images(iout, widths, heights, params, current_names);
+  }
+  total = test_time;
+
+  release_buffers(iout);
+
+  checkCudaErrors(cudaStreamDestroy(params.stream));
+
+  return EXIT_SUCCESS;
+}
+
+// parse parameters
+int findParamIndex(const char **argv, int argc, const char *parm) {
+  int count = 0;
+  int index = -1;
+
+  for (int i = 0; i < argc; i++) {
+    if (strncmp(argv[i], parm, 100) == 0) {
+      index = i;
+      count++;
+    }
+  }
+
+  if (count == 0 || count == 1) {
+    return index;
+  } else {
+    std::cout << "Error, parameter " << parm
+              << " has been specified more than once, exiting\n"
+              << std::endl;
+    return -1;
+  }
+
+  return -1;
+}
+
+int main(int argc, const char *argv[]) {
+  int pidx;
+
+  if ((pidx = findParamIndex(argv, argc, "-h")) != -1 ||
+      (pidx = findParamIndex(argv, argc, "--help")) != -1) {
+    std::cout << "Usage: " << argv[0]
+              << " -i images_dir [-b batch_size] [-t total_images] [-device= "
+                 "device_id] [-w warmup_iterations] [-o output_dir] "
+                 "[-pipelined] [-batched] [-fmt output_format]\n";
+    std::cout << "Parameters: " << std::endl;
+    std::cout << "\timages_dir\t:\tPath to single image or directory of images"
+              << std::endl;
+    std::cout << "\tbatch_size\t:\tDecode images from input by batches of "
+                 "specified size"
+              << std::endl;
+    std::cout << "\ttotal_images\t:\tDecode this much images, if there are "
+                 "less images \n"
+              << "\t\t\t\t\tin the input than total images, decoder will loop "
+                 "over the input"
+              << std::endl;
+    std::cout << "\tdevice_id\t:\tWhich device to use for decoding"
+              << std::endl;
+    std::cout << "\twarmup_iterations\t:\tRun this amount of batches first "
+                 "without measuring performance"
+              << std::endl;
+    std::cout
+        << "\toutput_dir\t:\tWrite decoded images as BMPs to this directory"
+        << std::endl;
+    std::cout << "\tpipelined\t:\tUse decoding in phases" << std::endl;
+    std::cout << "\tbatched\t\t:\tUse batched interface" << std::endl;
+    std::cout << "\toutput_format\t:\tnvJPEG output format for decoding. One "
+                 "of [rgb, rgbi, bgr, bgri, yuv, y, unchanged]"
+              << std::endl;
+    return EXIT_SUCCESS;
+  }
+
+  decode_params_t params;
+
+  params.input_dir = "./";
+  if ((pidx = findParamIndex(argv, argc, "-i")) != -1) {
+    params.input_dir = argv[pidx + 1];
+  } else {
+    std::cerr << "Please specify input directory with encoded images"
+              << std::endl;
+    return EXIT_WAIVED;
+  }
+
+  params.batch_size = 1;
+  if ((pidx = findParamIndex(argv, argc, "-b")) != -1) {
+    params.batch_size = std::atoi(argv[pidx + 1]);
+  }
+
+  params.total_images = -1;
+  if ((pidx = findParamIndex(argv, argc, "-t")) != -1) {
+    params.total_images = std::atoi(argv[pidx + 1]);
+  }
+
+  params.dev = 0;
+  params.dev = findCudaDevice(argc, argv);
+
+  params.warmup = 0;
+  if ((pidx = findParamIndex(argv, argc, "-w")) != -1) {
+    params.warmup = std::atoi(argv[pidx + 1]);
+  }
+
+  params.batched = false;
+  if ((pidx = findParamIndex(argv, argc, "-batched")) != -1) {
+    params.batched = true;
+  }
+
+  params.pipelined = false;
+  if ((pidx = findParamIndex(argv, argc, "-pipelined")) != -1) {
+    params.pipelined = true;
+  }
+
+  params.fmt = NVJPEG_OUTPUT_RGB;
+  if ((pidx = findParamIndex(argv, argc, "-fmt")) != -1) {
+    std::string sfmt = argv[pidx + 1];
+    if (sfmt == "rgb")
+      params.fmt = NVJPEG_OUTPUT_RGB;
+    else if (sfmt == "bgr")
+      params.fmt = NVJPEG_OUTPUT_BGR;
+    else if (sfmt == "rgbi")
+      params.fmt = NVJPEG_OUTPUT_RGBI;
+    else if (sfmt == "bgri")
+      params.fmt = NVJPEG_OUTPUT_BGRI;
+    else if (sfmt == "yuv")
+      params.fmt = NVJPEG_OUTPUT_YUV;
+    else if (sfmt == "y")
+      params.fmt = NVJPEG_OUTPUT_Y;
+    else if (sfmt == "unchanged")
+      params.fmt = NVJPEG_OUTPUT_UNCHANGED;
+    else {
+      std::cout << "Unknown format: " << sfmt << std::endl;
+      return EXIT_FAILURE;
+    }
+  }
+
+  params.write_decoded = false;
+  if ((pidx = findParamIndex(argv, argc, "-o")) != -1) {
+    params.output_dir = argv[pidx + 1];
+    if (params.fmt != NVJPEG_OUTPUT_RGB && params.fmt != NVJPEG_OUTPUT_BGR &&
+        params.fmt != NVJPEG_OUTPUT_RGBI && params.fmt != NVJPEG_OUTPUT_BGRI) {
+      std::cout << "We can write ony BMPs, which require output format be "
+                   "either RGB/BGR or RGBi/BGRi"
+                << std::endl;
+      return EXIT_FAILURE;
+    }
+    params.write_decoded = true;
+  }
+
+  cudaDeviceProp props;
+  checkCudaErrors(cudaGetDeviceProperties(&props, params.dev));
+
+  printf("Using GPU %d (%s, %d SMs, %d th/SM max, CC %d.%d, ECC %s)\n",
+         params.dev, props.name, props.multiProcessorCount,
+         props.maxThreadsPerMultiProcessor, props.major, props.minor,
+         props.ECCEnabled ? "on" : "off");
+
+  nvjpegDevAllocator_t dev_allocator = {&dev_malloc, &dev_free};
+  checkCudaErrors(nvjpegCreate(NVJPEG_BACKEND_DEFAULT, &dev_allocator,
+                               &params.nvjpeg_handle));
+  checkCudaErrors(
+      nvjpegJpegStateCreate(params.nvjpeg_handle, &params.nvjpeg_state));
+  checkCudaErrors(
+      nvjpegDecodeBatchedInitialize(params.nvjpeg_handle, params.nvjpeg_state,
+                                    params.batch_size, 1, params.fmt));
+
+  // read source images
+  FileNames image_names;
+  readInput(params.input_dir, image_names);
+
+  if (params.total_images == -1) {
+    params.total_images = image_names.size();
+  } else if (params.total_images % params.batch_size) {
+    params.total_images =
+        ((params.total_images) / params.batch_size) * params.batch_size;
+    std::cout << "Changing total_images number to " << params.total_images
+              << " to be multiple of batch_size - " << params.batch_size
+              << std::endl;
+  }
+
+  std::cout << "Decoding images in directory: " << params.input_dir
+            << ", total " << params.total_images << ", batchsize "
+            << params.batch_size << std::endl;
+
+  double total;
+  if (process_images(image_names, params, total)) return EXIT_FAILURE;
+  std::cout << "Total decoding time: " << total << std::endl;
+  std::cout << "Avg decoding time per image: " << total / params.total_images
+            << std::endl;
+  std::cout << "Avg images per sec: " << params.total_images / total
+            << std::endl;
+  std::cout << "Avg decoding time per batch: "
+            << total / ((params.total_images + params.batch_size - 1) /
+                        params.batch_size)
+            << std::endl;
+
+  checkCudaErrors(nvjpegJpegStateDestroy(params.nvjpeg_state));
+  checkCudaErrors(nvjpegDestroy(params.nvjpeg_handle));
+
+  return EXIT_SUCCESS;
+}
diff --git a/Samples/nvJPEG/nvJPEG_helper.hxx b/Samples/nvJPEG/nvJPEG_helper.hxx
new file mode 100644
index 000000000..9bff952ca
--- /dev/null
+++ b/Samples/nvJPEG/nvJPEG_helper.hxx
@@ -0,0 +1,338 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+// This sample needs at least CUDA 10.0.
+// It demonstrates usages of the nvJPEG library
+
+#ifndef NV_JPEG_EXAMPLE
+#define NV_JPEG_EXAMPLE
+
+#include "cuda_runtime.h"
+#include "nvjpeg.h"
+#include "helper_cuda.h"
+#include "helper_timer.h"
+
+#include <cstdlib>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include <string.h>    // strcmpi
+#include <sys/time.h>  // timings
+
+#include <dirent.h>  // linux dir traverse
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+// write bmp, input - RGB, device
+int writeBMP(const char *filename, const unsigned char *d_chanR, int pitchR,
+             const unsigned char *d_chanG, int pitchG,
+             const unsigned char *d_chanB, int pitchB, int width, int height) {
+  unsigned int headers[13];
+  FILE *outfile;
+  int extrabytes;
+  int paddedsize;
+  int x;
+  int y;
+  int n;
+  int red, green, blue;
+
+  std::vector<unsigned char> vchanR(height * width);
+  std::vector<unsigned char> vchanG(height * width);
+  std::vector<unsigned char> vchanB(height * width);
+  unsigned char *chanR = vchanR.data();
+  unsigned char *chanG = vchanG.data();
+  unsigned char *chanB = vchanB.data();
+  checkCudaErrors(cudaMemcpy2D(chanR, (size_t)width, d_chanR, (size_t)pitchR,
+                               width, height, cudaMemcpyDeviceToHost));
+  checkCudaErrors(cudaMemcpy2D(chanG, (size_t)width, d_chanG, (size_t)pitchR,
+                               width, height, cudaMemcpyDeviceToHost));
+  checkCudaErrors(cudaMemcpy2D(chanB, (size_t)width, d_chanB, (size_t)pitchR,
+                               width, height, cudaMemcpyDeviceToHost));
+
+  extrabytes =
+      4 - ((width * 3) % 4);  // How many bytes of padding to add to each
+  // horizontal line - the size of which must
+  // be a multiple of 4 bytes.
+  if (extrabytes == 4) extrabytes = 0;
+
+  paddedsize = ((width * 3) + extrabytes) * height;
+
+  // Headers...
+  // Note that the "BM" identifier in bytes 0 and 1 is NOT included in these
+  // "headers".
+
+  headers[0] = paddedsize + 54;  // bfSize (whole file size)
+  headers[1] = 0;                // bfReserved (both)
+  headers[2] = 54;               // bfOffbits
+  headers[3] = 40;               // biSize
+  headers[4] = width;            // biWidth
+  headers[5] = height;           // biHeight
+
+  // Would have biPlanes and biBitCount in position 6, but they're shorts.
+  // It's easier to write them out separately (see below) than pretend
+  // they're a single int, especially with endian issues...
+
+  headers[7] = 0;           // biCompression
+  headers[8] = paddedsize;  // biSizeImage
+  headers[9] = 0;           // biXPelsPerMeter
+  headers[10] = 0;          // biYPelsPerMeter
+  headers[11] = 0;          // biClrUsed
+  headers[12] = 0;          // biClrImportant
+
+  if (!(outfile = fopen(filename, "wb"))) {
+    std::cerr << "Cannot open file: " << filename << std::endl;
+    return 1;
+  }
+
+  //
+  // Headers begin...
+  // When printing ints and shorts, we write out 1 character at a time to avoid
+  // endian issues.
+  //
+  fprintf(outfile, "BM");
+
+  for (n = 0; n <= 5; n++) {
+    fprintf(outfile, "%c", headers[n] & 0x000000FF);
+    fprintf(outfile, "%c", (headers[n] & 0x0000FF00) >> 8);
+    fprintf(outfile, "%c", (headers[n] & 0x00FF0000) >> 16);
+    fprintf(outfile, "%c", (headers[n] & (unsigned int)0xFF000000) >> 24);
+  }
+
+  // These next 4 characters are for the biPlanes and biBitCount fields.
+
+  fprintf(outfile, "%c", 1);
+  fprintf(outfile, "%c", 0);
+  fprintf(outfile, "%c", 24);
+  fprintf(outfile, "%c", 0);
+
+  for (n = 7; n <= 12; n++) {
+    fprintf(outfile, "%c", headers[n] & 0x000000FF);
+    fprintf(outfile, "%c", (headers[n] & 0x0000FF00) >> 8);
+    fprintf(outfile, "%c", (headers[n] & 0x00FF0000) >> 16);
+    fprintf(outfile, "%c", (headers[n] & (unsigned int)0xFF000000) >> 24);
+  }
+
+  //
+  // Headers done, now write the data...
+  //
+
+  for (y = height - 1; y >= 0;
+       y--)  // BMP image format is written from bottom to top...
+  {
+    for (x = 0; x <= width - 1; x++) {
+      red = chanR[y * width + x];
+      green = chanG[y * width + x];
+      blue = chanB[y * width + x];
+
+      if (red > 255) red = 255;
+      if (red < 0) red = 0;
+      if (green > 255) green = 255;
+      if (green < 0) green = 0;
+      if (blue > 255) blue = 255;
+      if (blue < 0) blue = 0;
+      // Also, it's written in (b,g,r) format...
+
+      fprintf(outfile, "%c", blue);
+      fprintf(outfile, "%c", green);
+      fprintf(outfile, "%c", red);
+    }
+    if (extrabytes)  // See above - BMP lines must be of lengths divisible by 4.
+    {
+      for (n = 1; n <= extrabytes; n++) {
+        fprintf(outfile, "%c", 0);
+      }
+    }
+  }
+
+  fclose(outfile);
+  return 0;
+}
+
+// write bmp, input - RGB, device
+int writeBMPi(const char *filename, const unsigned char *d_RGB, int pitch,
+              int width, int height) {
+  unsigned int headers[13];
+  FILE *outfile;
+  int extrabytes;
+  int paddedsize;
+  int x;
+  int y;
+  int n;
+  int red, green, blue;
+
+  std::vector<unsigned char> vchanRGB(height * width * 3);
+  unsigned char *chanRGB = vchanRGB.data();
+  checkCudaErrors(cudaMemcpy2D(chanRGB, (size_t)width * 3, d_RGB, (size_t)pitch,
+                               width * 3, height, cudaMemcpyDeviceToHost));
+
+  extrabytes =
+      4 - ((width * 3) % 4);  // How many bytes of padding to add to each
+  // horizontal line - the size of which must
+  // be a multiple of 4 bytes.
+  if (extrabytes == 4) extrabytes = 0;
+
+  paddedsize = ((width * 3) + extrabytes) * height;
+
+  // Headers...
+  // Note that the "BM" identifier in bytes 0 and 1 is NOT included in these
+  // "headers".
+  headers[0] = paddedsize + 54;  // bfSize (whole file size)
+  headers[1] = 0;                // bfReserved (both)
+  headers[2] = 54;               // bfOffbits
+  headers[3] = 40;               // biSize
+  headers[4] = width;            // biWidth
+  headers[5] = height;           // biHeight
+
+  // Would have biPlanes and biBitCount in position 6, but they're shorts.
+  // It's easier to write them out separately (see below) than pretend
+  // they're a single int, especially with endian issues...
+
+  headers[7] = 0;           // biCompression
+  headers[8] = paddedsize;  // biSizeImage
+  headers[9] = 0;           // biXPelsPerMeter
+  headers[10] = 0;          // biYPelsPerMeter
+  headers[11] = 0;          // biClrUsed
+  headers[12] = 0;          // biClrImportant
+
+  if (!(outfile = fopen(filename, "wb"))) {
+    std::cerr << "Cannot open file: " << filename << std::endl;
+    return 1;
+  }
+
+  //
+  // Headers begin...
+  // When printing ints and shorts, we write out 1 character at a time to avoid
+  // endian issues.
+  //
+
+  fprintf(outfile, "BM");
+
+  for (n = 0; n <= 5; n++) {
+    fprintf(outfile, "%c", headers[n] & 0x000000FF);
+    fprintf(outfile, "%c", (headers[n] & 0x0000FF00) >> 8);
+    fprintf(outfile, "%c", (headers[n] & 0x00FF0000) >> 16);
+    fprintf(outfile, "%c", (headers[n] & (unsigned int)0xFF000000) >> 24);
+  }
+
+  // These next 4 characters are for the biPlanes and biBitCount fields.
+
+  fprintf(outfile, "%c", 1);
+  fprintf(outfile, "%c", 0);
+  fprintf(outfile, "%c", 24);
+  fprintf(outfile, "%c", 0);
+
+  for (n = 7; n <= 12; n++) {
+    fprintf(outfile, "%c", headers[n] & 0x000000FF);
+    fprintf(outfile, "%c", (headers[n] & 0x0000FF00) >> 8);
+    fprintf(outfile, "%c", (headers[n] & 0x00FF0000) >> 16);
+    fprintf(outfile, "%c", (headers[n] & (unsigned int)0xFF000000) >> 24);
+  }
+
+  //
+  // Headers done, now write the data...
+  //
+  for (y = height - 1; y >= 0;
+       y--)  // BMP image format is written from bottom to top...
+  {
+    for (x = 0; x <= width - 1; x++) {
+      red = chanRGB[(y * width + x) * 3];
+      green = chanRGB[(y * width + x) * 3 + 1];
+      blue = chanRGB[(y * width + x) * 3 + 2];
+
+      if (red > 255) red = 255;
+      if (red < 0) red = 0;
+      if (green > 255) green = 255;
+      if (green < 0) green = 0;
+      if (blue > 255) blue = 255;
+      if (blue < 0) blue = 0;
+      // Also, it's written in (b,g,r) format...
+
+      fprintf(outfile, "%c", blue);
+      fprintf(outfile, "%c", green);
+      fprintf(outfile, "%c", red);
+    }
+    if (extrabytes)  // See above - BMP lines must be of lengths divisible by 4.
+    {
+      for (n = 1; n <= extrabytes; n++) {
+        fprintf(outfile, "%c", 0);
+      }
+    }
+  }
+
+  fclose(outfile);
+  return 0;
+}
+
+int readInput(const std::string &sInputPath,
+              std::vector<std::string> &filelist) {
+  int error_code = 1;
+  struct stat s;
+
+  if (stat(sInputPath.c_str(), &s) == 0) {
+    if (s.st_mode & S_IFREG) {
+      filelist.push_back(sInputPath);
+    } else if (s.st_mode & S_IFDIR) {
+      // processing each file in directory
+      DIR *dir_handle;
+      struct dirent *dir;
+      dir_handle = opendir(sInputPath.c_str());
+      std::vector<std::string> filenames;
+      if (dir_handle) {
+        error_code = 0;
+        while ((dir = readdir(dir_handle)) != NULL) {
+          if (dir->d_type == DT_REG) {
+            std::string sFileName = sInputPath + dir->d_name;
+            filelist.push_back(sFileName);
+          } else if (dir->d_type == DT_DIR) {
+            std::string sname = dir->d_name;
+            if (sname != "." && sname != "..") {
+              readInput(sInputPath + sname + "/", filelist);
+            }
+          }
+        }
+        closedir(dir_handle);
+      } else {
+        std::cout << "Cannot open input directory: " << sInputPath << std::endl;
+        return error_code;
+      }
+    } else {
+      std::cout << "Cannot open input: " << sInputPath << std::endl;
+      return error_code;
+    }
+  } else {
+    std::cout << "Cannot find input path " << sInputPath << std::endl;
+    return error_code;
+  }
+
+  return 0;
+}
+
+#endif
diff --git a/Samples/p2pBandwidthLatencyTest/Makefile b/Samples/p2pBandwidthLatencyTest/Makefile
index 07a4c9d53..5492855b7 100644
--- a/Samples/p2pBandwidthLatencyTest/Makefile
+++ b/Samples/p2pBandwidthLatencyTest/Makefile
@@ -246,7 +246,11 @@ LIBRARIES :=
 ################################################################################
 
 # Gencode arguments
+ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
+SMS ?= 30 35 37 50 52 60 61 70 72 75
+else
 SMS ?= 30 35 37 50 52 60 61 70 75
+endif
 
 ifeq ($(SMS),)
 $(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
diff --git a/Samples/p2pBandwidthLatencyTest/NsightEclipse.xml b/Samples/p2pBandwidthLatencyTest/NsightEclipse.xml
index 383a2972f..7ce8df089 100644
--- a/Samples/p2pBandwidthLatencyTest/NsightEclipse.xml
+++ b/Samples/p2pBandwidthLatencyTest/NsightEclipse.xml
@@ -48,6 +48,7 @@
   <sm-arch>sm60</sm-arch>
   <sm-arch>sm61</sm-arch>
   <sm-arch>sm70</sm-arch>
+  <sm-arch>sm72</sm-arch>
   <sm-arch>sm75</sm-arch>
   <supported_envs>
     <env>
diff --git a/Samples/p2pBandwidthLatencyTest/README.md b/Samples/p2pBandwidthLatencyTest/README.md
index 47f0ca013..b0e56d105 100644
--- a/Samples/p2pBandwidthLatencyTest/README.md
+++ b/Samples/p2pBandwidthLatencyTest/README.md
@@ -10,7 +10,7 @@ Performance Strategies, Asynchronous Data Transfers, Unified Virtual Address Spa
 
 ## Supported SM Architectures
 
-[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
+[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
 
 ## Supported OSes
 
@@ -27,7 +27,7 @@ cudaDeviceCanAccessPeer, cudaDeviceEnablePeerAccess, cudaDeviceDisablePeerAccess
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 10.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 
 ## Build and Run
 
diff --git a/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2012.vcxproj b/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2012.vcxproj
index c763b1dab..bd0efa07a 100644
--- a/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2012.vcxproj
+++ b/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2012.vcxproj
@@ -33,7 +33,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -102,6 +102,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2013.vcxproj b/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2013.vcxproj
index 2fa7ab331..5375cc4d4 100644
--- a/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2013.vcxproj
+++ b/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2013.vcxproj
@@ -33,7 +33,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -102,6 +102,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2015.vcxproj b/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2015.vcxproj
index e6db55686..e46756666 100644
--- a/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2015.vcxproj
+++ b/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2015.vcxproj
@@ -33,7 +33,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -102,6 +102,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2017.vcxproj b/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2017.vcxproj
index 7b8ec8f63..412d8f979 100644
--- a/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2017.vcxproj
+++ b/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2017.vcxproj
@@ -34,7 +34,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -103,6 +103,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/reduction/Makefile b/Samples/reduction/Makefile
new file mode 100644
index 000000000..621426128
--- /dev/null
+++ b/Samples/reduction/Makefile
@@ -0,0 +1,307 @@
+################################################################################
+# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+#
+# Makefile project only supported on Mac OS X and Linux Platforms)
+#
+################################################################################
+
+# Location of the CUDA Toolkit
+CUDA_PATH ?= /usr/local/cuda
+
+##############################
+# start deprecated interface #
+##############################
+ifeq ($(x86_64),1)
+    $(info WARNING - x86_64 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=x86_64 instead)
+    TARGET_ARCH ?= x86_64
+endif
+ifeq ($(ARMv7),1)
+    $(info WARNING - ARMv7 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=armv7l instead)
+    TARGET_ARCH ?= armv7l
+endif
+ifeq ($(aarch64),1)
+    $(info WARNING - aarch64 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=aarch64 instead)
+    TARGET_ARCH ?= aarch64
+endif
+ifeq ($(ppc64le),1)
+    $(info WARNING - ppc64le variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=ppc64le instead)
+    TARGET_ARCH ?= ppc64le
+endif
+ifneq ($(GCC),)
+    $(info WARNING - GCC variable has been deprecated)
+    $(info WARNING - please use HOST_COMPILER=$(GCC) instead)
+    HOST_COMPILER ?= $(GCC)
+endif
+ifneq ($(abi),)
+    $(error ERROR - abi variable has been removed)
+endif
+############################
+# end deprecated interface #
+############################
+
+# architecture
+HOST_ARCH   := $(shell uname -m)
+TARGET_ARCH ?= $(HOST_ARCH)
+ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
+    ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le))
+            TARGET_SIZE := 64
+        else ifneq (,$(filter $(TARGET_ARCH),armv7l))
+            TARGET_SIZE := 32
+        endif
+    else
+        TARGET_SIZE := $(shell getconf LONG_BIT)
+    endif
+else
+    $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
+endif
+ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le))
+        $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
+    endif
+endif
+
+# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
+ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
+    TARGET_ARCH = armv7l
+endif
+
+# operating system
+HOST_OS   := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
+TARGET_OS ?= $(HOST_OS)
+ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
+    $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
+endif
+
+# host compiler
+ifeq ($(TARGET_OS),darwin)
+    ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
+        HOST_COMPILER ?= clang++
+    endif
+else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
+        ifeq ($(TARGET_OS),linux)
+            HOST_COMPILER ?= arm-linux-gnueabihf-g++
+        else ifeq ($(TARGET_OS),qnx)
+            ifeq ($(QNX_HOST),)
+                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
+            endif
+            ifeq ($(QNX_TARGET),)
+                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
+            endif
+            export QNX_HOST
+            export QNX_TARGET
+            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
+        else ifeq ($(TARGET_OS),android)
+            HOST_COMPILER ?= arm-linux-androideabi-g++
+        endif
+    else ifeq ($(TARGET_ARCH),aarch64)
+        ifeq ($(TARGET_OS), linux)
+            HOST_COMPILER ?= aarch64-linux-gnu-g++
+        else ifeq ($(TARGET_OS),qnx)
+            ifeq ($(QNX_HOST),)
+                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
+            endif
+            ifeq ($(QNX_TARGET),)
+                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
+            endif
+            export QNX_HOST
+            export QNX_TARGET
+            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/aarch64-unknown-nto-qnx7.0.0-g++
+        else ifeq ($(TARGET_OS), android)
+            HOST_COMPILER ?= aarch64-linux-android-clang++
+        endif
+    else ifeq ($(TARGET_ARCH),ppc64le)
+        HOST_COMPILER ?= powerpc64le-linux-gnu-g++
+    endif
+endif
+HOST_COMPILER ?= g++
+NVCC          := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
+
+# internal flags
+NVCCFLAGS   := -m${TARGET_SIZE}
+CCFLAGS     :=
+LDFLAGS     :=
+
+# build flags
+ifeq ($(TARGET_OS),darwin)
+    LDFLAGS += -rpath $(CUDA_PATH)/lib
+    CCFLAGS += -arch $(HOST_ARCH)
+else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
+    LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
+    CCFLAGS += -mfloat-abi=hard
+else ifeq ($(TARGET_OS),android)
+    LDFLAGS += -pie
+    CCFLAGS += -fpie -fpic -fexceptions
+endif
+
+ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
+        ifneq ($(TARGET_FS),)
+            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
+            ifeq ($(GCCVERSIONLTEQ46),1)
+                CCFLAGS += --sysroot=$(TARGET_FS)
+            endif
+            LDFLAGS += --sysroot=$(TARGET_FS)
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
+        ifneq ($(TARGET_FS),)
+            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
+            ifeq ($(GCCVERSIONLTEQ46),1)
+                CCFLAGS += --sysroot=$(TARGET_FS)
+            endif
+            LDFLAGS += --sysroot=$(TARGET_FS)
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L $(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L $(TARGET_FS)/usr/lib/aarch64-linux-gnu
+            LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu
+        endif
+    endif
+endif
+
+ifeq ($(TARGET_OS),qnx)
+    CCFLAGS += -DWIN_INTERFACE_CUSTOM
+    LDFLAGS += -lsocket
+endif
+
+# Install directory of different arch
+CUDA_INSTALL_TARGET_DIR :=
+ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
+    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
+    CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
+else ifeq ($(TARGET_ARCH),ppc64le)
+    CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
+endif
+
+# Debug build flags
+ifeq ($(dbg),1)
+      NVCCFLAGS += -g -G
+      BUILD_TYPE := debug
+else
+      BUILD_TYPE := release
+endif
+
+ALL_CCFLAGS :=
+ALL_CCFLAGS += $(NVCCFLAGS)
+ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
+ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
+ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
+
+SAMPLE_ENABLED := 1
+
+ALL_LDFLAGS :=
+ALL_LDFLAGS += $(ALL_CCFLAGS)
+ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
+ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
+
+# Common includes and paths for CUDA
+INCLUDES  := -I../../Common
+LIBRARIES :=
+
+################################################################################
+
+# Gencode arguments
+ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
+SMS ?= 30 35 37 50 52 60 61 70 72 75
+else
+SMS ?= 30 35 37 50 52 60 61 70 75
+endif
+
+ifeq ($(SMS),)
+$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
+SAMPLE_ENABLED := 0
+endif
+
+ifeq ($(GENCODE_FLAGS),)
+# Generate SASS code for each SM architecture listed in $(SMS)
+$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
+
+# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
+HIGHEST_SM := $(lastword $(sort $(SMS)))
+ifneq ($(HIGHEST_SM),)
+GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
+endif
+endif
+
+ifeq ($(SAMPLE_ENABLED),0)
+EXEC ?= @echo "[@]"
+endif
+
+################################################################################
+
+# Target rules
+all: build
+
+build: reduction
+
+check.deps:
+ifeq ($(SAMPLE_ENABLED),0)
+	@echo "Sample will be waived due to the above missing dependencies"
+else
+	@echo "Sample is ready - all dependencies have been met"
+endif
+
+reduction.o:reduction.cpp
+	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
+
+reduction_kernel.o:reduction_kernel.cu
+	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
+
+reduction: reduction.o reduction_kernel.o
+	$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
+	$(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
+	$(EXEC) cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
+
+run: build
+	$(EXEC) ./reduction
+
+clean:
+	rm -f reduction reduction.o reduction_kernel.o
+	rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/reduction
+
+clobber: clean
diff --git a/Samples/reduction/NsightEclipse.xml b/Samples/reduction/NsightEclipse.xml
new file mode 100644
index 000000000..48c36bb77
--- /dev/null
+++ b/Samples/reduction/NsightEclipse.xml
@@ -0,0 +1,67 @@
+<?xml version="1.0" encoding="UTF-8"?> 
+<!DOCTYPE entry SYSTEM "SamplesInfo.dtd">
+<entry>
+  <name>reduction</name>
+  <description><![CDATA[A parallel sum reduction that computes the sum of a large arrays of values.  This sample demonstrates several important optimization strategies for Data-Parallel Algorithms like reduction.]]></description>
+  <devicecompilation>whole</devicecompilation>
+  <includepaths>
+    <path>./</path>
+    <path>../</path>
+    <path>../../common/inc</path>
+  </includepaths>
+  <keyconcepts>
+    <concept level="advanced">Data-Parallel Algorithms</concept>
+    <concept level="advanced">Performance Strategies</concept>
+  </keyconcepts>
+  <keywords>
+    <keyword>CUDA</keyword>
+    <keyword>GPGPU</keyword>
+    <keyword>Parallel Reduction</keyword>
+  </keywords>
+  <libraries>
+  </libraries>
+  <librarypaths>
+  </librarypaths>
+  <nsight_eclipse>true</nsight_eclipse>
+  <primary_file>reduction.cpp</primary_file>
+  <scopes>
+    <scope>1:CUDA Advanced Topics</scope>
+    <scope>1:Data-Parallel Algorithms</scope>
+    <scope>1:Performance Strategies</scope>
+  </scopes>
+  <sm-arch>sm30</sm-arch>
+  <sm-arch>sm35</sm-arch>
+  <sm-arch>sm37</sm-arch>
+  <sm-arch>sm50</sm-arch>
+  <sm-arch>sm52</sm-arch>
+  <sm-arch>sm60</sm-arch>
+  <sm-arch>sm61</sm-arch>
+  <sm-arch>sm70</sm-arch>
+  <sm-arch>sm72</sm-arch>
+  <sm-arch>sm75</sm-arch>
+  <supported_envs>
+    <env>
+      <arch>x86_64</arch>
+      <platform>linux</platform>
+    </env>
+    <env>
+      <platform>windows7</platform>
+    </env>
+    <env>
+      <arch>x86_64</arch>
+      <platform>macosx</platform>
+    </env>
+    <env>
+      <arch>arm</arch>
+    </env>
+    <env>
+      <arch>ppc64le</arch>
+      <platform>linux</platform>
+    </env>
+  </supported_envs>
+  <supported_sm_architectures>
+    <include>all</include>
+  </supported_sm_architectures>
+  <title>CUDA Parallel Reduction</title>
+  <type>exe</type>
+</entry>
diff --git a/Samples/reduction/README.md b/Samples/reduction/README.md
new file mode 100644
index 000000000..7b6a7e8c6
--- /dev/null
+++ b/Samples/reduction/README.md
@@ -0,0 +1,91 @@
+# reduction - CUDA Parallel Reduction
+
+## Description
+
+A parallel sum reduction that computes the sum of a large arrays of values.  This sample demonstrates several important optimization strategies for Data-Parallel Algorithms like reduction.
+
+## Key Concepts
+
+Data-Parallel Algorithms, Performance Strategies
+
+## Supported SM Architectures
+
+[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
+
+## Supported OSes
+
+Linux, Windows, MacOSX
+
+## Supported CPU Architecture
+
+x86_64, ppc64le, armv7l
+
+## CUDA APIs involved
+
+## Prerequisites
+
+Download and install the [CUDA Toolkit 10.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+
+## Build and Run
+
+### Windows
+The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format:
+```
+*_vs<version>.sln - for Visual Studio <version>
+```
+Each individual sample has its own set of solution files in its directory:
+
+To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used.
+> **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details."
+
+### Linux
+The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make:
+```
+$ cd <sample_dir>
+$ make
+```
+The samples makefiles can take advantage of certain options:
+*  **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, armv7l.
+    By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.<br/>
+`$ make TARGET_ARCH=x86_64` <br/> `$ make TARGET_ARCH=ppc64le` <br/> `$ make TARGET_ARCH=armv7l` <br/>
+    See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details.
+*   **dbg=1** - build with debug symbols
+    ```
+    $ make dbg=1
+    ```
+*   **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`.
+    ```
+    $ make SMS="50 60"
+    ```
+
+*  **HOST_COMPILER=<host_compiler>** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers.
+```
+    $ make HOST_COMPILER=g++
+```
+
+### Mac
+The Mac samples are built using makefiles. To use the makefiles, change directory into the sample directory you wish to build, and run make:
+```
+$ cd <sample_dir>
+$ make
+```
+
+The samples makefiles can take advantage of certain options:
+
+*  **dbg=1** - build with debug symbols
+    ```
+    $ make dbg=1
+    ```
+
+*  **SMS="A B ..."** - override the SM architectures for which the sample will be built, where "A B ..." is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use SMS="50 60".
+    ```
+    $ make SMS="A B ..."
+    ```
+
+*  **HOST_COMPILER=<host_compiler>** - override the default clang host compiler. See the [Mac Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#system-requirements) for a list of supported host compilers.
+    ```
+    $ make HOST_COMPILER=clang
+    ```
+
+## References (for more details)
+
diff --git a/Samples/reduction/reduction.cpp b/Samples/reduction/reduction.cpp
new file mode 100644
index 000000000..972ae3611
--- /dev/null
+++ b/Samples/reduction/reduction.cpp
@@ -0,0 +1,563 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+    Parallel reduction
+
+    This sample shows how to perform a reduction operation on an array of values
+    to produce a single value.
+
+    Reductions are a very common computation in parallel algorithms.  Any time
+    an array of values needs to be reduced to a single value using a binary
+    associative operator, a reduction can be used.  Example applications include
+    statistics computations such as mean and standard deviation, and image
+    processing applications such as finding the total luminance of an
+    image.
+
+    This code performs sum reductions, but any associative operator such as
+    min() or max() could also be used.
+
+    It assumes the input size is a power of 2.
+
+    COMMAND LINE ARGUMENTS
+
+    "--shmoo":         Test performance for 1 to 32M elements with each of the 7
+   different kernels
+    "--n=<N>":         Specify the number of elements to reduce (default
+   1048576)
+    "--threads=<N>":   Specify the number of threads per block (default 128)
+    "--kernel=<N>":    Specify which kernel to run (0-6, default 6)
+    "--maxblocks=<N>": Specify the maximum number of thread blocks to launch
+   (kernel 6 only, default 64)
+    "--cpufinal":      Read back the per-block results and do final sum of block
+   sums on CPU (default false)
+    "--cputhresh=<N>": The threshold of number of blocks sums below which to
+   perform a CPU final reduction (default 1)
+    "-type=<T>":       The datatype for the reduction, where T is "int",
+   "float", or "double" (default int)
+*/
+
+// CUDA Runtime
+#include <cuda_runtime.h>
+
+// Utilities and system includes
+#include <helper_cuda.h>
+#include <helper_functions.h>
+#include <algorithm>
+
+// includes, project
+#include "reduction.h"
+
+enum ReduceType { REDUCE_INT, REDUCE_FLOAT, REDUCE_DOUBLE };
+
+////////////////////////////////////////////////////////////////////////////////
+// declaration, forward
+template <class T>
+bool runTest(int argc, char **argv, ReduceType datatype);
+
+#define MAX_BLOCK_DIM_SIZE 65535
+
+#ifdef WIN32
+#define strcasecmp strcmpi
+#endif
+
+extern "C" bool isPow2(unsigned int x) { return ((x & (x - 1)) == 0); }
+
+const char *getReduceTypeString(const ReduceType type) {
+  switch (type) {
+    case REDUCE_INT:
+      return "int";
+    case REDUCE_FLOAT:
+      return "float";
+    case REDUCE_DOUBLE:
+      return "double";
+    default:
+      return "unknown";
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Program main
+////////////////////////////////////////////////////////////////////////////////
+int main(int argc, char **argv) {
+  printf("%s Starting...\n\n", argv[0]);
+
+  char *typeInput = 0;
+  getCmdLineArgumentString(argc, (const char **)argv, "type", &typeInput);
+
+  ReduceType datatype = REDUCE_INT;
+
+  if (0 != typeInput) {
+    if (!strcasecmp(typeInput, "float")) {
+      datatype = REDUCE_FLOAT;
+    } else if (!strcasecmp(typeInput, "double")) {
+      datatype = REDUCE_DOUBLE;
+    } else if (strcasecmp(typeInput, "int")) {
+      printf("Type %s is not recognized. Using default type int.\n\n",
+             typeInput);
+    }
+  }
+
+  cudaDeviceProp deviceProp;
+  int dev;
+
+  dev = findCudaDevice(argc, (const char **)argv);
+
+  checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev));
+
+  printf("Using Device %d: %s\n\n", dev, deviceProp.name);
+  checkCudaErrors(cudaSetDevice(dev));
+
+  printf("Reducing array of type %s\n\n", getReduceTypeString(datatype));
+
+  bool bResult = false;
+
+  switch (datatype) {
+    default:
+    case REDUCE_INT:
+      bResult = runTest<int>(argc, argv, datatype);
+      break;
+
+    case REDUCE_FLOAT:
+      bResult = runTest<float>(argc, argv, datatype);
+      break;
+
+    case REDUCE_DOUBLE:
+      bResult = runTest<double>(argc, argv, datatype);
+      break;
+  }
+
+  printf(bResult ? "Test passed\n" : "Test failed!\n");
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Compute sum reduction on CPU
+//! We use Kahan summation for an accurate sum of large arrays.
+//! http://en.wikipedia.org/wiki/Kahan_summation_algorithm
+//!
+//! @param data       pointer to input data
+//! @param size       number of input data elements
+////////////////////////////////////////////////////////////////////////////////
+template <class T>
+T reduceCPU(T *data, int size) {
+  T sum = data[0];
+  T c = (T)0.0;
+
+  for (int i = 1; i < size; i++) {
+    T y = data[i] - c;
+    T t = sum + y;
+    c = (t - sum) - y;
+    sum = t;
+  }
+
+  return sum;
+}
+
+unsigned int nextPow2(unsigned int x) {
+  --x;
+  x |= x >> 1;
+  x |= x >> 2;
+  x |= x >> 4;
+  x |= x >> 8;
+  x |= x >> 16;
+  return ++x;
+}
+
+#ifndef MIN
+#define MIN(x, y) ((x < y) ? x : y)
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// Compute the number of threads and blocks to use for the given reduction
+// kernel For the kernels >= 3, we set threads / block to the minimum of
+// maxThreads and n/2. For kernels < 3, we set to the minimum of maxThreads and
+// n.  For kernel 6, we observe the maximum specified number of blocks, because
+// each thread in that kernel can process a variable number of elements.
+////////////////////////////////////////////////////////////////////////////////
+void getNumBlocksAndThreads(int whichKernel, int n, int maxBlocks,
+                            int maxThreads, int &blocks, int &threads) {
+  // get device capability, to avoid block/grid size exceed the upper bound
+  cudaDeviceProp prop;
+  int device;
+  checkCudaErrors(cudaGetDevice(&device));
+  checkCudaErrors(cudaGetDeviceProperties(&prop, device));
+
+  if (whichKernel < 3) {
+    threads = (n < maxThreads) ? nextPow2(n) : maxThreads;
+    blocks = (n + threads - 1) / threads;
+  } else {
+    threads = (n < maxThreads * 2) ? nextPow2((n + 1) / 2) : maxThreads;
+    blocks = (n + (threads * 2 - 1)) / (threads * 2);
+  }
+
+  if ((float)threads * blocks >
+      (float)prop.maxGridSize[0] * prop.maxThreadsPerBlock) {
+    printf("n is too large, please choose a smaller number!\n");
+  }
+
+  if (blocks > prop.maxGridSize[0]) {
+    printf(
+        "Grid size <%d> exceeds the device capability <%d>, set block size as "
+        "%d (original %d)\n",
+        blocks, prop.maxGridSize[0], threads * 2, threads);
+
+    blocks /= 2;
+    threads *= 2;
+  }
+
+  if (whichKernel == 6) {
+    blocks = MIN(maxBlocks, blocks);
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// This function performs a reduction of the input data multiple times and
+// measures the average reduction time.
+////////////////////////////////////////////////////////////////////////////////
+template <class T>
+T benchmarkReduce(int n, int numThreads, int numBlocks, int maxThreads,
+                  int maxBlocks, int whichKernel, int testIterations,
+                  bool cpuFinalReduction, int cpuFinalThreshold,
+                  StopWatchInterface *timer, T *h_odata, T *d_idata,
+                  T *d_odata) {
+  T gpu_result = 0;
+  bool needReadBack = true;
+
+  T *d_intermediateSums;
+  checkCudaErrors(
+      cudaMalloc((void **)&d_intermediateSums, sizeof(T) * numBlocks));
+
+  for (int i = 0; i < testIterations; ++i) {
+    gpu_result = 0;
+
+    cudaDeviceSynchronize();
+    sdkStartTimer(&timer);
+
+    // execute the kernel
+    reduce<T>(n, numThreads, numBlocks, whichKernel, d_idata, d_odata);
+
+    // check if kernel execution generated an error
+    getLastCudaError("Kernel execution failed");
+
+    if (cpuFinalReduction) {
+      // sum partial sums from each block on CPU
+      // copy result from device to host
+      checkCudaErrors(cudaMemcpy(h_odata, d_odata, numBlocks * sizeof(T),
+                                 cudaMemcpyDeviceToHost));
+
+      for (int i = 0; i < numBlocks; i++) {
+        gpu_result += h_odata[i];
+      }
+
+      needReadBack = false;
+    } else {
+      // sum partial block sums on GPU
+      int s = numBlocks;
+      int kernel = whichKernel;
+
+      while (s > cpuFinalThreshold) {
+        int threads = 0, blocks = 0;
+        getNumBlocksAndThreads(kernel, s, maxBlocks, maxThreads, blocks,
+                               threads);
+        checkCudaErrors(cudaMemcpy(d_intermediateSums, d_odata, s * sizeof(T),
+                                   cudaMemcpyDeviceToDevice));
+        reduce<T>(s, threads, blocks, kernel, d_intermediateSums, d_odata);
+
+        if (kernel < 3) {
+          s = (s + threads - 1) / threads;
+        } else {
+          s = (s + (threads * 2 - 1)) / (threads * 2);
+        }
+      }
+
+      if (s > 1) {
+        // copy result from device to host
+        checkCudaErrors(cudaMemcpy(h_odata, d_odata, s * sizeof(T),
+                                   cudaMemcpyDeviceToHost));
+
+        for (int i = 0; i < s; i++) {
+          gpu_result += h_odata[i];
+        }
+
+        needReadBack = false;
+      }
+    }
+
+    cudaDeviceSynchronize();
+    sdkStopTimer(&timer);
+  }
+
+  if (needReadBack) {
+    // copy final sum from device to host
+    checkCudaErrors(
+        cudaMemcpy(&gpu_result, d_odata, sizeof(T), cudaMemcpyDeviceToHost));
+  }
+  checkCudaErrors(cudaFree(d_intermediateSums));
+  return gpu_result;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// This function calls benchmarkReduce multiple times for a range of array sizes
+// and prints a report in CSV (comma-separated value) format that can be used
+// for generating a "shmoo" plot showing the performance for each kernel
+// variation over a wide range of input sizes.
+////////////////////////////////////////////////////////////////////////////////
+template <class T>
+void shmoo(int minN, int maxN, int maxThreads, int maxBlocks,
+           ReduceType datatype) {
+  // create random input data on CPU
+  unsigned int bytes = maxN * sizeof(T);
+
+  T *h_idata = (T *)malloc(bytes);
+
+  for (int i = 0; i < maxN; i++) {
+    // Keep the numbers small so we don't get truncation error in the sum
+    if (datatype == REDUCE_INT) {
+      h_idata[i] = (T)(rand() & 0xFF);
+    } else {
+      h_idata[i] = (rand() & 0xFF) / (T)RAND_MAX;
+    }
+  }
+
+  int maxNumBlocks = MIN(maxN / maxThreads, MAX_BLOCK_DIM_SIZE);
+
+  // allocate mem for the result on host side
+  T *h_odata = (T *)malloc(maxNumBlocks * sizeof(T));
+
+  // allocate device memory and data
+  T *d_idata = NULL;
+  T *d_odata = NULL;
+
+  checkCudaErrors(cudaMalloc((void **)&d_idata, bytes));
+  checkCudaErrors(cudaMalloc((void **)&d_odata, maxNumBlocks * sizeof(T)));
+
+  // copy data directly to device memory
+  checkCudaErrors(cudaMemcpy(d_idata, h_idata, bytes, cudaMemcpyHostToDevice));
+  checkCudaErrors(cudaMemcpy(d_odata, h_idata, maxNumBlocks * sizeof(T),
+                             cudaMemcpyHostToDevice));
+
+  // warm-up
+  for (int kernel = 0; kernel < 7; kernel++) {
+    reduce<T>(maxN, maxThreads, maxNumBlocks, kernel, d_idata, d_odata);
+  }
+
+  int testIterations = 100;
+
+  StopWatchInterface *timer = 0;
+  sdkCreateTimer(&timer);
+
+  // print headers
+  printf(
+      "Time in milliseconds for various numbers of elements for each "
+      "kernel\n\n\n");
+  printf("Kernel");
+
+  for (int i = minN; i <= maxN; i *= 2) {
+    printf(", %d", i);
+  }
+
+  for (int kernel = 0; kernel < 7; kernel++) {
+    printf("\n%d", kernel);
+
+    for (int i = minN; i <= maxN; i *= 2) {
+      sdkResetTimer(&timer);
+      int numBlocks = 0;
+      int numThreads = 0;
+      getNumBlocksAndThreads(kernel, i, maxBlocks, maxThreads, numBlocks,
+                             numThreads);
+
+      float reduceTime;
+
+      if (numBlocks <= MAX_BLOCK_DIM_SIZE) {
+        benchmarkReduce(i, numThreads, numBlocks, maxThreads, maxBlocks, kernel,
+                        testIterations, false, 1, timer, h_odata, d_idata,
+                        d_odata);
+        reduceTime = sdkGetAverageTimerValue(&timer);
+      } else {
+        reduceTime = -1.0;
+      }
+
+      printf(", %.5f", reduceTime);
+    }
+  }
+
+  // cleanup
+  sdkDeleteTimer(&timer);
+  free(h_idata);
+  free(h_odata);
+
+  checkCudaErrors(cudaFree(d_idata));
+  checkCudaErrors(cudaFree(d_odata));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// The main function which runs the reduction test.
+////////////////////////////////////////////////////////////////////////////////
+template <class T>
+bool runTest(int argc, char **argv, ReduceType datatype) {
+  int size = 1 << 24;    // number of elements to reduce
+  int maxThreads = 256;  // number of threads per block
+  int whichKernel = 6;
+  int maxBlocks = 64;
+  bool cpuFinalReduction = false;
+  int cpuFinalThreshold = 1;
+
+  if (checkCmdLineFlag(argc, (const char **)argv, "n")) {
+    size = getCmdLineArgumentInt(argc, (const char **)argv, "n");
+  }
+
+  if (checkCmdLineFlag(argc, (const char **)argv, "threads")) {
+    maxThreads = getCmdLineArgumentInt(argc, (const char **)argv, "threads");
+  }
+
+  if (checkCmdLineFlag(argc, (const char **)argv, "kernel")) {
+    whichKernel = getCmdLineArgumentInt(argc, (const char **)argv, "kernel");
+  }
+
+  if (checkCmdLineFlag(argc, (const char **)argv, "maxblocks")) {
+    maxBlocks = getCmdLineArgumentInt(argc, (const char **)argv, "maxblocks");
+  }
+
+  printf("%d elements\n", size);
+  printf("%d threads (max)\n", maxThreads);
+
+  cpuFinalReduction = checkCmdLineFlag(argc, (const char **)argv, "cpufinal");
+
+  if (checkCmdLineFlag(argc, (const char **)argv, "cputhresh")) {
+    cpuFinalThreshold =
+        getCmdLineArgumentInt(argc, (const char **)argv, "cputhresh");
+  }
+
+  bool runShmoo = checkCmdLineFlag(argc, (const char **)argv, "shmoo");
+
+  if (runShmoo) {
+    shmoo<T>(1, 33554432, maxThreads, maxBlocks, datatype);
+  } else {
+    // create random input data on CPU
+    unsigned int bytes = size * sizeof(T);
+
+    T *h_idata = (T *)malloc(bytes);
+
+    for (int i = 0; i < size; i++) {
+      // Keep the numbers small so we don't get truncation error in the sum
+      if (datatype == REDUCE_INT) {
+        h_idata[i] = (T)(rand() & 0xFF);
+      } else {
+        h_idata[i] = (rand() & 0xFF) / (T)RAND_MAX;
+      }
+    }
+
+    int numBlocks = 0;
+    int numThreads = 0;
+    getNumBlocksAndThreads(whichKernel, size, maxBlocks, maxThreads, numBlocks,
+                           numThreads);
+
+    if (numBlocks == 1) {
+      cpuFinalThreshold = 1;
+    }
+
+    // allocate mem for the result on host side
+    T *h_odata = (T *)malloc(numBlocks * sizeof(T));
+
+    printf("%d blocks\n\n", numBlocks);
+
+    // allocate device memory and data
+    T *d_idata = NULL;
+    T *d_odata = NULL;
+
+    checkCudaErrors(cudaMalloc((void **)&d_idata, bytes));
+    checkCudaErrors(cudaMalloc((void **)&d_odata, numBlocks * sizeof(T)));
+
+    // copy data directly to device memory
+    checkCudaErrors(
+        cudaMemcpy(d_idata, h_idata, bytes, cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(d_odata, h_idata, numBlocks * sizeof(T),
+                               cudaMemcpyHostToDevice));
+
+    // warm-up
+    reduce<T>(size, numThreads, numBlocks, whichKernel, d_idata, d_odata);
+
+    int testIterations = 100;
+
+    StopWatchInterface *timer = 0;
+    sdkCreateTimer(&timer);
+
+    T gpu_result = 0;
+
+    gpu_result =
+        benchmarkReduce<T>(size, numThreads, numBlocks, maxThreads, maxBlocks,
+                           whichKernel, testIterations, cpuFinalReduction,
+                           cpuFinalThreshold, timer, h_odata, d_idata, d_odata);
+
+    double reduceTime = sdkGetAverageTimerValue(&timer) * 1e-3;
+    printf(
+        "Reduction, Throughput = %.4f GB/s, Time = %.5f s, Size = %u Elements, "
+        "NumDevsUsed = %d, Workgroup = %u\n",
+        1.0e-9 * ((double)bytes) / reduceTime, reduceTime, size, 1, numThreads);
+
+    // compute reference solution
+    T cpu_result = reduceCPU<T>(h_idata, size);
+
+    int precision = 0;
+    double threshold = 0;
+    double diff = 0;
+
+    if (datatype == REDUCE_INT) {
+      printf("\nGPU result = %d\n", (int)gpu_result);
+      printf("CPU result = %d\n\n", (int)cpu_result);
+    } else {
+      if (datatype == REDUCE_FLOAT) {
+        precision = 8;
+        threshold = 1e-8 * size;
+      } else {
+        precision = 12;
+        threshold = 1e-12 * size;
+      }
+
+      printf("\nGPU result = %.*f\n", precision, (double)gpu_result);
+      printf("CPU result = %.*f\n\n", precision, (double)cpu_result);
+
+      diff = fabs((double)gpu_result - (double)cpu_result);
+    }
+
+    // cleanup
+    sdkDeleteTimer(&timer);
+    free(h_idata);
+    free(h_odata);
+
+    checkCudaErrors(cudaFree(d_idata));
+    checkCudaErrors(cudaFree(d_odata));
+
+    if (datatype == REDUCE_INT) {
+      return (gpu_result == cpu_result);
+    } else {
+      return (diff < threshold);
+    }
+  }
+
+  return true;
+}
diff --git a/Samples/reduction/reduction.h b/Samples/reduction/reduction.h
new file mode 100644
index 000000000..9727bd58a
--- /dev/null
+++ b/Samples/reduction/reduction.h
@@ -0,0 +1,36 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#ifndef __REDUCTION_H__
+#define __REDUCTION_H__
+
+template <class T>
+void reduce(int size, int threads, int blocks,
+            int whichKernel, T *d_idata, T *d_odata);
+
+#endif
diff --git a/Samples/reduction/reduction_kernel.cu b/Samples/reduction/reduction_kernel.cu
new file mode 100644
index 000000000..0aaee929e
--- /dev/null
+++ b/Samples/reduction/reduction_kernel.cu
@@ -0,0 +1,666 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+    Parallel reduction kernels
+*/
+
+#ifndef _REDUCE_KERNEL_H_
+#define _REDUCE_KERNEL_H_
+
+#include <cooperative_groups.h>
+#include <stdio.h>
+
+namespace cg = cooperative_groups;
+
+// Utility class used to avoid linker errors with extern
+// unsized shared memory arrays with templated type
+template <class T>
+struct SharedMemory {
+  __device__ inline operator T *() {
+    extern __shared__ int __smem[];
+    return (T *)__smem;
+  }
+
+  __device__ inline operator const T *() const {
+    extern __shared__ int __smem[];
+    return (T *)__smem;
+  }
+};
+
+// specialize for double to avoid unaligned memory
+// access compile errors
+template <>
+struct SharedMemory<double> {
+  __device__ inline operator double *() {
+    extern __shared__ double __smem_d[];
+    return (double *)__smem_d;
+  }
+
+  __device__ inline operator const double *() const {
+    extern __shared__ double __smem_d[];
+    return (double *)__smem_d;
+  }
+};
+
+/*
+    Parallel sum reduction using shared memory
+    - takes log(n) steps for n input elements
+    - uses n threads
+    - only works for power-of-2 arrays
+*/
+
+/* This reduction interleaves which threads are active by using the modulo
+   operator.  This operator is very expensive on GPUs, and the interleaved
+   inactivity means that no whole warps are active, which is also very
+   inefficient */
+template <class T>
+__global__ void reduce0(T *g_idata, T *g_odata, unsigned int n) {
+  // Handle to thread block group
+  cg::thread_block cta = cg::this_thread_block();
+  T *sdata = SharedMemory<T>();
+
+  // load shared mem
+  unsigned int tid = threadIdx.x;
+  unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+  sdata[tid] = (i < n) ? g_idata[i] : 0;
+
+  cg::sync(cta);
+
+  // do reduction in shared mem
+  for (unsigned int s = 1; s < blockDim.x; s *= 2) {
+    // modulo arithmetic is slow!
+    if ((tid % (2 * s)) == 0) {
+      sdata[tid] += sdata[tid + s];
+    }
+
+    cg::sync(cta);
+  }
+
+  // write result for this block to global mem
+  if (tid == 0) g_odata[blockIdx.x] = sdata[0];
+}
+
+/* This version uses contiguous threads, but its interleaved
+   addressing results in many shared memory bank conflicts.
+*/
+template <class T>
+__global__ void reduce1(T *g_idata, T *g_odata, unsigned int n) {
+  // Handle to thread block group
+  cg::thread_block cta = cg::this_thread_block();
+  T *sdata = SharedMemory<T>();
+
+  // load shared mem
+  unsigned int tid = threadIdx.x;
+  unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+  sdata[tid] = (i < n) ? g_idata[i] : 0;
+
+  cg::sync(cta);
+
+  // do reduction in shared mem
+  for (unsigned int s = 1; s < blockDim.x; s *= 2) {
+    int index = 2 * s * tid;
+
+    if (index < blockDim.x) {
+      sdata[index] += sdata[index + s];
+    }
+
+    cg::sync(cta);
+  }
+
+  // write result for this block to global mem
+  if (tid == 0) g_odata[blockIdx.x] = sdata[0];
+}
+
+/*
+    This version uses sequential addressing -- no divergence or bank conflicts.
+*/
+template <class T>
+__global__ void reduce2(T *g_idata, T *g_odata, unsigned int n) {
+  // Handle to thread block group
+  cg::thread_block cta = cg::this_thread_block();
+  T *sdata = SharedMemory<T>();
+
+  // load shared mem
+  unsigned int tid = threadIdx.x;
+  unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+  sdata[tid] = (i < n) ? g_idata[i] : 0;
+
+  cg::sync(cta);
+
+  // do reduction in shared mem
+  for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
+    if (tid < s) {
+      sdata[tid] += sdata[tid + s];
+    }
+
+    cg::sync(cta);
+  }
+
+  // write result for this block to global mem
+  if (tid == 0) g_odata[blockIdx.x] = sdata[0];
+}
+
+/*
+    This version uses n/2 threads --
+    it performs the first level of reduction when reading from global memory.
+*/
+template <class T>
+__global__ void reduce3(T *g_idata, T *g_odata, unsigned int n) {
+  // Handle to thread block group
+  cg::thread_block cta = cg::this_thread_block();
+  T *sdata = SharedMemory<T>();
+
+  // perform first level of reduction,
+  // reading from global memory, writing to shared memory
+  unsigned int tid = threadIdx.x;
+  unsigned int i = blockIdx.x * (blockDim.x * 2) + threadIdx.x;
+
+  T mySum = (i < n) ? g_idata[i] : 0;
+
+  if (i + blockDim.x < n) mySum += g_idata[i + blockDim.x];
+
+  sdata[tid] = mySum;
+  cg::sync(cta);
+
+  // do reduction in shared mem
+  for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
+    if (tid < s) {
+      sdata[tid] = mySum = mySum + sdata[tid + s];
+    }
+
+    cg::sync(cta);
+  }
+
+  // write result for this block to global mem
+  if (tid == 0) g_odata[blockIdx.x] = mySum;
+}
+
+/*
+    This version uses the warp shuffle operation if available to reduce
+    warp synchronization. When shuffle is not available the final warp's
+    worth of work is unrolled to reduce looping overhead.
+
+    See
+   http://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler/
+    for additional information about using shuffle to perform a reduction
+    within a warp.
+
+    Note, this kernel needs a minimum of 64*sizeof(T) bytes of shared memory.
+    In other words if blockSize <= 32, allocate 64*sizeof(T) bytes.
+    If blockSize > 32, allocate blockSize*sizeof(T) bytes.
+*/
+template <class T, unsigned int blockSize>
+__global__ void reduce4(T *g_idata, T *g_odata, unsigned int n) {
+  // Handle to thread block group
+  cg::thread_block cta = cg::this_thread_block();
+  T *sdata = SharedMemory<T>();
+
+  // perform first level of reduction,
+  // reading from global memory, writing to shared memory
+  unsigned int tid = threadIdx.x;
+  unsigned int i = blockIdx.x * (blockDim.x * 2) + threadIdx.x;
+
+  T mySum = (i < n) ? g_idata[i] : 0;
+
+  if (i + blockSize < n) mySum += g_idata[i + blockSize];
+
+  sdata[tid] = mySum;
+  cg::sync(cta);
+
+  // do reduction in shared mem
+  for (unsigned int s = blockDim.x / 2; s > 32; s >>= 1) {
+    if (tid < s) {
+      sdata[tid] = mySum = mySum + sdata[tid + s];
+    }
+
+    cg::sync(cta);
+  }
+
+  cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta);
+
+  if (cta.thread_rank() < 32) {
+    // Fetch final intermediate sum from 2nd warp
+    if (blockSize >= 64) mySum += sdata[tid + 32];
+    // Reduce final warp using shuffle
+    for (int offset = tile32.size() / 2; offset > 0; offset /= 2) {
+      mySum += tile32.shfl_down(mySum, offset);
+    }
+  }
+
+  // write result for this block to global mem
+  if (cta.thread_rank() == 0) g_odata[blockIdx.x] = mySum;
+}
+
+/*
+    This version is completely unrolled, unless warp shuffle is available, then
+    shuffle is used within a loop.  It uses a template parameter to achieve
+    optimal code for any (power of 2) number of threads.  This requires a switch
+    statement in the host code to handle all the different thread block sizes at
+    compile time. When shuffle is available, it is used to reduce warp
+   synchronization.
+
+    Note, this kernel needs a minimum of 64*sizeof(T) bytes of shared memory.
+    In other words if blockSize <= 32, allocate 64*sizeof(T) bytes.
+    If blockSize > 32, allocate blockSize*sizeof(T) bytes.
+*/
+template <class T, unsigned int blockSize>
+__global__ void reduce5(T *g_idata, T *g_odata, unsigned int n) {
+  // Handle to thread block group
+  cg::thread_block cta = cg::this_thread_block();
+  T *sdata = SharedMemory<T>();
+
+  // perform first level of reduction,
+  // reading from global memory, writing to shared memory
+  unsigned int tid = threadIdx.x;
+  unsigned int i = blockIdx.x * (blockSize * 2) + threadIdx.x;
+
+  T mySum = (i < n) ? g_idata[i] : 0;
+
+  if (i + blockSize < n) mySum += g_idata[i + blockSize];
+
+  sdata[tid] = mySum;
+  cg::sync(cta);
+
+  // do reduction in shared mem
+  if ((blockSize >= 512) && (tid < 256)) {
+    sdata[tid] = mySum = mySum + sdata[tid + 256];
+  }
+
+  cg::sync(cta);
+
+  if ((blockSize >= 256) && (tid < 128)) {
+    sdata[tid] = mySum = mySum + sdata[tid + 128];
+  }
+
+  cg::sync(cta);
+
+  if ((blockSize >= 128) && (tid < 64)) {
+    sdata[tid] = mySum = mySum + sdata[tid + 64];
+  }
+
+  cg::sync(cta);
+
+  cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta);
+
+  if (cta.thread_rank() < 32) {
+    // Fetch final intermediate sum from 2nd warp
+    if (blockSize >= 64) mySum += sdata[tid + 32];
+    // Reduce final warp using shuffle
+    for (int offset = tile32.size() / 2; offset > 0; offset /= 2) {
+      mySum += tile32.shfl_down(mySum, offset);
+    }
+  }
+
+  // write result for this block to global mem
+  if (cta.thread_rank() == 0) g_odata[blockIdx.x] = mySum;
+}
+
+/*
+    This version adds multiple elements per thread sequentially.  This reduces
+   the overall cost of the algorithm while keeping the work complexity O(n) and
+   the step complexity O(log n). (Brent's Theorem optimization)
+
+    Note, this kernel needs a minimum of 64*sizeof(T) bytes of shared memory.
+    In other words if blockSize <= 32, allocate 64*sizeof(T) bytes.
+    If blockSize > 32, allocate blockSize*sizeof(T) bytes.
+*/
+template <class T, unsigned int blockSize, bool nIsPow2>
+__global__ void reduce6(T *g_idata, T *g_odata, unsigned int n) {
+  // Handle to thread block group
+  cg::thread_block cta = cg::this_thread_block();
+  T *sdata = SharedMemory<T>();
+
+  // perform first level of reduction,
+  // reading from global memory, writing to shared memory
+  unsigned int tid = threadIdx.x;
+  unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x;
+  unsigned int gridSize = blockSize * 2 * gridDim.x;
+
+  T mySum = 0;
+
+  // we reduce multiple elements per thread.  The number is determined by the
+  // number of active thread blocks (via gridDim).  More blocks will result
+  // in a larger gridSize and therefore fewer elements per thread
+  while (i < n) {
+    mySum += g_idata[i];
+
+    // ensure we don't read out of bounds -- this is optimized away for powerOf2
+    // sized arrays
+    if (nIsPow2 || i + blockSize < n) mySum += g_idata[i + blockSize];
+
+    i += gridSize;
+  }
+
+  // each thread puts its local sum into shared memory
+  sdata[tid] = mySum;
+  cg::sync(cta);
+
+  // do reduction in shared mem
+  if ((blockSize >= 512) && (tid < 256)) {
+    sdata[tid] = mySum = mySum + sdata[tid + 256];
+  }
+
+  cg::sync(cta);
+
+  if ((blockSize >= 256) && (tid < 128)) {
+    sdata[tid] = mySum = mySum + sdata[tid + 128];
+  }
+
+  cg::sync(cta);
+
+  if ((blockSize >= 128) && (tid < 64)) {
+    sdata[tid] = mySum = mySum + sdata[tid + 64];
+  }
+
+  cg::sync(cta);
+
+  cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta);
+
+  if (cta.thread_rank() < 32) {
+    // Fetch final intermediate sum from 2nd warp
+    if (blockSize >= 64) mySum += sdata[tid + 32];
+    // Reduce final warp using shuffle
+    for (int offset = tile32.size() / 2; offset > 0; offset /= 2) {
+      mySum += tile32.shfl_down(mySum, offset);
+    }
+  }
+
+  // write result for this block to global mem
+  if (cta.thread_rank() == 0) g_odata[blockIdx.x] = mySum;
+}
+
+extern "C" bool isPow2(unsigned int x);
+
+////////////////////////////////////////////////////////////////////////////////
+// Wrapper function for kernel launch
+////////////////////////////////////////////////////////////////////////////////
+template <class T>
+void reduce(int size, int threads, int blocks, int whichKernel, T *d_idata,
+            T *d_odata) {
+  dim3 dimBlock(threads, 1, 1);
+  dim3 dimGrid(blocks, 1, 1);
+
+  // when there is only one warp per block, we need to allocate two warps
+  // worth of shared memory so that we don't index shared memory out of bounds
+  int smemSize =
+      (threads <= 32) ? 2 * threads * sizeof(T) : threads * sizeof(T);
+
+  // choose which of the optimized versions of reduction to launch
+  switch (whichKernel) {
+    case 0:
+      reduce0<T><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
+      break;
+
+    case 1:
+      reduce1<T><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
+      break;
+
+    case 2:
+      reduce2<T><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
+      break;
+
+    case 3:
+      reduce3<T><<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
+      break;
+
+    case 4:
+      switch (threads) {
+        case 512:
+          reduce4<T, 512>
+              <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
+          break;
+
+        case 256:
+          reduce4<T, 256>
+              <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
+          break;
+
+        case 128:
+          reduce4<T, 128>
+              <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
+          break;
+
+        case 64:
+          reduce4<T, 64>
+              <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
+          break;
+
+        case 32:
+          reduce4<T, 32>
+              <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
+          break;
+
+        case 16:
+          reduce4<T, 16>
+              <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
+          break;
+
+        case 8:
+          reduce4<T, 8>
+              <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
+          break;
+
+        case 4:
+          reduce4<T, 4>
+              <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
+          break;
+
+        case 2:
+          reduce4<T, 2>
+              <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
+          break;
+
+        case 1:
+          reduce4<T, 1>
+              <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
+          break;
+      }
+
+      break;
+
+    case 5:
+      switch (threads) {
+        case 512:
+          reduce5<T, 512>
+              <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
+          break;
+
+        case 256:
+          reduce5<T, 256>
+              <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
+          break;
+
+        case 128:
+          reduce5<T, 128>
+              <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
+          break;
+
+        case 64:
+          reduce5<T, 64>
+              <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
+          break;
+
+        case 32:
+          reduce5<T, 32>
+              <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
+          break;
+
+        case 16:
+          reduce5<T, 16>
+              <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
+          break;
+
+        case 8:
+          reduce5<T, 8>
+              <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
+          break;
+
+        case 4:
+          reduce5<T, 4>
+              <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
+          break;
+
+        case 2:
+          reduce5<T, 2>
+              <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
+          break;
+
+        case 1:
+          reduce5<T, 1>
+              <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
+          break;
+      }
+
+      break;
+
+    case 6:
+    default:
+      if (isPow2(size)) {
+        switch (threads) {
+          case 512:
+            reduce6<T, 512, true>
+                <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
+            break;
+
+          case 256:
+            reduce6<T, 256, true>
+                <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
+            break;
+
+          case 128:
+            reduce6<T, 128, true>
+                <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
+            break;
+
+          case 64:
+            reduce6<T, 64, true>
+                <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
+            break;
+
+          case 32:
+            reduce6<T, 32, true>
+                <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
+            break;
+
+          case 16:
+            reduce6<T, 16, true>
+                <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
+            break;
+
+          case 8:
+            reduce6<T, 8, true>
+                <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
+            break;
+
+          case 4:
+            reduce6<T, 4, true>
+                <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
+            break;
+
+          case 2:
+            reduce6<T, 2, true>
+                <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
+            break;
+
+          case 1:
+            reduce6<T, 1, true>
+                <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
+            break;
+        }
+      } else {
+        switch (threads) {
+          case 512:
+            reduce6<T, 512, false>
+                <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
+            break;
+
+          case 256:
+            reduce6<T, 256, false>
+                <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
+            break;
+
+          case 128:
+            reduce6<T, 128, false>
+                <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
+            break;
+
+          case 64:
+            reduce6<T, 64, false>
+                <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
+            break;
+
+          case 32:
+            reduce6<T, 32, false>
+                <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
+            break;
+
+          case 16:
+            reduce6<T, 16, false>
+                <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
+            break;
+
+          case 8:
+            reduce6<T, 8, false>
+                <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
+            break;
+
+          case 4:
+            reduce6<T, 4, false>
+                <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
+            break;
+
+          case 2:
+            reduce6<T, 2, false>
+                <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
+            break;
+
+          case 1:
+            reduce6<T, 1, false>
+                <<<dimGrid, dimBlock, smemSize>>>(d_idata, d_odata, size);
+            break;
+        }
+      }
+
+      break;
+  }
+}
+
+// Instantiate the reduction function for 3 types
+template void reduce<int>(int size, int threads, int blocks, int whichKernel,
+                          int *d_idata, int *d_odata);
+
+template void reduce<float>(int size, int threads, int blocks, int whichKernel,
+                            float *d_idata, float *d_odata);
+
+template void reduce<double>(int size, int threads, int blocks, int whichKernel,
+                             double *d_idata, double *d_odata);
+
+#endif  // #ifndef _REDUCE_KERNEL_H_
diff --git a/Samples/reduction/reduction_vs2012.sln b/Samples/reduction/reduction_vs2012.sln
new file mode 100644
index 000000000..dbd1bfc4b
--- /dev/null
+++ b/Samples/reduction/reduction_vs2012.sln
@@ -0,0 +1,20 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2012
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "reduction", "reduction_vs2012.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
diff --git a/Samples/reduction/reduction_vs2012.vcxproj b/Samples/reduction/reduction_vs2012.vcxproj
new file mode 100644
index 000000000..9cc806f7e
--- /dev/null
+++ b/Samples/reduction/reduction_vs2012.vcxproj
@@ -0,0 +1,108 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>reduction_vs2012</RootNamespace>
+    <ProjectName>reduction</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v110</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/reduction.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="reduction.cpp" />
+    <CudaCompile Include="reduction_kernel.cu" />
+    <ClInclude Include="reduction.h" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
+  </ImportGroup>
+</Project>
diff --git a/Samples/reduction/reduction_vs2013.sln b/Samples/reduction/reduction_vs2013.sln
new file mode 100644
index 000000000..a04f9b383
--- /dev/null
+++ b/Samples/reduction/reduction_vs2013.sln
@@ -0,0 +1,20 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 13.00
+# Visual Studio 2013
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "reduction", "reduction_vs2013.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
diff --git a/Samples/reduction/reduction_vs2013.vcxproj b/Samples/reduction/reduction_vs2013.vcxproj
new file mode 100644
index 000000000..fc6ebd5ad
--- /dev/null
+++ b/Samples/reduction/reduction_vs2013.vcxproj
@@ -0,0 +1,108 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>reduction_vs2013</RootNamespace>
+    <ProjectName>reduction</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v120</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/reduction.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="reduction.cpp" />
+    <CudaCompile Include="reduction_kernel.cu" />
+    <ClInclude Include="reduction.h" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
+  </ImportGroup>
+</Project>
diff --git a/Samples/reduction/reduction_vs2015.sln b/Samples/reduction/reduction_vs2015.sln
new file mode 100644
index 000000000..528951f46
--- /dev/null
+++ b/Samples/reduction/reduction_vs2015.sln
@@ -0,0 +1,20 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 14.00
+# Visual Studio 2015
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "reduction", "reduction_vs2015.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
diff --git a/Samples/reduction/reduction_vs2015.vcxproj b/Samples/reduction/reduction_vs2015.vcxproj
new file mode 100644
index 000000000..72804717a
--- /dev/null
+++ b/Samples/reduction/reduction_vs2015.vcxproj
@@ -0,0 +1,108 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>reduction_vs2015</RootNamespace>
+    <ProjectName>reduction</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v140</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/reduction.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="reduction.cpp" />
+    <CudaCompile Include="reduction_kernel.cu" />
+    <ClInclude Include="reduction.h" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
+  </ImportGroup>
+</Project>
diff --git a/Samples/reduction/reduction_vs2017.sln b/Samples/reduction/reduction_vs2017.sln
new file mode 100644
index 000000000..4dc8b9429
--- /dev/null
+++ b/Samples/reduction/reduction_vs2017.sln
@@ -0,0 +1,20 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2017
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "reduction", "reduction_vs2017.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
diff --git a/Samples/reduction/reduction_vs2017.vcxproj b/Samples/reduction/reduction_vs2017.vcxproj
new file mode 100644
index 000000000..4f72d0b07
--- /dev/null
+++ b/Samples/reduction/reduction_vs2017.vcxproj
@@ -0,0 +1,109 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>reduction_vs2017</RootNamespace>
+    <ProjectName>reduction</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v141</PlatformToolset>
+	<WindowsTargetPlatformVersion>10.0.15063.0</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/reduction.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="reduction.cpp" />
+    <CudaCompile Include="reduction_kernel.cu" />
+    <ClInclude Include="reduction.h" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
+  </ImportGroup>
+</Project>
diff --git a/Samples/shfl_scan/Makefile b/Samples/shfl_scan/Makefile
index fcb778ce6..6e5df239a 100644
--- a/Samples/shfl_scan/Makefile
+++ b/Samples/shfl_scan/Makefile
@@ -246,7 +246,11 @@ LIBRARIES :=
 ################################################################################
 
 # Gencode arguments
+ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
+SMS ?= 30 35 37 50 52 60 61 70 72 75
+else
 SMS ?= 30 35 37 50 52 60 61 70 75
+endif
 
 ifeq ($(SMS),)
 $(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
diff --git a/Samples/shfl_scan/NsightEclipse.xml b/Samples/shfl_scan/NsightEclipse.xml
index f7e1801d7..4899f9797 100644
--- a/Samples/shfl_scan/NsightEclipse.xml
+++ b/Samples/shfl_scan/NsightEclipse.xml
@@ -42,6 +42,7 @@
   <sm-arch>sm60</sm-arch>
   <sm-arch>sm61</sm-arch>
   <sm-arch>sm70</sm-arch>
+  <sm-arch>sm72</sm-arch>
   <sm-arch>sm75</sm-arch>
   <supported_envs>
     <env>
diff --git a/Samples/shfl_scan/README.md b/Samples/shfl_scan/README.md
index f2726ddb1..b70d5531a 100644
--- a/Samples/shfl_scan/README.md
+++ b/Samples/shfl_scan/README.md
@@ -10,7 +10,7 @@ Data-Parallel Algorithms, Performance Strategies
 
 ## Supported SM Architectures
 
-[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
+[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
 
 ## Supported OSes
 
@@ -24,7 +24,7 @@ x86_64, ppc64le, armv7l, aarch64
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 10.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 
 ## Build and Run
 
diff --git a/Samples/shfl_scan/shfl_scan_vs2012.vcxproj b/Samples/shfl_scan/shfl_scan_vs2012.vcxproj
index ffb126f99..f96b4c239 100644
--- a/Samples/shfl_scan/shfl_scan_vs2012.vcxproj
+++ b/Samples/shfl_scan/shfl_scan_vs2012.vcxproj
@@ -33,7 +33,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -103,6 +103,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/shfl_scan/shfl_scan_vs2013.vcxproj b/Samples/shfl_scan/shfl_scan_vs2013.vcxproj
index 08f212cb8..da2894823 100644
--- a/Samples/shfl_scan/shfl_scan_vs2013.vcxproj
+++ b/Samples/shfl_scan/shfl_scan_vs2013.vcxproj
@@ -33,7 +33,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -103,6 +103,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/shfl_scan/shfl_scan_vs2015.vcxproj b/Samples/shfl_scan/shfl_scan_vs2015.vcxproj
index 0debbb65b..22742a7d7 100644
--- a/Samples/shfl_scan/shfl_scan_vs2015.vcxproj
+++ b/Samples/shfl_scan/shfl_scan_vs2015.vcxproj
@@ -33,7 +33,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -103,6 +103,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/shfl_scan/shfl_scan_vs2017.vcxproj b/Samples/shfl_scan/shfl_scan_vs2017.vcxproj
index 935732987..ddde4c99a 100644
--- a/Samples/shfl_scan/shfl_scan_vs2017.vcxproj
+++ b/Samples/shfl_scan/shfl_scan_vs2017.vcxproj
@@ -34,7 +34,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -104,6 +104,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/simpleCUBLAS/NsightEclipse.xml b/Samples/simpleCUBLAS/NsightEclipse.xml
index 06b96ee09..05c2a4fd2 100644
--- a/Samples/simpleCUBLAS/NsightEclipse.xml
+++ b/Samples/simpleCUBLAS/NsightEclipse.xml
@@ -41,6 +41,7 @@
   <sm-arch>sm60</sm-arch>
   <sm-arch>sm61</sm-arch>
   <sm-arch>sm70</sm-arch>
+  <sm-arch>sm72</sm-arch>
   <sm-arch>sm75</sm-arch>
   <supported_envs>
     <env>
diff --git a/Samples/simpleCUBLAS/README.md b/Samples/simpleCUBLAS/README.md
index 7acb91825..1dd206ac0 100644
--- a/Samples/simpleCUBLAS/README.md
+++ b/Samples/simpleCUBLAS/README.md
@@ -10,7 +10,7 @@ Image Processing, CUBLAS Library
 
 ## Supported SM Architectures
 
-[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
+[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
 
 ## Supported OSes
 
@@ -27,7 +27,7 @@ x86_64, ppc64le, armv7l, aarch64
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 10.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 
 ## Build and Run
diff --git a/Samples/simpleCUBLAS/simpleCUBLAS_vs2012.vcxproj b/Samples/simpleCUBLAS/simpleCUBLAS_vs2012.vcxproj
index 5b9f16016..36437c3ef 100644
--- a/Samples/simpleCUBLAS/simpleCUBLAS_vs2012.vcxproj
+++ b/Samples/simpleCUBLAS/simpleCUBLAS_vs2012.vcxproj
@@ -33,7 +33,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -102,6 +102,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/simpleCUBLAS/simpleCUBLAS_vs2013.vcxproj b/Samples/simpleCUBLAS/simpleCUBLAS_vs2013.vcxproj
index ba1a17c45..072145b89 100644
--- a/Samples/simpleCUBLAS/simpleCUBLAS_vs2013.vcxproj
+++ b/Samples/simpleCUBLAS/simpleCUBLAS_vs2013.vcxproj
@@ -33,7 +33,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -102,6 +102,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/simpleCUBLAS/simpleCUBLAS_vs2015.vcxproj b/Samples/simpleCUBLAS/simpleCUBLAS_vs2015.vcxproj
index 24a93d516..23992ba82 100644
--- a/Samples/simpleCUBLAS/simpleCUBLAS_vs2015.vcxproj
+++ b/Samples/simpleCUBLAS/simpleCUBLAS_vs2015.vcxproj
@@ -33,7 +33,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -102,6 +102,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/simpleCUBLAS/simpleCUBLAS_vs2017.vcxproj b/Samples/simpleCUBLAS/simpleCUBLAS_vs2017.vcxproj
index d2beef3d5..b20a411e4 100644
--- a/Samples/simpleCUBLAS/simpleCUBLAS_vs2017.vcxproj
+++ b/Samples/simpleCUBLAS/simpleCUBLAS_vs2017.vcxproj
@@ -34,7 +34,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -103,6 +103,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/simpleCUBLASXT/NsightEclipse.xml b/Samples/simpleCUBLASXT/NsightEclipse.xml
index 37cd88053..10f2775d3 100644
--- a/Samples/simpleCUBLASXT/NsightEclipse.xml
+++ b/Samples/simpleCUBLASXT/NsightEclipse.xml
@@ -40,6 +40,7 @@
   <sm-arch>sm60</sm-arch>
   <sm-arch>sm61</sm-arch>
   <sm-arch>sm70</sm-arch>
+  <sm-arch>sm72</sm-arch>
   <sm-arch>sm75</sm-arch>
   <supported_envs>
     <env>
diff --git a/Samples/simpleCUBLASXT/README.md b/Samples/simpleCUBLASXT/README.md
index 63260e6d6..59c2f4e47 100644
--- a/Samples/simpleCUBLASXT/README.md
+++ b/Samples/simpleCUBLASXT/README.md
@@ -10,7 +10,7 @@ CUBLAS-XT Library
 
 ## Supported SM Architectures
 
-[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
+[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
 
 ## Supported OSes
 
@@ -27,7 +27,7 @@ x86_64, ppc64le, aarch64
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 10.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 
 ## Build and Run
diff --git a/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2012.vcxproj b/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2012.vcxproj
index 60731c2af..fa170b48a 100644
--- a/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2012.vcxproj
+++ b/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2012.vcxproj
@@ -33,7 +33,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -102,6 +102,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2013.vcxproj b/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2013.vcxproj
index 861f66923..d46d0908a 100644
--- a/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2013.vcxproj
+++ b/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2013.vcxproj
@@ -33,7 +33,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -102,6 +102,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2015.vcxproj b/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2015.vcxproj
index 747e9a4bf..f1da4ca24 100644
--- a/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2015.vcxproj
+++ b/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2015.vcxproj
@@ -33,7 +33,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -102,6 +102,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2017.vcxproj b/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2017.vcxproj
index 07c44de17..a33be8d22 100644
--- a/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2017.vcxproj
+++ b/Samples/simpleCUBLASXT/simpleCUBLASXT_vs2017.vcxproj
@@ -34,7 +34,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -103,6 +103,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/simpleCUFFT/Makefile b/Samples/simpleCUFFT/Makefile
index f2a261a9d..c235dab3e 100644
--- a/Samples/simpleCUFFT/Makefile
+++ b/Samples/simpleCUFFT/Makefile
@@ -246,7 +246,11 @@ LIBRARIES :=
 ################################################################################
 
 # Gencode arguments
+ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
+SMS ?= 30 35 37 50 52 60 61 70 72 75
+else
 SMS ?= 30 35 37 50 52 60 61 70 75
+endif
 
 ifeq ($(SMS),)
 $(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
diff --git a/Samples/simpleCUFFT/NsightEclipse.xml b/Samples/simpleCUFFT/NsightEclipse.xml
index 27a3e9ffd..1d8ac3b1c 100644
--- a/Samples/simpleCUFFT/NsightEclipse.xml
+++ b/Samples/simpleCUFFT/NsightEclipse.xml
@@ -39,6 +39,7 @@
   <sm-arch>sm60</sm-arch>
   <sm-arch>sm61</sm-arch>
   <sm-arch>sm70</sm-arch>
+  <sm-arch>sm72</sm-arch>
   <sm-arch>sm75</sm-arch>
   <supported_envs>
     <env>
diff --git a/Samples/simpleCUFFT/README.md b/Samples/simpleCUFFT/README.md
index 95724675b..d128956aa 100644
--- a/Samples/simpleCUFFT/README.md
+++ b/Samples/simpleCUFFT/README.md
@@ -10,7 +10,7 @@ Image Processing, CUFFT Library
 
 ## Supported SM Architectures
 
-[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
+[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
 
 ## Supported OSes
 
@@ -27,7 +27,7 @@ x86_64, ppc64le, armv7l, aarch64
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 10.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 
 ## Build and Run
diff --git a/Samples/simpleCUFFT/simpleCUFFT_vs2012.vcxproj b/Samples/simpleCUFFT/simpleCUFFT_vs2012.vcxproj
index a29e47cfa..8eb2145f2 100644
--- a/Samples/simpleCUFFT/simpleCUFFT_vs2012.vcxproj
+++ b/Samples/simpleCUFFT/simpleCUFFT_vs2012.vcxproj
@@ -33,7 +33,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -102,6 +102,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/simpleCUFFT/simpleCUFFT_vs2013.vcxproj b/Samples/simpleCUFFT/simpleCUFFT_vs2013.vcxproj
index cb78fb43d..fc95fd17b 100644
--- a/Samples/simpleCUFFT/simpleCUFFT_vs2013.vcxproj
+++ b/Samples/simpleCUFFT/simpleCUFFT_vs2013.vcxproj
@@ -33,7 +33,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -102,6 +102,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/simpleCUFFT/simpleCUFFT_vs2015.vcxproj b/Samples/simpleCUFFT/simpleCUFFT_vs2015.vcxproj
index 4d4ef344b..bc54d35da 100644
--- a/Samples/simpleCUFFT/simpleCUFFT_vs2015.vcxproj
+++ b/Samples/simpleCUFFT/simpleCUFFT_vs2015.vcxproj
@@ -33,7 +33,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -102,6 +102,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/simpleCUFFT/simpleCUFFT_vs2017.vcxproj b/Samples/simpleCUFFT/simpleCUFFT_vs2017.vcxproj
index 1f5b2dfc7..c2e1fe040 100644
--- a/Samples/simpleCUFFT/simpleCUFFT_vs2017.vcxproj
+++ b/Samples/simpleCUFFT/simpleCUFFT_vs2017.vcxproj
@@ -34,7 +34,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -103,6 +103,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/simpleCudaGraphs/Makefile b/Samples/simpleCudaGraphs/Makefile
index 9341732af..5a76d9242 100644
--- a/Samples/simpleCudaGraphs/Makefile
+++ b/Samples/simpleCudaGraphs/Makefile
@@ -246,7 +246,11 @@ LIBRARIES :=
 ################################################################################
 
 # Gencode arguments
+ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
+SMS ?= 30 35 37 50 52 60 61 70 72 75
+else
 SMS ?= 30 35 37 50 52 60 61 70 75
+endif
 
 ifeq ($(SMS),)
 $(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
diff --git a/Samples/simpleCudaGraphs/NsightEclipse.xml b/Samples/simpleCudaGraphs/NsightEclipse.xml
index 8c4e98703..9137ab803 100644
--- a/Samples/simpleCudaGraphs/NsightEclipse.xml
+++ b/Samples/simpleCudaGraphs/NsightEclipse.xml
@@ -49,6 +49,7 @@
   <sm-arch>sm60</sm-arch>
   <sm-arch>sm61</sm-arch>
   <sm-arch>sm70</sm-arch>
+  <sm-arch>sm72</sm-arch>
   <sm-arch>sm75</sm-arch>
   <supported_envs>
     <env>
diff --git a/Samples/simpleCudaGraphs/README.md b/Samples/simpleCudaGraphs/README.md
index a4a226b48..8fbbc65bc 100644
--- a/Samples/simpleCudaGraphs/README.md
+++ b/Samples/simpleCudaGraphs/README.md
@@ -10,7 +10,7 @@ CUDA Graphs, Stream Capture
 
 ## Supported SM Architectures
 
-[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
+[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
 
 ## Supported OSes
 
@@ -27,7 +27,7 @@ cudaStreamBeginCapture, cudaStreamEndCapture, cudaLaunchHostFunc, cudaGraphCreat
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 10.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 
 ## Build and Run
 
diff --git a/Samples/simpleCudaGraphs/simpleCudaGraphs.cu b/Samples/simpleCudaGraphs/simpleCudaGraphs.cu
index 6db123f80..af6b4eb62 100644
--- a/Samples/simpleCudaGraphs/simpleCudaGraphs.cu
+++ b/Samples/simpleCudaGraphs/simpleCudaGraphs.cu
@@ -302,7 +302,7 @@ void cudaGraphsUsingStreamCapture(float *inputVec_h, float *inputVec_d,
   checkCudaErrors(cudaStreamCreate(&streamForGraph));
   checkCudaErrors(cudaEventCreate(&reduceKernelEvent));
 
-  checkCudaErrors(cudaStreamBeginCapture(stream1));
+  checkCudaErrors(cudaStreamBeginCapture(stream1, cudaStreamCaptureModeGlobal));
 
   checkCudaErrors(cudaMemcpyAsync(inputVec_d, inputVec_h,
                                   sizeof(float) * inputSize, cudaMemcpyDefault,
@@ -396,4 +396,4 @@ int main(int argc, char **argv) {
   checkCudaErrors(cudaFree(outputVec_d));
   checkCudaErrors(cudaFree(result_d));
   return EXIT_SUCCESS;
-}
\ No newline at end of file
+}
diff --git a/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2012.vcxproj b/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2012.vcxproj
index ec603e11e..50897567d 100644
--- a/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2012.vcxproj
+++ b/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2012.vcxproj
@@ -33,7 +33,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -102,6 +102,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2013.vcxproj b/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2013.vcxproj
index def552f57..a72646dae 100644
--- a/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2013.vcxproj
+++ b/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2013.vcxproj
@@ -33,7 +33,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -102,6 +102,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2015.vcxproj b/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2015.vcxproj
index d36327423..135022ec1 100644
--- a/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2015.vcxproj
+++ b/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2015.vcxproj
@@ -33,7 +33,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -102,6 +102,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2017.vcxproj b/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2017.vcxproj
index 57c7e22a2..12042576a 100644
--- a/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2017.vcxproj
+++ b/Samples/simpleCudaGraphs/simpleCudaGraphs_vs2017.vcxproj
@@ -34,7 +34,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -103,6 +103,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/simpleD3D12/NsightEclipse.xml b/Samples/simpleD3D12/NsightEclipse.xml
index 3fdff74f3..902723ba3 100644
--- a/Samples/simpleD3D12/NsightEclipse.xml
+++ b/Samples/simpleD3D12/NsightEclipse.xml
@@ -48,6 +48,7 @@
   <sm-arch>sm60</sm-arch>
   <sm-arch>sm61</sm-arch>
   <sm-arch>sm70</sm-arch>
+  <sm-arch>sm72</sm-arch>
   <sm-arch>sm75</sm-arch>
   <supported_envs>
     <env>
diff --git a/Samples/simpleD3D12/README.md b/Samples/simpleD3D12/README.md
index 6e236a72f..52cc68036 100644
--- a/Samples/simpleD3D12/README.md
+++ b/Samples/simpleD3D12/README.md
@@ -10,7 +10,7 @@ Graphics Interop, CUDA DX12 Interop, Image Processing
 
 ## Supported SM Architectures
 
-[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
+[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
 
 ## Supported OSes
 
@@ -30,7 +30,7 @@ cudaWaitExternalSemaphoresAsync, cudaSignalExternalSemaphoresAsync, cudaImportEx
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 10.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 
 ## Build and Run
diff --git a/Samples/simpleD3D12/simpleD3D12_vs2015.vcxproj b/Samples/simpleD3D12/simpleD3D12_vs2015.vcxproj
index 522781dee..f90745454 100644
--- a/Samples/simpleD3D12/simpleD3D12_vs2015.vcxproj
+++ b/Samples/simpleD3D12/simpleD3D12_vs2015.vcxproj
@@ -38,7 +38,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -119,6 +119,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/simpleD3D12/simpleD3D12_vs2017.vcxproj b/Samples/simpleD3D12/simpleD3D12_vs2017.vcxproj
index 94e39cbe9..64366956b 100644
--- a/Samples/simpleD3D12/simpleD3D12_vs2017.vcxproj
+++ b/Samples/simpleD3D12/simpleD3D12_vs2017.vcxproj
@@ -39,7 +39,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -120,6 +120,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/simpleIPC/Makefile b/Samples/simpleIPC/Makefile
new file mode 100644
index 000000000..12396b36a
--- /dev/null
+++ b/Samples/simpleIPC/Makefile
@@ -0,0 +1,325 @@
+################################################################################
+# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+#
+# Makefile project only supported on Mac OS X and Linux Platforms)
+#
+################################################################################
+
+# Location of the CUDA Toolkit
+CUDA_PATH ?= /usr/local/cuda
+
+##############################
+# start deprecated interface #
+##############################
+ifeq ($(x86_64),1)
+    $(info WARNING - x86_64 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=x86_64 instead)
+    TARGET_ARCH ?= x86_64
+endif
+ifeq ($(ARMv7),1)
+    $(info WARNING - ARMv7 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=armv7l instead)
+    TARGET_ARCH ?= armv7l
+endif
+ifeq ($(aarch64),1)
+    $(info WARNING - aarch64 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=aarch64 instead)
+    TARGET_ARCH ?= aarch64
+endif
+ifeq ($(ppc64le),1)
+    $(info WARNING - ppc64le variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=ppc64le instead)
+    TARGET_ARCH ?= ppc64le
+endif
+ifneq ($(GCC),)
+    $(info WARNING - GCC variable has been deprecated)
+    $(info WARNING - please use HOST_COMPILER=$(GCC) instead)
+    HOST_COMPILER ?= $(GCC)
+endif
+ifneq ($(abi),)
+    $(error ERROR - abi variable has been removed)
+endif
+############################
+# end deprecated interface #
+############################
+
+# architecture
+HOST_ARCH   := $(shell uname -m)
+TARGET_ARCH ?= $(HOST_ARCH)
+ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
+    ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le))
+            TARGET_SIZE := 64
+        else ifneq (,$(filter $(TARGET_ARCH),armv7l))
+            TARGET_SIZE := 32
+        endif
+    else
+        TARGET_SIZE := $(shell getconf LONG_BIT)
+    endif
+else
+    $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
+endif
+ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le))
+        $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
+    endif
+endif
+
+# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
+ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
+    TARGET_ARCH = armv7l
+endif
+
+# operating system
+HOST_OS   := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
+TARGET_OS ?= $(HOST_OS)
+ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
+    $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
+endif
+
+# host compiler
+ifeq ($(TARGET_OS),darwin)
+    ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
+        HOST_COMPILER ?= clang++
+    endif
+else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
+        ifeq ($(TARGET_OS),linux)
+            HOST_COMPILER ?= arm-linux-gnueabihf-g++
+        else ifeq ($(TARGET_OS),qnx)
+            ifeq ($(QNX_HOST),)
+                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
+            endif
+            ifeq ($(QNX_TARGET),)
+                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
+            endif
+            export QNX_HOST
+            export QNX_TARGET
+            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
+        else ifeq ($(TARGET_OS),android)
+            HOST_COMPILER ?= arm-linux-androideabi-g++
+        endif
+    else ifeq ($(TARGET_ARCH),aarch64)
+        ifeq ($(TARGET_OS), linux)
+            HOST_COMPILER ?= aarch64-linux-gnu-g++
+        else ifeq ($(TARGET_OS),qnx)
+            ifeq ($(QNX_HOST),)
+                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
+            endif
+            ifeq ($(QNX_TARGET),)
+                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
+            endif
+            export QNX_HOST
+            export QNX_TARGET
+            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/aarch64-unknown-nto-qnx7.0.0-g++
+        else ifeq ($(TARGET_OS), android)
+            HOST_COMPILER ?= aarch64-linux-android-clang++
+        endif
+    else ifeq ($(TARGET_ARCH),ppc64le)
+        HOST_COMPILER ?= powerpc64le-linux-gnu-g++
+    endif
+endif
+HOST_COMPILER ?= g++
+NVCC          := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
+
+# internal flags
+NVCCFLAGS   := -m${TARGET_SIZE}
+CCFLAGS     :=
+LDFLAGS     :=
+
+# build flags
+ifeq ($(TARGET_OS),darwin)
+    LDFLAGS += -rpath $(CUDA_PATH)/lib
+    CCFLAGS += -arch $(HOST_ARCH)
+else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
+    LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
+    CCFLAGS += -mfloat-abi=hard
+else ifeq ($(TARGET_OS),android)
+    LDFLAGS += -pie
+    CCFLAGS += -fpie -fpic -fexceptions
+endif
+
+ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
+        ifneq ($(TARGET_FS),)
+            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
+            ifeq ($(GCCVERSIONLTEQ46),1)
+                CCFLAGS += --sysroot=$(TARGET_FS)
+            endif
+            LDFLAGS += --sysroot=$(TARGET_FS)
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
+        ifneq ($(TARGET_FS),)
+            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
+            ifeq ($(GCCVERSIONLTEQ46),1)
+                CCFLAGS += --sysroot=$(TARGET_FS)
+            endif
+            LDFLAGS += --sysroot=$(TARGET_FS)
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L $(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L $(TARGET_FS)/usr/lib/aarch64-linux-gnu
+            LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu
+        endif
+    endif
+endif
+
+ifeq ($(TARGET_OS),qnx)
+    CCFLAGS += -DWIN_INTERFACE_CUSTOM
+    LDFLAGS += -lsocket
+endif
+
+# Install directory of different arch
+CUDA_INSTALL_TARGET_DIR :=
+ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
+    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
+    CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
+else ifeq ($(TARGET_ARCH),ppc64le)
+    CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
+endif
+
+# Debug build flags
+ifeq ($(dbg),1)
+      NVCCFLAGS += -g -G
+      BUILD_TYPE := debug
+else
+      BUILD_TYPE := release
+endif
+
+ALL_CCFLAGS :=
+ALL_CCFLAGS += $(NVCCFLAGS)
+ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
+ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
+ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
+
+SAMPLE_ENABLED := 1
+
+# This sample is not supported on Mac OSX
+ifeq ($(TARGET_OS),darwin)
+  $(info >>> WARNING - simpleIPC is not supported on Mac OSX - waiving sample <<<)
+  SAMPLE_ENABLED := 0
+endif
+
+# This sample is not supported on ARMv7
+ifeq ($(TARGET_ARCH),armv7l)
+  $(info >>> WARNING - simpleIPC is not supported on ARMv7 - waiving sample <<<)
+  SAMPLE_ENABLED := 0
+endif
+
+# This sample is not supported on aarch64
+ifeq ($(TARGET_ARCH),aarch64)
+  $(info >>> WARNING - simpleIPC is not supported on aarch64 - waiving sample <<<)
+  SAMPLE_ENABLED := 0
+endif
+
+ALL_LDFLAGS :=
+ALL_LDFLAGS += $(ALL_CCFLAGS)
+ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
+ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
+
+# Common includes and paths for CUDA
+INCLUDES  := -I../../Common
+LIBRARIES :=
+
+################################################################################
+
+# Gencode arguments
+ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
+SMS ?= 30 35 37 50 52 60 61 70 72 75
+else
+SMS ?= 30 35 37 50 52 60 61 70 75
+endif
+
+ifeq ($(SMS),)
+$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
+SAMPLE_ENABLED := 0
+endif
+
+ifeq ($(GENCODE_FLAGS),)
+# Generate SASS code for each SM architecture listed in $(SMS)
+$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
+
+# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
+HIGHEST_SM := $(lastword $(sort $(SMS)))
+ifneq ($(HIGHEST_SM),)
+GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
+endif
+endif
+
+ifeq ($(SAMPLE_ENABLED),0)
+EXEC ?= @echo "[@]"
+endif
+
+################################################################################
+
+# Target rules
+all: build
+
+build: simpleIPC
+
+check.deps:
+ifeq ($(SAMPLE_ENABLED),0)
+	@echo "Sample will be waived due to the above missing dependencies"
+else
+	@echo "Sample is ready - all dependencies have been met"
+endif
+
+helper_multiprocess.o:../../Common/helper_multiprocess.cpp
+	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
+
+simpleIPC.o:simpleIPC.cu
+	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
+
+simpleIPC: helper_multiprocess.o simpleIPC.o
+	$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
+	$(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
+	$(EXEC) cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
+
+run: build
+	$(EXEC) ./simpleIPC
+
+clean:
+	rm -f simpleIPC helper_multiprocess.o simpleIPC.o
+	rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/simpleIPC
+
+clobber: clean
diff --git a/Samples/simpleIPC/NsightEclipse.xml b/Samples/simpleIPC/NsightEclipse.xml
new file mode 100644
index 000000000..8ac258f20
--- /dev/null
+++ b/Samples/simpleIPC/NsightEclipse.xml
@@ -0,0 +1,74 @@
+<?xml version="1.0" encoding="UTF-8"?> 
+<!DOCTYPE entry SYSTEM "SamplesInfo.dtd">
+<entry>
+  <name>simpleIPC</name>
+  <cuda_api_list>
+    <toolkit>cudaIpcGetEventHandle</toolkit>
+    <toolkit>cudaIpcOpenMemHandle</toolkit>
+    <toolkit>cudaIpcCloseMemHandle</toolkit>
+    <toolkit>cudaMemcpyAsync</toolkit>
+  </cuda_api_list>
+  <description><![CDATA[This CUDA Runtime API sample is a very basic sample that demonstrates Inter Process Communication with one process per GPU for computation.  Requires Compute Capability 3.0 or higher and a Linux Operating System, or a Windows Operating System with TCC enabled GPUs]]></description>
+  <devicecompilation>whole</devicecompilation>
+  <files>
+    <file>../../Common/helper_multiprocess.cpp</file>
+  </files>
+  <includepaths>
+    <path>./</path>
+    <path>../</path>
+    <path>../../common/inc</path>
+  </includepaths>
+  <keyconcepts>
+    <concept level="basic">CUDA Systems Integration</concept>
+    <concept level="basic">Peer to Peer</concept>
+    <concept level="basic">InterProcess Communication</concept>
+  </keyconcepts>
+  <keywords>
+    <keyword>GPGPU</keyword>
+  </keywords>
+  <libraries>
+  </libraries>
+  <librarypaths>
+  </librarypaths>
+  <nsight_eclipse>true</nsight_eclipse>
+  <primary_file>simpleIPC.cu</primary_file>
+  <required_dependencies>
+    <dependency>IPC</dependency>
+  </required_dependencies>
+  <scopes>
+    <scope>1:CUDA Basic Topics</scope>
+    <scope>1:CUDA Systems Integration</scope>
+  </scopes>
+  <sm-arch>sm30</sm-arch>
+  <sm-arch>sm35</sm-arch>
+  <sm-arch>sm37</sm-arch>
+  <sm-arch>sm50</sm-arch>
+  <sm-arch>sm52</sm-arch>
+  <sm-arch>sm60</sm-arch>
+  <sm-arch>sm61</sm-arch>
+  <sm-arch>sm70</sm-arch>
+  <sm-arch>sm72</sm-arch>
+  <sm-arch>sm75</sm-arch>
+  <sources>
+    <extracompilation>../../Common/helper_multiprocess.cpp</extracompilation>
+    <extraheader>../../Common/helper_multiprocess.h</extraheader>
+  </sources>
+  <supported_envs>
+    <env>
+      <arch>x86_64</arch>
+      <platform>linux</platform>
+    </env>
+    <env>
+      <arch>ppc64le</arch>
+      <platform>linux</platform>
+    </env>
+    <env>
+      <platform>windows7</platform>
+    </env>
+  </supported_envs>
+  <supported_sm_architectures>
+    <include>all</include>
+  </supported_sm_architectures>
+  <title>simpleIPC</title>
+  <type>exe</type>
+</entry>
diff --git a/Samples/simpleIPC/README.md b/Samples/simpleIPC/README.md
new file mode 100644
index 000000000..0542a9e13
--- /dev/null
+++ b/Samples/simpleIPC/README.md
@@ -0,0 +1,74 @@
+# simpleIPC - simpleIPC
+
+## Description
+
+This CUDA Runtime API sample is a very basic sample that demonstrates Inter Process Communication with one process per GPU for computation.  Requires Compute Capability 3.0 or higher and a Linux Operating System, or a Windows Operating System with TCC enabled GPUs
+
+## Key Concepts
+
+CUDA Systems Integration, Peer to Peer, InterProcess Communication
+
+## Supported SM Architectures
+
+[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
+
+## Supported OSes
+
+Linux, Windows
+
+## Supported CPU Architecture
+
+x86_64, ppc64le
+
+## CUDA APIs involved
+
+### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html)
+cudaIpcGetEventHandle, cudaIpcOpenMemHandle, cudaIpcCloseMemHandle, cudaMemcpyAsync
+
+## Dependencies needed to build/run
+[IPC](../../README.md#ipc)
+
+## Prerequisites
+
+Download and install the [CUDA Toolkit 10.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Make sure the dependencies mentioned in [Dependencies]() section above are installed.
+
+## Build and Run
+
+### Windows
+The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format:
+```
+*_vs<version>.sln - for Visual Studio <version>
+```
+Each individual sample has its own set of solution files in its directory:
+
+To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used.
+> **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details."
+
+### Linux
+The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make:
+```
+$ cd <sample_dir>
+$ make
+```
+The samples makefiles can take advantage of certain options:
+*  **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le.
+    By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.<br/>
+`$ make TARGET_ARCH=x86_64` <br/> `$ make TARGET_ARCH=ppc64le` <br/>
+    See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details.
+*   **dbg=1** - build with debug symbols
+    ```
+    $ make dbg=1
+    ```
+*   **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`.
+    ```
+    $ make SMS="50 60"
+    ```
+
+*  **HOST_COMPILER=<host_compiler>** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers.
+```
+    $ make HOST_COMPILER=g++
+```
+
+## References (for more details)
+
diff --git a/Samples/simpleIPC/simpleIPC.cu b/Samples/simpleIPC/simpleIPC.cu
new file mode 100644
index 000000000..5dd713529
--- /dev/null
+++ b/Samples/simpleIPC/simpleIPC.cu
@@ -0,0 +1,336 @@
+/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * This sample demonstrates Inter Process Communication
+ * using one process per GPU for computation.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <vector>
+#include "helper_cuda.h"
+#include "helper_multiprocess.h"
+static const char shmName[] = "simpleIPCshm";
+// For direct NVLINK and PCI-E peers, at max 8 simultaneous peers are allowed
+// For NVSWITCH connected peers like DGX-2, simultaneous peers are not limited
+// in the same way.
+#define MAX_DEVICES (32)
+#define DATA_SIZE (64ULL << 20ULL)  // 64MB
+
+#if defined(__linux__)
+#define cpu_atomic_add32(a, x) __sync_add_and_fetch(a, x)
+#elif defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+#define cpu_atomic_add32(a, x) InterlockedAdd((volatile LONG *)a, x)
+#else
+#error Unsupported system
+#endif
+
+typedef struct shmStruct_st {
+  size_t nprocesses;
+  int barrier;
+  int sense;
+  int devices[MAX_DEVICES];
+  cudaIpcMemHandle_t memHandle[MAX_DEVICES];
+  cudaIpcEventHandle_t eventHandle[MAX_DEVICES];
+} shmStruct;
+
+__global__ void simpleKernel(char *ptr, int sz, char val) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  for (; idx < sz; idx += (gridDim.x * blockDim.x)) {
+    ptr[idx] = val;
+  }
+}
+
+static void barrierWait(volatile int *barrier, volatile int *sense,
+                        unsigned int n) {
+  int count;
+
+  // Check-in
+  count = cpu_atomic_add32(barrier, 1);
+  if (count == n)  // Last one in
+    *sense = 1;
+  while (!*sense)
+    ;
+
+  // Check-out
+  count = cpu_atomic_add32(barrier, -1);
+  if (count == 0)  // Last one out
+    *sense = 0;
+  while (*sense)
+    ;
+}
+
+static void childProcess(int id) {
+  volatile shmStruct *shm = NULL;
+  cudaStream_t stream;
+  sharedMemoryInfo info;
+  size_t procCount, i;
+  int blocks = 0;
+  int threads = 128;
+  cudaDeviceProp prop;
+  std::vector<void *> ptrs;
+  std::vector<cudaEvent_t> events;
+  std::vector<char> verification_buffer(DATA_SIZE);
+
+  if (sharedMemoryOpen(shmName, sizeof(shmStruct), &info) != 0) {
+    printf("Failed to create shared memory slab\n");
+    exit(EXIT_FAILURE);
+  }
+  shm = (volatile shmStruct *)info.addr;
+  procCount = shm->nprocesses;
+
+  printf("Process %d: Starting on device %d...\n", id, shm->devices[id]);
+
+  checkCudaErrors(cudaSetDevice(shm->devices[id]));
+  checkCudaErrors(cudaGetDeviceProperties(&prop, shm->devices[id]));
+  checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+  checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+      &blocks, simpleKernel, threads, 0));
+  blocks *= prop.multiProcessorCount;
+
+  // Open and track all the allocations and events created in the master
+  // process for use later
+  for (i = 0; i < procCount; i++) {
+    void *ptr = NULL;
+    cudaEvent_t event;
+
+    // Notice, we don't need to explicitly enable peer access for
+    // allocations on other devices.
+    checkCudaErrors(
+        cudaIpcOpenMemHandle(&ptr, *(cudaIpcMemHandle_t *)&shm->memHandle[i],
+                             cudaIpcMemLazyEnablePeerAccess));
+    checkCudaErrors(cudaIpcOpenEventHandle(
+        &event, *(cudaIpcEventHandle_t *)&shm->eventHandle[i]));
+
+    ptrs.push_back(ptr);
+    events.push_back(event);
+  }
+
+  // At each iteration of the loop, each sibling process will push work on
+  // their respective devices accessing the next peer mapped buffer allocated
+  // by the master process (these can come from other sibling processes as
+  // well). To coordinate each process' access, we force the stream to wait for
+  // the work already accessing this buffer asynchronously through IPC events,
+  // allowing the CPU processes to continue to queue more work.
+  for (i = 0; i < procCount; i++) {
+    size_t bufferId = (i + id) % procCount;
+    // Wait for the buffer to be accessed to be ready
+    checkCudaErrors(cudaStreamWaitEvent(stream, events[bufferId], 0));
+    // Push a simple kernel on it
+    simpleKernel<<<blocks, threads, 0, stream>>>((char *)ptrs[bufferId],
+                                                 DATA_SIZE, id);
+    checkCudaErrors(cudaGetLastError());
+    // Signal that this buffer is ready for the next consumer
+    checkCudaErrors(cudaEventRecord(events[bufferId], stream));
+    // Wait for all my sibling processes to push this stage of their work
+    // before proceeding to the next. This prevents siblings from racing
+    // ahead and clobbering the recorded event or waiting on the wrong
+    // recorded event.
+    barrierWait(&shm->barrier, &shm->sense, (unsigned int)procCount);
+    if (id == 0) {
+      printf("Step %lld done\n", (unsigned long long)i);
+    }
+  }
+
+  // Now wait for my buffer to be ready so I can copy it locally and verify it
+  checkCudaErrors(cudaStreamWaitEvent(stream, events[id], 0));
+  checkCudaErrors(cudaMemcpyAsync(&verification_buffer[0], ptrs[id], DATA_SIZE,
+                                  cudaMemcpyDeviceToHost, stream));
+  // And wait for all the queued up work to complete
+  checkCudaErrors(cudaStreamSynchronize(stream));
+
+  printf("Process %d: verifying...\n", id);
+
+  // The contents should have the id of the sibling just after me
+  char compareId = (char)((id + 1) % procCount);
+  for (unsigned long long j = 0; j < DATA_SIZE; j++) {
+    if (verification_buffer[j] != compareId) {
+      printf("Process %d: Verification mismatch at %lld: %d != %d\n", id, j,
+             (int)verification_buffer[j], (int)compareId);
+    }
+  }
+
+  // Clean up!
+  for (i = 0; i < procCount; i++) {
+    checkCudaErrors(cudaIpcCloseMemHandle(ptrs[i]));
+    checkCudaErrors(cudaEventDestroy(events[i]));
+  }
+
+  checkCudaErrors(cudaStreamDestroy(stream));
+
+  printf("Process %d complete!\n", id);
+}
+
+static void parentProcess(char *app) {
+  sharedMemoryInfo info;
+  int devCount, i;
+  volatile shmStruct *shm = NULL;
+  std::vector<void *> ptrs;
+  std::vector<cudaEvent_t> events;
+  std::vector<Process> processes;
+
+  checkCudaErrors(cudaGetDeviceCount(&devCount));
+
+  if (sharedMemoryCreate(shmName, sizeof(*shm), &info) != 0) {
+    printf("Failed to create shared memory slab\n");
+    exit(EXIT_FAILURE);
+  }
+  shm = (volatile shmStruct *)info.addr;
+  memset((void *)shm, 0, sizeof(*shm));
+
+  // Pick all the devices that can access each other's memory for this test
+  // Keep in mind that CUDA has minimal support for fork() without a
+  // corresponding exec() in the child process, but in this case our
+  // spawnProcess will always exec, so no need to worry.
+  for (i = 0; i < devCount; i++) {
+    bool allPeers = true;
+    cudaDeviceProp prop;
+    checkCudaErrors(cudaGetDeviceProperties(&prop, i));
+
+    // CUDA IPC is only supported on devices with unified addressing
+    if (!prop.unifiedAddressing) {
+      printf("Device %d does not support unified addressing, skipping...\n", i);
+      continue;
+    }
+    // This sample requires two processes accessing each device, so we need
+    // to ensure exclusive or prohibited mode is not set
+    if (prop.computeMode != cudaComputeModeDefault) {
+      printf("Device %d is in an unsupported compute mode for this sample\n",
+             i);
+      continue;
+    }
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+    // CUDA IPC on Windows is only supported on TCC
+    if (!prop.tccDriver) {
+      printf("Device %d is not in TCC mode\n", i);
+      continue;
+    }
+#endif
+
+    for (int j = 0; j < shm->nprocesses; j++) {
+      int canAccessPeerIJ, canAccessPeerJI;
+      checkCudaErrors(
+          cudaDeviceCanAccessPeer(&canAccessPeerJI, shm->devices[j], i));
+      checkCudaErrors(
+          cudaDeviceCanAccessPeer(&canAccessPeerIJ, i, shm->devices[j]));
+      if (!canAccessPeerIJ || !canAccessPeerJI) {
+        allPeers = false;
+        break;
+      }
+    }
+    if (allPeers) {
+      // Enable peers here.  This isn't necessary for IPC, but it will
+      // setup the peers for the device.  For systems that only allow 8
+      // peers per GPU at a time, this acts to remove devices from CanAccessPeer
+      for (int j = 0; j < shm->nprocesses; j++) {
+        checkCudaErrors(cudaSetDevice(i));
+        checkCudaErrors(cudaDeviceEnablePeerAccess(shm->devices[j], 0));
+        checkCudaErrors(cudaSetDevice(shm->devices[j]));
+        checkCudaErrors(cudaDeviceEnablePeerAccess(i, 0));
+      }
+      shm->devices[shm->nprocesses++] = i;
+      if (shm->nprocesses >= MAX_DEVICES) break;
+    } else {
+      printf(
+          "Device %d is not peer capable with some other selected peers, "
+          "skipping\n",
+          i);
+    }
+  }
+
+  if (shm->nprocesses == 0) {
+    printf("No CUDA devices support IPC\n");
+    exit(EXIT_WAIVED);
+  }
+
+  // Now allocate memory and an event for each process and fill the shared
+  // memory buffer with the IPC handles to communicate
+  for (i = 0; i < shm->nprocesses; i++) {
+    void *ptr = NULL;
+    cudaEvent_t event;
+
+    checkCudaErrors(cudaSetDevice(shm->devices[i]));
+    checkCudaErrors(cudaMalloc(&ptr, DATA_SIZE));
+    checkCudaErrors(
+        cudaIpcGetMemHandle((cudaIpcMemHandle_t *)&shm->memHandle[i], ptr));
+    checkCudaErrors(cudaEventCreate(
+        &event, cudaEventDisableTiming | cudaEventInterprocess));
+    checkCudaErrors(cudaIpcGetEventHandle(
+        (cudaIpcEventHandle_t *)&shm->eventHandle[i], event));
+
+    ptrs.push_back(ptr);
+    events.push_back(event);
+  }
+
+  // Launch the child processes!
+  for (i = 0; i < shm->nprocesses; i++) {
+    char devIdx[10];
+    char *const args[] = {app, devIdx, NULL};
+    Process process;
+
+    SPRINTF(devIdx, "%d", i);
+
+    if (spawnProcess(&process, app, args)) {
+      printf("Failed to create process\n");
+      exit(EXIT_FAILURE);
+    }
+
+    processes.push_back(process);
+  }
+
+  // And wait for them to finish
+  for (i = 0; i < processes.size(); i++) {
+    if (waitProcess(&processes[i]) != EXIT_SUCCESS) {
+      printf("Process %d failed!\n", i);
+      exit(EXIT_FAILURE);
+    }
+  }
+
+  // Clean up!
+  for (i = 0; i < shm->nprocesses; i++) {
+    checkCudaErrors(cudaSetDevice(shm->devices[i]));
+    checkCudaErrors(cudaEventSynchronize(events[i]));
+    checkCudaErrors(cudaEventDestroy(events[i]));
+    checkCudaErrors(cudaFree(ptrs[i]));
+  }
+
+  sharedMemoryClose(&info);
+}
+
+int main(int argc, char **argv) {
+#if defined(__arm__) || defined(__aarch64__)
+  printf("Not supported on ARM\n");
+  return EXIT_WAIVED;
+#else
+  if (argc == 1) {
+    parentProcess(argv[0]);
+  } else {
+    childProcess(atoi(argv[1]));
+  }
+  return EXIT_SUCCESS;
+#endif
+}
diff --git a/Samples/simpleIPC/simpleIPC_vs2012.sln b/Samples/simpleIPC/simpleIPC_vs2012.sln
new file mode 100644
index 000000000..d66504013
--- /dev/null
+++ b/Samples/simpleIPC/simpleIPC_vs2012.sln
@@ -0,0 +1,20 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2012
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "simpleIPC", "simpleIPC_vs2012.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
diff --git a/Samples/simpleIPC/simpleIPC_vs2012.vcxproj b/Samples/simpleIPC/simpleIPC_vs2012.vcxproj
new file mode 100644
index 000000000..9c5d7e4f3
--- /dev/null
+++ b/Samples/simpleIPC/simpleIPC_vs2012.vcxproj
@@ -0,0 +1,108 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>simpleIPC_vs2012</RootNamespace>
+    <ProjectName>simpleIPC</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v110</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/simpleIPC.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="simpleIPC.cu" />
+    <ClCompile Include="../../Common/helper_multiprocess.cpp" />
+    <ClInclude Include="../../Common/helper_multiprocess.h" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
+  </ImportGroup>
+</Project>
diff --git a/Samples/simpleIPC/simpleIPC_vs2013.sln b/Samples/simpleIPC/simpleIPC_vs2013.sln
new file mode 100644
index 000000000..163848168
--- /dev/null
+++ b/Samples/simpleIPC/simpleIPC_vs2013.sln
@@ -0,0 +1,20 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 13.00
+# Visual Studio 2013
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "simpleIPC", "simpleIPC_vs2013.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
diff --git a/Samples/simpleIPC/simpleIPC_vs2013.vcxproj b/Samples/simpleIPC/simpleIPC_vs2013.vcxproj
new file mode 100644
index 000000000..dadc53993
--- /dev/null
+++ b/Samples/simpleIPC/simpleIPC_vs2013.vcxproj
@@ -0,0 +1,108 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>simpleIPC_vs2013</RootNamespace>
+    <ProjectName>simpleIPC</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v120</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/simpleIPC.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="simpleIPC.cu" />
+    <ClCompile Include="../../Common/helper_multiprocess.cpp" />
+    <ClInclude Include="../../Common/helper_multiprocess.h" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
+  </ImportGroup>
+</Project>
diff --git a/Samples/simpleIPC/simpleIPC_vs2015.sln b/Samples/simpleIPC/simpleIPC_vs2015.sln
new file mode 100644
index 000000000..d798eb745
--- /dev/null
+++ b/Samples/simpleIPC/simpleIPC_vs2015.sln
@@ -0,0 +1,20 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 14.00
+# Visual Studio 2015
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "simpleIPC", "simpleIPC_vs2015.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
diff --git a/Samples/simpleIPC/simpleIPC_vs2015.vcxproj b/Samples/simpleIPC/simpleIPC_vs2015.vcxproj
new file mode 100644
index 000000000..7e2570cd0
--- /dev/null
+++ b/Samples/simpleIPC/simpleIPC_vs2015.vcxproj
@@ -0,0 +1,108 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>simpleIPC_vs2015</RootNamespace>
+    <ProjectName>simpleIPC</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v140</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/simpleIPC.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="simpleIPC.cu" />
+    <ClCompile Include="../../Common/helper_multiprocess.cpp" />
+    <ClInclude Include="../../Common/helper_multiprocess.h" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
+  </ImportGroup>
+</Project>
diff --git a/Samples/simpleIPC/simpleIPC_vs2017.sln b/Samples/simpleIPC/simpleIPC_vs2017.sln
new file mode 100644
index 000000000..93eb4ac5e
--- /dev/null
+++ b/Samples/simpleIPC/simpleIPC_vs2017.sln
@@ -0,0 +1,20 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2017
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "simpleIPC", "simpleIPC_vs2017.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
diff --git a/Samples/simpleIPC/simpleIPC_vs2017.vcxproj b/Samples/simpleIPC/simpleIPC_vs2017.vcxproj
new file mode 100644
index 000000000..e4366cd2b
--- /dev/null
+++ b/Samples/simpleIPC/simpleIPC_vs2017.vcxproj
@@ -0,0 +1,109 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>simpleIPC_vs2017</RootNamespace>
+    <ProjectName>simpleIPC</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v141</PlatformToolset>
+	<WindowsTargetPlatformVersion>10.0.15063.0</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/simpleIPC.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="simpleIPC.cu" />
+    <ClCompile Include="../../Common/helper_multiprocess.cpp" />
+    <ClInclude Include="../../Common/helper_multiprocess.h" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
+  </ImportGroup>
+</Project>
diff --git a/Samples/simpleVoteIntrinsics/Makefile b/Samples/simpleVoteIntrinsics/Makefile
index 288a0289f..b953c2965 100644
--- a/Samples/simpleVoteIntrinsics/Makefile
+++ b/Samples/simpleVoteIntrinsics/Makefile
@@ -246,7 +246,11 @@ LIBRARIES :=
 ################################################################################
 
 # Gencode arguments
+ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
+SMS ?= 30 35 37 50 52 60 61 70 72 75
+else
 SMS ?= 30 35 37 50 52 60 61 70 75
+endif
 
 ifeq ($(SMS),)
 $(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
diff --git a/Samples/simpleVoteIntrinsics/NsightEclipse.xml b/Samples/simpleVoteIntrinsics/NsightEclipse.xml
index 3e5cadfe9..f7c2618f9 100644
--- a/Samples/simpleVoteIntrinsics/NsightEclipse.xml
+++ b/Samples/simpleVoteIntrinsics/NsightEclipse.xml
@@ -40,6 +40,7 @@
   <sm-arch>sm60</sm-arch>
   <sm-arch>sm61</sm-arch>
   <sm-arch>sm70</sm-arch>
+  <sm-arch>sm72</sm-arch>
   <sm-arch>sm75</sm-arch>
   <supported_envs>
     <env>
diff --git a/Samples/simpleVoteIntrinsics/README.md b/Samples/simpleVoteIntrinsics/README.md
index b9a4d9e6e..1f11c718e 100644
--- a/Samples/simpleVoteIntrinsics/README.md
+++ b/Samples/simpleVoteIntrinsics/README.md
@@ -10,7 +10,7 @@ Vote Intrinsics
 
 ## Supported SM Architectures
 
-[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
+[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
 
 ## Supported OSes
 
@@ -27,7 +27,7 @@ cudaMalloc, cudaFree, cudaMemcpy, cudaFreeHost
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 10.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 
 ## Build and Run
 
diff --git a/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2012.vcxproj b/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2012.vcxproj
index ef5ca0729..2fd4df8ca 100644
--- a/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2012.vcxproj
+++ b/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2012.vcxproj
@@ -33,7 +33,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -102,6 +102,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2013.vcxproj b/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2013.vcxproj
index c7dcd4658..01d3b586e 100644
--- a/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2013.vcxproj
+++ b/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2013.vcxproj
@@ -33,7 +33,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -102,6 +102,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2015.vcxproj b/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2015.vcxproj
index c58c4a889..b9e89adbc 100644
--- a/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2015.vcxproj
+++ b/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2015.vcxproj
@@ -33,7 +33,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -102,6 +102,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2017.vcxproj b/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2017.vcxproj
index 184ad6a37..68cb2153d 100644
--- a/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2017.vcxproj
+++ b/Samples/simpleVoteIntrinsics/simpleVoteIntrinsics_vs2017.vcxproj
@@ -34,7 +34,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -103,6 +103,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/simpleVulkan/Build_instructions.txt b/Samples/simpleVulkan/Build_instructions.txt
index 2b19ed360..e7517d42b 100644
--- a/Samples/simpleVulkan/Build_instructions.txt
+++ b/Samples/simpleVulkan/Build_instructions.txt
@@ -10,9 +10,14 @@ To add the GLFW3 headers path
 -- In Property pages window go to "VC++ Directories" section. Here in "Include Directories" edit and add path to GLFW3 headers include directory location.
 ** Make sure to add path to glfw3.dll in your PATH environment variable**
 
-
 For Linux:
 -- Install the Vulkan SDK from https://www.lunarg.com/vulkan-sdk/  and follow environment setup instructions.
 -- Install GLFW3 library through your OS package repository. For example: apt-get for Ubuntu and dnf for RHEL/CentOS
 -- Install "libxcb1-dev" and "xorg-dev" as GLFW3 is depended on it
--- Add Vulkan and GLFW3 libraries directories to LD_LIBRARY_PATH
\ No newline at end of file
+-- Add Vulkan and GLFW3 libraries directories to LD_LIBRARY_PATH
+
+For Linux aarch64(L4T):
+-- Install GLFW3 library using "apt-get install libglfw3-dev" this will provide glfw3 
+-- install above will also provide libvulkan-dev as dependencies
+-- Add Vulkan and GLFW3 libraries directories to LD_LIBRARY_PATH
+-- Pass path to vulkan sdk while building 'make VULKAN_SDK_PATH=<PATH_TO_VULKAN_SDK>', VULKAN_SDK_PATH in this scenario is typically "/usr"
diff --git a/Samples/simpleVulkan/Makefile b/Samples/simpleVulkan/Makefile
index f47458124..f8c353f16 100644
--- a/Samples/simpleVulkan/Makefile
+++ b/Samples/simpleVulkan/Makefile
@@ -246,9 +246,9 @@ ifeq ($(TARGET_ARCH),armv7l)
   SAMPLE_ENABLED := 0
 endif
 
-# This sample is not supported on aarch64
-ifeq ($(TARGET_ARCH),aarch64)
-  $(info >>> WARNING - simpleVulkan is not supported on aarch64 - waiving sample <<<)
+# This sample is not supported on QNX
+ifeq ($(TARGET_OS),qnx)
+  $(info >>> WARNING - simpleVulkan is not supported on QNX - waiving sample <<<)
   SAMPLE_ENABLED := 0
 endif
 
@@ -301,7 +301,11 @@ ifeq ($(TARGET_OS),linux)
 endif
 
 # Gencode arguments
+ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
+SMS ?= 30 35 37 50 52 60 61 70 72 75
+else
 SMS ?= 30 35 37 50 52 60 61 70 75
+endif
 
 ifeq ($(SMS),)
 $(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
@@ -331,8 +335,6 @@ endif
 all: build
 
 build: simpleVulkan
-	$(EXEC) $(VULKAN_SDK_PATH)/bin/glslangValidator -V shader_sine.vert
-	$(EXEC) $(VULKAN_SDK_PATH)/bin/glslangValidator -V shader_sine.frag
 
 check.deps:
 ifeq ($(SAMPLE_ENABLED),0)
@@ -355,7 +357,5 @@ run: build
 clean:
 	rm -f simpleVulkan vulkanCUDASinewave.o
 	rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/simpleVulkan
-	rm -rf vert.spv
-	rm -rf frag.spv
 
 clobber: clean
diff --git a/Samples/simpleVulkan/NsightEclipse.xml b/Samples/simpleVulkan/NsightEclipse.xml
index 7682326e5..34f7fae48 100644
--- a/Samples/simpleVulkan/NsightEclipse.xml
+++ b/Samples/simpleVulkan/NsightEclipse.xml
@@ -38,14 +38,6 @@
   <librarypaths>
   </librarypaths>
   <nsight_eclipse>true</nsight_eclipse>
-  <postbuildevent>
-    <event os="Windows">$(VULKAN_SDK)/Bin/glslangValidator.exe -V shader_sine.vert</event>
-    <event os="Windows">$(VULKAN_SDK)/Bin/glslangValidator.exe -V shader_sine.frag</event>
-    <event os="Linux">$(VULKAN_SDK_PATH)/bin/glslangValidator -V shader_sine.vert</event>
-    <event os="Linux">$(VULKAN_SDK_PATH)/bin/glslangValidator -V shader_sine.frag</event>
-    <eventclean os="Linux">rm -rf vert.spv</eventclean>
-    <eventclean os="Linux">rm -rf frag.spv</eventclean>
-  </postbuildevent>
   <primary_file>vulkanCUDASinewave.cu</primary_file>
   <required_dependencies>
     <dependency>X11</dependency>
@@ -64,6 +56,7 @@
   <sm-arch>sm60</sm-arch>
   <sm-arch>sm61</sm-arch>
   <sm-arch>sm70</sm-arch>
+  <sm-arch>sm72</sm-arch>
   <sm-arch>sm75</sm-arch>
   <supported_envs>
     <env>
@@ -73,6 +66,9 @@
     <env>
       <platform>windows7</platform>
     </env>
+    <env>
+      <platform>aarch64</platform>
+    </env>
   </supported_envs>
   <supported_sm_architectures>
     <include>all</include>
diff --git a/Samples/simpleVulkan/README.md b/Samples/simpleVulkan/README.md
index 766f826ff..eb76cce8f 100644
--- a/Samples/simpleVulkan/README.md
+++ b/Samples/simpleVulkan/README.md
@@ -10,7 +10,7 @@ Graphics Interop, CUDA Vulkan Interop, Data Parallel Algorithms
 
 ## Supported SM Architectures
 
-[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
+[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
 
 ## Supported OSes
 
@@ -18,7 +18,7 @@ Linux, Windows
 
 ## Supported CPU Architecture
 
-x86_64
+x86_64, aarch64
 
 ## CUDA APIs involved
 
@@ -30,7 +30,7 @@ cudaImportExternalMemory, cudaExternalMemoryGetMappedBuffer, cudaImportExternalS
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 10.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 
 ## Build and Run
@@ -52,9 +52,9 @@ $ cd <sample_dir>
 $ make
 ```
 The samples makefiles can take advantage of certain options:
-*  **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64.
+*  **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, aarch64.
     By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.<br/>
-`$ make TARGET_ARCH=x86_64` <br/>
+`$ make TARGET_ARCH=x86_64` <br/> `$ make TARGET_ARCH=aarch64` <br/>
     See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details.
 *   **dbg=1** - build with debug symbols
     ```
diff --git a/Samples/simpleVulkan/shader_sine.frag b/Samples/simpleVulkan/shader_sine.frag
index 1730b4f17..b096569ce 100644
--- a/Samples/simpleVulkan/shader_sine.frag
+++ b/Samples/simpleVulkan/shader_sine.frag
@@ -1,5 +1,6 @@
 #version 450
 #extension GL_ARB_separate_shader_objects : enable
+#extension GL_NV_gpu_shader5 : enable
 
 layout(location = 0) in vec3 fragColor;
 
diff --git a/Samples/simpleVulkan/shader_sine.vert b/Samples/simpleVulkan/shader_sine.vert
index 801963439..849558b32 100644
--- a/Samples/simpleVulkan/shader_sine.vert
+++ b/Samples/simpleVulkan/shader_sine.vert
@@ -1,6 +1,6 @@
 #version 450
 #extension GL_ARB_separate_shader_objects : enable
-
+#extension GL_NV_gpu_shader5 : enable
 
 layout(binding = 0) uniform UniformBufferObject {
     mat4 model;
diff --git a/Samples/simpleVulkan/simpleVulkan_vs2013.vcxproj b/Samples/simpleVulkan/simpleVulkan_vs2013.vcxproj
index 0f630f8cb..7b35a5386 100644
--- a/Samples/simpleVulkan/simpleVulkan_vs2013.vcxproj
+++ b/Samples/simpleVulkan/simpleVulkan_vs2013.vcxproj
@@ -33,7 +33,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -67,11 +67,6 @@
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
     </CudaCompile>
-    <PostBuildEvent>
-        <Command>$(VULKAN_SDK)/Bin/glslangValidator.exe -V shader_sine.vert
-$(VULKAN_SDK)/Bin/glslangValidator.exe -V shader_sine.frag
-        </Command>
-    </PostBuildEvent>
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
     <ClCompile>
@@ -117,6 +112,6 @@ $(VULKAN_SDK)/Bin/glslangValidator.exe -V shader_sine.frag
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/simpleVulkan/simpleVulkan_vs2015.vcxproj b/Samples/simpleVulkan/simpleVulkan_vs2015.vcxproj
index 0e4e9d642..b09f40862 100644
--- a/Samples/simpleVulkan/simpleVulkan_vs2015.vcxproj
+++ b/Samples/simpleVulkan/simpleVulkan_vs2015.vcxproj
@@ -33,7 +33,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -67,11 +67,6 @@
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
     </CudaCompile>
-    <PostBuildEvent>
-        <Command>$(VULKAN_SDK)/Bin/glslangValidator.exe -V shader_sine.vert
-$(VULKAN_SDK)/Bin/glslangValidator.exe -V shader_sine.frag
-        </Command>
-    </PostBuildEvent>
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
     <ClCompile>
@@ -117,6 +112,6 @@ $(VULKAN_SDK)/Bin/glslangValidator.exe -V shader_sine.frag
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/simpleVulkan/simpleVulkan_vs2017.vcxproj b/Samples/simpleVulkan/simpleVulkan_vs2017.vcxproj
index 7c45957f3..c2f7bf783 100644
--- a/Samples/simpleVulkan/simpleVulkan_vs2017.vcxproj
+++ b/Samples/simpleVulkan/simpleVulkan_vs2017.vcxproj
@@ -34,7 +34,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -68,11 +68,6 @@
       <Include>./;../../Common</Include>
       <Defines>WIN32</Defines>
     </CudaCompile>
-    <PostBuildEvent>
-        <Command>$(VULKAN_SDK)/Bin/glslangValidator.exe -V shader_sine.vert
-$(VULKAN_SDK)/Bin/glslangValidator.exe -V shader_sine.frag
-        </Command>
-    </PostBuildEvent>
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
     <ClCompile>
@@ -118,6 +113,6 @@ $(VULKAN_SDK)/Bin/glslangValidator.exe -V shader_sine.frag
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/simpleVulkan/vulkanCUDASinewave.cu b/Samples/simpleVulkan/vulkanCUDASinewave.cu
index a5755a279..fa266e93c 100644
--- a/Samples/simpleVulkan/vulkanCUDASinewave.cu
+++ b/Samples/simpleVulkan/vulkanCUDASinewave.cu
@@ -883,8 +883,8 @@ class vulkanCudaApp {
   }
 
   void createGraphicsPipeline() {
-    auto vertShaderCode = readFile("vert.spv");
-    auto fragShaderCode = readFile("frag.spv");
+    auto vertShaderCode = readFile("shader_sine.vert");
+    auto fragShaderCode = readFile("shader_sine.frag");
 
     VkShaderModule vertShaderModule;
     VkShaderModule fragShaderModule;
diff --git a/Samples/systemWideAtomics/Makefile b/Samples/systemWideAtomics/Makefile
index ac55d3197..162ab6010 100644
--- a/Samples/systemWideAtomics/Makefile
+++ b/Samples/systemWideAtomics/Makefile
@@ -264,7 +264,11 @@ LIBRARIES :=
 ################################################################################
 
 # Gencode arguments
+ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
+SMS ?= 60 61 70 72 75
+else
 SMS ?= 60 61 70 75
+endif
 
 ifeq ($(SMS),)
 $(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
diff --git a/Samples/systemWideAtomics/NsightEclipse.xml b/Samples/systemWideAtomics/NsightEclipse.xml
index b977c6da6..5108b3afa 100644
--- a/Samples/systemWideAtomics/NsightEclipse.xml
+++ b/Samples/systemWideAtomics/NsightEclipse.xml
@@ -39,6 +39,7 @@
   <sm-arch>sm60</sm-arch>
   <sm-arch>sm61</sm-arch>
   <sm-arch>sm70</sm-arch>
+  <sm-arch>sm72</sm-arch>
   <sm-arch>sm75</sm-arch>
   <supported_envs>
     <env>
diff --git a/Samples/systemWideAtomics/README.md b/Samples/systemWideAtomics/README.md
index 462eb23cc..eaaa57ca7 100644
--- a/Samples/systemWideAtomics/README.md
+++ b/Samples/systemWideAtomics/README.md
@@ -10,7 +10,7 @@ Atomic Intrinsics, Unified Memory
 
 ## Supported SM Architectures
 
-[SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
+[SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
 
 ## Supported OSes
 
@@ -30,7 +30,7 @@ cudaMalloc, cudaFree, cudaMemcpy, cudaFreeHost
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 10.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 
 ## Build and Run
diff --git a/Samples/vectorAdd_nvrtc/Makefile b/Samples/vectorAdd_nvrtc/Makefile
index 5561b18ab..a23cd7a58 100644
--- a/Samples/vectorAdd_nvrtc/Makefile
+++ b/Samples/vectorAdd_nvrtc/Makefile
@@ -242,6 +242,12 @@ ifeq ($(TARGET_ARCH),armv7l)
   SAMPLE_ENABLED := 0
 endif
 
+# This sample is not supported on QNX
+ifeq ($(TARGET_OS),qnx)
+  $(info >>> WARNING - vectorAdd_nvrtc is not supported on QNX - waiving sample <<<)
+  SAMPLE_ENABLED := 0
+endif
+
 ALL_LDFLAGS :=
 ALL_LDFLAGS += $(ALL_CCFLAGS)
 ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
diff --git a/Samples/vectorAdd_nvrtc/README.md b/Samples/vectorAdd_nvrtc/README.md
index aa9308d3e..db3c0dc84 100644
--- a/Samples/vectorAdd_nvrtc/README.md
+++ b/Samples/vectorAdd_nvrtc/README.md
@@ -10,7 +10,7 @@ CUDA Driver API, Vector Addition, Runtime Compilation
 
 ## Supported SM Architectures
 
-[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
+[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
 
 ## Supported OSes
 
@@ -30,7 +30,7 @@ cuMemAlloc, cuMemFree, cuMemcpyHtoD, cuMemcpyDtoH
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 10.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 
 ## Build and Run
diff --git a/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2012.vcxproj b/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2012.vcxproj
index ae30b7a29..4ec987590 100644
--- a/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2012.vcxproj
+++ b/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2012.vcxproj
@@ -33,7 +33,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -102,6 +102,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2013.vcxproj b/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2013.vcxproj
index 99e6a833b..ee52dec69 100644
--- a/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2013.vcxproj
+++ b/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2013.vcxproj
@@ -33,7 +33,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -102,6 +102,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2015.vcxproj b/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2015.vcxproj
index 3ba3e4378..00564844b 100644
--- a/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2015.vcxproj
+++ b/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2015.vcxproj
@@ -33,7 +33,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -102,6 +102,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2017.vcxproj b/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2017.vcxproj
index 5caded618..c9d8a55ba 100644
--- a/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2017.vcxproj
+++ b/Samples/vectorAdd_nvrtc/vectorAdd_nvrtc_vs2017.vcxproj
@@ -34,7 +34,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -103,6 +103,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/warpAggregatedAtomicsCG/Makefile b/Samples/warpAggregatedAtomicsCG/Makefile
index 5353f6fee..2cf3bbf5d 100644
--- a/Samples/warpAggregatedAtomicsCG/Makefile
+++ b/Samples/warpAggregatedAtomicsCG/Makefile
@@ -246,7 +246,11 @@ LIBRARIES :=
 ################################################################################
 
 # Gencode arguments
+ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64))
+SMS ?= 30 35 37 50 52 60 61 70 72 75
+else
 SMS ?= 30 35 37 50 52 60 61 70 75
+endif
 
 ifeq ($(SMS),)
 $(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
diff --git a/Samples/warpAggregatedAtomicsCG/NsightEclipse.xml b/Samples/warpAggregatedAtomicsCG/NsightEclipse.xml
index 49a896d3f..67bf4eede 100644
--- a/Samples/warpAggregatedAtomicsCG/NsightEclipse.xml
+++ b/Samples/warpAggregatedAtomicsCG/NsightEclipse.xml
@@ -34,6 +34,7 @@
   <sm-arch>sm60</sm-arch>
   <sm-arch>sm61</sm-arch>
   <sm-arch>sm70</sm-arch>
+  <sm-arch>sm72</sm-arch>
   <sm-arch>sm75</sm-arch>
   <supported_envs>
     <env>
diff --git a/Samples/warpAggregatedAtomicsCG/README.md b/Samples/warpAggregatedAtomicsCG/README.md
index dacffd13c..f958302d0 100644
--- a/Samples/warpAggregatedAtomicsCG/README.md
+++ b/Samples/warpAggregatedAtomicsCG/README.md
@@ -10,7 +10,7 @@ Cooperative Groups, Atomic Intrinsics
 
 ## Supported SM Architectures
 
-[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
+[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
 
 ## Supported OSes
 
@@ -24,7 +24,7 @@ x86_64, ppc64le, armv7l, aarch64
 
 ## Prerequisites
 
-Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 10.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 
 ## Build and Run
 
diff --git a/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2012.vcxproj b/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2012.vcxproj
index 4f0f38385..740dc1903 100644
--- a/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2012.vcxproj
+++ b/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2012.vcxproj
@@ -33,7 +33,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -102,6 +102,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2013.vcxproj b/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2013.vcxproj
index cc4187bee..8075939c0 100644
--- a/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2013.vcxproj
+++ b/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2013.vcxproj
@@ -33,7 +33,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -102,6 +102,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2015.vcxproj b/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2015.vcxproj
index cc614a67e..7a2c1c802 100644
--- a/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2015.vcxproj
+++ b/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2015.vcxproj
@@ -33,7 +33,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -102,6 +102,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
   </ImportGroup>
 </Project>
diff --git a/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2017.vcxproj b/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2017.vcxproj
index a36350c56..2613a64f1 100644
--- a/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2017.vcxproj
+++ b/Samples/warpAggregatedAtomicsCG/warpAggregatedAtomicsCG_vs2017.vcxproj
@@ -34,7 +34,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets">
     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@@ -103,6 +103,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.1.targets" />
   </ImportGroup>
 </Project>