diff --git a/.idea/runConfigurations/CudaSamples.xml b/.idea/runConfigurations/CudaSamples.xml
deleted file mode 100644
index 04419df..0000000
--- a/.idea/runConfigurations/CudaSamples.xml
+++ /dev/null
@@ -1,7 +0,0 @@
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/.idea/runConfigurations/asyncAPI.xml b/.idea/runConfigurations/asyncAPI.xml
deleted file mode 100644
index db32816..0000000
--- a/.idea/runConfigurations/asyncAPI.xml
+++ /dev/null
@@ -1,7 +0,0 @@
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/.idea/runConfigurations/cudaTensorCoreGemm.xml b/.idea/runConfigurations/cudaTensorCoreGemm.xml
deleted file mode 100644
index 57250fd..0000000
--- a/.idea/runConfigurations/cudaTensorCoreGemm.xml
+++ /dev/null
@@ -1,7 +0,0 @@
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/.idea/runConfigurations/simpleMPI.xml b/.idea/runConfigurations/simpleMPI.xml
deleted file mode 100644
index f1766a5..0000000
--- a/.idea/runConfigurations/simpleMPI.xml
+++ /dev/null
@@ -1,7 +0,0 @@
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7c6e3c1..e9eb068 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -11,17 +11,10 @@ set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "-arch=sm_75")
set(cuda_include_dirs = ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
set(cuda_samples_include = "${cuda_include_dirs}/../../../samples/common/inc/")
include_directories(${cuda_samples_include})
-
+include_directories(${cuda_include_dirs})
set(CUDA_VERBOSE_BUILD ON)
set(CUDA_SEPARABLE_COMPILATION ON) # Allow multiple CUDA files compilation
-add_executable(CudaSamples main.cu)
-
-set_target_properties(
- CudaSamples
- PROPERTIES
- CUDA_SEPARABLE_COMPILATION ON)
-
add_executable(asyncAPI asyncAPI.cu)
set(MPIEXEC_EXECUTABLE /opt/openmpi-cuda-4.0.3/bin/mpicxx)
#set(MPI_CXX_COMPILER /usr/local/bin/mpicxx)
@@ -43,8 +36,14 @@ target_link_libraries(simpleGL GL GLU glut)
set(VULKANSDK = /kws_space/vulkansdk_1.2.135.0/x86_64)
add_executable(simpleVulkan vulkanCUDASinewave.cu)
-target_include_directories(simpleVulkan PRIVATE "${VULKANSDK}/include" )
+target_include_directories(simpleVulkan PRIVATE "${VULKANSDK}/include")
target_link_directories(simpleVulkan PRIVATE "${VULKANSDK}/lib")
target_link_libraries(simpleVulkan vulkan glfw)
-#add_executable(interval )
\ No newline at end of file
+add_executable(deviceQuery deviceQuery.cpp)
+
+set_property(TARGET deviceQuery PROPERTY LINKER_LANGUAGE CUDA)
+target_link_libraries(deviceQuery cuda)
+
+add_executable(deviceQueryDrv deviceQueryDrv.cpp)
+target_link_libraries(deviceQueryDrv cuda)
\ No newline at end of file
diff --git a/deviceQuery.cpp b/deviceQuery.cpp
new file mode 100644
index 0000000..b3d5180
--- /dev/null
+++ b/deviceQuery.cpp
@@ -0,0 +1,325 @@
+//
+// Created by kwoodle on 6/9/20.
+//
+
+/*
+ * Copyright 1993-2015 NVIDIA Corporation. All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+/* This sample queries the properties of the CUDA devices present in the system
+ * via CUDA Runtime API. */
+
+// std::system includes
+
+#include
+#include
+
+#include
+#include
+#include
+
+int *pArgc = NULL;
+char **pArgv = NULL;
+
+#if CUDART_VERSION < 5000
+
+// CUDA-C includes
+#include
+
+// This function wraps the CUDA Driver API into a template function
+template
+inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute,
+ int device) {
+ CUresult error = cuDeviceGetAttribute(attribute, device_attribute, device);
+
+ if (CUDA_SUCCESS != error) {
+ fprintf(
+ stderr,
+ "cuSafeCallNoSync() Driver API error = %04d from file <%s>, line %i.\n",
+ error, __FILE__, __LINE__);
+
+ exit(EXIT_FAILURE);
+ }
+}
+
+#endif /* CUDART_VERSION < 5000 */
+
+////////////////////////////////////////////////////////////////////////////////
+// Program main
+////////////////////////////////////////////////////////////////////////////////
+int main(int argc, char **argv) {
+ pArgc = &argc;
+ pArgv = argv;
+
+ printf("%s Starting...\n\n", argv[0]);
+ printf(
+ " CUDA Device Query (Runtime API) version (CUDART static linking)\n\n");
+
+ int deviceCount = 0;
+ cudaError_t error_id = cudaGetDeviceCount(&deviceCount);
+
+ if (error_id != cudaSuccess) {
+ printf("cudaGetDeviceCount returned %d\n-> %s\n",
+ static_cast(error_id), cudaGetErrorString(error_id));
+ printf("Result = FAIL\n");
+ exit(EXIT_FAILURE);
+ }
+
+ // This function call returns 0 if there are no CUDA capable devices.
+ if (deviceCount == 0) {
+ printf("There are no available device(s) that support CUDA\n");
+ } else {
+ printf("Detected %d CUDA Capable device(s)\n", deviceCount);
+ }
+
+ int dev, driverVersion = 0, runtimeVersion = 0;
+
+ for (dev = 0; dev < deviceCount; ++dev) {
+ cudaSetDevice(dev);
+ cudaDeviceProp deviceProp;
+ cudaGetDeviceProperties(&deviceProp, dev);
+
+ printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name);
+
+ // Console log
+ cudaDriverGetVersion(&driverVersion);
+ cudaRuntimeGetVersion(&runtimeVersion);
+ printf(" CUDA Driver Version / Runtime Version %d.%d / %d.%d\n",
+ driverVersion / 1000, (driverVersion % 100) / 10,
+ runtimeVersion / 1000, (runtimeVersion % 100) / 10);
+ printf(" CUDA Capability Major/Minor version number: %d.%d\n",
+ deviceProp.major, deviceProp.minor);
+
+ char msg[256];
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+ sprintf_s(msg, sizeof(msg),
+ " Total amount of global memory: %.0f MBytes "
+ "(%llu bytes)\n",
+ static_cast(deviceProp.totalGlobalMem / 1048576.0f),
+ (unsigned long long)deviceProp.totalGlobalMem);
+#else
+ snprintf(msg, sizeof(msg),
+ " Total amount of global memory: %.0f MBytes "
+ "(%llu bytes)\n",
+ static_cast(deviceProp.totalGlobalMem / 1048576.0f),
+ (unsigned long long) deviceProp.totalGlobalMem);
+#endif
+ printf("%s", msg);
+
+ printf(" (%2d) Multiprocessors, (%3d) CUDA Cores/MP: %d CUDA Cores\n",
+ deviceProp.multiProcessorCount,
+ _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
+ _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) *
+ deviceProp.multiProcessorCount);
+ printf(
+ " GPU Max Clock rate: %.0f MHz (%0.2f "
+ "GHz)\n",
+ deviceProp.clockRate * 1e-3f, deviceProp.clockRate * 1e-6f);
+
+#if CUDART_VERSION >= 5000
+ // This is supported in CUDA 5.0 (runtime API device properties)
+ printf(" Memory Clock rate: %.0f Mhz\n",
+ deviceProp.memoryClockRate * 1e-3f);
+ printf(" Memory Bus Width: %d-bit\n",
+ deviceProp.memoryBusWidth);
+
+ if (deviceProp.l2CacheSize) {
+ printf(" L2 Cache Size: %d bytes\n",
+ deviceProp.l2CacheSize);
+ }
+
+#else
+ // This only available in CUDA 4.0-4.2 (but these were only exposed in the
+ // CUDA Driver API)
+ int memoryClock;
+ getCudaAttribute(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE,
+ dev);
+ printf(" Memory Clock rate: %.0f Mhz\n",
+ memoryClock * 1e-3f);
+ int memBusWidth;
+ getCudaAttribute(&memBusWidth,
+ CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev);
+ printf(" Memory Bus Width: %d-bit\n",
+ memBusWidth);
+ int L2CacheSize;
+ getCudaAttribute(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev);
+
+ if (L2CacheSize) {
+ printf(" L2 Cache Size: %d bytes\n",
+ L2CacheSize);
+ }
+
+#endif
+
+ printf(
+ " Maximum Texture Dimension Size (x,y,z) 1D=(%d), 2D=(%d, "
+ "%d), 3D=(%d, %d, %d)\n",
+ deviceProp.maxTexture1D, deviceProp.maxTexture2D[0],
+ deviceProp.maxTexture2D[1], deviceProp.maxTexture3D[0],
+ deviceProp.maxTexture3D[1], deviceProp.maxTexture3D[2]);
+ printf(
+ " Maximum Layered 1D Texture Size, (num) layers 1D=(%d), %d layers\n",
+ deviceProp.maxTexture1DLayered[0], deviceProp.maxTexture1DLayered[1]);
+ printf(
+ " Maximum Layered 2D Texture Size, (num) layers 2D=(%d, %d), %d "
+ "layers\n",
+ deviceProp.maxTexture2DLayered[0], deviceProp.maxTexture2DLayered[1],
+ deviceProp.maxTexture2DLayered[2]);
+
+ printf(" Total amount of constant memory: %zu bytes\n",
+ deviceProp.totalConstMem);
+ printf(" Total amount of shared memory per block: %zu bytes\n",
+ deviceProp.sharedMemPerBlock);
+ printf(" Total number of registers available per block: %d\n",
+ deviceProp.regsPerBlock);
+ printf(" Warp size: %d\n",
+ deviceProp.warpSize);
+ printf(" Maximum number of threads per multiprocessor: %d\n",
+ deviceProp.maxThreadsPerMultiProcessor);
+ printf(" Maximum number of threads per block: %d\n",
+ deviceProp.maxThreadsPerBlock);
+ printf(" Max dimension size of a thread block (x,y,z): (%d, %d, %d)\n",
+ deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1],
+ deviceProp.maxThreadsDim[2]);
+ printf(" Max dimension size of a grid size (x,y,z): (%d, %d, %d)\n",
+ deviceProp.maxGridSize[0], deviceProp.maxGridSize[1],
+ deviceProp.maxGridSize[2]);
+ printf(" Maximum memory pitch: %zu bytes\n",
+ deviceProp.memPitch);
+ printf(" Texture alignment: %zu bytes\n",
+ deviceProp.textureAlignment);
+ printf(
+ " Concurrent copy and kernel execution: %s with %d copy "
+ "engine(s)\n",
+ (deviceProp.deviceOverlap ? "Yes" : "No"), deviceProp.asyncEngineCount);
+ printf(" Run time limit on kernels: %s\n",
+ deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No");
+ printf(" Integrated GPU sharing Host Memory: %s\n",
+ deviceProp.integrated ? "Yes" : "No");
+ printf(" Support host page-locked memory mapping: %s\n",
+ deviceProp.canMapHostMemory ? "Yes" : "No");
+ printf(" Alignment requirement for Surfaces: %s\n",
+ deviceProp.surfaceAlignment ? "Yes" : "No");
+ printf(" Device has ECC support: %s\n",
+ deviceProp.ECCEnabled ? "Enabled" : "Disabled");
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+ printf(" CUDA Device Driver Mode (TCC or WDDM): %s\n",
+ deviceProp.tccDriver ? "TCC (Tesla Compute Cluster Driver)"
+ : "WDDM (Windows Display Driver Model)");
+#endif
+ printf(" Device supports Unified Addressing (UVA): %s\n",
+ deviceProp.unifiedAddressing ? "Yes" : "No");
+ printf(" Device supports Compute Preemption: %s\n",
+ deviceProp.computePreemptionSupported ? "Yes" : "No");
+ printf(" Supports Cooperative Kernel Launch: %s\n",
+ deviceProp.cooperativeLaunch ? "Yes" : "No");
+ printf(" Supports MultiDevice Co-op Kernel Launch: %s\n",
+ deviceProp.cooperativeMultiDeviceLaunch ? "Yes" : "No");
+ printf(" Device PCI Domain ID / Bus ID / location ID: %d / %d / %d\n",
+ deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID);
+
+ const char *sComputeMode[] = {
+ "Default (multiple host threads can use ::cudaSetDevice() with device "
+ "simultaneously)",
+ "Exclusive (only one host thread in one process is able to use "
+ "::cudaSetDevice() with this device)",
+ "Prohibited (no host thread can use ::cudaSetDevice() with this "
+ "device)",
+ "Exclusive Process (many threads in one process is able to use "
+ "::cudaSetDevice() with this device)",
+ "Unknown",
+ NULL};
+ printf(" Compute Mode:\n");
+ printf(" < %s >\n", sComputeMode[deviceProp.computeMode]);
+ }
+
+ // If there are 2 or more GPUs, query to determine whether RDMA is supported
+ if (deviceCount >= 2) {
+ cudaDeviceProp prop[64];
+ int gpuid[64]; // we want to find the first two GPUs that can support P2P
+ int gpu_p2p_count = 0;
+
+ for (int i = 0; i < deviceCount; i++) {
+ checkCudaErrors(cudaGetDeviceProperties(&prop[i], i));
+
+ // Only boards based on Fermi or later can support P2P
+ if ((prop[i].major >= 2)
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+ // on Windows (64-bit), the Tesla Compute Cluster driver for windows
+ // must be enabled to support this
+ && prop[i].tccDriver
+#endif
+ ) {
+ // This is an array of P2P capable GPUs
+ gpuid[gpu_p2p_count++] = i;
+ }
+ }
+
+ // Show all the combinations of support P2P GPUs
+ int can_access_peer;
+
+ if (gpu_p2p_count >= 2) {
+ for (int i = 0; i < gpu_p2p_count; i++) {
+ for (int j = 0; j < gpu_p2p_count; j++) {
+ if (gpuid[i] == gpuid[j]) {
+ continue;
+ }
+ checkCudaErrors(
+ cudaDeviceCanAccessPeer(&can_access_peer, gpuid[i], gpuid[j]));
+ printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n",
+ prop[gpuid[i]].name, gpuid[i], prop[gpuid[j]].name, gpuid[j],
+ can_access_peer ? "Yes" : "No");
+ }
+ }
+ }
+ }
+
+ // csv masterlog info
+ // *****************************
+ // exe and CUDA driver name
+ printf("\n");
+ std::string sProfileString = "deviceQuery, CUDA Driver = CUDART";
+ char cTemp[16];
+
+ // driver version
+ sProfileString += ", CUDA Driver Version = ";
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+ sprintf_s(cTemp, 10, "%d.%d", driverVersion/1000, (driverVersion%100)/10);
+#else
+ snprintf(cTemp, sizeof(cTemp), "%d.%d", driverVersion / 1000,
+ (driverVersion % 100) / 10);
+#endif
+ sProfileString += cTemp;
+
+ // Runtime version
+ sProfileString += ", CUDA Runtime Version = ";
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+ sprintf_s(cTemp, 10, "%d.%d", runtimeVersion/1000, (runtimeVersion%100)/10);
+#else
+ snprintf(cTemp, sizeof(cTemp), "%d.%d", runtimeVersion / 1000,
+ (runtimeVersion % 100) / 10);
+#endif
+ sProfileString += cTemp;
+
+ // Device count
+ sProfileString += ", NumDevs = ";
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+ sprintf_s(cTemp, 10, "%d", deviceCount);
+#else
+ snprintf(cTemp, sizeof(cTemp), "%d", deviceCount);
+#endif
+ sProfileString += cTemp;
+ sProfileString += "\n";
+ printf("%s", sProfileString.c_str());
+
+ printf("Result = PASS\n");
+
+ // finish
+ exit(EXIT_SUCCESS);
+}
diff --git a/deviceQueryDrv.cpp b/deviceQueryDrv.cpp
new file mode 100644
index 0000000..5d4f662
--- /dev/null
+++ b/deviceQueryDrv.cpp
@@ -0,0 +1,281 @@
+//
+// Created by kwoodle on 6/9/20.
+//
+
+/*
+ * Copyright 1993-2015 NVIDIA Corporation. All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+
+/* This sample queries the properties of the CUDA devices present in the system. */
+
+// includes, system
+#include
+#include
+#include
+
+#include
+#include
+
+////////////////////////////////////////////////////////////////////////////////
+// Program main
+////////////////////////////////////////////////////////////////////////////////
+int
+main(int argc, char **argv) {
+ CUdevice dev;
+ int major = 0, minor = 0;
+ int deviceCount = 0;
+ char deviceName[256];
+
+ printf("%s Starting...\n\n", argv[0]);
+
+ // note your project will need to link with cuda.lib files on windows
+ printf("CUDA Device Query (Driver API) statically linked version \n");
+
+ checkCudaErrors(cuInit(0));
+
+ checkCudaErrors(cuDeviceGetCount(&deviceCount));
+
+ // This function call returns 0 if there are no CUDA capable devices.
+ if (deviceCount == 0) {
+ printf("There are no available device(s) that support CUDA\n");
+ } else {
+ printf("Detected %d CUDA Capable device(s)\n", deviceCount);
+ }
+
+ for (dev = 0; dev < deviceCount; ++dev) {
+ checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, dev));
+ checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, dev));
+
+ checkCudaErrors(cuDeviceGetName(deviceName, 256, dev));
+
+ printf("\nDevice %d: \"%s\"\n", dev, deviceName);
+
+ int driverVersion = 0;
+ checkCudaErrors(cuDriverGetVersion(&driverVersion));
+ printf(" CUDA Driver Version: %d.%d\n", driverVersion / 1000,
+ (driverVersion % 100) / 10);
+ printf(" CUDA Capability Major/Minor version number: %d.%d\n", major, minor);
+
+ size_t totalGlobalMem;
+ checkCudaErrors(cuDeviceTotalMem(&totalGlobalMem, dev));
+
+ char msg[256];
+ SPRINTF(msg, " Total amount of global memory: %.0f MBytes (%llu bytes)\n",
+ (float) totalGlobalMem / 1048576.0f, (unsigned long long) totalGlobalMem);
+ printf("%s", msg);
+
+ int multiProcessorCount;
+ getCudaAttribute(&multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
+
+ printf(" (%2d) Multiprocessors, (%3d) CUDA Cores/MP: %d CUDA Cores\n",
+ multiProcessorCount, _ConvertSMVer2CoresDRV(major, minor),
+ _ConvertSMVer2CoresDRV(major, minor) * multiProcessorCount);
+
+ int clockRate;
+ getCudaAttribute(&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
+ printf(" GPU Max Clock rate: %.0f MHz (%0.2f GHz)\n", clockRate * 1e-3f,
+ clockRate * 1e-6f);
+ int memoryClock;
+ getCudaAttribute(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev);
+ printf(" Memory Clock rate: %.0f Mhz\n", memoryClock * 1e-3f);
+ int memBusWidth;
+ getCudaAttribute(&memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev);
+ printf(" Memory Bus Width: %d-bit\n", memBusWidth);
+ int L2CacheSize;
+ getCudaAttribute(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev);
+
+ if (L2CacheSize) {
+ printf(" L2 Cache Size: %d bytes\n", L2CacheSize);
+ }
+
+ int maxTex1D, maxTex2D[2], maxTex3D[3];
+ getCudaAttribute(&maxTex1D, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH, dev);
+ getCudaAttribute(&maxTex2D[0], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH, dev);
+ getCudaAttribute(&maxTex2D[1], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT, dev);
+ getCudaAttribute(&maxTex3D[0], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH, dev);
+ getCudaAttribute(&maxTex3D[1], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT, dev);
+ getCudaAttribute(&maxTex3D[2], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH, dev);
+ printf(" Max Texture Dimension Sizes 1D=(%d) 2D=(%d, %d) 3D=(%d, %d, %d)\n",
+ maxTex1D, maxTex2D[0], maxTex2D[1], maxTex3D[0], maxTex3D[1], maxTex3D[2]);
+
+ int maxTex1DLayered[2];
+ getCudaAttribute(&maxTex1DLayered[0], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH, dev);
+ getCudaAttribute(&maxTex1DLayered[1], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS, dev);
+ printf(" Maximum Layered 1D Texture Size, (num) layers 1D=(%d), %d layers\n",
+ maxTex1DLayered[0], maxTex1DLayered[1]);
+
+ int maxTex2DLayered[3];
+ getCudaAttribute(&maxTex2DLayered[0], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH, dev);
+ getCudaAttribute(&maxTex2DLayered[1], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT, dev);
+ getCudaAttribute(&maxTex2DLayered[2], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS, dev);
+ printf(" Maximum Layered 2D Texture Size, (num) layers 2D=(%d, %d), %d layers\n",
+ maxTex2DLayered[0], maxTex2DLayered[1], maxTex2DLayered[2]);
+
+ int totalConstantMemory;
+ getCudaAttribute(&totalConstantMemory, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, dev);
+ printf(" Total amount of constant memory: %u bytes\n", totalConstantMemory);
+ int sharedMemPerBlock;
+ getCudaAttribute(&sharedMemPerBlock, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, dev);
+ printf(" Total amount of shared memory per block: %u bytes\n", sharedMemPerBlock);
+ int regsPerBlock;
+ getCudaAttribute(®sPerBlock, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);
+ printf(" Total number of registers available per block: %d\n", regsPerBlock);
+ int warpSize;
+ getCudaAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
+ printf(" Warp size: %d\n", warpSize);
+ int maxThreadsPerMultiProcessor;
+ getCudaAttribute(&maxThreadsPerMultiProcessor, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev);
+ printf(" Maximum number of threads per multiprocessor: %d\n", maxThreadsPerMultiProcessor);
+ int maxThreadsPerBlock;
+ getCudaAttribute(&maxThreadsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev);
+ printf(" Maximum number of threads per block: %d\n", maxThreadsPerBlock);
+
+ int blockDim[3];
+ getCudaAttribute(&blockDim[0], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, dev);
+ getCudaAttribute(&blockDim[1], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, dev);
+ getCudaAttribute(&blockDim[2], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, dev);
+ printf(" Max dimension size of a thread block (x,y,z): (%d, %d, %d)\n", blockDim[0], blockDim[1], blockDim[2]);
+ int gridDim[3];
+ getCudaAttribute(&gridDim[0], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, dev);
+ getCudaAttribute(&gridDim[1], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, dev);
+ getCudaAttribute(&gridDim[2], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, dev);
+ printf(" Max dimension size of a grid size (x,y,z): (%d, %d, %d)\n", gridDim[0], gridDim[1], gridDim[2]);
+
+ int textureAlign;
+ getCudaAttribute(&textureAlign, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, dev);
+ printf(" Texture alignment: %u bytes\n", textureAlign);
+
+ int memPitch;
+ getCudaAttribute(&memPitch, CU_DEVICE_ATTRIBUTE_MAX_PITCH, dev);
+ printf(" Maximum memory pitch: %u bytes\n", memPitch);
+
+ int gpuOverlap;
+ getCudaAttribute(&gpuOverlap, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
+
+ int asyncEngineCount;
+ getCudaAttribute(&asyncEngineCount, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
+ printf(" Concurrent copy and kernel execution: %s with %d copy engine(s)\n",
+ (gpuOverlap ? "Yes" : "No"), asyncEngineCount);
+
+ int kernelExecTimeoutEnabled;
+ getCudaAttribute(&kernelExecTimeoutEnabled, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, dev);
+ printf(" Run time limit on kernels: %s\n", kernelExecTimeoutEnabled ? "Yes" : "No");
+ int integrated;
+ getCudaAttribute(&integrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
+ printf(" Integrated GPU sharing Host Memory: %s\n", integrated ? "Yes" : "No");
+ int canMapHostMemory;
+ getCudaAttribute(&canMapHostMemory, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
+ printf(" Support host page-locked memory mapping: %s\n", canMapHostMemory ? "Yes" : "No");
+
+ int concurrentKernels;
+ getCudaAttribute(&concurrentKernels, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
+ printf(" Concurrent kernel execution: %s\n", concurrentKernels ? "Yes" : "No");
+
+ int surfaceAlignment;
+ getCudaAttribute(&surfaceAlignment, CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT, dev);
+ printf(" Alignment requirement for Surfaces: %s\n", surfaceAlignment ? "Yes" : "No");
+
+ int eccEnabled;
+ getCudaAttribute(&eccEnabled, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev);
+ printf(" Device has ECC support: %s\n", eccEnabled ? "Enabled" : "Disabled");
+
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+ int tccDriver ;
+ getCudaAttribute(&tccDriver , CU_DEVICE_ATTRIBUTE_TCC_DRIVER, dev);
+ printf(" CUDA Device Driver Mode (TCC or WDDM): %s\n", tccDriver ? "TCC (Tesla Compute Cluster Driver)" : "WDDM (Windows Display Driver Model)");
+#endif
+
+ int unifiedAddressing;
+ getCudaAttribute(&unifiedAddressing, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, dev);
+ printf(" Device supports Unified Addressing (UVA): %s\n", unifiedAddressing ? "Yes" : "No");
+
+ int computePreemption;
+ getCudaAttribute(&computePreemption, CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, dev);
+ printf(" Device supports Compute Preemption: %s\n", computePreemption ? "Yes" : "No");
+
+ int cooperativeLaunch;
+ getCudaAttribute(&cooperativeLaunch, CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, dev);
+ printf(" Supports Cooperative Kernel Launch: %s\n", cooperativeLaunch ? "Yes" : "No");
+
+ int cooperativeMultiDevLaunch;
+ getCudaAttribute(&cooperativeMultiDevLaunch, CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH, dev);
+ printf(" Supports MultiDevice Co-op Kernel Launch: %s\n", cooperativeMultiDevLaunch ? "Yes" : "No");
+
+ int pciDomainID, pciBusID, pciDeviceID;
+ getCudaAttribute(&pciDomainID, CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, dev);
+ getCudaAttribute(&pciBusID, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, dev);
+ getCudaAttribute(&pciDeviceID, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, dev);
+ printf(" Device PCI Domain ID / Bus ID / location ID: %d / %d / %d\n", pciDomainID, pciBusID, pciDeviceID);
+
+ const char *sComputeMode[] =
+ {
+ "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)",
+ "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)",
+ "Prohibited (no host thread can use ::cudaSetDevice() with this device)",
+ "Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)",
+ "Unknown",
+ NULL
+ };
+
+ int computeMode;
+ getCudaAttribute(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
+ printf(" Compute Mode:\n");
+ printf(" < %s >\n", sComputeMode[computeMode]);
+ }
+
+
+ // If there are 2 or more GPUs, query to determine whether RDMA is supported
+ if (deviceCount >= 2) {
+ int gpuid[64]; // we want to find the first two GPUs that can support P2P
+ int gpu_p2p_count = 0;
+ int tccDriver = 0;
+
+ for (int i = 0; i < deviceCount; i++) {
+ checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, i));
+ checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, i));
+ getCudaAttribute(&tccDriver, CU_DEVICE_ATTRIBUTE_TCC_DRIVER, i);
+
+ // Only boards based on Fermi or later can support P2P
+ if ((major >= 2)
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+ // on Windows (64-bit), the Tesla Compute Cluster driver for windows must be enabled to support this
+ && tccDriver
+#endif
+ ) {
+ // This is an array of P2P capable GPUs
+ gpuid[gpu_p2p_count++] = i;
+ }
+ }
+
+ // Show all the combinations of support P2P GPUs
+ int can_access_peer;
+ char deviceName0[256], deviceName1[256];
+
+ if (gpu_p2p_count >= 2) {
+ for (int i = 0; i < gpu_p2p_count; i++) {
+ for (int j = 0; j < gpu_p2p_count; j++) {
+ if (gpuid[i] == gpuid[j]) {
+ continue;
+ }
+ checkCudaErrors(cuDeviceCanAccessPeer(&can_access_peer, gpuid[i], gpuid[j]));
+ checkCudaErrors(cuDeviceGetName(deviceName0, 256, gpuid[i]));
+ checkCudaErrors(cuDeviceGetName(deviceName1, 256, gpuid[j]));
+ printf("> Peer-to-Peer (P2P) access from %s (GPU%d) -> %s (GPU%d) : %s\n", deviceName0, gpuid[i],
+ deviceName1, gpuid[j],
+ can_access_peer ? "Yes" : "No");
+ }
+ }
+ }
+ }
+
+ printf("Result = PASS\n");
+
+ exit(EXIT_SUCCESS);
+}
\ No newline at end of file
diff --git a/main.cu b/main.cu
deleted file mode 100644
index bc8f460..0000000
--- a/main.cu
+++ /dev/null
@@ -1,6 +0,0 @@
-#include
-
-int main() {
- std::cout << "Hello, World!" << std::endl;
- return 0;
-}