From f4244303536cb84cb6a68b989508b720f0ecdcc5 Mon Sep 17 00:00:00 2001 From: kwoodle Date: Tue, 12 May 2020 10:39:20 -0400 Subject: [PATCH] Added simpleGL and vulkanCUDASinewave --- CMakeLists.txt | 14 +- linmath.h | 505 ++++++++++++ shader_sine.frag | 11 + shader_sine.vert | 23 + simpleGL.cu | 590 ++++++++++++++ vulkanCUDASinewave.cu | 1711 +++++++++++++++++++++++++++++++++++++++++ 6 files changed, 2853 insertions(+), 1 deletion(-) create mode 100644 linmath.h create mode 100644 shader_sine.frag create mode 100644 shader_sine.vert create mode 100644 simpleGL.cu create mode 100644 vulkanCUDASinewave.cu diff --git a/CMakeLists.txt b/CMakeLists.txt index c97a31a..eacaef3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -23,10 +23,22 @@ add_executable(asyncAPI asyncAPI.cu) find_package(MPI REQUIRED) include_directories(${MPI_CXX_INCLUDE_DIRS}) -add_executable(simpleMPI simpleMPI.cpp simpleMPI.h simpleMPI.cu) +add_executable(simpleMPI simpleMPI.cpp simpleMPI.cu) target_link_libraries(simpleMPI ${MPI_CXX_LIBRARIES}) add_executable(cudaTensorCoreGemm cudaTensorCoreGemm.cu) target_link_libraries(cudaTensorCoreGemm) add_executable(bandwidthTest bandwidthTest.cu) + +find_package(OpenGL REQUIRED) +add_executable(simpleGL simpleGL.cu) +target_link_libraries(simpleGL GL GLU glut) + +set(VULKANSDK = /kws_space/vulkansdk_1.2.135.0/x86_64) +add_executable(simpleVulkan vulkanCUDASinewave.cu) +target_include_directories(simpleVulkan PRIVATE "${VULKANSDK}/include" ) +target_link_directories(simpleVulkan PRIVATE "${VULKANSDK}/lib") +target_link_libraries(simpleVulkan vulkan glfw) + +add_executable(interval ) \ No newline at end of file diff --git a/linmath.h b/linmath.h new file mode 100644 index 0000000..b4f4d16 --- /dev/null +++ b/linmath.h @@ -0,0 +1,505 @@ +// +// Created by kwoodle on 5/12/20. +// + +/* + * Copyright (c) 2015-2016 The Khronos Group Inc. + * Copyright (c) 2015-2016 Valve Corporation + * Copyright (c) 2015-2016 LunarG, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Relicensed from the WTFPL (http://www.wtfpl.net/faq/). + */ + +#ifndef LINMATH_H +#define LINMATH_H + +#include + +// Converts degrees to radians. +#define degreesToRadians(angleDegrees) (angleDegrees * M_PI / 180.0) + +// Converts radians to degrees. +#define radiansToDegrees(angleRadians) (angleRadians * 180.0 / M_PI) + +typedef float vec3[3]; +static inline void vec3_add(vec3 r, vec3 const a, vec3 const b) { + int i; + for (i = 0; i < 3; ++i) r[i] = a[i] + b[i]; +} +static inline void vec3_sub(vec3 r, vec3 const a, vec3 const b) { + int i; + for (i = 0; i < 3; ++i) r[i] = a[i] - b[i]; +} +static inline void vec3_scale(vec3 r, vec3 const v, float const s) { + int i; + for (i = 0; i < 3; ++i) r[i] = v[i] * s; +} +static inline float vec3_mul_inner(vec3 const a, vec3 const b) { + float p = 0.f; + int i; + for (i = 0; i < 3; ++i) p += b[i] * a[i]; + return p; +} +static inline void vec3_mul_cross(vec3 r, vec3 const a, vec3 const b) { + r[0] = a[1] * b[2] - a[2] * b[1]; + r[1] = a[2] * b[0] - a[0] * b[2]; + r[2] = a[0] * b[1] - a[1] * b[0]; +} +static inline float vec3_len(vec3 const v) { return sqrtf(vec3_mul_inner(v, v)); } +static inline void vec3_norm(vec3 r, vec3 const v) { + float k = 1.f / vec3_len(v); + vec3_scale(r, v, k); +} +static inline void vec3_reflect(vec3 r, vec3 const v, vec3 const n) { + float p = 2.f * vec3_mul_inner(v, n); + int i; + for (i = 0; i < 3; ++i) r[i] = v[i] - p * n[i]; +} + +typedef float vec4[4]; +static inline void vec4_add(vec4 r, vec4 const a, vec4 const b) { + int i; + for (i = 0; i < 4; ++i) r[i] = a[i] + b[i]; +} +static inline void vec4_sub(vec4 r, vec4 const a, vec4 const b) { + int i; + for (i = 0; i < 4; ++i) r[i] = a[i] - b[i]; +} +static inline void vec4_scale(vec4 r, vec4 v, float s) { + int i; + for (i = 0; i < 4; ++i) r[i] = v[i] * s; +} +static inline float vec4_mul_inner(vec4 a, vec4 b) { + float p = 0.f; + int i; + for (i = 0; i < 4; ++i) p += b[i] * a[i]; + return p; +} +static inline void vec4_mul_cross(vec4 r, vec4 a, vec4 b) { + r[0] = a[1] * b[2] - a[2] * b[1]; + r[1] = a[2] * b[0] - a[0] * b[2]; + r[2] = a[0] * b[1] - a[1] * b[0]; + r[3] = 1.f; +} +static inline float vec4_len(vec4 v) { return sqrtf(vec4_mul_inner(v, v)); } +static inline void vec4_norm(vec4 r, vec4 v) { + float k = 1.f / vec4_len(v); + vec4_scale(r, v, k); +} +static inline void vec4_reflect(vec4 r, vec4 v, vec4 n) { + float p = 2.f * vec4_mul_inner(v, n); + int i; + for (i = 0; i < 4; ++i) r[i] = v[i] - p * n[i]; +} + +typedef vec4 mat4x4[4]; +static inline void mat4x4_identity(mat4x4 M) { + int i, j; + for (i = 0; i < 4; ++i) + for (j = 0; j < 4; ++j) M[i][j] = i == j ? 1.f : 0.f; +} +static inline void mat4x4_dup(mat4x4 M, mat4x4 N) { + int i, j; + for (i = 0; i < 4; ++i) + for (j = 0; j < 4; ++j) M[i][j] = N[i][j]; +} +static inline void mat4x4_row(vec4 r, mat4x4 M, int i) { + int k; + for (k = 0; k < 4; ++k) r[k] = M[k][i]; +} +static inline void mat4x4_col(vec4 r, mat4x4 M, int i) { + int k; + for (k = 0; k < 4; ++k) r[k] = M[i][k]; +} +static inline void mat4x4_transpose(mat4x4 M, mat4x4 N) { + int i, j; + for (j = 0; j < 4; ++j) + for (i = 0; i < 4; ++i) M[i][j] = N[j][i]; +} +static inline void mat4x4_add(mat4x4 M, mat4x4 a, mat4x4 b) { + int i; + for (i = 0; i < 4; ++i) vec4_add(M[i], a[i], b[i]); +} +static inline void mat4x4_sub(mat4x4 M, mat4x4 a, mat4x4 b) { + int i; + for (i = 0; i < 4; ++i) vec4_sub(M[i], a[i], b[i]); +} +static inline void mat4x4_scale(mat4x4 M, mat4x4 a, float k) { + int i; + for (i = 0; i < 4; ++i) vec4_scale(M[i], a[i], k); +} +static inline void mat4x4_scale_aniso(mat4x4 M, mat4x4 a, float x, float y, float z) { + int i; + vec4_scale(M[0], a[0], x); + vec4_scale(M[1], a[1], y); + vec4_scale(M[2], a[2], z); + for (i = 0; i < 4; ++i) { + M[3][i] = a[3][i]; + } +} +static inline void mat4x4_mul(mat4x4 M, mat4x4 a, mat4x4 b) { + int k, r, c; + for (c = 0; c < 4; ++c) + for (r = 0; r < 4; ++r) { + M[c][r] = 0.f; + for (k = 0; k < 4; ++k) M[c][r] += a[k][r] * b[c][k]; + } +} +static inline void mat4x4_mul_vec4(vec4 r, mat4x4 M, vec4 v) { + int i, j; + for (j = 0; j < 4; ++j) { + r[j] = 0.f; + for (i = 0; i < 4; ++i) r[j] += M[i][j] * v[i]; + } +} +static inline void mat4x4_translate(mat4x4 T, float x, float y, float z) { + mat4x4_identity(T); + T[3][0] = x; + T[3][1] = y; + T[3][2] = z; +} +static inline void mat4x4_translate_in_place(mat4x4 M, float x, float y, float z) { + vec4 t = {x, y, z, 0}; + vec4 r; + int i; + for (i = 0; i < 4; ++i) { + mat4x4_row(r, M, i); + M[3][i] += vec4_mul_inner(r, t); + } +} +static inline void mat4x4_from_vec3_mul_outer(mat4x4 M, vec3 a, vec3 b) { + int i, j; + for (i = 0; i < 4; ++i) + for (j = 0; j < 4; ++j) M[i][j] = i < 3 && j < 3 ? a[i] * b[j] : 0.f; +} +static inline void mat4x4_rotate(mat4x4 R, mat4x4 M, float x, float y, float z, float angle) { + float s = sinf(angle); + float c = cosf(angle); + vec3 u = {x, y, z}; + + if (vec3_len(u) > 1e-4) { + vec3_norm(u, u); + mat4x4 T; + mat4x4_from_vec3_mul_outer(T, u, u); + + mat4x4 S = {{0, u[2], -u[1], 0}, {-u[2], 0, u[0], 0}, {u[1], -u[0], 0, 0}, {0, 0, 0, 0}}; + mat4x4_scale(S, S, s); + + mat4x4 C; + mat4x4_identity(C); + mat4x4_sub(C, C, T); + + mat4x4_scale(C, C, c); + + mat4x4_add(T, T, C); + mat4x4_add(T, T, S); + + T[3][3] = 1.; + mat4x4_mul(R, M, T); + } else { + mat4x4_dup(R, M); + } +} +static inline void mat4x4_rotate_X(mat4x4 Q, mat4x4 M, float angle) { + float s = sinf(angle); + float c = cosf(angle); + mat4x4 R = {{1.f, 0.f, 0.f, 0.f}, {0.f, c, s, 0.f}, {0.f, -s, c, 0.f}, {0.f, 0.f, 0.f, 1.f}}; + mat4x4_mul(Q, M, R); +} +static inline void mat4x4_rotate_Y(mat4x4 Q, mat4x4 M, float angle) { + float s = sinf(angle); + float c = cosf(angle); + mat4x4 R = {{c, 0.f, s, 0.f}, {0.f, 1.f, 0.f, 0.f}, {-s, 0.f, c, 0.f}, {0.f, 0.f, 0.f, 1.f}}; + mat4x4_mul(Q, M, R); +} +static inline void mat4x4_rotate_Z(mat4x4 Q, mat4x4 M, float angle) { + float s = sinf(angle); + float c = cosf(angle); + mat4x4 R = {{c, s, 0.f, 0.f}, {-s, c, 0.f, 0.f}, {0.f, 0.f, 1.f, 0.f}, {0.f, 0.f, 0.f, 1.f}}; + mat4x4_mul(Q, M, R); +} +static inline void mat4x4_invert(mat4x4 T, mat4x4 M) { + float s[6]; + float c[6]; + s[0] = M[0][0] * M[1][1] - M[1][0] * M[0][1]; + s[1] = M[0][0] * M[1][2] - M[1][0] * M[0][2]; + s[2] = M[0][0] * M[1][3] - M[1][0] * M[0][3]; + s[3] = M[0][1] * M[1][2] - M[1][1] * M[0][2]; + s[4] = M[0][1] * M[1][3] - M[1][1] * M[0][3]; + s[5] = M[0][2] * M[1][3] - M[1][2] * M[0][3]; + + c[0] = M[2][0] * M[3][1] - M[3][0] * M[2][1]; + c[1] = M[2][0] * M[3][2] - M[3][0] * M[2][2]; + c[2] = M[2][0] * M[3][3] - M[3][0] * M[2][3]; + c[3] = M[2][1] * M[3][2] - M[3][1] * M[2][2]; + c[4] = M[2][1] * M[3][3] - M[3][1] * M[2][3]; + c[5] = M[2][2] * M[3][3] - M[3][2] * M[2][3]; + + /* Assumes it is invertible */ + float idet = 1.0f / (s[0] * c[5] - s[1] * c[4] + s[2] * c[3] + s[3] * c[2] - s[4] * c[1] + s[5] * c[0]); + + T[0][0] = (M[1][1] * c[5] - M[1][2] * c[4] + M[1][3] * c[3]) * idet; + T[0][1] = (-M[0][1] * c[5] + M[0][2] * c[4] - M[0][3] * c[3]) * idet; + T[0][2] = (M[3][1] * s[5] - M[3][2] * s[4] + M[3][3] * s[3]) * idet; + T[0][3] = (-M[2][1] * s[5] + M[2][2] * s[4] - M[2][3] * s[3]) * idet; + + T[1][0] = (-M[1][0] * c[5] + M[1][2] * c[2] - M[1][3] * c[1]) * idet; + T[1][1] = (M[0][0] * c[5] - M[0][2] * c[2] + M[0][3] * c[1]) * idet; + T[1][2] = (-M[3][0] * s[5] + M[3][2] * s[2] - M[3][3] * s[1]) * idet; + T[1][3] = (M[2][0] * s[5] - M[2][2] * s[2] + M[2][3] * s[1]) * idet; + + T[2][0] = (M[1][0] * c[4] - M[1][1] * c[2] + M[1][3] * c[0]) * idet; + T[2][1] = (-M[0][0] * c[4] + M[0][1] * c[2] - M[0][3] * c[0]) * idet; + T[2][2] = (M[3][0] * s[4] - M[3][1] * s[2] + M[3][3] * s[0]) * idet; + T[2][3] = (-M[2][0] * s[4] + M[2][1] * s[2] - M[2][3] * s[0]) * idet; + + T[3][0] = (-M[1][0] * c[3] + M[1][1] * c[1] - M[1][2] * c[0]) * idet; + T[3][1] = (M[0][0] * c[3] - M[0][1] * c[1] + M[0][2] * c[0]) * idet; + T[3][2] = (-M[3][0] * s[3] + M[3][1] * s[1] - M[3][2] * s[0]) * idet; + T[3][3] = (M[2][0] * s[3] - M[2][1] * s[1] + M[2][2] * s[0]) * idet; +} +static inline void mat4x4_orthonormalize(mat4x4 R, mat4x4 M) { + mat4x4_dup(R, M); + float s = 1.; + vec3 h; + + vec3_norm(R[2], R[2]); + + s = vec3_mul_inner(R[1], R[2]); + vec3_scale(h, R[2], s); + vec3_sub(R[1], R[1], h); + vec3_norm(R[2], R[2]); + + s = vec3_mul_inner(R[1], R[2]); + vec3_scale(h, R[2], s); + vec3_sub(R[1], R[1], h); + vec3_norm(R[1], R[1]); + + s = vec3_mul_inner(R[0], R[1]); + vec3_scale(h, R[1], s); + vec3_sub(R[0], R[0], h); + vec3_norm(R[0], R[0]); +} + +static inline void mat4x4_frustum(mat4x4 M, float l, float r, float b, float t, float n, float f) { + M[0][0] = 2.f * n / (r - l); + M[0][1] = M[0][2] = M[0][3] = 0.f; + + M[1][1] = 2.f * n / (t - b); + M[1][0] = M[1][2] = M[1][3] = 0.f; + + M[2][0] = (r + l) / (r - l); + M[2][1] = (t + b) / (t - b); + M[2][2] = -(f + n) / (f - n); + M[2][3] = -1.f; + + M[3][2] = -2.f * (f * n) / (f - n); + M[3][0] = M[3][1] = M[3][3] = 0.f; +} +static inline void mat4x4_ortho(mat4x4 M, float l, float r, float b, float t, float n, float f) { + M[0][0] = 2.f / (r - l); + M[0][1] = M[0][2] = M[0][3] = 0.f; + + M[1][1] = 2.f / (t - b); + M[1][0] = M[1][2] = M[1][3] = 0.f; + + M[2][2] = -2.f / (f - n); + M[2][0] = M[2][1] = M[2][3] = 0.f; + + M[3][0] = -(r + l) / (r - l); + M[3][1] = -(t + b) / (t - b); + M[3][2] = -(f + n) / (f - n); + M[3][3] = 1.f; +} +static inline void mat4x4_perspective(mat4x4 m, float y_fov, float aspect, float n, float f) { + /* NOTE: Degrees are an unhandy unit to work with. + * linmath.h uses radians for everything! */ + float const a = (float)(1.f / tan(y_fov / 2.f)); + + m[0][0] = a / aspect; + m[0][1] = 0.f; + m[0][2] = 0.f; + m[0][3] = 0.f; + + m[1][0] = 0.f; + m[1][1] = a; + m[1][2] = 0.f; + m[1][3] = 0.f; + + m[2][0] = 0.f; + m[2][1] = 0.f; + m[2][2] = -((f + n) / (f - n)); + m[2][3] = -1.f; + + m[3][0] = 0.f; + m[3][1] = 0.f; + m[3][2] = -((2.f * f * n) / (f - n)); + m[3][3] = 0.f; +} +static inline void mat4x4_look_at(mat4x4 m, vec3 eye, vec3 center, vec3 up) { + /* Adapted from Android's OpenGL Matrix.java. */ + /* See the OpenGL GLUT documentation for gluLookAt for a description */ + /* of the algorithm. We implement it in a straightforward way: */ + + /* TODO: The negation of of can be spared by swapping the order of + * operands in the following cross products in the right way. */ + vec3 f; + vec3_sub(f, center, eye); + vec3_norm(f, f); + + vec3 s; + vec3_mul_cross(s, f, up); + vec3_norm(s, s); + + vec3 t; + vec3_mul_cross(t, s, f); + + m[0][0] = s[0]; + m[0][1] = t[0]; + m[0][2] = -f[0]; + m[0][3] = 0.f; + + m[1][0] = s[1]; + m[1][1] = t[1]; + m[1][2] = -f[1]; + m[1][3] = 0.f; + + m[2][0] = s[2]; + m[2][1] = t[2]; + m[2][2] = -f[2]; + m[2][3] = 0.f; + + m[3][0] = 0.f; + m[3][1] = 0.f; + m[3][2] = 0.f; + m[3][3] = 1.f; + + mat4x4_translate_in_place(m, -eye[0], -eye[1], -eye[2]); +} + +typedef float quat[4]; +static inline void quat_identity(quat q) { + q[0] = q[1] = q[2] = 0.f; + q[3] = 1.f; +} +static inline void quat_add(quat r, quat a, quat b) { + int i; + for (i = 0; i < 4; ++i) r[i] = a[i] + b[i]; +} +static inline void quat_sub(quat r, quat a, quat b) { + int i; + for (i = 0; i < 4; ++i) r[i] = a[i] - b[i]; +} +static inline void quat_mul(quat r, quat p, quat q) { + vec3 w; + vec3_mul_cross(r, p, q); + vec3_scale(w, p, q[3]); + vec3_add(r, r, w); + vec3_scale(w, q, p[3]); + vec3_add(r, r, w); + r[3] = p[3] * q[3] - vec3_mul_inner(p, q); +} +static inline void quat_scale(quat r, quat v, float s) { + int i; + for (i = 0; i < 4; ++i) r[i] = v[i] * s; +} +static inline float quat_inner_product(quat a, quat b) { + float p = 0.f; + int i; + for (i = 0; i < 4; ++i) p += b[i] * a[i]; + return p; +} +static inline void quat_conj(quat r, quat q) { + int i; + for (i = 0; i < 3; ++i) r[i] = -q[i]; + r[3] = q[3]; +} +#define quat_norm vec4_norm +static inline void quat_mul_vec3(vec3 r, quat q, vec3 v) { + quat v_ = {v[0], v[1], v[2], 0.f}; + + quat_conj(r, q); + quat_norm(r, r); + quat_mul(r, v_, r); + quat_mul(r, q, r); +} +static inline void mat4x4_from_quat(mat4x4 M, quat q) { + float a = q[3]; + float b = q[0]; + float c = q[1]; + float d = q[2]; + float a2 = a * a; + float b2 = b * b; + float c2 = c * c; + float d2 = d * d; + + M[0][0] = a2 + b2 - c2 - d2; + M[0][1] = 2.f * (b * c + a * d); + M[0][2] = 2.f * (b * d - a * c); + M[0][3] = 0.f; + + M[1][0] = 2 * (b * c - a * d); + M[1][1] = a2 - b2 + c2 - d2; + M[1][2] = 2.f * (c * d + a * b); + M[1][3] = 0.f; + + M[2][0] = 2.f * (b * d + a * c); + M[2][1] = 2.f * (c * d - a * b); + M[2][2] = a2 - b2 - c2 + d2; + M[2][3] = 0.f; + + M[3][0] = M[3][1] = M[3][2] = 0.f; + M[3][3] = 1.f; +} + +static inline void mat4x4o_mul_quat(mat4x4 R, mat4x4 M, quat q) { + /* XXX: The way this is written only works for othogonal matrices. */ + /* TODO: Take care of non-orthogonal case. */ + quat_mul_vec3(R[0], q, M[0]); + quat_mul_vec3(R[1], q, M[1]); + quat_mul_vec3(R[2], q, M[2]); + + R[3][0] = R[3][1] = R[3][2] = 0.f; + R[3][3] = 1.f; +} +static inline void quat_from_mat4x4(quat q, mat4x4 M) { + float r = 0.f; + int i; + + int perm[] = {0, 1, 2, 0, 1}; + int *p = perm; + + for (i = 0; i < 3; i++) { + float m = M[i][i]; + if (m < r) continue; + m = r; + p = &perm[i]; + } + + r = sqrtf(1.f + M[p[0]][p[0]] - M[p[1]][p[1]] - M[p[2]][p[2]]); + + if (r < 1e-6) { + q[0] = 1.f; + q[1] = q[2] = q[3] = 0.f; + return; + } + + q[0] = r / 2.f; + q[1] = (M[p[0]][p[1]] - M[p[1]][p[0]]) / (2.f * r); + q[2] = (M[p[2]][p[0]] - M[p[0]][p[2]]) / (2.f * r); + q[3] = (M[p[2]][p[1]] - M[p[1]][p[2]]) / (2.f * r); +} + +#endif diff --git a/shader_sine.frag b/shader_sine.frag new file mode 100644 index 0000000..b096569 --- /dev/null +++ b/shader_sine.frag @@ -0,0 +1,11 @@ +#version 450 +#extension GL_ARB_separate_shader_objects : enable +#extension GL_NV_gpu_shader5 : enable + +layout(location = 0) in vec3 fragColor; + +layout(location = 0) out vec4 outColor; + +void main() { + outColor = vec4(fragColor, 1.0); +} \ No newline at end of file diff --git a/shader_sine.vert b/shader_sine.vert new file mode 100644 index 0000000..849558b --- /dev/null +++ b/shader_sine.vert @@ -0,0 +1,23 @@ +#version 450 +#extension GL_ARB_separate_shader_objects : enable +#extension GL_NV_gpu_shader5 : enable + +layout(binding = 0) uniform UniformBufferObject { + mat4 model; + mat4 view; + mat4 proj; +} ubo; + +layout(location = 0) in vec4 inPosition; +layout(location = 1) in vec3 inColor; + +layout(location = 0) out vec3 fragColor; + +out gl_PerVertex { + vec4 gl_Position; +}; + +void main() { + gl_Position = ubo.proj * ubo.view * ubo.model * inPosition; + fragColor = inColor; +} \ No newline at end of file diff --git a/simpleGL.cu b/simpleGL.cu new file mode 100644 index 0000000..e38e4fe --- /dev/null +++ b/simpleGL.cu @@ -0,0 +1,590 @@ +// +// Created by kwoodle on 5/12/20. +// +//////////////////////////////////////////////////////////////////////////// +// +// Copyright 1993-2015 NVIDIA Corporation. All rights reserved. +// +// Please refer to the NVIDIA end user license agreement (EULA) associated +// with this source code for terms and conditions that govern your use of +// this software. Any use, reproduction, disclosure, or distribution of +// this software and related documentation outside the terms of the EULA +// is strictly prohibited. +// +//////////////////////////////////////////////////////////////////////////// + +/* + This example demonstrates how to use the Cuda OpenGL bindings to + dynamically modify a vertex buffer using a Cuda kernel. + + The steps are: + 1. Create an empty vertex buffer object (VBO) + 2. Register the VBO with Cuda + 3. Map the VBO for writing from Cuda + 4. Run Cuda kernel to modify the vertex positions + 5. Unmap the VBO + 6. Render the results using OpenGL + + Host code +*/ + +// includes, system +#include +#include +#include +#include + +#ifdef _WIN32 +# define WINDOWS_LEAN_AND_MEAN +# define NOMINMAX +# include +#endif + +// OpenGL Graphics includes +#include +#if defined (__APPLE__) || defined(MACOSX) +#pragma clang diagnostic ignored "-Wdeprecated-declarations" + #include + #ifndef glutCloseFunc + #define glutCloseFunc glutWMCloseFunc + #endif +#else +#include +#endif + +// includes, cuda +#include +#include + +// Utilities and timing functions +#include // includes cuda.h and cuda_runtime_api.h + +// CUDA helper functions +#include // helper functions for CUDA error check + +#include + +#define MAX_EPSILON_ERROR 10.0f +#define THRESHOLD 0.30f +#define REFRESH_DELAY 10 //ms + +//////////////////////////////////////////////////////////////////////////////// +// constants +const unsigned int window_width = 512; +const unsigned int window_height = 512; + +const unsigned int mesh_width = 256; +const unsigned int mesh_height = 256; + +// vbo variables +GLuint vbo; +struct cudaGraphicsResource *cuda_vbo_resource; +void *d_vbo_buffer = NULL; + +float g_fAnim = 0.0; + +// mouse controls +int mouse_old_x, mouse_old_y; +int mouse_buttons = 0; +float rotate_x = 0.0, rotate_y = 0.0; +float translate_z = -3.0; + +StopWatchInterface *timer = NULL; + +// Auto-Verification Code +int fpsCount = 0; // FPS count for averaging +int fpsLimit = 1; // FPS limit for sampling +int g_Index = 0; +float avgFPS = 0.0f; +unsigned int frameCount = 0; +unsigned int g_TotalErrors = 0; +bool g_bQAReadback = false; + +int *pArgc = NULL; +char **pArgv = NULL; + +#define MAX(a,b) ((a > b) ? a : b) + +//////////////////////////////////////////////////////////////////////////////// +// declaration, forward +bool runTest(int argc, char **argv, char *ref_file); +void cleanup(); + +// GL functionality +bool initGL(int *argc, char **argv); +void createVBO(GLuint *vbo, struct cudaGraphicsResource **vbo_res, + unsigned int vbo_res_flags); +void deleteVBO(GLuint *vbo, struct cudaGraphicsResource *vbo_res); + +// rendering callbacks +void display(); +void keyboard(unsigned char key, int x, int y); +void mouse(int button, int state, int x, int y); +void motion(int x, int y); +void timerEvent(int value); + +// Cuda functionality +void runCuda(struct cudaGraphicsResource **vbo_resource); +void runAutoTest(int devID, char **argv, char *ref_file); +void checkResultCuda(int argc, char **argv, const GLuint &vbo); + +const char *sSDKsample = "simpleGL (VBO)"; + +/////////////////////////////////////////////////////////////////////////////// +//! Simple kernel to modify vertex positions in sine wave pattern +//! @param data data in global memory +/////////////////////////////////////////////////////////////////////////////// +__global__ void simple_vbo_kernel(float4 *pos, unsigned int width, unsigned int height, float time) +{ + unsigned int x = blockIdx.x*blockDim.x + threadIdx.x; + unsigned int y = blockIdx.y*blockDim.y + threadIdx.y; + + // calculate uv coordinates + float u = x / (float) width; + float v = y / (float) height; + u = u*2.0f - 1.0f; + v = v*2.0f - 1.0f; + + // calculate simple sine wave pattern + float freq = 4.0f; + float w = sinf(u*freq + time) * cosf(v*freq + time) * 0.5f; + + // write output vertex + pos[y*width+x] = make_float4(u, w, v, 1.0f); +} + + +void launch_kernel(float4 *pos, unsigned int mesh_width, + unsigned int mesh_height, float time) +{ + // execute the kernel + dim3 block(8, 8, 1); + dim3 grid(mesh_width / block.x, mesh_height / block.y, 1); + simple_vbo_kernel<<< grid, block>>>(pos, mesh_width, mesh_height, time); +} + +bool checkHW(char *name, const char *gpuType, int dev) +{ + cudaDeviceProp deviceProp; + cudaGetDeviceProperties(&deviceProp, dev); + strcpy(name, deviceProp.name); + + if (!STRNCASECMP(deviceProp.name, gpuType, strlen(gpuType))) + { + return true; + } + else + { + return false; + } +} + +//////////////////////////////////////////////////////////////////////////////// +// Program main +//////////////////////////////////////////////////////////////////////////////// +int main(int argc, char **argv) +{ + char *ref_file = NULL; + + pArgc = &argc; + pArgv = argv; + +#if defined(__linux__) + setenv ("DISPLAY", ":0", 0); +#endif + + printf("%s starting...\n", sSDKsample); + + if (argc > 1) + { + if (checkCmdLineFlag(argc, (const char **)argv, "file")) + { + // In this mode, we are running non-OpenGL and doing a compare of the VBO was generated correctly + getCmdLineArgumentString(argc, (const char **)argv, "file", (char **)&ref_file); + } + } + + printf("\n"); + + runTest(argc, argv, ref_file); + + printf("%s completed, returned %s\n", sSDKsample, (g_TotalErrors == 0) ? "OK" : "ERROR!"); + exit(g_TotalErrors == 0 ? EXIT_SUCCESS : EXIT_FAILURE); +} + +void computeFPS() +{ + frameCount++; + fpsCount++; + + if (fpsCount == fpsLimit) + { + avgFPS = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f); + fpsCount = 0; + fpsLimit = (int)MAX(avgFPS, 1.f); + + sdkResetTimer(&timer); + } + + char fps[256]; + sprintf(fps, "Cuda GL Interop (VBO): %3.1f fps (Max 100Hz)", avgFPS); + glutSetWindowTitle(fps); +} + +//////////////////////////////////////////////////////////////////////////////// +//! Initialize GL +//////////////////////////////////////////////////////////////////////////////// +bool initGL(int *argc, char **argv) +{ + glutInit(argc, argv); + glutInitDisplayMode(GLUT_RGBA | GLUT_DOUBLE); + glutInitWindowSize(window_width, window_height); + glutCreateWindow("Cuda GL Interop (VBO)"); + glutDisplayFunc(display); + glutKeyboardFunc(keyboard); + glutMotionFunc(motion); + glutTimerFunc(REFRESH_DELAY, timerEvent,0); + + // initialize necessary OpenGL extensions + if (! isGLVersionSupported(2,0)) + { + fprintf(stderr, "ERROR: Support for necessary OpenGL extensions missing."); + fflush(stderr); + return false; + } + + // default initialization + glClearColor(0.0, 0.0, 0.0, 1.0); + glDisable(GL_DEPTH_TEST); + + // viewport + glViewport(0, 0, window_width, window_height); + + // projection + glMatrixMode(GL_PROJECTION); + glLoadIdentity(); + gluPerspective(60.0, (GLfloat)window_width / (GLfloat) window_height, 0.1, 10.0); + + SDK_CHECK_ERROR_GL(); + + return true; +} + + +//////////////////////////////////////////////////////////////////////////////// +//! Run a simple test for CUDA +//////////////////////////////////////////////////////////////////////////////// +bool runTest(int argc, char **argv, char *ref_file) +{ + // Create the CUTIL timer + sdkCreateTimer(&timer); + + // use command-line specified CUDA device, otherwise use device with highest Gflops/s + int devID = findCudaDevice(argc, (const char **)argv); + + // command line mode only + if (ref_file != NULL) + { + // create VBO + checkCudaErrors(cudaMalloc((void **)&d_vbo_buffer, mesh_width*mesh_height*4*sizeof(float))); + + // run the cuda part + runAutoTest(devID, argv, ref_file); + + // check result of Cuda step + checkResultCuda(argc, argv, vbo); + + cudaFree(d_vbo_buffer); + d_vbo_buffer = NULL; + } + else + { + // First initialize OpenGL context, so we can properly set the GL for CUDA. + // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop. + if (false == initGL(&argc, argv)) + { + return false; + } + + // register callbacks + glutDisplayFunc(display); + glutKeyboardFunc(keyboard); + glutMouseFunc(mouse); + glutMotionFunc(motion); +#if defined (__APPLE__) || defined(MACOSX) + atexit(cleanup); +#else + glutCloseFunc(cleanup); +#endif + + // create VBO + createVBO(&vbo, &cuda_vbo_resource, cudaGraphicsMapFlagsWriteDiscard); + + // run the cuda part + runCuda(&cuda_vbo_resource); + + // start rendering mainloop + glutMainLoop(); + } + + return true; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Run the Cuda part of the computation +//////////////////////////////////////////////////////////////////////////////// +void runCuda(struct cudaGraphicsResource **vbo_resource) +{ + // map OpenGL buffer object for writing from CUDA + float4 *dptr; + checkCudaErrors(cudaGraphicsMapResources(1, vbo_resource, 0)); + size_t num_bytes; + checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&dptr, &num_bytes, + *vbo_resource)); + //printf("CUDA mapped VBO: May access %ld bytes\n", num_bytes); + + // execute the kernel + // dim3 block(8, 8, 1); + // dim3 grid(mesh_width / block.x, mesh_height / block.y, 1); + // kernel<<< grid, block>>>(dptr, mesh_width, mesh_height, g_fAnim); + + launch_kernel(dptr, mesh_width, mesh_height, g_fAnim); + + // unmap buffer object + checkCudaErrors(cudaGraphicsUnmapResources(1, vbo_resource, 0)); +} + +#ifdef _WIN32 +#ifndef FOPEN +#define FOPEN(fHandle,filename,mode) fopen_s(&fHandle, filename, mode) +#endif +#else +#ifndef FOPEN +#define FOPEN(fHandle,filename,mode) (fHandle = fopen(filename, mode)) +#endif +#endif + +void sdkDumpBin2(void *data, unsigned int bytes, const char *filename) +{ + printf("sdkDumpBin: <%s>\n", filename); + FILE *fp; + FOPEN(fp, filename, "wb"); + fwrite(data, bytes, 1, fp); + fflush(fp); + fclose(fp); +} + +//////////////////////////////////////////////////////////////////////////////// +//! Run the Cuda part of the computation +//////////////////////////////////////////////////////////////////////////////// +void runAutoTest(int devID, char **argv, char *ref_file) +{ + char *reference_file = NULL; + void *imageData = malloc(mesh_width*mesh_height*sizeof(float)); + + // execute the kernel + launch_kernel((float4 *)d_vbo_buffer, mesh_width, mesh_height, g_fAnim); + + cudaDeviceSynchronize(); + getLastCudaError("launch_kernel failed"); + + checkCudaErrors(cudaMemcpy(imageData, d_vbo_buffer, mesh_width*mesh_height*sizeof(float), cudaMemcpyDeviceToHost)); + + sdkDumpBin2(imageData, mesh_width*mesh_height*sizeof(float), "simpleGL.bin"); + reference_file = sdkFindFilePath(ref_file, argv[0]); + + if (reference_file && + !sdkCompareBin2BinFloat("simpleGL.bin", reference_file, + mesh_width*mesh_height*sizeof(float), + MAX_EPSILON_ERROR, THRESHOLD, pArgv[0])) + { + g_TotalErrors++; + } +} + +//////////////////////////////////////////////////////////////////////////////// +//! Create VBO +//////////////////////////////////////////////////////////////////////////////// +void createVBO(GLuint *vbo, struct cudaGraphicsResource **vbo_res, + unsigned int vbo_res_flags) +{ + assert(vbo); + + // create buffer object + glGenBuffers(1, vbo); + glBindBuffer(GL_ARRAY_BUFFER, *vbo); + + // initialize buffer object + unsigned int size = mesh_width * mesh_height * 4 * sizeof(float); + glBufferData(GL_ARRAY_BUFFER, size, 0, GL_DYNAMIC_DRAW); + + glBindBuffer(GL_ARRAY_BUFFER, 0); + + // register this buffer object with CUDA + checkCudaErrors(cudaGraphicsGLRegisterBuffer(vbo_res, *vbo, vbo_res_flags)); + + SDK_CHECK_ERROR_GL(); +} + +//////////////////////////////////////////////////////////////////////////////// +//! Delete VBO +//////////////////////////////////////////////////////////////////////////////// +void deleteVBO(GLuint *vbo, struct cudaGraphicsResource *vbo_res) +{ + + // unregister this buffer object with CUDA + checkCudaErrors(cudaGraphicsUnregisterResource(vbo_res)); + + glBindBuffer(1, *vbo); + glDeleteBuffers(1, vbo); + + *vbo = 0; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Display callback +//////////////////////////////////////////////////////////////////////////////// +void display() +{ + sdkStartTimer(&timer); + + // run CUDA kernel to generate vertex positions + runCuda(&cuda_vbo_resource); + + glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT); + + // set view matrix + glMatrixMode(GL_MODELVIEW); + glLoadIdentity(); + glTranslatef(0.0, 0.0, translate_z); + glRotatef(rotate_x, 1.0, 0.0, 0.0); + glRotatef(rotate_y, 0.0, 1.0, 0.0); + + // render from the vbo + glBindBuffer(GL_ARRAY_BUFFER, vbo); + glVertexPointer(4, GL_FLOAT, 0, 0); + + glEnableClientState(GL_VERTEX_ARRAY); + glColor3f(1.0, 0.0, 0.0); + glDrawArrays(GL_POINTS, 0, mesh_width * mesh_height); + glDisableClientState(GL_VERTEX_ARRAY); + + glutSwapBuffers(); + + g_fAnim += 0.01f; + + sdkStopTimer(&timer); + computeFPS(); +} + +void timerEvent(int value) +{ + if (glutGetWindow()) + { + glutPostRedisplay(); + glutTimerFunc(REFRESH_DELAY, timerEvent,0); + } +} + +void cleanup() +{ + sdkDeleteTimer(&timer); + + if (vbo) + { + deleteVBO(&vbo, cuda_vbo_resource); + } +} + + +//////////////////////////////////////////////////////////////////////////////// +//! Keyboard events handler +//////////////////////////////////////////////////////////////////////////////// +void keyboard(unsigned char key, int /*x*/, int /*y*/) +{ + switch (key) + { + case (27) : +#if defined(__APPLE__) || defined(MACOSX) + exit(EXIT_SUCCESS); +#else + glutDestroyWindow(glutGetWindow()); + return; +#endif + } +} + +//////////////////////////////////////////////////////////////////////////////// +//! Mouse event handlers +//////////////////////////////////////////////////////////////////////////////// +void mouse(int button, int state, int x, int y) +{ + if (state == GLUT_DOWN) + { + mouse_buttons |= 1<("./data/regression.dat", + data, mesh_width * mesh_height * 3, 0.0, false); + } + + // unmap GL buffer object + if (!glUnmapBuffer(GL_ARRAY_BUFFER)) + { + fprintf(stderr, "Unmap buffer failed.\n"); + fflush(stderr); + } + + checkCudaErrors(cudaGraphicsGLRegisterBuffer(&cuda_vbo_resource, vbo, + cudaGraphicsMapFlagsWriteDiscard)); + + SDK_CHECK_ERROR_GL(); + } +} + diff --git a/vulkanCUDASinewave.cu b/vulkanCUDASinewave.cu new file mode 100644 index 0000000..bcbd9bf --- /dev/null +++ b/vulkanCUDASinewave.cu @@ -0,0 +1,1711 @@ +// +// Created by kwoodle on 5/12/20. +// + +/* + * Copyright 1993-2018 NVIDIA Corporation. All rights reserved. + * + * Please refer to the NVIDIA end user license agreement (EULA) associated + * with this source code for terms and conditions that govern your use of + * this software. Any use, reproduction, disclosure, or distribution of + * this software and related documentation outside the terms of the EULA + * is strictly prohibited. + * + */ + +#define GLFW_INCLUDE_VULKAN +#ifdef _WIN64 +#include +#include +#include +#include +#define _USE_MATH_DEFINES +#endif + +#include +#include +#ifdef _WIN64 +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "linmath.h" + +#define WIDTH 800 +#define HEIGHT 600 + +#define VULKAN_VALIDATION 0 + + +const std::vector validationLayers = { + "VK_LAYER_LUNARG_standard_validation" +}; + +#if VULKAN_VALIDATION +const bool enableValidationLayers = true; +#else +const bool enableValidationLayers = false; +#endif + +struct QueueFamilyIndices { + int graphicsFamily = -1; + int presentFamily = -1; + + bool isComplete() { + return graphicsFamily >= 0 && presentFamily >= 0; + } +}; + +const std::vector deviceExtensions = { + VK_KHR_SWAPCHAIN_EXTENSION_NAME, + VK_KHR_EXTERNAL_MEMORY_EXTENSION_NAME, + VK_KHR_EXTERNAL_SEMAPHORE_EXTENSION_NAME, +#ifdef _WIN64 + VK_KHR_EXTERNAL_MEMORY_WIN32_EXTENSION_NAME, + VK_KHR_EXTERNAL_SEMAPHORE_WIN32_EXTENSION_NAME, +#else + VK_KHR_EXTERNAL_MEMORY_FD_EXTENSION_NAME, + VK_KHR_EXTERNAL_SEMAPHORE_FD_EXTENSION_NAME, +#endif +}; + +#ifdef _WIN64 +class WindowsSecurityAttributes { +protected: + SECURITY_ATTRIBUTES m_winSecurityAttributes; + PSECURITY_DESCRIPTOR m_winPSecurityDescriptor; + +public: + WindowsSecurityAttributes(); + SECURITY_ATTRIBUTES * operator&(); + ~WindowsSecurityAttributes(); +}; + +WindowsSecurityAttributes::WindowsSecurityAttributes() +{ + m_winPSecurityDescriptor = (PSECURITY_DESCRIPTOR)calloc(1, SECURITY_DESCRIPTOR_MIN_LENGTH + 2 * sizeof(void**)); + //CHECK_NEQ(m_winPSecurityDescriptor, (PSECURITY_DESCRIPTOR)NULL); + + PSID *ppSID = (PSID *)((PBYTE)m_winPSecurityDescriptor + SECURITY_DESCRIPTOR_MIN_LENGTH); + PACL *ppACL = (PACL *)((PBYTE)ppSID + sizeof(PSID *)); + + InitializeSecurityDescriptor(m_winPSecurityDescriptor, SECURITY_DESCRIPTOR_REVISION); + + SID_IDENTIFIER_AUTHORITY sidIdentifierAuthority = SECURITY_WORLD_SID_AUTHORITY; + AllocateAndInitializeSid(&sidIdentifierAuthority, 1, SECURITY_WORLD_RID, 0, 0, 0, 0, 0, 0, 0, ppSID); + + EXPLICIT_ACCESS explicitAccess; + ZeroMemory(&explicitAccess, sizeof(EXPLICIT_ACCESS)); + explicitAccess.grfAccessPermissions = STANDARD_RIGHTS_ALL | SPECIFIC_RIGHTS_ALL; + explicitAccess.grfAccessMode = SET_ACCESS; + explicitAccess.grfInheritance = INHERIT_ONLY; + explicitAccess.Trustee.TrusteeForm = TRUSTEE_IS_SID; + explicitAccess.Trustee.TrusteeType = TRUSTEE_IS_WELL_KNOWN_GROUP; + explicitAccess.Trustee.ptstrName = (LPTSTR)*ppSID; + + SetEntriesInAcl(1, &explicitAccess, NULL, ppACL); + + SetSecurityDescriptorDacl(m_winPSecurityDescriptor, TRUE, *ppACL, FALSE); + + m_winSecurityAttributes.nLength = sizeof(m_winSecurityAttributes); + m_winSecurityAttributes.lpSecurityDescriptor = m_winPSecurityDescriptor; + m_winSecurityAttributes.bInheritHandle = TRUE; +} + +SECURITY_ATTRIBUTES * +WindowsSecurityAttributes::operator&() +{ + return &m_winSecurityAttributes; +} + +WindowsSecurityAttributes::~WindowsSecurityAttributes() +{ + PSID* ppSID = (PSID*)((PBYTE)m_winPSecurityDescriptor + SECURITY_DESCRIPTOR_MIN_LENGTH); + PACL* ppACL = (PACL*)((PBYTE)ppSID + sizeof(PSID*)); + + if (*ppSID) { + FreeSid(*ppSID); + } + if (*ppACL) { + LocalFree(*ppACL); + } + free(m_winPSecurityDescriptor); +} +#endif + +struct UniformBufferObject { + mat4x4 model; + mat4x4 view; + mat4x4 proj; +}; + +struct SwapChainSupportDetails { + VkSurfaceCapabilitiesKHR capabilities; + std::vector formats; + std::vector presentModes; +}; + +struct Vertex { + vec4 pos; + vec3 color; + + static VkVertexInputBindingDescription getBindingDescription() { + VkVertexInputBindingDescription bindingDescription = {}; + + bindingDescription.binding = 0; + bindingDescription.stride = sizeof(Vertex); + bindingDescription.inputRate = VK_VERTEX_INPUT_RATE_VERTEX; + + return bindingDescription; + } + + static std::array getAttributeDescriptions() { + std::array attributeDescriptions = {}; + attributeDescriptions[0].binding = 0; + attributeDescriptions[0].location = 0; + attributeDescriptions[0].format = VK_FORMAT_R32G32B32A32_SFLOAT; + attributeDescriptions[0].offset = offsetof(Vertex, pos); + + attributeDescriptions[1].binding = 0; + attributeDescriptions[1].location = 1; + attributeDescriptions[1].format = VK_FORMAT_R32G32B32_SFLOAT; + attributeDescriptions[1].offset = offsetof(Vertex, color); + return attributeDescriptions; + } + +}; + +size_t mesh_width = 0, mesh_height = 0; +std::string execution_path; + +__global__ void sinewave_gen_kernel(Vertex *vertices, unsigned int width, unsigned int height, float time) +{ + unsigned int x = blockIdx.x*blockDim.x + threadIdx.x; + unsigned int y = blockIdx.y*blockDim.y + threadIdx.y; + + // calculate uv coordinates + float u = x / (float) width; + float v = y / (float) height; + u = u*2.0f - 1.0f; + v = v*2.0f - 1.0f; + + // calculate simple sine wave pattern + float freq = 4.0f; + float w = sinf(u*freq + time) * cosf(v*freq + time) * 0.5f; + + if (y < height && x < width) + { + // write output vertex + vertices[y*width+x].pos[0] = u; + vertices[y*width+x].pos[1] = w; + vertices[y*width+x].pos[2] = v; + vertices[y*width+x].pos[3] = 1.0f; + vertices[y*width+x].color[0] = 1.0f; + vertices[y*width+x].color[1] = 0.0f; + vertices[y*width+x].color[2] = 0.0f; + } +} + + +class vulkanCudaApp { +public: + void run() { + initWindow(); + initVulkan(); + initCuda(); + mainLoop(); + cleanup(); + } + +private: + GLFWwindow* window; + VkInstance instance; + VkPhysicalDevice physicalDevice = VK_NULL_HANDLE; + uint8_t vkDeviceUUID[VK_UUID_SIZE]; + VkDevice device; + VkQueue graphicsQueue; + VkQueue presentQueue; + VkSurfaceKHR surface; + VkSwapchainKHR swapChain; + std::vector swapChainImages; + VkFormat swapChainImageFormat; + VkExtent2D swapChainExtent; + std::vector swapChainImageViews; + VkDescriptorSetLayout descriptorSetLayout; + VkDescriptorPool descriptorPool; + VkDescriptorSet descriptorSet; + VkPipelineLayout pipelineLayout; + VkRenderPass renderPass; + VkPipeline graphicsPipeline; + std::vector swapChainFramebuffers; + VkCommandPool commandPool; + VkBuffer vertexBuffer; + VkDeviceMemory vertexBufferMemory; + VkBuffer uniformBuffer; + VkDeviceMemory uniformBufferMemory; + std::vector commandBuffers; + VkSemaphore imageAvailableSemaphore; + VkSemaphore renderFinishedSemaphore; + VkSemaphore cudaUpdateVkVertexBufSemaphore; + VkSemaphore vkUpdateCudaVertexBufSemaphore; + + size_t vertexBufSize = 0; + bool startSubmit = 0; + double AnimTime=1.0f; + + VkDebugReportCallbackEXT callback; + +#ifdef _WIN64 + PFN_vkGetMemoryWin32HandleKHR fpGetMemoryWin32HandleKHR; + PFN_vkGetSemaphoreWin32HandleKHR fpGetSemaphoreWin32HandleKHR; +#else + PFN_vkGetMemoryFdKHR fpGetMemoryFdKHR; + PFN_vkGetSemaphoreFdKHR fpGetSemaphoreFdKHR; +#endif + + PFN_vkGetPhysicalDeviceProperties2 fpGetPhysicalDeviceProperties2; + + // CUDA stuff + cudaExternalMemory_t cudaExtMemVertexBuffer; + cudaExternalSemaphore_t cudaExtCudaUpdateVkVertexBufSemaphore; + cudaExternalSemaphore_t cudaExtVkUpdateCudaVertexBufSemaphore; + void *cudaDevVertptr = NULL; + cudaStream_t streamToRun; + + bool checkValidationLayerSupport() { + uint32_t layerCount; + vkEnumerateInstanceLayerProperties(&layerCount, nullptr); + + std::vector availableLayers(layerCount); + vkEnumerateInstanceLayerProperties(&layerCount, availableLayers.data()); + + for (const char* layerName : validationLayers) { + bool layerFound = false; + + for (const auto& layerProperties : availableLayers) { + if (strcmp(layerName, layerProperties.layerName) == 0) { + layerFound = true; + break; + } + } + + if (!layerFound) { + return false; + } + } + + return true; + } + + static VKAPI_ATTR VkBool32 VKAPI_CALL debugCallback( + VkDebugReportFlagsEXT flags, + VkDebugReportObjectTypeEXT objType, + uint64_t obj, + size_t location, + int32_t code, + const char* layerPrefix, + const char* msg, + void* userData) { + + std::cerr << "validation layer: " << msg << std::endl; + + return VK_FALSE; + } + + VkResult CreateDebugReportCallbackEXT(VkInstance instance, const VkDebugReportCallbackCreateInfoEXT* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkDebugReportCallbackEXT* pCallback) { + auto func = (PFN_vkCreateDebugReportCallbackEXT)vkGetInstanceProcAddr(instance, "vkCreateDebugReportCallbackEXT"); + if (func != nullptr) { + return func(instance, pCreateInfo, pAllocator, pCallback); + } + else { + return VK_ERROR_EXTENSION_NOT_PRESENT; + } + } + + void DestroyDebugReportCallbackEXT(VkInstance instance, VkDebugReportCallbackEXT callback, const VkAllocationCallbacks* pAllocator) { + auto func = (PFN_vkDestroyDebugReportCallbackEXT)vkGetInstanceProcAddr(instance, "vkDestroyDebugReportCallbackEXT"); + if (func != nullptr) { + func(instance, callback, pAllocator); + } + } + + void setupDebugCallback() { + if (!enableValidationLayers) return; + + VkDebugReportCallbackCreateInfoEXT createInfo = {}; + createInfo.sType = VK_STRUCTURE_TYPE_DEBUG_REPORT_CALLBACK_CREATE_INFO_EXT; + createInfo.flags = VK_DEBUG_REPORT_ERROR_BIT_EXT | VK_DEBUG_REPORT_WARNING_BIT_EXT; + createInfo.pfnCallback = debugCallback; + + if (CreateDebugReportCallbackEXT(instance, &createInfo, nullptr, &callback) != VK_SUCCESS) { + throw std::runtime_error("failed to set up debug callback!"); + } + } + void initWindow() { + glfwInit(); + glfwWindowHint(GLFW_CLIENT_API, GLFW_NO_API); + glfwWindowHint(GLFW_RESIZABLE, GLFW_FALSE); + window = glfwCreateWindow(WIDTH, HEIGHT, "Vulkan-CUDA Interop Sinewave", nullptr, nullptr); + } + + void createInstance() { + + if (enableValidationLayers && !checkValidationLayerSupport()) { + throw std::runtime_error("validation layers requested, but not available!"); + } + + VkApplicationInfo appInfo = {}; + appInfo.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO; + appInfo.pApplicationName = "Vulkan CUDA Sinewave"; + appInfo.applicationVersion = VK_MAKE_VERSION(1, 0, 0); + appInfo.pEngineName = "No Engine"; + appInfo.engineVersion = VK_MAKE_VERSION(1, 0, 0); + appInfo.apiVersion = VK_API_VERSION_1_0; + + VkInstanceCreateInfo createInfo = {}; + createInfo.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO; + createInfo.pApplicationInfo = &appInfo; + + uint32_t glfwExtensionCount = 0; + const char** glfwExtensions; + + glfwExtensions = glfwGetRequiredInstanceExtensions(&glfwExtensionCount); + + std::vector enabledExtensionNameList; + enabledExtensionNameList.push_back(VK_KHR_GET_PHYSICAL_DEVICE_PROPERTIES_2_EXTENSION_NAME); + enabledExtensionNameList.push_back(VK_KHR_EXTERNAL_MEMORY_CAPABILITIES_EXTENSION_NAME); + enabledExtensionNameList.push_back(VK_KHR_EXTERNAL_SEMAPHORE_CAPABILITIES_EXTENSION_NAME); + + for (int i = 0; i < glfwExtensionCount; i++) + { + enabledExtensionNameList.push_back(glfwExtensions[i]); + } + if (enableValidationLayers) { + enabledExtensionNameList.push_back(VK_EXT_DEBUG_REPORT_EXTENSION_NAME); + createInfo.enabledLayerCount = static_cast(validationLayers.size()); + createInfo.ppEnabledLayerNames = validationLayers.data(); + } + else { + createInfo.enabledLayerCount = 0; + } + + createInfo.enabledExtensionCount = enabledExtensionNameList.size(); + createInfo.ppEnabledExtensionNames = enabledExtensionNameList.data(); + + if (vkCreateInstance(&createInfo, nullptr, &instance) != VK_SUCCESS) { + throw std::runtime_error("failed to create instance!"); + } + else { + std::cout << "Instance created successfully!!\n"; + } + + fpGetPhysicalDeviceProperties2 = (PFN_vkGetPhysicalDeviceProperties2)vkGetInstanceProcAddr(instance, "vkGetPhysicalDeviceProperties2"); + if (fpGetPhysicalDeviceProperties2 == NULL) { + throw std::runtime_error("Vulkan: Proc address for \"vkGetPhysicalDeviceProperties2KHR\" not found.\n"); + } + +#ifdef _WIN64 + fpGetMemoryWin32HandleKHR = (PFN_vkGetMemoryWin32HandleKHR)vkGetInstanceProcAddr(instance, "vkGetMemoryWin32HandleKHR"); + if (fpGetMemoryWin32HandleKHR == NULL) { + throw std::runtime_error("Vulkan: Proc address for \"vkGetMemoryWin32HandleKHR\" not found.\n"); + } +#else + fpGetMemoryFdKHR = (PFN_vkGetMemoryFdKHR)vkGetInstanceProcAddr(instance, "vkGetMemoryFdKHR"); + if (fpGetMemoryFdKHR == NULL) { + throw std::runtime_error("Vulkan: Proc address for \"vkGetMemoryFdKHR\" not found.\n"); + } +#endif + } + + void initVulkan() { + createInstance(); + setupDebugCallback(); + createSurface(); + pickPhysicalDevice(); + createLogicalDevice(); + getKhrExtensionsFn(); + createSwapChain(); + createImageViews(); + createRenderPass(); + createDescriptorSetLayout(); + createGraphicsPipeline(); + createFramebuffers(); + createCommandPool(); + createVertexBuffer(); + createUniformBuffer(); + createDescriptorPool(); + createDescriptorSet(); + createCommandBuffers(); + createSyncObjects(); + createSyncObjectsExt(); + } + + void initCuda() + { + setCudaVkDevice(); + cudaVkImportVertexMem(); + cudaInitVertexMem(); + cudaVkImportSemaphore(); + } + + void createSurface() { + if (glfwCreateWindowSurface(instance, window, nullptr, &surface) != VK_SUCCESS) { + throw std::runtime_error("failed to create window surface!"); + } + } + + void pickPhysicalDevice() { + uint32_t deviceCount = 0; + + vkEnumeratePhysicalDevices(instance, &deviceCount, nullptr); + + if (deviceCount == 0) { + throw std::runtime_error("failed to find GPUs with Vulkan support!"); + } + + std::vector devices(deviceCount); + vkEnumeratePhysicalDevices(instance, &deviceCount, devices.data()); + + for (const auto& device : devices) { + if (isDeviceSuitable(device)) { + physicalDevice = device; + break; + } + } + if (physicalDevice == VK_NULL_HANDLE) { + throw std::runtime_error("failed to find a suitable GPU!"); + } + + std::cout << "Selected physical device = " << physicalDevice << std::endl; + + VkPhysicalDeviceIDProperties vkPhysicalDeviceIDProperties = {}; + vkPhysicalDeviceIDProperties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES; + vkPhysicalDeviceIDProperties.pNext = NULL; + + VkPhysicalDeviceProperties2 vkPhysicalDeviceProperties2 = {}; + vkPhysicalDeviceProperties2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2; + vkPhysicalDeviceProperties2.pNext = &vkPhysicalDeviceIDProperties; + + fpGetPhysicalDeviceProperties2(physicalDevice, &vkPhysicalDeviceProperties2); + + memcpy(vkDeviceUUID, vkPhysicalDeviceIDProperties.deviceUUID, sizeof(vkDeviceUUID)); + } + + int setCudaVkDevice() + { + int current_device = 0; + int device_count = 0; + int devices_prohibited = 0; + + cudaDeviceProp deviceProp; + checkCudaErrors(cudaGetDeviceCount(&device_count)); + + if (device_count == 0) { + fprintf(stderr, "CUDA error: no devices supporting CUDA.\n"); + exit(EXIT_FAILURE); + } + + // Find the GPU which is selected by Vulkan + while (current_device < device_count) { + cudaGetDeviceProperties(&deviceProp, current_device); + + if ((deviceProp.computeMode != cudaComputeModeProhibited)) { + // Compare the cuda device UUID with vulkan UUID + int ret = memcmp(&deviceProp.uuid, &vkDeviceUUID, VK_UUID_SIZE); + if (ret == 0) + { + checkCudaErrors(cudaSetDevice(current_device)); + checkCudaErrors(cudaGetDeviceProperties(&deviceProp, current_device)); + printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", + current_device, deviceProp.name, deviceProp.major, + deviceProp.minor); + + return current_device; + } + + } else { + devices_prohibited++; + } + + current_device++; + } + + if (devices_prohibited == device_count) { + fprintf(stderr, + "CUDA error:" + " No Vulkan-CUDA Interop capable GPU found.\n"); + exit(EXIT_FAILURE); + } + + return -1; + } + + bool isDeviceSuitable(VkPhysicalDevice device) { + QueueFamilyIndices indices = findQueueFamilies(device); + bool extensionsSupported = checkDeviceExtensionSupport(device); + + bool swapChainAdequate = false; + if (extensionsSupported) { + SwapChainSupportDetails swapChainSupport = querySwapChainSupport(device); + swapChainAdequate = !swapChainSupport.formats.empty() && !swapChainSupport.presentModes.empty(); + } + + return indices.isComplete() && extensionsSupported && swapChainAdequate; + } + + bool checkDeviceExtensionSupport(VkPhysicalDevice device) { + uint32_t extensionCount; + vkEnumerateDeviceExtensionProperties(device, nullptr, &extensionCount, nullptr); + + std::vector availableExtensions(extensionCount); + vkEnumerateDeviceExtensionProperties(device, nullptr, &extensionCount, availableExtensions.data()); + + std::set requiredExtensions(deviceExtensions.begin(), deviceExtensions.end()); + + for (const auto& extension : availableExtensions) { + requiredExtensions.erase(extension.extensionName); + } + + return requiredExtensions.empty(); + } + + QueueFamilyIndices findQueueFamilies(VkPhysicalDevice device) { + QueueFamilyIndices indices; + uint32_t queueFamilyCount = 0; + vkGetPhysicalDeviceQueueFamilyProperties(device, &queueFamilyCount, nullptr); + + std::vector queueFamilies(queueFamilyCount); + vkGetPhysicalDeviceQueueFamilyProperties(device, &queueFamilyCount, queueFamilies.data()); + + + int i = 0; + for (const auto& queueFamily : queueFamilies) { + if (queueFamily.queueCount > 0 && queueFamily.queueFlags & VK_QUEUE_GRAPHICS_BIT) { + indices.graphicsFamily = i; + } + + VkBool32 presentSupport = false; + vkGetPhysicalDeviceSurfaceSupportKHR(device, i, surface, &presentSupport); + + if (queueFamily.queueCount > 0 && presentSupport) { + indices.presentFamily = i; + } + + if (indices.isComplete()) { + break; + } + i++; + } + return indices; + } + + SwapChainSupportDetails querySwapChainSupport(VkPhysicalDevice device) { + SwapChainSupportDetails details; + vkGetPhysicalDeviceSurfaceCapabilitiesKHR(device, surface, &details.capabilities); + + uint32_t formatCount; + vkGetPhysicalDeviceSurfaceFormatsKHR(device, surface, &formatCount, nullptr); + + if (formatCount != 0) { + details.formats.resize(formatCount); + vkGetPhysicalDeviceSurfaceFormatsKHR(device, surface, &formatCount, details.formats.data()); + } + + uint32_t presentModeCount; + vkGetPhysicalDeviceSurfacePresentModesKHR(device, surface, &presentModeCount, nullptr); + + if (presentModeCount != 0) { + details.presentModes.resize(presentModeCount); + vkGetPhysicalDeviceSurfacePresentModesKHR(device, surface, &presentModeCount, details.presentModes.data()); + } + + return details; + } + + VkSurfaceFormatKHR chooseSwapSurfaceFormat(const std::vector& availableFormats) { + if (availableFormats.size() == 1 && availableFormats[0].format == VK_FORMAT_UNDEFINED) { + return {VK_FORMAT_B8G8R8A8_UNORM, VK_COLOR_SPACE_SRGB_NONLINEAR_KHR}; + } + + for (const auto& availableFormat : availableFormats) { + if (availableFormat.format == VK_FORMAT_B8G8R8A8_UNORM && availableFormat.colorSpace == VK_COLOR_SPACE_SRGB_NONLINEAR_KHR) { + return availableFormat; + } + } + + return availableFormats[0]; + } + + VkPresentModeKHR chooseSwapPresentMode(const std::vector availablePresentModes) { + VkPresentModeKHR bestMode = VK_PRESENT_MODE_FIFO_KHR; + + for (const auto& availablePresentMode : availablePresentModes) { + if (availablePresentMode == VK_PRESENT_MODE_MAILBOX_KHR) { + return availablePresentMode; + } else if (availablePresentMode == VK_PRESENT_MODE_IMMEDIATE_KHR) { + bestMode = availablePresentMode; + } + } + + return bestMode; + } + + VkExtent2D chooseSwapExtent(const VkSurfaceCapabilitiesKHR& capabilities) { + if (capabilities.currentExtent.width != std::numeric_limits::max()) { + return capabilities.currentExtent; + } else { + VkExtent2D actualExtent = {WIDTH, HEIGHT}; + + actualExtent.width = std::max(capabilities.minImageExtent.width, std::min(capabilities.maxImageExtent.width, actualExtent.width)); + actualExtent.height = std::max(capabilities.minImageExtent.height, std::min(capabilities.maxImageExtent.height, actualExtent.height)); + + return actualExtent; + } + } + + void createLogicalDevice() { + + QueueFamilyIndices indices = findQueueFamilies(physicalDevice); + + std::vector queueCreateInfos; + std::set uniqueQueueFamilies = {indices.graphicsFamily, indices.presentFamily}; + + float queuePriority = 1.0f; + for (int queueFamily : uniqueQueueFamilies) { + VkDeviceQueueCreateInfo queueCreateInfo = {}; + queueCreateInfo.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO; + queueCreateInfo.queueFamilyIndex = queueFamily; + queueCreateInfo.queueCount = 1; + queueCreateInfo.pQueuePriorities = &queuePriority; + queueCreateInfos.push_back(queueCreateInfo); + } + + VkPhysicalDeviceFeatures deviceFeatures = {}; + + VkDeviceCreateInfo createInfo = {}; + createInfo.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO; + + createInfo.pQueueCreateInfos = queueCreateInfos.data(); + createInfo.queueCreateInfoCount = queueCreateInfos.size(); + + createInfo.pEnabledFeatures = &deviceFeatures; + std::vector enabledExtensionNameList; + + for (int i = 0; i < deviceExtensions.size(); i++) + { + enabledExtensionNameList.push_back(deviceExtensions[i]); + } + if (enableValidationLayers) { + createInfo.enabledLayerCount = static_cast(validationLayers.size()); + createInfo.ppEnabledLayerNames = validationLayers.data(); + } + else { + createInfo.enabledLayerCount = 0; + } + createInfo.enabledExtensionCount = static_cast(enabledExtensionNameList.size()); + createInfo.ppEnabledExtensionNames = enabledExtensionNameList.data(); + + if (vkCreateDevice(physicalDevice, &createInfo, nullptr, &device) != VK_SUCCESS) { + throw std::runtime_error("failed to create logical device!"); + } + vkGetDeviceQueue(device, indices.graphicsFamily, 0, &graphicsQueue); + vkGetDeviceQueue(device, indices.presentFamily, 0, &presentQueue); + } + + void createSwapChain() { + SwapChainSupportDetails swapChainSupport = querySwapChainSupport(physicalDevice); + + VkSurfaceFormatKHR surfaceFormat = chooseSwapSurfaceFormat(swapChainSupport.formats); + VkPresentModeKHR presentMode = chooseSwapPresentMode(swapChainSupport.presentModes); + VkExtent2D extent = chooseSwapExtent(swapChainSupport.capabilities); + + uint32_t imageCount = swapChainSupport.capabilities.minImageCount + 1; + if (swapChainSupport.capabilities.maxImageCount > 0 && imageCount > swapChainSupport.capabilities.maxImageCount) { + imageCount = swapChainSupport.capabilities.maxImageCount; + } + + VkSwapchainCreateInfoKHR createInfo = {}; + createInfo.sType = VK_STRUCTURE_TYPE_SWAPCHAIN_CREATE_INFO_KHR; + createInfo.surface = surface; + createInfo.minImageCount = imageCount; + createInfo.imageFormat = surfaceFormat.format; + createInfo.imageColorSpace = surfaceFormat.colorSpace; + createInfo.imageExtent = extent; + createInfo.imageArrayLayers = 1; + createInfo.imageUsage = VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT; + + QueueFamilyIndices indices = findQueueFamilies(physicalDevice); + uint32_t queueFamilyIndices[] = {(uint32_t) indices.graphicsFamily, (uint32_t) indices.presentFamily}; + + if (indices.graphicsFamily != indices.presentFamily) { + createInfo.imageSharingMode = VK_SHARING_MODE_CONCURRENT; + createInfo.queueFamilyIndexCount = 2; + createInfo.pQueueFamilyIndices = queueFamilyIndices; + } else { + createInfo.imageSharingMode = VK_SHARING_MODE_EXCLUSIVE; + createInfo.queueFamilyIndexCount = 0; // Optional + createInfo.pQueueFamilyIndices = nullptr; // Optional + } + + createInfo.preTransform = swapChainSupport.capabilities.currentTransform; + createInfo.compositeAlpha = VK_COMPOSITE_ALPHA_OPAQUE_BIT_KHR; + createInfo.presentMode = presentMode; + createInfo.clipped = VK_TRUE; + createInfo.oldSwapchain = VK_NULL_HANDLE; + + if (vkCreateSwapchainKHR(device, &createInfo, nullptr, &swapChain) != VK_SUCCESS) { + throw std::runtime_error("failed to create swap chain!"); + } + else + { + std::cout<< "Swapchain created!!\n"; + } + + vkGetSwapchainImagesKHR(device, swapChain, &imageCount, nullptr); + swapChainImages.resize(imageCount); + vkGetSwapchainImagesKHR(device, swapChain, &imageCount, swapChainImages.data()); + + swapChainImageFormat = surfaceFormat.format; + swapChainExtent = extent; + } + + void createImageViews() { + swapChainImageViews.resize(swapChainImages.size()); + + for (size_t i = 0; i < swapChainImages.size(); i++) { + VkImageViewCreateInfo createInfo = {}; + createInfo.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO; + createInfo.image = swapChainImages[i]; + createInfo.viewType = VK_IMAGE_VIEW_TYPE_2D; + createInfo.format = swapChainImageFormat; + + createInfo.components.r = VK_COMPONENT_SWIZZLE_IDENTITY; + createInfo.components.g = VK_COMPONENT_SWIZZLE_IDENTITY; + createInfo.components.b = VK_COMPONENT_SWIZZLE_IDENTITY; + createInfo.components.a = VK_COMPONENT_SWIZZLE_IDENTITY; + + createInfo.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; + createInfo.subresourceRange.baseMipLevel = 0; + createInfo.subresourceRange.levelCount = 1; + createInfo.subresourceRange.baseArrayLayer = 0; + createInfo.subresourceRange.layerCount = 1; + + if (vkCreateImageView(device, &createInfo, nullptr, &swapChainImageViews[i]) != VK_SUCCESS) { + throw std::runtime_error("failed to create image views!"); + } + } + } + + void createDescriptorSetLayout() { + VkDescriptorSetLayoutBinding uboLayoutBinding = {}; + uboLayoutBinding.binding = 0; + uboLayoutBinding.descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER; + uboLayoutBinding.descriptorCount = 1; + uboLayoutBinding.stageFlags = VK_SHADER_STAGE_VERTEX_BIT; + uboLayoutBinding.pImmutableSamplers = nullptr; // Optional + + VkDescriptorSetLayoutCreateInfo layoutInfo = {}; + layoutInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO; + layoutInfo.bindingCount = 1; + layoutInfo.pBindings = &uboLayoutBinding; + + if (vkCreateDescriptorSetLayout(device, &layoutInfo, nullptr, &descriptorSetLayout) != VK_SUCCESS) { + throw std::runtime_error("failed to create descriptor set layout!"); + } + + } + + void createGraphicsPipeline() { + auto vertShaderCode = readFile("shader_sine.vert"); + auto fragShaderCode = readFile("shader_sine.frag"); + + VkShaderModule vertShaderModule; + VkShaderModule fragShaderModule; + + vertShaderModule = createShaderModule(vertShaderCode); + fragShaderModule = createShaderModule(fragShaderCode); + + VkPipelineShaderStageCreateInfo vertShaderStageInfo = {}; + vertShaderStageInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO; + vertShaderStageInfo.stage = VK_SHADER_STAGE_VERTEX_BIT; + vertShaderStageInfo.module = vertShaderModule; + vertShaderStageInfo.pName = "main"; + + VkPipelineShaderStageCreateInfo fragShaderStageInfo = {}; + fragShaderStageInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO; + fragShaderStageInfo.stage = VK_SHADER_STAGE_FRAGMENT_BIT; + fragShaderStageInfo.module = fragShaderModule; + fragShaderStageInfo.pName = "main"; + + VkPipelineShaderStageCreateInfo shaderStages[] = {vertShaderStageInfo, fragShaderStageInfo}; + + VkPipelineVertexInputStateCreateInfo vertexInputInfo = {}; + vertexInputInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO; + auto bindingDescription = Vertex::getBindingDescription(); + auto attributeDescriptions = Vertex::getAttributeDescriptions(); + vertexInputInfo.vertexBindingDescriptionCount = 1; + vertexInputInfo.pVertexBindingDescriptions = &bindingDescription; + vertexInputInfo.vertexAttributeDescriptionCount = static_cast(attributeDescriptions.size()); + vertexInputInfo.pVertexAttributeDescriptions = attributeDescriptions.data(); + + VkPipelineInputAssemblyStateCreateInfo inputAssembly = {}; + inputAssembly.sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO; + inputAssembly.topology = VK_PRIMITIVE_TOPOLOGY_POINT_LIST; + inputAssembly.primitiveRestartEnable = VK_FALSE; + + VkViewport viewport = {}; + viewport.x = 0.0f; + viewport.y = 0.0f; + viewport.width = (float) swapChainExtent.width; + viewport.height = (float) swapChainExtent.height; + viewport.minDepth = 0.0f; + viewport.maxDepth = 1.0f; + + VkRect2D scissor = {}; + scissor.offset = {0, 0}; + scissor.extent = swapChainExtent; + + VkPipelineViewportStateCreateInfo viewportState = {}; + viewportState.sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO; + viewportState.viewportCount = 1; + viewportState.pViewports = &viewport; + viewportState.scissorCount = 1; + viewportState.pScissors = &scissor; + + VkPipelineRasterizationStateCreateInfo rasterizer = {}; + rasterizer.sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO; + rasterizer.depthClampEnable = VK_FALSE; + rasterizer.rasterizerDiscardEnable = VK_FALSE; + rasterizer.polygonMode = VK_POLYGON_MODE_FILL; + rasterizer.lineWidth = 1.0f; + rasterizer.cullMode = VK_CULL_MODE_BACK_BIT; + rasterizer.frontFace = VK_FRONT_FACE_COUNTER_CLOCKWISE; + rasterizer.depthBiasEnable = VK_FALSE; + rasterizer.depthBiasConstantFactor = 0.0f; // Optional + rasterizer.depthBiasClamp = 0.0f; // Optional + rasterizer.depthBiasSlopeFactor = 0.0f; // Optional + + VkPipelineMultisampleStateCreateInfo multisampling = {}; + multisampling.sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO; + multisampling.sampleShadingEnable = VK_FALSE; + multisampling.rasterizationSamples = VK_SAMPLE_COUNT_1_BIT; + multisampling.minSampleShading = 1.0f; // Optional + multisampling.pSampleMask = nullptr; // Optional + multisampling.alphaToCoverageEnable = VK_FALSE; // Optional + multisampling.alphaToOneEnable = VK_FALSE; // Optional + + VkPipelineColorBlendAttachmentState colorBlendAttachment = {}; + colorBlendAttachment.colorWriteMask = VK_COLOR_COMPONENT_R_BIT | VK_COLOR_COMPONENT_G_BIT | VK_COLOR_COMPONENT_B_BIT | VK_COLOR_COMPONENT_A_BIT; + colorBlendAttachment.blendEnable = VK_FALSE; + colorBlendAttachment.srcColorBlendFactor = VK_BLEND_FACTOR_ONE; // Optional + colorBlendAttachment.dstColorBlendFactor = VK_BLEND_FACTOR_ZERO; // Optional + colorBlendAttachment.colorBlendOp = VK_BLEND_OP_ADD; // Optional + colorBlendAttachment.srcAlphaBlendFactor = VK_BLEND_FACTOR_ONE; // Optional + colorBlendAttachment.dstAlphaBlendFactor = VK_BLEND_FACTOR_ZERO; // Optional + colorBlendAttachment.alphaBlendOp = VK_BLEND_OP_ADD; // Optional + + VkPipelineColorBlendStateCreateInfo colorBlending = {}; + colorBlending.sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO; + colorBlending.logicOpEnable = VK_FALSE; + colorBlending.logicOp = VK_LOGIC_OP_COPY; // Optional + colorBlending.attachmentCount = 1; + colorBlending.pAttachments = &colorBlendAttachment; + colorBlending.blendConstants[0] = 0.0f; // Optional + colorBlending.blendConstants[1] = 0.0f; // Optional + colorBlending.blendConstants[2] = 0.0f; // Optional + colorBlending.blendConstants[3] = 0.0f; // Optional + +#if 0 + VkDynamicState dynamicStates[] = { + VK_DYNAMIC_STATE_VIEWPORT, + VK_DYNAMIC_STATE_LINE_WIDTH + }; + + VkPipelineDynamicStateCreateInfo dynamicState = {}; + dynamicState.sType = VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO; + dynamicState.dynamicStateCount = 2; + dynamicState.pDynamicStates = dynamicStates; +#endif + VkPipelineLayoutCreateInfo pipelineLayoutInfo = {}; + pipelineLayoutInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO; + pipelineLayoutInfo.setLayoutCount = 1; // Optional + pipelineLayoutInfo.pSetLayouts = &descriptorSetLayout; // Optional + pipelineLayoutInfo.pushConstantRangeCount = 0; // Optional + pipelineLayoutInfo.pPushConstantRanges = nullptr; // Optional + + if (vkCreatePipelineLayout(device, &pipelineLayoutInfo, nullptr, &pipelineLayout) != VK_SUCCESS) { + throw std::runtime_error("failed to create pipeline layout!"); + } + + VkGraphicsPipelineCreateInfo pipelineInfo = {}; + pipelineInfo.sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO; + pipelineInfo.stageCount = 2; + pipelineInfo.pStages = shaderStages; + pipelineInfo.pVertexInputState = &vertexInputInfo; + pipelineInfo.pInputAssemblyState = &inputAssembly; + pipelineInfo.pViewportState = &viewportState; + pipelineInfo.pRasterizationState = &rasterizer; + pipelineInfo.pMultisampleState = &multisampling; + pipelineInfo.pDepthStencilState = nullptr; // Optional + pipelineInfo.pColorBlendState = &colorBlending; + pipelineInfo.pDynamicState = nullptr; // Optional + pipelineInfo.layout = pipelineLayout; + pipelineInfo.renderPass = renderPass; + pipelineInfo.subpass = 0; + pipelineInfo.basePipelineHandle = VK_NULL_HANDLE; // Optional + pipelineInfo.basePipelineIndex = -1; // Optional + + if (vkCreateGraphicsPipelines(device, VK_NULL_HANDLE, 1, &pipelineInfo, nullptr, &graphicsPipeline) != VK_SUCCESS) { + throw std::runtime_error("failed to create graphics pipeline!"); + } + else + { + std::cout << "Pipeline created successfully!!\n"; + } + vkDestroyShaderModule(device, fragShaderModule, nullptr); + vkDestroyShaderModule(device, vertShaderModule, nullptr); + } + + void createRenderPass() { + VkAttachmentDescription colorAttachment = {}; + colorAttachment.format = swapChainImageFormat; + colorAttachment.samples = VK_SAMPLE_COUNT_1_BIT; + + colorAttachment.loadOp = VK_ATTACHMENT_LOAD_OP_CLEAR; + colorAttachment.storeOp = VK_ATTACHMENT_STORE_OP_STORE; + + colorAttachment.stencilLoadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE; + colorAttachment.stencilStoreOp = VK_ATTACHMENT_STORE_OP_DONT_CARE; + + colorAttachment.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED; + colorAttachment.finalLayout = VK_IMAGE_LAYOUT_PRESENT_SRC_KHR; + + VkAttachmentReference colorAttachmentRef = {}; + colorAttachmentRef.attachment = 0; + colorAttachmentRef.layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL; + + VkSubpassDescription subpass = {}; + subpass.pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS; + + subpass.colorAttachmentCount = 1; + subpass.pColorAttachments = &colorAttachmentRef; + + VkRenderPassCreateInfo renderPassInfo = {}; + renderPassInfo.sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO; + renderPassInfo.attachmentCount = 1; + renderPassInfo.pAttachments = &colorAttachment; + renderPassInfo.subpassCount = 1; + renderPassInfo.pSubpasses = &subpass; + + VkSubpassDependency dependency = {}; + dependency.srcSubpass = VK_SUBPASS_EXTERNAL; + dependency.dstSubpass = 0; + dependency.srcStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT; + dependency.srcAccessMask = 0; + dependency.dstStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT; + dependency.dstAccessMask = VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT; + renderPassInfo.dependencyCount = 1; + renderPassInfo.pDependencies = &dependency; + + if (vkCreateRenderPass(device, &renderPassInfo, nullptr, &renderPass) != VK_SUCCESS) { + throw std::runtime_error("failed to create render pass!"); + } + + } + + void createFramebuffers() { + swapChainFramebuffers.resize(swapChainImageViews.size()); + + for (size_t i = 0; i < swapChainImageViews.size(); i++) { + VkImageView attachments[] = { + swapChainImageViews[i] + }; + + VkFramebufferCreateInfo framebufferInfo = {}; + framebufferInfo.sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO; + framebufferInfo.renderPass = renderPass; + framebufferInfo.attachmentCount = 1; + framebufferInfo.pAttachments = attachments; + framebufferInfo.width = swapChainExtent.width; + framebufferInfo.height = swapChainExtent.height; + framebufferInfo.layers = 1; + + if (vkCreateFramebuffer(device, &framebufferInfo, nullptr, &swapChainFramebuffers[i]) != VK_SUCCESS) { + throw std::runtime_error("failed to create framebuffer!"); + } + } + } + + void createCommandPool() { + QueueFamilyIndices queueFamilyIndices = findQueueFamilies(physicalDevice); + + VkCommandPoolCreateInfo poolInfo = {}; + poolInfo.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO; + poolInfo.queueFamilyIndex = queueFamilyIndices.graphicsFamily; + poolInfo.flags = 0; // Optional + + if (vkCreateCommandPool(device, &poolInfo, nullptr, &commandPool) != VK_SUCCESS) { + throw std::runtime_error("failed to create command pool!"); + } + } + + void createBuffer(VkDeviceSize size, VkBufferUsageFlags usage, + VkMemoryPropertyFlags properties, VkBuffer& buffer, + VkDeviceMemory& bufferMemory) { + VkBufferCreateInfo bufferInfo = {}; + bufferInfo.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO; + bufferInfo.size = size; + bufferInfo.usage = usage; + bufferInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE; + + if (vkCreateBuffer(device, &bufferInfo, nullptr, &buffer) != VK_SUCCESS) { + throw std::runtime_error("failed to create buffer!"); + } + + VkMemoryRequirements memRequirements; + vkGetBufferMemoryRequirements(device, buffer, &memRequirements); + + VkMemoryAllocateInfo allocInfo = {}; + allocInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO; + allocInfo.allocationSize = memRequirements.size; + allocInfo.memoryTypeIndex = findMemoryType(memRequirements.memoryTypeBits, properties); + + if (vkAllocateMemory(device, &allocInfo, nullptr, &bufferMemory) != VK_SUCCESS) { + throw std::runtime_error("failed to allocate buffer memory!"); + } + + vkBindBufferMemory(device, buffer, bufferMemory, 0); + } + + void createBufferExtMem(VkDeviceSize size, VkBufferUsageFlags usage, VkMemoryPropertyFlags properties, + VkExternalMemoryHandleTypeFlagsKHR extMemHandleType, + VkBuffer& buffer, VkDeviceMemory& bufferMemory) + { + VkBufferCreateInfo bufferInfo = {}; + bufferInfo.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO; + bufferInfo.size = size; + bufferInfo.usage = usage; + bufferInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE; + + if (vkCreateBuffer(device, &bufferInfo, nullptr, &buffer) != VK_SUCCESS) { + throw std::runtime_error("failed to create buffer!"); + } + + VkMemoryRequirements memRequirements; + vkGetBufferMemoryRequirements(device, buffer, &memRequirements); + +#ifdef _WIN64 + WindowsSecurityAttributes winSecurityAttributes; + + VkExportMemoryWin32HandleInfoKHR vulkanExportMemoryWin32HandleInfoKHR = {}; + vulkanExportMemoryWin32HandleInfoKHR.sType = VK_STRUCTURE_TYPE_EXPORT_MEMORY_WIN32_HANDLE_INFO_KHR; + vulkanExportMemoryWin32HandleInfoKHR.pNext = NULL; + vulkanExportMemoryWin32HandleInfoKHR.pAttributes = &winSecurityAttributes; + vulkanExportMemoryWin32HandleInfoKHR.dwAccess = DXGI_SHARED_RESOURCE_READ | DXGI_SHARED_RESOURCE_WRITE; + vulkanExportMemoryWin32HandleInfoKHR.name = (LPCWSTR)NULL; +#endif + VkExportMemoryAllocateInfoKHR vulkanExportMemoryAllocateInfoKHR = {}; + vulkanExportMemoryAllocateInfoKHR.sType = VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO_KHR; +#ifdef _WIN64 + vulkanExportMemoryAllocateInfoKHR.pNext = extMemHandleType & VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT_KHR ? &vulkanExportMemoryWin32HandleInfoKHR : NULL; + vulkanExportMemoryAllocateInfoKHR.handleTypes = extMemHandleType; +#else + vulkanExportMemoryAllocateInfoKHR.pNext = NULL; + vulkanExportMemoryAllocateInfoKHR.handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT; +#endif + VkMemoryAllocateInfo allocInfo = {}; + allocInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO; + allocInfo.pNext = &vulkanExportMemoryAllocateInfoKHR; + allocInfo.allocationSize = memRequirements.size; + allocInfo.memoryTypeIndex = findMemoryType(memRequirements.memoryTypeBits, properties); + + if (vkAllocateMemory(device, &allocInfo, nullptr, &bufferMemory) != VK_SUCCESS) { + throw std::runtime_error("failed to allocate external buffer memory!"); + } + + vkBindBufferMemory(device, buffer, bufferMemory, 0); + } + + void createVertexBuffer() { + mesh_width = swapChainExtent.width / 2; + mesh_height = swapChainExtent.height / 2; + vertexBufSize = mesh_height*mesh_width; + + VkDeviceSize bufferSize = sizeof(Vertex) * vertexBufSize; +#ifdef _WIN64 + if (IsWindows8OrGreater()) + { + createBufferExtMem(bufferSize, VK_BUFFER_USAGE_VERTEX_BUFFER_BIT, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT, + vertexBuffer, vertexBufferMemory); + } + else + { + createBufferExtMem(bufferSize, VK_BUFFER_USAGE_VERTEX_BUFFER_BIT, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT, + vertexBuffer, vertexBufferMemory); + } +#else + createBufferExtMem(bufferSize, VK_BUFFER_USAGE_VERTEX_BUFFER_BIT, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT, + vertexBuffer, vertexBufferMemory); +#endif + } + + void cudaInitVertexMem() + { + checkCudaErrors(cudaStreamCreate(&streamToRun)); + + dim3 block(16, 16, 1); + dim3 grid(mesh_width/16, mesh_height/16, 1); + Vertex *vertices = (Vertex*) cudaDevVertptr; + sinewave_gen_kernel<<>>(vertices, mesh_width, mesh_height, 1.0); + checkCudaErrors(cudaStreamSynchronize(streamToRun)); + } + + void createUniformBuffer() { + VkDeviceSize bufferSize = sizeof(UniformBufferObject); + createBuffer(bufferSize, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, uniformBuffer, uniformBufferMemory); + } + + uint32_t findMemoryType(uint32_t typeFilter, VkMemoryPropertyFlags properties) { + VkPhysicalDeviceMemoryProperties memProperties; + vkGetPhysicalDeviceMemoryProperties(physicalDevice, &memProperties); + + for (uint32_t i = 0; i < memProperties.memoryTypeCount; i++) { + if (typeFilter & (1 << i) && (memProperties.memoryTypes[i].propertyFlags & properties) == properties) { + return i; + } + } + + throw std::runtime_error("failed to find suitable memory type!"); + } + + void getKhrExtensionsFn() + { +#ifdef _WIN64 + + fpGetSemaphoreWin32HandleKHR = (PFN_vkGetSemaphoreWin32HandleKHR)vkGetDeviceProcAddr(device, "vkGetSemaphoreWin32HandleKHR"); + if (fpGetSemaphoreWin32HandleKHR == NULL) { + throw std::runtime_error("Vulkan: Proc address for \"vkGetSemaphoreWin32HandleKHR\" not found.\n"); + } +#else + fpGetSemaphoreFdKHR = (PFN_vkGetSemaphoreFdKHR)vkGetDeviceProcAddr(device, "vkGetSemaphoreFdKHR"); + if (fpGetSemaphoreFdKHR == NULL) { + throw std::runtime_error("Vulkan: Proc address for \"vkGetSemaphoreFdKHR\" not found.\n"); + } +#endif + } + + void createCommandBuffers() { + commandBuffers.resize(swapChainFramebuffers.size()); + + VkCommandBufferAllocateInfo allocInfo = {}; + allocInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO; + allocInfo.commandPool = commandPool; + allocInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY; + allocInfo.commandBufferCount = (uint32_t) commandBuffers.size(); + + if (vkAllocateCommandBuffers(device, &allocInfo, commandBuffers.data()) != VK_SUCCESS) { + throw std::runtime_error("failed to allocate command buffers!"); + } + + for (size_t i = 0; i < commandBuffers.size(); i++) { + VkCommandBufferBeginInfo beginInfo = {}; + beginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO; + beginInfo.flags = VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT; + beginInfo.pInheritanceInfo = nullptr; // Optional + + if (vkBeginCommandBuffer(commandBuffers[i], &beginInfo) != VK_SUCCESS) { + throw std::runtime_error("failed to begin recording command buffer!"); + } + + VkRenderPassBeginInfo renderPassInfo = {}; + renderPassInfo.sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO; + renderPassInfo.renderPass = renderPass; + renderPassInfo.framebuffer = swapChainFramebuffers[i]; + renderPassInfo.renderArea.offset = {0, 0}; + renderPassInfo.renderArea.extent = swapChainExtent; + + VkClearValue clearColor = {0.0f, 0.0f, 0.0f, 1.0f}; + renderPassInfo.clearValueCount = 1; + renderPassInfo.pClearValues = &clearColor; + + vkCmdBeginRenderPass(commandBuffers[i], &renderPassInfo, VK_SUBPASS_CONTENTS_INLINE); + vkCmdBindPipeline(commandBuffers[i], VK_PIPELINE_BIND_POINT_GRAPHICS, graphicsPipeline); + VkBuffer vertexBuffers[] = {vertexBuffer}; + VkDeviceSize offsets[] = {0}; + vkCmdBindVertexBuffers(commandBuffers[i], 0, 1, vertexBuffers, offsets); + vkCmdBindDescriptorSets(commandBuffers[i], VK_PIPELINE_BIND_POINT_GRAPHICS, pipelineLayout, 0, 1, &descriptorSet, 0, nullptr); + vkCmdDraw(commandBuffers[i], static_cast(vertexBufSize), 1, 0, 0); + vkCmdEndRenderPass(commandBuffers[i]); + if (vkEndCommandBuffer(commandBuffers[i]) != VK_SUCCESS) { + throw std::runtime_error("failed to record command buffer!"); + } + } + + } + + VkShaderModule createShaderModule(const std::vector& code) { + VkShaderModuleCreateInfo createInfo = {}; + createInfo.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO; + createInfo.codeSize = code.size(); + createInfo.pCode = reinterpret_cast(code.data()); + + VkShaderModule shaderModule; + if (vkCreateShaderModule(device, &createInfo, nullptr, &shaderModule) != VK_SUCCESS) { + throw std::runtime_error("failed to create shader module!"); + } + + return shaderModule; + } + + static std::vector readFile(const std::string& filename) { + char* file_path = sdkFindFilePath(filename.c_str(), execution_path.c_str()); + + std::ifstream file(file_path, std::ios::ate | std::ios::binary); + + if (!file.is_open()) { + throw std::runtime_error("failed to open shader spv file!\n"); + } + size_t fileSize = (size_t) file.tellg(); + std::vector buffer(fileSize); + file.seekg(0); + file.read(buffer.data(), fileSize); + file.close(); + + return buffer; + } + + void mainLoop() { + + updateUniformBuffer(); + + while (!glfwWindowShouldClose(window)) { + glfwPollEvents(); + drawFrame(); + } + + vkDeviceWaitIdle(device); + } + + void updateUniformBuffer() { + + UniformBufferObject ubo = {}; + + mat4x4_identity(ubo.model); + mat4x4 Model; + mat4x4_dup(Model, ubo.model); + mat4x4_rotate(ubo.model, Model, 1.0f, 0.0f, 1.0f, degreesToRadians(45.0f)); + + vec3 eye = {2.0f, 2.0f, 2.0f}; + vec3 center = {0.0f, 0.0f, 0.0f}; + vec3 up = {0.0f, 0.0f, 1.0f}; + mat4x4_look_at(ubo.view, eye, center, up); + mat4x4_perspective(ubo.proj, degreesToRadians(45.0f), swapChainExtent.width / (float) swapChainExtent.height, 0.1f, 10.0f); + ubo.proj[1][1] *= -1; + void* data; + vkMapMemory(device, uniformBufferMemory, 0, sizeof(ubo), 0, &data); + memcpy(data, &ubo, sizeof(ubo)); + vkUnmapMemory(device, uniformBufferMemory); + } + + void createDescriptorPool() { + VkDescriptorPoolSize poolSize = {}; + poolSize.type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER; + poolSize.descriptorCount = 1; + + VkDescriptorPoolCreateInfo poolInfo = {}; + poolInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO; + poolInfo.poolSizeCount = 1; + poolInfo.pPoolSizes = &poolSize; + poolInfo.maxSets = 1; + + if (vkCreateDescriptorPool(device, &poolInfo, nullptr, &descriptorPool) != VK_SUCCESS) { + throw std::runtime_error("failed to create descriptor pool!"); + } + } + + void createDescriptorSet() { + VkDescriptorSetLayout layouts[] = {descriptorSetLayout}; + VkDescriptorSetAllocateInfo allocInfo = {}; + allocInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO; + allocInfo.descriptorPool = descriptorPool; + allocInfo.descriptorSetCount = 1; + allocInfo.pSetLayouts = layouts; + + if (vkAllocateDescriptorSets(device, &allocInfo, &descriptorSet) != VK_SUCCESS) { + throw std::runtime_error("failed to allocate descriptor set!"); + } + + VkDescriptorBufferInfo bufferInfo = {}; + bufferInfo.buffer = uniformBuffer; + bufferInfo.offset = 0; + bufferInfo.range = sizeof(UniformBufferObject); + + VkWriteDescriptorSet descriptorWrite = {}; + descriptorWrite.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; + descriptorWrite.dstSet = descriptorSet; + descriptorWrite.dstBinding = 0; + descriptorWrite.dstArrayElement = 0; + descriptorWrite.descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER; + descriptorWrite.descriptorCount = 1; + descriptorWrite.pBufferInfo = &bufferInfo; + descriptorWrite.pImageInfo = nullptr; // Optional + descriptorWrite.pTexelBufferView = nullptr; // Optional + + vkUpdateDescriptorSets(device, 1, &descriptorWrite, 0, nullptr); + } + + void drawFrame() { + + uint32_t imageIndex; + vkAcquireNextImageKHR(device, swapChain, std::numeric_limits::max(), imageAvailableSemaphore, VK_NULL_HANDLE, &imageIndex); + + if (!startSubmit) + { + submitVulkan(imageIndex); + startSubmit = 1; + } + else + { + submitVulkanCuda(imageIndex); + } + + VkPresentInfoKHR presentInfo = {}; + presentInfo.sType = VK_STRUCTURE_TYPE_PRESENT_INFO_KHR; + + VkSemaphore signalSemaphores[] = {renderFinishedSemaphore}; + + presentInfo.waitSemaphoreCount = 1; + presentInfo.pWaitSemaphores = signalSemaphores; + + VkSwapchainKHR swapChains[] = {swapChain}; + presentInfo.swapchainCount = 1; + presentInfo.pSwapchains = swapChains; + presentInfo.pImageIndices = &imageIndex; + presentInfo.pResults = nullptr; // Optional + + vkQueuePresentKHR(presentQueue, &presentInfo); + + cudaUpdateVertexBuffer(); + // Added sleep of 5 millisecs so that CPU does not submit too much work to GPU + std::this_thread::sleep_for(std::chrono::microseconds(5000)); + } + + void submitVulkan(uint32_t imageIndex) + { + VkSubmitInfo submitInfo = {}; + submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; + + VkSemaphore waitSemaphores[] = {imageAvailableSemaphore}; + VkPipelineStageFlags waitStages[] = { VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT }; + submitInfo.waitSemaphoreCount = 1; + submitInfo.pWaitSemaphores = waitSemaphores; + submitInfo.pWaitDstStageMask = waitStages; + submitInfo.commandBufferCount = 1; + submitInfo.pCommandBuffers = &commandBuffers[imageIndex]; + + VkSemaphore signalSemaphores[] = {renderFinishedSemaphore, vkUpdateCudaVertexBufSemaphore}; + + submitInfo.signalSemaphoreCount = 2; + submitInfo.pSignalSemaphores = signalSemaphores; + + if (vkQueueSubmit(graphicsQueue, 1, &submitInfo, VK_NULL_HANDLE) != VK_SUCCESS) { + throw std::runtime_error("failed to submit draw command buffer!"); + } + } + + void submitVulkanCuda(uint32_t imageIndex) + { + VkSubmitInfo submitInfo = {}; + submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; + + VkSemaphore waitSemaphores[] = {imageAvailableSemaphore, cudaUpdateVkVertexBufSemaphore}; + VkPipelineStageFlags waitStages[] = { VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT}; + submitInfo.waitSemaphoreCount = 2; + submitInfo.pWaitSemaphores = waitSemaphores; + submitInfo.pWaitDstStageMask = waitStages; + submitInfo.commandBufferCount = 1; + submitInfo.pCommandBuffers = &commandBuffers[imageIndex]; + + VkSemaphore signalSemaphores[] = {renderFinishedSemaphore, vkUpdateCudaVertexBufSemaphore}; + + submitInfo.signalSemaphoreCount = 2; + submitInfo.pSignalSemaphores = signalSemaphores; + + if (vkQueueSubmit(graphicsQueue, 1, &submitInfo, VK_NULL_HANDLE) != VK_SUCCESS) { + throw std::runtime_error("failed to submit draw command buffer!"); + } + } + + void createSyncObjects() { + + VkSemaphoreCreateInfo semaphoreInfo = {}; + semaphoreInfo.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO; + + if (vkCreateSemaphore(device, &semaphoreInfo, nullptr, &imageAvailableSemaphore) != VK_SUCCESS || + vkCreateSemaphore(device, &semaphoreInfo, nullptr, &renderFinishedSemaphore) != VK_SUCCESS) { + + throw std::runtime_error("failed to create synchronization objects for a frame!"); + } + } + + void createSyncObjectsExt() { + + VkSemaphoreCreateInfo semaphoreInfo = {}; + semaphoreInfo.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO; + + memset(&semaphoreInfo, 0, sizeof(semaphoreInfo)); + semaphoreInfo.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO; + +#ifdef _WIN64 + WindowsSecurityAttributes winSecurityAttributes; + + VkExportSemaphoreWin32HandleInfoKHR vulkanExportSemaphoreWin32HandleInfoKHR = {}; + vulkanExportSemaphoreWin32HandleInfoKHR.sType = VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_WIN32_HANDLE_INFO_KHR; + vulkanExportSemaphoreWin32HandleInfoKHR.pNext = NULL; + vulkanExportSemaphoreWin32HandleInfoKHR.pAttributes = &winSecurityAttributes; + vulkanExportSemaphoreWin32HandleInfoKHR.dwAccess = DXGI_SHARED_RESOURCE_READ | DXGI_SHARED_RESOURCE_WRITE; + vulkanExportSemaphoreWin32HandleInfoKHR.name = (LPCWSTR)NULL; +#endif + VkExportSemaphoreCreateInfoKHR vulkanExportSemaphoreCreateInfo = {}; + vulkanExportSemaphoreCreateInfo.sType = VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_CREATE_INFO_KHR; +#ifdef _WIN64 + vulkanExportSemaphoreCreateInfo.pNext = IsWindows8OrGreater() ? &vulkanExportSemaphoreWin32HandleInfoKHR : NULL; + vulkanExportSemaphoreCreateInfo.handleTypes = IsWindows8OrGreater() ? VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT : VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT; +#else + vulkanExportSemaphoreCreateInfo.pNext = NULL; + vulkanExportSemaphoreCreateInfo.handleTypes = VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT; +#endif + semaphoreInfo.pNext = &vulkanExportSemaphoreCreateInfo; + + if (vkCreateSemaphore(device, &semaphoreInfo, nullptr, &cudaUpdateVkVertexBufSemaphore) != VK_SUCCESS || + vkCreateSemaphore(device, &semaphoreInfo, nullptr, &vkUpdateCudaVertexBufSemaphore) != VK_SUCCESS ) + { + throw std::runtime_error("failed to create synchronization objects for a CUDA-Vulkan!"); + } + } + + void cudaVkImportVertexMem() + { + cudaExternalMemoryHandleDesc cudaExtMemHandleDesc; + memset(&cudaExtMemHandleDesc, 0, sizeof(cudaExtMemHandleDesc)); +#ifdef _WIN64 + cudaExtMemHandleDesc.type = IsWindows8OrGreater() ? cudaExternalMemoryHandleTypeOpaqueWin32 : cudaExternalMemoryHandleTypeOpaqueWin32Kmt; + cudaExtMemHandleDesc.handle.win32.handle = getVkMemHandle(IsWindows8OrGreater() ? VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT : VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT); +#else + cudaExtMemHandleDesc.type = cudaExternalMemoryHandleTypeOpaqueFd; + cudaExtMemHandleDesc.handle.fd = getVkMemHandle(VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT); +#endif + cudaExtMemHandleDesc.size = sizeof(Vertex) * vertexBufSize; + + checkCudaErrors(cudaImportExternalMemory(&cudaExtMemVertexBuffer, &cudaExtMemHandleDesc)); + + cudaExternalMemoryBufferDesc cudaExtBufferDesc; + cudaExtBufferDesc.offset = 0; + cudaExtBufferDesc.size = sizeof(Vertex) * vertexBufSize; + cudaExtBufferDesc.flags = 0; + + checkCudaErrors(cudaExternalMemoryGetMappedBuffer(&cudaDevVertptr, cudaExtMemVertexBuffer, &cudaExtBufferDesc)); + printf("CUDA Imported Vulkan vertex buffer\n"); + } + + void cudaVkImportSemaphore() + { + cudaExternalSemaphoreHandleDesc externalSemaphoreHandleDesc; + memset(&externalSemaphoreHandleDesc, 0, sizeof(externalSemaphoreHandleDesc)); +#ifdef _WIN64 + externalSemaphoreHandleDesc.type = IsWindows8OrGreater() ? cudaExternalSemaphoreHandleTypeOpaqueWin32 : cudaExternalSemaphoreHandleTypeOpaqueWin32Kmt; + externalSemaphoreHandleDesc.handle.win32.handle = getVkSemaphoreHandle(IsWindows8OrGreater() ? VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT : VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT, + cudaUpdateVkVertexBufSemaphore); +#else + externalSemaphoreHandleDesc.type = cudaExternalSemaphoreHandleTypeOpaqueFd; + externalSemaphoreHandleDesc.handle.fd = getVkSemaphoreHandle(VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT, cudaUpdateVkVertexBufSemaphore); +#endif + externalSemaphoreHandleDesc.flags = 0; + + checkCudaErrors(cudaImportExternalSemaphore(&cudaExtCudaUpdateVkVertexBufSemaphore, &externalSemaphoreHandleDesc)); + + memset(&externalSemaphoreHandleDesc, 0, sizeof(externalSemaphoreHandleDesc)); +#ifdef _WIN64 + externalSemaphoreHandleDesc.type = IsWindows8OrGreater() ? cudaExternalSemaphoreHandleTypeOpaqueWin32 : cudaExternalSemaphoreHandleTypeOpaqueWin32Kmt;; + externalSemaphoreHandleDesc.handle.win32.handle = getVkSemaphoreHandle(IsWindows8OrGreater() ? VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT : VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT, + vkUpdateCudaVertexBufSemaphore); +#else + externalSemaphoreHandleDesc.type = cudaExternalSemaphoreHandleTypeOpaqueFd; + externalSemaphoreHandleDesc.handle.fd = getVkSemaphoreHandle(VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT, vkUpdateCudaVertexBufSemaphore); +#endif + externalSemaphoreHandleDesc.flags = 0; + checkCudaErrors(cudaImportExternalSemaphore(&cudaExtVkUpdateCudaVertexBufSemaphore, &externalSemaphoreHandleDesc)); + printf("CUDA Imported Vulkan semaphore\n"); + } + +#ifdef _WIN64 // For windows + HANDLE getVkMemHandle(VkExternalMemoryHandleTypeFlagsKHR externalMemoryHandleType) + { + HANDLE handle; + + VkMemoryGetWin32HandleInfoKHR vkMemoryGetWin32HandleInfoKHR = {}; + vkMemoryGetWin32HandleInfoKHR.sType = VK_STRUCTURE_TYPE_MEMORY_GET_WIN32_HANDLE_INFO_KHR; + vkMemoryGetWin32HandleInfoKHR.pNext = NULL; + vkMemoryGetWin32HandleInfoKHR.memory = vertexBufferMemory; + vkMemoryGetWin32HandleInfoKHR.handleType = (VkExternalMemoryHandleTypeFlagBitsKHR)externalMemoryHandleType; + + fpGetMemoryWin32HandleKHR(device, &vkMemoryGetWin32HandleInfoKHR, &handle); + return handle; + } +#else + int getVkMemHandle(VkExternalMemoryHandleTypeFlagsKHR externalMemoryHandleType) + { + if (externalMemoryHandleType == VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT) { + int fd; + + VkMemoryGetFdInfoKHR vkMemoryGetFdInfoKHR = {}; + vkMemoryGetFdInfoKHR.sType = VK_STRUCTURE_TYPE_MEMORY_GET_FD_INFO_KHR; + vkMemoryGetFdInfoKHR.pNext = NULL; + vkMemoryGetFdInfoKHR.memory = vertexBufferMemory; + vkMemoryGetFdInfoKHR.handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT_KHR; + + fpGetMemoryFdKHR(device, &vkMemoryGetFdInfoKHR, &fd); + + return fd; + } + return -1; + } +#endif + + +#ifdef _WIN64 + HANDLE getVkSemaphoreHandle(VkExternalSemaphoreHandleTypeFlagBitsKHR externalSemaphoreHandleType, VkSemaphore &semVkCuda) + { + HANDLE handle; + + VkSemaphoreGetWin32HandleInfoKHR vulkanSemaphoreGetWin32HandleInfoKHR = {}; + vulkanSemaphoreGetWin32HandleInfoKHR.sType = VK_STRUCTURE_TYPE_SEMAPHORE_GET_WIN32_HANDLE_INFO_KHR; + vulkanSemaphoreGetWin32HandleInfoKHR.pNext = NULL; + vulkanSemaphoreGetWin32HandleInfoKHR.semaphore = semVkCuda; + vulkanSemaphoreGetWin32HandleInfoKHR.handleType = externalSemaphoreHandleType; + + fpGetSemaphoreWin32HandleKHR(device, &vulkanSemaphoreGetWin32HandleInfoKHR, &handle); + + return handle; + } +#else + int getVkSemaphoreHandle(VkExternalSemaphoreHandleTypeFlagBitsKHR externalSemaphoreHandleType, VkSemaphore &semVkCuda) + { + if (externalSemaphoreHandleType == VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT) { + int fd; + + VkSemaphoreGetFdInfoKHR vulkanSemaphoreGetFdInfoKHR = {}; + vulkanSemaphoreGetFdInfoKHR.sType = VK_STRUCTURE_TYPE_SEMAPHORE_GET_FD_INFO_KHR; + vulkanSemaphoreGetFdInfoKHR.pNext = NULL; + vulkanSemaphoreGetFdInfoKHR.semaphore = semVkCuda; + vulkanSemaphoreGetFdInfoKHR.handleType = VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT_KHR; + + fpGetSemaphoreFdKHR(device, &vulkanSemaphoreGetFdInfoKHR, &fd); + + return fd; + } + return -1; + } +#endif + + void cudaVkSemaphoreSignal(cudaExternalSemaphore_t &extSemaphore) + { + cudaExternalSemaphoreSignalParams extSemaphoreSignalParams; + memset(&extSemaphoreSignalParams, 0, sizeof(extSemaphoreSignalParams)); + + extSemaphoreSignalParams.params.fence.value = 0; + extSemaphoreSignalParams.flags = 0; + checkCudaErrors(cudaSignalExternalSemaphoresAsync(&extSemaphore, &extSemaphoreSignalParams, 1, streamToRun)); + } + + void cudaVkSemaphoreWait(cudaExternalSemaphore_t &extSemaphore) + { + cudaExternalSemaphoreWaitParams extSemaphoreWaitParams; + + memset(&extSemaphoreWaitParams, 0, sizeof(extSemaphoreWaitParams)); + + extSemaphoreWaitParams.params.fence.value = 0; + extSemaphoreWaitParams.flags = 0; + + checkCudaErrors(cudaWaitExternalSemaphoresAsync(&extSemaphore, &extSemaphoreWaitParams, 1, streamToRun)); + } + + void cudaUpdateVertexBuffer() + { + cudaVkSemaphoreWait(cudaExtVkUpdateCudaVertexBufSemaphore); + + dim3 block(16, 16, 1); + dim3 grid(mesh_width/block.x, mesh_height/block.y, 1); + Vertex *pos = (Vertex*) cudaDevVertptr; + AnimTime += 0.01f; + sinewave_gen_kernel<<>>(pos, mesh_width, mesh_height, AnimTime); + cudaVkSemaphoreSignal(cudaExtCudaUpdateVkVertexBufSemaphore); + } + + void cleanup() { + if (enableValidationLayers) { + DestroyDebugReportCallbackEXT(instance, callback, nullptr); + } + + vkDestroySemaphore(device, renderFinishedSemaphore, nullptr); + vkDestroySemaphore(device, imageAvailableSemaphore, nullptr); + checkCudaErrors(cudaDestroyExternalSemaphore(cudaExtCudaUpdateVkVertexBufSemaphore)); + vkDestroySemaphore(device, cudaUpdateVkVertexBufSemaphore, nullptr); + checkCudaErrors(cudaDestroyExternalSemaphore(cudaExtVkUpdateCudaVertexBufSemaphore)); + vkDestroySemaphore(device, vkUpdateCudaVertexBufSemaphore, nullptr); + + vkDestroyCommandPool(device, commandPool, nullptr); + for (auto framebuffer : swapChainFramebuffers) { + vkDestroyFramebuffer(device, framebuffer, nullptr); + } + for (auto imageView : swapChainImageViews) { + vkDestroyImageView(device, imageView, nullptr); + } + vkDestroyPipeline(device, graphicsPipeline, nullptr); + vkDestroyPipelineLayout(device, pipelineLayout, nullptr); + vkDestroyDescriptorSetLayout(device, descriptorSetLayout, nullptr); + vkDestroyBuffer(device, uniformBuffer, nullptr); + vkFreeMemory(device, uniformBufferMemory, nullptr); + vkDestroyRenderPass(device, renderPass, nullptr); + vkDestroySwapchainKHR(device, swapChain, nullptr); + checkCudaErrors(cudaDestroyExternalMemory(cudaExtMemVertexBuffer)); + vkDestroyBuffer(device, vertexBuffer, nullptr); + vkFreeMemory(device, vertexBufferMemory, nullptr); + vkDestroyDescriptorPool(device, descriptorPool, nullptr); + vkDestroyDevice(device, nullptr); + vkDestroySurfaceKHR(instance, surface, nullptr); + vkDestroyInstance(instance, nullptr); + glfwDestroyWindow(window); + glfwTerminate(); + } +}; + +int main(int argc, char* argv[]) { + execution_path = argv[0]; + vulkanCudaApp app; + + try { + app.run(); + } catch (const std::runtime_error& e) { + std::cerr << e.what() << std::endl; + return EXIT_FAILURE; + } + + return EXIT_SUCCESS; +} \ No newline at end of file