From f4244303536cb84cb6a68b989508b720f0ecdcc5 Mon Sep 17 00:00:00 2001
From: kwoodle <kwoodle@aya.yale.edu>
Date: Tue, 12 May 2020 10:39:20 -0400
Subject: [PATCH] Added simpleGL and vulkanCUDASinewave

---
 CMakeLists.txt        |   14 +-
 linmath.h             |  505 ++++++++++++
 shader_sine.frag      |   11 +
 shader_sine.vert      |   23 +
 simpleGL.cu           |  590 ++++++++++++++
 vulkanCUDASinewave.cu | 1711 +++++++++++++++++++++++++++++++++++++++++
 6 files changed, 2853 insertions(+), 1 deletion(-)
 create mode 100644 linmath.h
 create mode 100644 shader_sine.frag
 create mode 100644 shader_sine.vert
 create mode 100644 simpleGL.cu
 create mode 100644 vulkanCUDASinewave.cu

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c97a31a..eacaef3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -23,10 +23,22 @@ add_executable(asyncAPI asyncAPI.cu)
 find_package(MPI REQUIRED)
 include_directories(${MPI_CXX_INCLUDE_DIRS})
 
-add_executable(simpleMPI simpleMPI.cpp simpleMPI.h simpleMPI.cu)
+add_executable(simpleMPI simpleMPI.cpp simpleMPI.cu)
 target_link_libraries(simpleMPI ${MPI_CXX_LIBRARIES})
 
 add_executable(cudaTensorCoreGemm cudaTensorCoreGemm.cu)
 target_link_libraries(cudaTensorCoreGemm)
 
 add_executable(bandwidthTest bandwidthTest.cu)
+
+find_package(OpenGL REQUIRED)
+add_executable(simpleGL simpleGL.cu)
+target_link_libraries(simpleGL GL GLU glut)
+
+set(VULKANSDK = /kws_space/vulkansdk_1.2.135.0/x86_64)
+add_executable(simpleVulkan vulkanCUDASinewave.cu)
+target_include_directories(simpleVulkan PRIVATE "${VULKANSDK}/include" )
+target_link_directories(simpleVulkan PRIVATE "${VULKANSDK}/lib")
+target_link_libraries(simpleVulkan vulkan glfw)
+
+add_executable(interval )
\ No newline at end of file
diff --git a/linmath.h b/linmath.h
new file mode 100644
index 0000000..b4f4d16
--- /dev/null
+++ b/linmath.h
@@ -0,0 +1,505 @@
+//
+// Created by kwoodle on 5/12/20.
+//
+
+/*
+ * Copyright (c) 2015-2016 The Khronos Group Inc.
+ * Copyright (c) 2015-2016 Valve Corporation
+ * Copyright (c) 2015-2016 LunarG, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Relicensed from the WTFPL (http://www.wtfpl.net/faq/).
+ */
+
+#ifndef LINMATH_H
+#define LINMATH_H
+
+#include <math.h>
+
+// Converts degrees to radians.
+#define degreesToRadians(angleDegrees) (angleDegrees * M_PI / 180.0)
+
+// Converts radians to degrees.
+#define radiansToDegrees(angleRadians) (angleRadians * 180.0 / M_PI)
+
+typedef float vec3[3];
+static inline void vec3_add(vec3 r, vec3 const a, vec3 const b) {
+    int i;
+    for (i = 0; i < 3; ++i) r[i] = a[i] + b[i];
+}
+static inline void vec3_sub(vec3 r, vec3 const a, vec3 const b) {
+    int i;
+    for (i = 0; i < 3; ++i) r[i] = a[i] - b[i];
+}
+static inline void vec3_scale(vec3 r, vec3 const v, float const s) {
+    int i;
+    for (i = 0; i < 3; ++i) r[i] = v[i] * s;
+}
+static inline float vec3_mul_inner(vec3 const a, vec3 const b) {
+    float p = 0.f;
+    int i;
+    for (i = 0; i < 3; ++i) p += b[i] * a[i];
+    return p;
+}
+static inline void vec3_mul_cross(vec3 r, vec3 const a, vec3 const b) {
+    r[0] = a[1] * b[2] - a[2] * b[1];
+    r[1] = a[2] * b[0] - a[0] * b[2];
+    r[2] = a[0] * b[1] - a[1] * b[0];
+}
+static inline float vec3_len(vec3 const v) { return sqrtf(vec3_mul_inner(v, v)); }
+static inline void vec3_norm(vec3 r, vec3 const v) {
+    float k = 1.f / vec3_len(v);
+    vec3_scale(r, v, k);
+}
+static inline void vec3_reflect(vec3 r, vec3 const v, vec3 const n) {
+    float p = 2.f * vec3_mul_inner(v, n);
+    int i;
+    for (i = 0; i < 3; ++i) r[i] = v[i] - p * n[i];
+}
+
+typedef float vec4[4];
+static inline void vec4_add(vec4 r, vec4 const a, vec4 const b) {
+    int i;
+    for (i = 0; i < 4; ++i) r[i] = a[i] + b[i];
+}
+static inline void vec4_sub(vec4 r, vec4 const a, vec4 const b) {
+    int i;
+    for (i = 0; i < 4; ++i) r[i] = a[i] - b[i];
+}
+static inline void vec4_scale(vec4 r, vec4 v, float s) {
+    int i;
+    for (i = 0; i < 4; ++i) r[i] = v[i] * s;
+}
+static inline float vec4_mul_inner(vec4 a, vec4 b) {
+    float p = 0.f;
+    int i;
+    for (i = 0; i < 4; ++i) p += b[i] * a[i];
+    return p;
+}
+static inline void vec4_mul_cross(vec4 r, vec4 a, vec4 b) {
+    r[0] = a[1] * b[2] - a[2] * b[1];
+    r[1] = a[2] * b[0] - a[0] * b[2];
+    r[2] = a[0] * b[1] - a[1] * b[0];
+    r[3] = 1.f;
+}
+static inline float vec4_len(vec4 v) { return sqrtf(vec4_mul_inner(v, v)); }
+static inline void vec4_norm(vec4 r, vec4 v) {
+    float k = 1.f / vec4_len(v);
+    vec4_scale(r, v, k);
+}
+static inline void vec4_reflect(vec4 r, vec4 v, vec4 n) {
+    float p = 2.f * vec4_mul_inner(v, n);
+    int i;
+    for (i = 0; i < 4; ++i) r[i] = v[i] - p * n[i];
+}
+
+typedef vec4 mat4x4[4];
+static inline void mat4x4_identity(mat4x4 M) {
+    int i, j;
+    for (i = 0; i < 4; ++i)
+        for (j = 0; j < 4; ++j) M[i][j] = i == j ? 1.f : 0.f;
+}
+static inline void mat4x4_dup(mat4x4 M, mat4x4 N) {
+    int i, j;
+    for (i = 0; i < 4; ++i)
+        for (j = 0; j < 4; ++j) M[i][j] = N[i][j];
+}
+static inline void mat4x4_row(vec4 r, mat4x4 M, int i) {
+    int k;
+    for (k = 0; k < 4; ++k) r[k] = M[k][i];
+}
+static inline void mat4x4_col(vec4 r, mat4x4 M, int i) {
+    int k;
+    for (k = 0; k < 4; ++k) r[k] = M[i][k];
+}
+static inline void mat4x4_transpose(mat4x4 M, mat4x4 N) {
+    int i, j;
+    for (j = 0; j < 4; ++j)
+        for (i = 0; i < 4; ++i) M[i][j] = N[j][i];
+}
+static inline void mat4x4_add(mat4x4 M, mat4x4 a, mat4x4 b) {
+    int i;
+    for (i = 0; i < 4; ++i) vec4_add(M[i], a[i], b[i]);
+}
+static inline void mat4x4_sub(mat4x4 M, mat4x4 a, mat4x4 b) {
+    int i;
+    for (i = 0; i < 4; ++i) vec4_sub(M[i], a[i], b[i]);
+}
+static inline void mat4x4_scale(mat4x4 M, mat4x4 a, float k) {
+    int i;
+    for (i = 0; i < 4; ++i) vec4_scale(M[i], a[i], k);
+}
+static inline void mat4x4_scale_aniso(mat4x4 M, mat4x4 a, float x, float y, float z) {
+    int i;
+    vec4_scale(M[0], a[0], x);
+    vec4_scale(M[1], a[1], y);
+    vec4_scale(M[2], a[2], z);
+    for (i = 0; i < 4; ++i) {
+        M[3][i] = a[3][i];
+    }
+}
+static inline void mat4x4_mul(mat4x4 M, mat4x4 a, mat4x4 b) {
+    int k, r, c;
+    for (c = 0; c < 4; ++c)
+        for (r = 0; r < 4; ++r) {
+            M[c][r] = 0.f;
+            for (k = 0; k < 4; ++k) M[c][r] += a[k][r] * b[c][k];
+        }
+}
+static inline void mat4x4_mul_vec4(vec4 r, mat4x4 M, vec4 v) {
+    int i, j;
+    for (j = 0; j < 4; ++j) {
+        r[j] = 0.f;
+        for (i = 0; i < 4; ++i) r[j] += M[i][j] * v[i];
+    }
+}
+static inline void mat4x4_translate(mat4x4 T, float x, float y, float z) {
+    mat4x4_identity(T);
+    T[3][0] = x;
+    T[3][1] = y;
+    T[3][2] = z;
+}
+static inline void mat4x4_translate_in_place(mat4x4 M, float x, float y, float z) {
+    vec4 t = {x, y, z, 0};
+    vec4 r;
+    int i;
+    for (i = 0; i < 4; ++i) {
+        mat4x4_row(r, M, i);
+        M[3][i] += vec4_mul_inner(r, t);
+    }
+}
+static inline void mat4x4_from_vec3_mul_outer(mat4x4 M, vec3 a, vec3 b) {
+    int i, j;
+    for (i = 0; i < 4; ++i)
+        for (j = 0; j < 4; ++j) M[i][j] = i < 3 && j < 3 ? a[i] * b[j] : 0.f;
+}
+static inline void mat4x4_rotate(mat4x4 R, mat4x4 M, float x, float y, float z, float angle) {
+    float s = sinf(angle);
+    float c = cosf(angle);
+    vec3 u = {x, y, z};
+
+    if (vec3_len(u) > 1e-4) {
+        vec3_norm(u, u);
+        mat4x4 T;
+        mat4x4_from_vec3_mul_outer(T, u, u);
+
+        mat4x4 S = {{0, u[2], -u[1], 0}, {-u[2], 0, u[0], 0}, {u[1], -u[0], 0, 0}, {0, 0, 0, 0}};
+        mat4x4_scale(S, S, s);
+
+        mat4x4 C;
+        mat4x4_identity(C);
+        mat4x4_sub(C, C, T);
+
+        mat4x4_scale(C, C, c);
+
+        mat4x4_add(T, T, C);
+        mat4x4_add(T, T, S);
+
+        T[3][3] = 1.;
+        mat4x4_mul(R, M, T);
+    } else {
+        mat4x4_dup(R, M);
+    }
+}
+static inline void mat4x4_rotate_X(mat4x4 Q, mat4x4 M, float angle) {
+    float s = sinf(angle);
+    float c = cosf(angle);
+    mat4x4 R = {{1.f, 0.f, 0.f, 0.f}, {0.f, c, s, 0.f}, {0.f, -s, c, 0.f}, {0.f, 0.f, 0.f, 1.f}};
+    mat4x4_mul(Q, M, R);
+}
+static inline void mat4x4_rotate_Y(mat4x4 Q, mat4x4 M, float angle) {
+    float s = sinf(angle);
+    float c = cosf(angle);
+    mat4x4 R = {{c, 0.f, s, 0.f}, {0.f, 1.f, 0.f, 0.f}, {-s, 0.f, c, 0.f}, {0.f, 0.f, 0.f, 1.f}};
+    mat4x4_mul(Q, M, R);
+}
+static inline void mat4x4_rotate_Z(mat4x4 Q, mat4x4 M, float angle) {
+    float s = sinf(angle);
+    float c = cosf(angle);
+    mat4x4 R = {{c, s, 0.f, 0.f}, {-s, c, 0.f, 0.f}, {0.f, 0.f, 1.f, 0.f}, {0.f, 0.f, 0.f, 1.f}};
+    mat4x4_mul(Q, M, R);
+}
+static inline void mat4x4_invert(mat4x4 T, mat4x4 M) {
+    float s[6];
+    float c[6];
+    s[0] = M[0][0] * M[1][1] - M[1][0] * M[0][1];
+    s[1] = M[0][0] * M[1][2] - M[1][0] * M[0][2];
+    s[2] = M[0][0] * M[1][3] - M[1][0] * M[0][3];
+    s[3] = M[0][1] * M[1][2] - M[1][1] * M[0][2];
+    s[4] = M[0][1] * M[1][3] - M[1][1] * M[0][3];
+    s[5] = M[0][2] * M[1][3] - M[1][2] * M[0][3];
+
+    c[0] = M[2][0] * M[3][1] - M[3][0] * M[2][1];
+    c[1] = M[2][0] * M[3][2] - M[3][0] * M[2][2];
+    c[2] = M[2][0] * M[3][3] - M[3][0] * M[2][3];
+    c[3] = M[2][1] * M[3][2] - M[3][1] * M[2][2];
+    c[4] = M[2][1] * M[3][3] - M[3][1] * M[2][3];
+    c[5] = M[2][2] * M[3][3] - M[3][2] * M[2][3];
+
+    /* Assumes it is invertible */
+    float idet = 1.0f / (s[0] * c[5] - s[1] * c[4] + s[2] * c[3] + s[3] * c[2] - s[4] * c[1] + s[5] * c[0]);
+
+    T[0][0] = (M[1][1] * c[5] - M[1][2] * c[4] + M[1][3] * c[3]) * idet;
+    T[0][1] = (-M[0][1] * c[5] + M[0][2] * c[4] - M[0][3] * c[3]) * idet;
+    T[0][2] = (M[3][1] * s[5] - M[3][2] * s[4] + M[3][3] * s[3]) * idet;
+    T[0][3] = (-M[2][1] * s[5] + M[2][2] * s[4] - M[2][3] * s[3]) * idet;
+
+    T[1][0] = (-M[1][0] * c[5] + M[1][2] * c[2] - M[1][3] * c[1]) * idet;
+    T[1][1] = (M[0][0] * c[5] - M[0][2] * c[2] + M[0][3] * c[1]) * idet;
+    T[1][2] = (-M[3][0] * s[5] + M[3][2] * s[2] - M[3][3] * s[1]) * idet;
+    T[1][3] = (M[2][0] * s[5] - M[2][2] * s[2] + M[2][3] * s[1]) * idet;
+
+    T[2][0] = (M[1][0] * c[4] - M[1][1] * c[2] + M[1][3] * c[0]) * idet;
+    T[2][1] = (-M[0][0] * c[4] + M[0][1] * c[2] - M[0][3] * c[0]) * idet;
+    T[2][2] = (M[3][0] * s[4] - M[3][1] * s[2] + M[3][3] * s[0]) * idet;
+    T[2][3] = (-M[2][0] * s[4] + M[2][1] * s[2] - M[2][3] * s[0]) * idet;
+
+    T[3][0] = (-M[1][0] * c[3] + M[1][1] * c[1] - M[1][2] * c[0]) * idet;
+    T[3][1] = (M[0][0] * c[3] - M[0][1] * c[1] + M[0][2] * c[0]) * idet;
+    T[3][2] = (-M[3][0] * s[3] + M[3][1] * s[1] - M[3][2] * s[0]) * idet;
+    T[3][3] = (M[2][0] * s[3] - M[2][1] * s[1] + M[2][2] * s[0]) * idet;
+}
+static inline void mat4x4_orthonormalize(mat4x4 R, mat4x4 M) {
+    mat4x4_dup(R, M);
+    float s = 1.;
+    vec3 h;
+
+    vec3_norm(R[2], R[2]);
+
+    s = vec3_mul_inner(R[1], R[2]);
+    vec3_scale(h, R[2], s);
+    vec3_sub(R[1], R[1], h);
+    vec3_norm(R[2], R[2]);
+
+    s = vec3_mul_inner(R[1], R[2]);
+    vec3_scale(h, R[2], s);
+    vec3_sub(R[1], R[1], h);
+    vec3_norm(R[1], R[1]);
+
+    s = vec3_mul_inner(R[0], R[1]);
+    vec3_scale(h, R[1], s);
+    vec3_sub(R[0], R[0], h);
+    vec3_norm(R[0], R[0]);
+}
+
+static inline void mat4x4_frustum(mat4x4 M, float l, float r, float b, float t, float n, float f) {
+    M[0][0] = 2.f * n / (r - l);
+    M[0][1] = M[0][2] = M[0][3] = 0.f;
+
+    M[1][1] = 2.f * n / (t - b);
+    M[1][0] = M[1][2] = M[1][3] = 0.f;
+
+    M[2][0] = (r + l) / (r - l);
+    M[2][1] = (t + b) / (t - b);
+    M[2][2] = -(f + n) / (f - n);
+    M[2][3] = -1.f;
+
+    M[3][2] = -2.f * (f * n) / (f - n);
+    M[3][0] = M[3][1] = M[3][3] = 0.f;
+}
+static inline void mat4x4_ortho(mat4x4 M, float l, float r, float b, float t, float n, float f) {
+    M[0][0] = 2.f / (r - l);
+    M[0][1] = M[0][2] = M[0][3] = 0.f;
+
+    M[1][1] = 2.f / (t - b);
+    M[1][0] = M[1][2] = M[1][3] = 0.f;
+
+    M[2][2] = -2.f / (f - n);
+    M[2][0] = M[2][1] = M[2][3] = 0.f;
+
+    M[3][0] = -(r + l) / (r - l);
+    M[3][1] = -(t + b) / (t - b);
+    M[3][2] = -(f + n) / (f - n);
+    M[3][3] = 1.f;
+}
+static inline void mat4x4_perspective(mat4x4 m, float y_fov, float aspect, float n, float f) {
+    /* NOTE: Degrees are an unhandy unit to work with.
+     * linmath.h uses radians for everything! */
+    float const a = (float)(1.f / tan(y_fov / 2.f));
+
+    m[0][0] = a / aspect;
+    m[0][1] = 0.f;
+    m[0][2] = 0.f;
+    m[0][3] = 0.f;
+
+    m[1][0] = 0.f;
+    m[1][1] = a;
+    m[1][2] = 0.f;
+    m[1][3] = 0.f;
+
+    m[2][0] = 0.f;
+    m[2][1] = 0.f;
+    m[2][2] = -((f + n) / (f - n));
+    m[2][3] = -1.f;
+
+    m[3][0] = 0.f;
+    m[3][1] = 0.f;
+    m[3][2] = -((2.f * f * n) / (f - n));
+    m[3][3] = 0.f;
+}
+static inline void mat4x4_look_at(mat4x4 m, vec3 eye, vec3 center, vec3 up) {
+    /* Adapted from Android's OpenGL Matrix.java.                        */
+    /* See the OpenGL GLUT documentation for gluLookAt for a description */
+    /* of the algorithm. We implement it in a straightforward way:       */
+
+    /* TODO: The negation of of can be spared by swapping the order of
+     *       operands in the following cross products in the right way. */
+    vec3 f;
+    vec3_sub(f, center, eye);
+    vec3_norm(f, f);
+
+    vec3 s;
+    vec3_mul_cross(s, f, up);
+    vec3_norm(s, s);
+
+    vec3 t;
+    vec3_mul_cross(t, s, f);
+
+    m[0][0] = s[0];
+    m[0][1] = t[0];
+    m[0][2] = -f[0];
+    m[0][3] = 0.f;
+
+    m[1][0] = s[1];
+    m[1][1] = t[1];
+    m[1][2] = -f[1];
+    m[1][3] = 0.f;
+
+    m[2][0] = s[2];
+    m[2][1] = t[2];
+    m[2][2] = -f[2];
+    m[2][3] = 0.f;
+
+    m[3][0] = 0.f;
+    m[3][1] = 0.f;
+    m[3][2] = 0.f;
+    m[3][3] = 1.f;
+
+    mat4x4_translate_in_place(m, -eye[0], -eye[1], -eye[2]);
+}
+
+typedef float quat[4];
+static inline void quat_identity(quat q) {
+    q[0] = q[1] = q[2] = 0.f;
+    q[3] = 1.f;
+}
+static inline void quat_add(quat r, quat a, quat b) {
+    int i;
+    for (i = 0; i < 4; ++i) r[i] = a[i] + b[i];
+}
+static inline void quat_sub(quat r, quat a, quat b) {
+    int i;
+    for (i = 0; i < 4; ++i) r[i] = a[i] - b[i];
+}
+static inline void quat_mul(quat r, quat p, quat q) {
+    vec3 w;
+    vec3_mul_cross(r, p, q);
+    vec3_scale(w, p, q[3]);
+    vec3_add(r, r, w);
+    vec3_scale(w, q, p[3]);
+    vec3_add(r, r, w);
+    r[3] = p[3] * q[3] - vec3_mul_inner(p, q);
+}
+static inline void quat_scale(quat r, quat v, float s) {
+    int i;
+    for (i = 0; i < 4; ++i) r[i] = v[i] * s;
+}
+static inline float quat_inner_product(quat a, quat b) {
+    float p = 0.f;
+    int i;
+    for (i = 0; i < 4; ++i) p += b[i] * a[i];
+    return p;
+}
+static inline void quat_conj(quat r, quat q) {
+    int i;
+    for (i = 0; i < 3; ++i) r[i] = -q[i];
+    r[3] = q[3];
+}
+#define quat_norm vec4_norm
+static inline void quat_mul_vec3(vec3 r, quat q, vec3 v) {
+    quat v_ = {v[0], v[1], v[2], 0.f};
+
+    quat_conj(r, q);
+    quat_norm(r, r);
+    quat_mul(r, v_, r);
+    quat_mul(r, q, r);
+}
+static inline void mat4x4_from_quat(mat4x4 M, quat q) {
+    float a = q[3];
+    float b = q[0];
+    float c = q[1];
+    float d = q[2];
+    float a2 = a * a;
+    float b2 = b * b;
+    float c2 = c * c;
+    float d2 = d * d;
+
+    M[0][0] = a2 + b2 - c2 - d2;
+    M[0][1] = 2.f * (b * c + a * d);
+    M[0][2] = 2.f * (b * d - a * c);
+    M[0][3] = 0.f;
+
+    M[1][0] = 2 * (b * c - a * d);
+    M[1][1] = a2 - b2 + c2 - d2;
+    M[1][2] = 2.f * (c * d + a * b);
+    M[1][3] = 0.f;
+
+    M[2][0] = 2.f * (b * d + a * c);
+    M[2][1] = 2.f * (c * d - a * b);
+    M[2][2] = a2 - b2 - c2 + d2;
+    M[2][3] = 0.f;
+
+    M[3][0] = M[3][1] = M[3][2] = 0.f;
+    M[3][3] = 1.f;
+}
+
+static inline void mat4x4o_mul_quat(mat4x4 R, mat4x4 M, quat q) {
+    /*  XXX: The way this is written only works for othogonal matrices. */
+    /* TODO: Take care of non-orthogonal case. */
+    quat_mul_vec3(R[0], q, M[0]);
+    quat_mul_vec3(R[1], q, M[1]);
+    quat_mul_vec3(R[2], q, M[2]);
+
+    R[3][0] = R[3][1] = R[3][2] = 0.f;
+    R[3][3] = 1.f;
+}
+static inline void quat_from_mat4x4(quat q, mat4x4 M) {
+    float r = 0.f;
+    int i;
+
+    int perm[] = {0, 1, 2, 0, 1};
+    int *p = perm;
+
+    for (i = 0; i < 3; i++) {
+        float m = M[i][i];
+        if (m < r) continue;
+        m = r;
+        p = &perm[i];
+    }
+
+    r = sqrtf(1.f + M[p[0]][p[0]] - M[p[1]][p[1]] - M[p[2]][p[2]]);
+
+    if (r < 1e-6) {
+        q[0] = 1.f;
+        q[1] = q[2] = q[3] = 0.f;
+        return;
+    }
+
+    q[0] = r / 2.f;
+    q[1] = (M[p[0]][p[1]] - M[p[1]][p[0]]) / (2.f * r);
+    q[2] = (M[p[2]][p[0]] - M[p[0]][p[2]]) / (2.f * r);
+    q[3] = (M[p[2]][p[1]] - M[p[1]][p[2]]) / (2.f * r);
+}
+
+#endif
diff --git a/shader_sine.frag b/shader_sine.frag
new file mode 100644
index 0000000..b096569
--- /dev/null
+++ b/shader_sine.frag
@@ -0,0 +1,11 @@
+#version 450
+#extension GL_ARB_separate_shader_objects : enable
+#extension GL_NV_gpu_shader5 : enable
+
+layout(location = 0) in vec3 fragColor;
+
+layout(location = 0) out vec4 outColor;
+
+void main() {
+    outColor = vec4(fragColor, 1.0);
+}
\ No newline at end of file
diff --git a/shader_sine.vert b/shader_sine.vert
new file mode 100644
index 0000000..849558b
--- /dev/null
+++ b/shader_sine.vert
@@ -0,0 +1,23 @@
+#version 450
+#extension GL_ARB_separate_shader_objects : enable
+#extension GL_NV_gpu_shader5 : enable
+
+layout(binding = 0) uniform UniformBufferObject {
+    mat4 model;
+    mat4 view;
+    mat4 proj;
+} ubo;
+
+layout(location = 0) in vec4 inPosition;
+layout(location = 1) in vec3 inColor;
+
+layout(location = 0) out vec3 fragColor;
+
+out gl_PerVertex {
+    vec4 gl_Position;
+};
+
+void main() {
+    gl_Position = ubo.proj * ubo.view * ubo.model * inPosition;
+    fragColor = inColor;
+}
\ No newline at end of file
diff --git a/simpleGL.cu b/simpleGL.cu
new file mode 100644
index 0000000..e38e4fe
--- /dev/null
+++ b/simpleGL.cu
@@ -0,0 +1,590 @@
+//
+// Created by kwoodle on 5/12/20.
+//
+////////////////////////////////////////////////////////////////////////////
+//
+// Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
+//
+// Please refer to the NVIDIA end user license agreement (EULA) associated
+// with this source code for terms and conditions that govern your use of
+// this software. Any use, reproduction, disclosure, or distribution of
+// this software and related documentation outside the terms of the EULA
+// is strictly prohibited.
+//
+////////////////////////////////////////////////////////////////////////////
+
+/*
+    This example demonstrates how to use the Cuda OpenGL bindings to
+    dynamically modify a vertex buffer using a Cuda kernel.
+
+    The steps are:
+    1. Create an empty vertex buffer object (VBO)
+    2. Register the VBO with Cuda
+    3. Map the VBO for writing from Cuda
+    4. Run Cuda kernel to modify the vertex positions
+    5. Unmap the VBO
+    6. Render the results using OpenGL
+
+    Host code
+*/
+
+// includes, system
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+
+#ifdef _WIN32
+#  define WINDOWS_LEAN_AND_MEAN
+#  define NOMINMAX
+#  include <windows.h>
+#endif
+
+// OpenGL Graphics includes
+#include <helper_gl.h>
+#if defined (__APPLE__) || defined(MACOSX)
+#pragma clang diagnostic ignored "-Wdeprecated-declarations"
+  #include <GLUT/glut.h>
+  #ifndef glutCloseFunc
+  #define glutCloseFunc glutWMCloseFunc
+  #endif
+#else
+#include <GL/freeglut.h>
+#endif
+
+// includes, cuda
+#include <cuda_runtime.h>
+#include <cuda_gl_interop.h>
+
+// Utilities and timing functions
+#include <helper_functions.h>    // includes cuda.h and cuda_runtime_api.h
+
+// CUDA helper functions
+#include <helper_cuda.h>         // helper functions for CUDA error check
+
+#include <vector_types.h>
+
+#define MAX_EPSILON_ERROR 10.0f
+#define THRESHOLD          0.30f
+#define REFRESH_DELAY     10 //ms
+
+////////////////////////////////////////////////////////////////////////////////
+// constants
+const unsigned int window_width  = 512;
+const unsigned int window_height = 512;
+
+const unsigned int mesh_width    = 256;
+const unsigned int mesh_height   = 256;
+
+// vbo variables
+GLuint vbo;
+struct cudaGraphicsResource *cuda_vbo_resource;
+void *d_vbo_buffer = NULL;
+
+float g_fAnim = 0.0;
+
+// mouse controls
+int mouse_old_x, mouse_old_y;
+int mouse_buttons = 0;
+float rotate_x = 0.0, rotate_y = 0.0;
+float translate_z = -3.0;
+
+StopWatchInterface *timer = NULL;
+
+// Auto-Verification Code
+int fpsCount = 0;        // FPS count for averaging
+int fpsLimit = 1;        // FPS limit for sampling
+int g_Index = 0;
+float avgFPS = 0.0f;
+unsigned int frameCount = 0;
+unsigned int g_TotalErrors = 0;
+bool g_bQAReadback = false;
+
+int *pArgc = NULL;
+char **pArgv = NULL;
+
+#define MAX(a,b) ((a > b) ? a : b)
+
+////////////////////////////////////////////////////////////////////////////////
+// declaration, forward
+bool runTest(int argc, char **argv, char *ref_file);
+void cleanup();
+
+// GL functionality
+bool initGL(int *argc, char **argv);
+void createVBO(GLuint *vbo, struct cudaGraphicsResource **vbo_res,
+               unsigned int vbo_res_flags);
+void deleteVBO(GLuint *vbo, struct cudaGraphicsResource *vbo_res);
+
+// rendering callbacks
+void display();
+void keyboard(unsigned char key, int x, int y);
+void mouse(int button, int state, int x, int y);
+void motion(int x, int y);
+void timerEvent(int value);
+
+// Cuda functionality
+void runCuda(struct cudaGraphicsResource **vbo_resource);
+void runAutoTest(int devID, char **argv, char *ref_file);
+void checkResultCuda(int argc, char **argv, const GLuint &vbo);
+
+const char *sSDKsample = "simpleGL (VBO)";
+
+///////////////////////////////////////////////////////////////////////////////
+//! Simple kernel to modify vertex positions in sine wave pattern
+//! @param data  data in global memory
+///////////////////////////////////////////////////////////////////////////////
+__global__ void simple_vbo_kernel(float4 *pos, unsigned int width, unsigned int height, float time)
+{
+    unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
+    unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
+
+    // calculate uv coordinates
+    float u = x / (float) width;
+    float v = y / (float) height;
+    u = u*2.0f - 1.0f;
+    v = v*2.0f - 1.0f;
+
+    // calculate simple sine wave pattern
+    float freq = 4.0f;
+    float w = sinf(u*freq + time) * cosf(v*freq + time) * 0.5f;
+
+    // write output vertex
+    pos[y*width+x] = make_float4(u, w, v, 1.0f);
+}
+
+
+void launch_kernel(float4 *pos, unsigned int mesh_width,
+                   unsigned int mesh_height, float time)
+{
+    // execute the kernel
+    dim3 block(8, 8, 1);
+    dim3 grid(mesh_width / block.x, mesh_height / block.y, 1);
+    simple_vbo_kernel<<< grid, block>>>(pos, mesh_width, mesh_height, time);
+}
+
+bool checkHW(char *name, const char *gpuType, int dev)
+{
+    cudaDeviceProp deviceProp;
+    cudaGetDeviceProperties(&deviceProp, dev);
+    strcpy(name, deviceProp.name);
+
+    if (!STRNCASECMP(deviceProp.name, gpuType, strlen(gpuType)))
+    {
+        return true;
+    }
+    else
+    {
+        return false;
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Program main
+////////////////////////////////////////////////////////////////////////////////
+int main(int argc, char **argv)
+{
+    char *ref_file = NULL;
+
+    pArgc = &argc;
+    pArgv = argv;
+
+#if defined(__linux__)
+    setenv ("DISPLAY", ":0", 0);
+#endif
+
+    printf("%s starting...\n", sSDKsample);
+
+    if (argc > 1)
+    {
+        if (checkCmdLineFlag(argc, (const char **)argv, "file"))
+        {
+            // In this mode, we are running non-OpenGL and doing a compare of the VBO was generated correctly
+            getCmdLineArgumentString(argc, (const char **)argv, "file", (char **)&ref_file);
+        }
+    }
+
+    printf("\n");
+
+    runTest(argc, argv, ref_file);
+
+    printf("%s completed, returned %s\n", sSDKsample, (g_TotalErrors == 0) ? "OK" : "ERROR!");
+    exit(g_TotalErrors == 0 ? EXIT_SUCCESS : EXIT_FAILURE);
+}
+
+void computeFPS()
+{
+    frameCount++;
+    fpsCount++;
+
+    if (fpsCount == fpsLimit)
+    {
+        avgFPS = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f);
+        fpsCount = 0;
+        fpsLimit = (int)MAX(avgFPS, 1.f);
+
+        sdkResetTimer(&timer);
+    }
+
+    char fps[256];
+    sprintf(fps, "Cuda GL Interop (VBO): %3.1f fps (Max 100Hz)", avgFPS);
+    glutSetWindowTitle(fps);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Initialize GL
+////////////////////////////////////////////////////////////////////////////////
+bool initGL(int *argc, char **argv)
+{
+    glutInit(argc, argv);
+    glutInitDisplayMode(GLUT_RGBA | GLUT_DOUBLE);
+    glutInitWindowSize(window_width, window_height);
+    glutCreateWindow("Cuda GL Interop (VBO)");
+    glutDisplayFunc(display);
+    glutKeyboardFunc(keyboard);
+    glutMotionFunc(motion);
+    glutTimerFunc(REFRESH_DELAY, timerEvent,0);
+
+    // initialize necessary OpenGL extensions
+    if (! isGLVersionSupported(2,0))
+    {
+        fprintf(stderr, "ERROR: Support for necessary OpenGL extensions missing.");
+        fflush(stderr);
+        return false;
+    }
+
+    // default initialization
+    glClearColor(0.0, 0.0, 0.0, 1.0);
+    glDisable(GL_DEPTH_TEST);
+
+    // viewport
+    glViewport(0, 0, window_width, window_height);
+
+    // projection
+    glMatrixMode(GL_PROJECTION);
+    glLoadIdentity();
+    gluPerspective(60.0, (GLfloat)window_width / (GLfloat) window_height, 0.1, 10.0);
+
+    SDK_CHECK_ERROR_GL();
+
+    return true;
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+//! Run a simple test for CUDA
+////////////////////////////////////////////////////////////////////////////////
+bool runTest(int argc, char **argv, char *ref_file)
+{
+    // Create the CUTIL timer
+    sdkCreateTimer(&timer);
+
+    // use command-line specified CUDA device, otherwise use device with highest Gflops/s
+    int devID = findCudaDevice(argc, (const char **)argv);
+
+    // command line mode only
+    if (ref_file != NULL)
+    {
+        // create VBO
+        checkCudaErrors(cudaMalloc((void **)&d_vbo_buffer, mesh_width*mesh_height*4*sizeof(float)));
+
+        // run the cuda part
+        runAutoTest(devID, argv, ref_file);
+
+        // check result of Cuda step
+        checkResultCuda(argc, argv, vbo);
+
+        cudaFree(d_vbo_buffer);
+        d_vbo_buffer = NULL;
+    }
+    else
+    {
+        // First initialize OpenGL context, so we can properly set the GL for CUDA.
+        // This is necessary in order to achieve optimal performance with OpenGL/CUDA interop.
+        if (false == initGL(&argc, argv))
+        {
+            return false;
+        }
+
+        // register callbacks
+        glutDisplayFunc(display);
+        glutKeyboardFunc(keyboard);
+        glutMouseFunc(mouse);
+        glutMotionFunc(motion);
+#if defined (__APPLE__) || defined(MACOSX)
+        atexit(cleanup);
+#else
+        glutCloseFunc(cleanup);
+#endif
+
+        // create VBO
+        createVBO(&vbo, &cuda_vbo_resource, cudaGraphicsMapFlagsWriteDiscard);
+
+        // run the cuda part
+        runCuda(&cuda_vbo_resource);
+
+        // start rendering mainloop
+        glutMainLoop();
+    }
+
+    return true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Run the Cuda part of the computation
+////////////////////////////////////////////////////////////////////////////////
+void runCuda(struct cudaGraphicsResource **vbo_resource)
+{
+    // map OpenGL buffer object for writing from CUDA
+    float4 *dptr;
+    checkCudaErrors(cudaGraphicsMapResources(1, vbo_resource, 0));
+    size_t num_bytes;
+    checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&dptr, &num_bytes,
+                                                         *vbo_resource));
+    //printf("CUDA mapped VBO: May access %ld bytes\n", num_bytes);
+
+    // execute the kernel
+    //    dim3 block(8, 8, 1);
+    //    dim3 grid(mesh_width / block.x, mesh_height / block.y, 1);
+    //    kernel<<< grid, block>>>(dptr, mesh_width, mesh_height, g_fAnim);
+
+    launch_kernel(dptr, mesh_width, mesh_height, g_fAnim);
+
+    // unmap buffer object
+    checkCudaErrors(cudaGraphicsUnmapResources(1, vbo_resource, 0));
+}
+
+#ifdef _WIN32
+#ifndef FOPEN
+#define FOPEN(fHandle,filename,mode) fopen_s(&fHandle, filename, mode)
+#endif
+#else
+#ifndef FOPEN
+#define FOPEN(fHandle,filename,mode) (fHandle = fopen(filename, mode))
+#endif
+#endif
+
+void sdkDumpBin2(void *data, unsigned int bytes, const char *filename)
+{
+    printf("sdkDumpBin: <%s>\n", filename);
+    FILE *fp;
+    FOPEN(fp, filename, "wb");
+    fwrite(data, bytes, 1, fp);
+    fflush(fp);
+    fclose(fp);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Run the Cuda part of the computation
+////////////////////////////////////////////////////////////////////////////////
+void runAutoTest(int devID, char **argv, char *ref_file)
+{
+    char *reference_file = NULL;
+    void *imageData = malloc(mesh_width*mesh_height*sizeof(float));
+
+    // execute the kernel
+    launch_kernel((float4 *)d_vbo_buffer, mesh_width, mesh_height, g_fAnim);
+
+    cudaDeviceSynchronize();
+    getLastCudaError("launch_kernel failed");
+
+    checkCudaErrors(cudaMemcpy(imageData, d_vbo_buffer, mesh_width*mesh_height*sizeof(float), cudaMemcpyDeviceToHost));
+
+    sdkDumpBin2(imageData, mesh_width*mesh_height*sizeof(float), "simpleGL.bin");
+    reference_file = sdkFindFilePath(ref_file, argv[0]);
+
+    if (reference_file &&
+        !sdkCompareBin2BinFloat("simpleGL.bin", reference_file,
+                                mesh_width*mesh_height*sizeof(float),
+                                MAX_EPSILON_ERROR, THRESHOLD, pArgv[0]))
+    {
+        g_TotalErrors++;
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Create VBO
+////////////////////////////////////////////////////////////////////////////////
+void createVBO(GLuint *vbo, struct cudaGraphicsResource **vbo_res,
+               unsigned int vbo_res_flags)
+{
+    assert(vbo);
+
+    // create buffer object
+    glGenBuffers(1, vbo);
+    glBindBuffer(GL_ARRAY_BUFFER, *vbo);
+
+    // initialize buffer object
+    unsigned int size = mesh_width * mesh_height * 4 * sizeof(float);
+    glBufferData(GL_ARRAY_BUFFER, size, 0, GL_DYNAMIC_DRAW);
+
+    glBindBuffer(GL_ARRAY_BUFFER, 0);
+
+    // register this buffer object with CUDA
+    checkCudaErrors(cudaGraphicsGLRegisterBuffer(vbo_res, *vbo, vbo_res_flags));
+
+    SDK_CHECK_ERROR_GL();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Delete VBO
+////////////////////////////////////////////////////////////////////////////////
+void deleteVBO(GLuint *vbo, struct cudaGraphicsResource *vbo_res)
+{
+
+    // unregister this buffer object with CUDA
+    checkCudaErrors(cudaGraphicsUnregisterResource(vbo_res));
+
+    glBindBuffer(1, *vbo);
+    glDeleteBuffers(1, vbo);
+
+    *vbo = 0;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Display callback
+////////////////////////////////////////////////////////////////////////////////
+void display()
+{
+    sdkStartTimer(&timer);
+
+    // run CUDA kernel to generate vertex positions
+    runCuda(&cuda_vbo_resource);
+
+    glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
+
+    // set view matrix
+    glMatrixMode(GL_MODELVIEW);
+    glLoadIdentity();
+    glTranslatef(0.0, 0.0, translate_z);
+    glRotatef(rotate_x, 1.0, 0.0, 0.0);
+    glRotatef(rotate_y, 0.0, 1.0, 0.0);
+
+    // render from the vbo
+    glBindBuffer(GL_ARRAY_BUFFER, vbo);
+    glVertexPointer(4, GL_FLOAT, 0, 0);
+
+    glEnableClientState(GL_VERTEX_ARRAY);
+    glColor3f(1.0, 0.0, 0.0);
+    glDrawArrays(GL_POINTS, 0, mesh_width * mesh_height);
+    glDisableClientState(GL_VERTEX_ARRAY);
+
+    glutSwapBuffers();
+
+    g_fAnim += 0.01f;
+
+    sdkStopTimer(&timer);
+    computeFPS();
+}
+
+void timerEvent(int value)
+{
+    if (glutGetWindow())
+    {
+        glutPostRedisplay();
+        glutTimerFunc(REFRESH_DELAY, timerEvent,0);
+    }
+}
+
+void cleanup()
+{
+    sdkDeleteTimer(&timer);
+
+    if (vbo)
+    {
+        deleteVBO(&vbo, cuda_vbo_resource);
+    }
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+//! Keyboard events handler
+////////////////////////////////////////////////////////////////////////////////
+void keyboard(unsigned char key, int /*x*/, int /*y*/)
+{
+    switch (key)
+    {
+        case (27) :
+#if defined(__APPLE__) || defined(MACOSX)
+            exit(EXIT_SUCCESS);
+#else
+            glutDestroyWindow(glutGetWindow());
+            return;
+#endif
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Mouse event handlers
+////////////////////////////////////////////////////////////////////////////////
+void mouse(int button, int state, int x, int y)
+{
+    if (state == GLUT_DOWN)
+    {
+        mouse_buttons |= 1<<button;
+    }
+    else if (state == GLUT_UP)
+    {
+        mouse_buttons = 0;
+    }
+
+    mouse_old_x = x;
+    mouse_old_y = y;
+}
+
+void motion(int x, int y)
+{
+    float dx, dy;
+    dx = (float)(x - mouse_old_x);
+    dy = (float)(y - mouse_old_y);
+
+    if (mouse_buttons & 1)
+    {
+        rotate_x += dy * 0.2f;
+        rotate_y += dx * 0.2f;
+    }
+    else if (mouse_buttons & 4)
+    {
+        translate_z += dy * 0.01f;
+    }
+
+    mouse_old_x = x;
+    mouse_old_y = y;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Check if the result is correct or write data to file for external
+//! regression testing
+////////////////////////////////////////////////////////////////////////////////
+void checkResultCuda(int argc, char **argv, const GLuint &vbo)
+{
+    if (!d_vbo_buffer)
+    {
+        checkCudaErrors(cudaGraphicsUnregisterResource(cuda_vbo_resource));
+
+        // map buffer object
+        glBindBuffer(GL_ARRAY_BUFFER, vbo);
+        float *data = (float *) glMapBuffer(GL_ARRAY_BUFFER, GL_READ_ONLY);
+
+        // check result
+        if (checkCmdLineFlag(argc, (const char **) argv, "regression"))
+        {
+            // write file for regression test
+            sdkWriteFile<float>("./data/regression.dat",
+                                data, mesh_width * mesh_height * 3, 0.0, false);
+        }
+
+        // unmap GL buffer object
+        if (!glUnmapBuffer(GL_ARRAY_BUFFER))
+        {
+            fprintf(stderr, "Unmap buffer failed.\n");
+            fflush(stderr);
+        }
+
+        checkCudaErrors(cudaGraphicsGLRegisterBuffer(&cuda_vbo_resource, vbo,
+                                                     cudaGraphicsMapFlagsWriteDiscard));
+
+        SDK_CHECK_ERROR_GL();
+    }
+}
+
diff --git a/vulkanCUDASinewave.cu b/vulkanCUDASinewave.cu
new file mode 100644
index 0000000..bcbd9bf
--- /dev/null
+++ b/vulkanCUDASinewave.cu
@@ -0,0 +1,1711 @@
+//
+// Created by kwoodle on 5/12/20.
+//
+
+/*
+ * Copyright 1993-2018 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+
+#define GLFW_INCLUDE_VULKAN
+#ifdef _WIN64
+#include <windows.h>
+#include <VersionHelpers.h>
+#include <dxgi1_2.h>
+#include <aclapi.h>
+#define _USE_MATH_DEFINES
+#endif
+
+#include <GLFW/glfw3.h>
+#include <vulkan/vulkan.h>
+#ifdef _WIN64
+#include <vulkan/vulkan_win32.h>
+#endif
+
+#include <iostream>
+#include <stdexcept>
+#include <functional>
+#include <cstdlib>
+#include <cstring>
+#include <vector>
+#include <set>
+#include <algorithm>
+#include <fstream>
+#include <array>
+#include <chrono>
+#include <thread>
+
+#include <cuda_runtime.h>
+#include <cuda.h>
+#include <helper_cuda.h>
+
+#include "linmath.h"
+
+#define WIDTH   800
+#define HEIGHT  600
+
+#define VULKAN_VALIDATION 0
+
+
+const std::vector<const char*> validationLayers = {
+        "VK_LAYER_LUNARG_standard_validation"
+};
+
+#if VULKAN_VALIDATION
+const bool enableValidationLayers = true;
+#else
+const bool enableValidationLayers = false;
+#endif
+
+struct QueueFamilyIndices {
+    int graphicsFamily = -1;
+    int presentFamily = -1;
+
+    bool isComplete() {
+        return graphicsFamily >= 0 && presentFamily >= 0;
+    }
+};
+
+const std::vector<const char*> deviceExtensions = {
+        VK_KHR_SWAPCHAIN_EXTENSION_NAME,
+        VK_KHR_EXTERNAL_MEMORY_EXTENSION_NAME,
+        VK_KHR_EXTERNAL_SEMAPHORE_EXTENSION_NAME,
+#ifdef _WIN64
+        VK_KHR_EXTERNAL_MEMORY_WIN32_EXTENSION_NAME,
+    VK_KHR_EXTERNAL_SEMAPHORE_WIN32_EXTENSION_NAME,
+#else
+        VK_KHR_EXTERNAL_MEMORY_FD_EXTENSION_NAME,
+        VK_KHR_EXTERNAL_SEMAPHORE_FD_EXTENSION_NAME,
+#endif
+};
+
+#ifdef _WIN64
+class WindowsSecurityAttributes {
+protected:
+    SECURITY_ATTRIBUTES m_winSecurityAttributes;
+    PSECURITY_DESCRIPTOR m_winPSecurityDescriptor;
+
+public:
+    WindowsSecurityAttributes();
+    SECURITY_ATTRIBUTES * operator&();
+    ~WindowsSecurityAttributes();
+};
+
+WindowsSecurityAttributes::WindowsSecurityAttributes()
+{
+    m_winPSecurityDescriptor = (PSECURITY_DESCRIPTOR)calloc(1, SECURITY_DESCRIPTOR_MIN_LENGTH + 2 * sizeof(void**));
+    //CHECK_NEQ(m_winPSecurityDescriptor, (PSECURITY_DESCRIPTOR)NULL);
+
+    PSID *ppSID = (PSID *)((PBYTE)m_winPSecurityDescriptor + SECURITY_DESCRIPTOR_MIN_LENGTH);
+    PACL *ppACL = (PACL *)((PBYTE)ppSID + sizeof(PSID *));
+
+    InitializeSecurityDescriptor(m_winPSecurityDescriptor, SECURITY_DESCRIPTOR_REVISION);
+
+    SID_IDENTIFIER_AUTHORITY sidIdentifierAuthority = SECURITY_WORLD_SID_AUTHORITY;
+    AllocateAndInitializeSid(&sidIdentifierAuthority, 1, SECURITY_WORLD_RID, 0, 0, 0, 0, 0, 0, 0, ppSID);
+
+    EXPLICIT_ACCESS explicitAccess;
+    ZeroMemory(&explicitAccess, sizeof(EXPLICIT_ACCESS));
+    explicitAccess.grfAccessPermissions = STANDARD_RIGHTS_ALL | SPECIFIC_RIGHTS_ALL;
+    explicitAccess.grfAccessMode        = SET_ACCESS;
+    explicitAccess.grfInheritance       = INHERIT_ONLY;
+    explicitAccess.Trustee.TrusteeForm  = TRUSTEE_IS_SID;
+    explicitAccess.Trustee.TrusteeType  = TRUSTEE_IS_WELL_KNOWN_GROUP;
+    explicitAccess.Trustee.ptstrName    = (LPTSTR)*ppSID;
+
+    SetEntriesInAcl(1, &explicitAccess, NULL, ppACL);
+
+    SetSecurityDescriptorDacl(m_winPSecurityDescriptor, TRUE, *ppACL, FALSE);
+
+    m_winSecurityAttributes.nLength              = sizeof(m_winSecurityAttributes);
+    m_winSecurityAttributes.lpSecurityDescriptor = m_winPSecurityDescriptor;
+    m_winSecurityAttributes.bInheritHandle       = TRUE;
+}
+
+SECURITY_ATTRIBUTES *
+WindowsSecurityAttributes::operator&()
+{
+    return &m_winSecurityAttributes;
+}
+
+WindowsSecurityAttributes::~WindowsSecurityAttributes()
+{
+    PSID* ppSID = (PSID*)((PBYTE)m_winPSecurityDescriptor + SECURITY_DESCRIPTOR_MIN_LENGTH);
+    PACL* ppACL = (PACL*)((PBYTE)ppSID + sizeof(PSID*));
+
+    if (*ppSID) {
+        FreeSid(*ppSID);
+    }
+    if (*ppACL) {
+        LocalFree(*ppACL);
+    }
+    free(m_winPSecurityDescriptor);
+}
+#endif
+
+struct UniformBufferObject {
+    mat4x4 model;
+    mat4x4 view;
+    mat4x4 proj;
+};
+
+struct SwapChainSupportDetails {
+    VkSurfaceCapabilitiesKHR capabilities;
+    std::vector<VkSurfaceFormatKHR> formats;
+    std::vector<VkPresentModeKHR> presentModes;
+};
+
+struct Vertex {
+    vec4 pos;
+    vec3 color;
+
+    static VkVertexInputBindingDescription getBindingDescription() {
+        VkVertexInputBindingDescription bindingDescription = {};
+
+        bindingDescription.binding = 0;
+        bindingDescription.stride = sizeof(Vertex);
+        bindingDescription.inputRate = VK_VERTEX_INPUT_RATE_VERTEX;
+
+        return bindingDescription;
+    }
+
+    static std::array<VkVertexInputAttributeDescription, 2> getAttributeDescriptions() {
+        std::array<VkVertexInputAttributeDescription, 2> attributeDescriptions = {};
+        attributeDescriptions[0].binding = 0;
+        attributeDescriptions[0].location = 0;
+        attributeDescriptions[0].format = VK_FORMAT_R32G32B32A32_SFLOAT;
+        attributeDescriptions[0].offset = offsetof(Vertex, pos);
+
+        attributeDescriptions[1].binding = 0;
+        attributeDescriptions[1].location = 1;
+        attributeDescriptions[1].format = VK_FORMAT_R32G32B32_SFLOAT;
+        attributeDescriptions[1].offset = offsetof(Vertex, color);
+        return attributeDescriptions;
+    }
+
+};
+
+size_t mesh_width = 0, mesh_height = 0;
+std::string execution_path;
+
+__global__ void sinewave_gen_kernel(Vertex *vertices, unsigned int width, unsigned int height, float time)
+{
+    unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
+    unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
+
+    // calculate uv coordinates
+    float u = x / (float) width;
+    float v = y / (float) height;
+    u = u*2.0f - 1.0f;
+    v = v*2.0f - 1.0f;
+
+    // calculate simple sine wave pattern
+    float freq = 4.0f;
+    float w = sinf(u*freq + time) * cosf(v*freq + time) * 0.5f;
+
+    if (y < height && x < width)
+    {
+        // write output vertex
+        vertices[y*width+x].pos[0] = u;
+        vertices[y*width+x].pos[1] = w;
+        vertices[y*width+x].pos[2] = v;
+        vertices[y*width+x].pos[3] = 1.0f;
+        vertices[y*width+x].color[0] = 1.0f;
+        vertices[y*width+x].color[1] = 0.0f;
+        vertices[y*width+x].color[2] = 0.0f;
+    }
+}
+
+
+class vulkanCudaApp {
+public:
+    void run() {
+        initWindow();
+        initVulkan();
+        initCuda();
+        mainLoop();
+        cleanup();
+    }
+
+private:
+    GLFWwindow* window;
+    VkInstance instance;
+    VkPhysicalDevice physicalDevice = VK_NULL_HANDLE;
+    uint8_t  vkDeviceUUID[VK_UUID_SIZE];
+    VkDevice device;
+    VkQueue graphicsQueue;
+    VkQueue presentQueue;
+    VkSurfaceKHR surface;
+    VkSwapchainKHR swapChain;
+    std::vector<VkImage> swapChainImages;
+    VkFormat swapChainImageFormat;
+    VkExtent2D swapChainExtent;
+    std::vector<VkImageView> swapChainImageViews;
+    VkDescriptorSetLayout descriptorSetLayout;
+    VkDescriptorPool descriptorPool;
+    VkDescriptorSet descriptorSet;
+    VkPipelineLayout pipelineLayout;
+    VkRenderPass renderPass;
+    VkPipeline graphicsPipeline;
+    std::vector<VkFramebuffer> swapChainFramebuffers;
+    VkCommandPool commandPool;
+    VkBuffer vertexBuffer;
+    VkDeviceMemory vertexBufferMemory;
+    VkBuffer uniformBuffer;
+    VkDeviceMemory uniformBufferMemory;
+    std::vector<VkCommandBuffer> commandBuffers;
+    VkSemaphore imageAvailableSemaphore;
+    VkSemaphore renderFinishedSemaphore;
+    VkSemaphore cudaUpdateVkVertexBufSemaphore;
+    VkSemaphore vkUpdateCudaVertexBufSemaphore;
+
+    size_t vertexBufSize = 0;
+    bool startSubmit = 0;
+    double AnimTime=1.0f;
+
+    VkDebugReportCallbackEXT callback;
+
+#ifdef _WIN64
+    PFN_vkGetMemoryWin32HandleKHR  fpGetMemoryWin32HandleKHR;
+    PFN_vkGetSemaphoreWin32HandleKHR fpGetSemaphoreWin32HandleKHR;
+#else
+    PFN_vkGetMemoryFdKHR  fpGetMemoryFdKHR;
+    PFN_vkGetSemaphoreFdKHR fpGetSemaphoreFdKHR;
+#endif
+
+    PFN_vkGetPhysicalDeviceProperties2 fpGetPhysicalDeviceProperties2;
+
+    // CUDA stuff
+    cudaExternalMemory_t cudaExtMemVertexBuffer;
+    cudaExternalSemaphore_t cudaExtCudaUpdateVkVertexBufSemaphore;
+    cudaExternalSemaphore_t cudaExtVkUpdateCudaVertexBufSemaphore;
+    void *cudaDevVertptr = NULL;
+    cudaStream_t streamToRun;
+
+    bool checkValidationLayerSupport() {
+        uint32_t layerCount;
+        vkEnumerateInstanceLayerProperties(&layerCount, nullptr);
+
+        std::vector<VkLayerProperties> availableLayers(layerCount);
+        vkEnumerateInstanceLayerProperties(&layerCount, availableLayers.data());
+
+        for (const char* layerName : validationLayers) {
+            bool layerFound = false;
+
+            for (const auto& layerProperties : availableLayers) {
+                if (strcmp(layerName, layerProperties.layerName) == 0) {
+                    layerFound = true;
+                    break;
+                }
+            }
+
+            if (!layerFound) {
+                return false;
+            }
+        }
+
+        return true;
+    }
+
+    static VKAPI_ATTR VkBool32 VKAPI_CALL debugCallback(
+            VkDebugReportFlagsEXT flags,
+            VkDebugReportObjectTypeEXT objType,
+            uint64_t obj,
+            size_t location,
+            int32_t code,
+            const char* layerPrefix,
+            const char* msg,
+            void* userData) {
+
+        std::cerr << "validation layer: " << msg << std::endl;
+
+        return VK_FALSE;
+    }
+
+    VkResult CreateDebugReportCallbackEXT(VkInstance instance, const VkDebugReportCallbackCreateInfoEXT* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkDebugReportCallbackEXT* pCallback) {
+        auto func = (PFN_vkCreateDebugReportCallbackEXT)vkGetInstanceProcAddr(instance, "vkCreateDebugReportCallbackEXT");
+        if (func != nullptr) {
+            return func(instance, pCreateInfo, pAllocator, pCallback);
+        }
+        else {
+            return VK_ERROR_EXTENSION_NOT_PRESENT;
+        }
+    }
+
+    void DestroyDebugReportCallbackEXT(VkInstance instance, VkDebugReportCallbackEXT callback, const VkAllocationCallbacks* pAllocator) {
+        auto func = (PFN_vkDestroyDebugReportCallbackEXT)vkGetInstanceProcAddr(instance, "vkDestroyDebugReportCallbackEXT");
+        if (func != nullptr) {
+            func(instance, callback, pAllocator);
+        }
+    }
+
+    void setupDebugCallback() {
+        if (!enableValidationLayers) return;
+
+        VkDebugReportCallbackCreateInfoEXT createInfo = {};
+        createInfo.sType = VK_STRUCTURE_TYPE_DEBUG_REPORT_CALLBACK_CREATE_INFO_EXT;
+        createInfo.flags = VK_DEBUG_REPORT_ERROR_BIT_EXT | VK_DEBUG_REPORT_WARNING_BIT_EXT;
+        createInfo.pfnCallback = debugCallback;
+
+        if (CreateDebugReportCallbackEXT(instance, &createInfo, nullptr, &callback) != VK_SUCCESS) {
+            throw std::runtime_error("failed to set up debug callback!");
+        }
+    }
+    void initWindow() {
+        glfwInit();
+        glfwWindowHint(GLFW_CLIENT_API, GLFW_NO_API);
+        glfwWindowHint(GLFW_RESIZABLE, GLFW_FALSE);
+        window = glfwCreateWindow(WIDTH, HEIGHT, "Vulkan-CUDA Interop Sinewave", nullptr, nullptr);
+    }
+
+    void createInstance() {
+
+        if (enableValidationLayers && !checkValidationLayerSupport()) {
+            throw std::runtime_error("validation layers requested, but not available!");
+        }
+
+        VkApplicationInfo appInfo = {};
+        appInfo.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO;
+        appInfo.pApplicationName = "Vulkan CUDA Sinewave";
+        appInfo.applicationVersion = VK_MAKE_VERSION(1, 0, 0);
+        appInfo.pEngineName = "No Engine";
+        appInfo.engineVersion = VK_MAKE_VERSION(1, 0, 0);
+        appInfo.apiVersion = VK_API_VERSION_1_0;
+
+        VkInstanceCreateInfo createInfo = {};
+        createInfo.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO;
+        createInfo.pApplicationInfo = &appInfo;
+
+        uint32_t glfwExtensionCount = 0;
+        const char** glfwExtensions;
+
+        glfwExtensions = glfwGetRequiredInstanceExtensions(&glfwExtensionCount);
+
+        std::vector<const char*> enabledExtensionNameList;
+        enabledExtensionNameList.push_back(VK_KHR_GET_PHYSICAL_DEVICE_PROPERTIES_2_EXTENSION_NAME);
+        enabledExtensionNameList.push_back(VK_KHR_EXTERNAL_MEMORY_CAPABILITIES_EXTENSION_NAME);
+        enabledExtensionNameList.push_back(VK_KHR_EXTERNAL_SEMAPHORE_CAPABILITIES_EXTENSION_NAME);
+
+        for (int i = 0; i < glfwExtensionCount; i++)
+        {
+            enabledExtensionNameList.push_back(glfwExtensions[i]);
+        }
+        if (enableValidationLayers) {
+            enabledExtensionNameList.push_back(VK_EXT_DEBUG_REPORT_EXTENSION_NAME);
+            createInfo.enabledLayerCount = static_cast<uint32_t>(validationLayers.size());
+            createInfo.ppEnabledLayerNames = validationLayers.data();
+        }
+        else {
+            createInfo.enabledLayerCount = 0;
+        }
+
+        createInfo.enabledExtensionCount = enabledExtensionNameList.size();
+        createInfo.ppEnabledExtensionNames = enabledExtensionNameList.data();
+
+        if (vkCreateInstance(&createInfo, nullptr, &instance) != VK_SUCCESS) {
+            throw std::runtime_error("failed to create instance!");
+        }
+        else {
+            std::cout << "Instance created successfully!!\n";
+        }
+
+        fpGetPhysicalDeviceProperties2 = (PFN_vkGetPhysicalDeviceProperties2)vkGetInstanceProcAddr(instance, "vkGetPhysicalDeviceProperties2");
+        if (fpGetPhysicalDeviceProperties2 == NULL) {
+            throw std::runtime_error("Vulkan: Proc address for \"vkGetPhysicalDeviceProperties2KHR\" not found.\n");
+        }
+
+#ifdef _WIN64
+        fpGetMemoryWin32HandleKHR = (PFN_vkGetMemoryWin32HandleKHR)vkGetInstanceProcAddr(instance, "vkGetMemoryWin32HandleKHR");
+        if (fpGetMemoryWin32HandleKHR == NULL) {
+            throw std::runtime_error("Vulkan: Proc address for \"vkGetMemoryWin32HandleKHR\" not found.\n");
+        }
+#else
+        fpGetMemoryFdKHR = (PFN_vkGetMemoryFdKHR)vkGetInstanceProcAddr(instance, "vkGetMemoryFdKHR");
+        if (fpGetMemoryFdKHR == NULL) {
+            throw std::runtime_error("Vulkan: Proc address for \"vkGetMemoryFdKHR\" not found.\n");
+        }
+#endif
+    }
+
+    void initVulkan() {
+        createInstance();
+        setupDebugCallback();
+        createSurface();
+        pickPhysicalDevice();
+        createLogicalDevice();
+        getKhrExtensionsFn();
+        createSwapChain();
+        createImageViews();
+        createRenderPass();
+        createDescriptorSetLayout();
+        createGraphicsPipeline();
+        createFramebuffers();
+        createCommandPool();
+        createVertexBuffer();
+        createUniformBuffer();
+        createDescriptorPool();
+        createDescriptorSet();
+        createCommandBuffers();
+        createSyncObjects();
+        createSyncObjectsExt();
+    }
+
+    void initCuda()
+    {
+        setCudaVkDevice();
+        cudaVkImportVertexMem();
+        cudaInitVertexMem();
+        cudaVkImportSemaphore();
+    }
+
+    void createSurface() {
+        if (glfwCreateWindowSurface(instance, window, nullptr, &surface) != VK_SUCCESS) {
+            throw std::runtime_error("failed to create window surface!");
+        }
+    }
+
+    void pickPhysicalDevice() {
+        uint32_t deviceCount = 0;
+
+        vkEnumeratePhysicalDevices(instance, &deviceCount, nullptr);
+
+        if (deviceCount == 0) {
+            throw std::runtime_error("failed to find GPUs with Vulkan support!");
+        }
+
+        std::vector<VkPhysicalDevice> devices(deviceCount);
+        vkEnumeratePhysicalDevices(instance, &deviceCount, devices.data());
+
+        for (const auto& device : devices) {
+            if (isDeviceSuitable(device)) {
+                physicalDevice = device;
+                break;
+            }
+        }
+        if (physicalDevice == VK_NULL_HANDLE) {
+            throw std::runtime_error("failed to find a suitable GPU!");
+        }
+
+        std::cout << "Selected physical device = " << physicalDevice << std::endl;
+
+        VkPhysicalDeviceIDProperties vkPhysicalDeviceIDProperties = {};
+        vkPhysicalDeviceIDProperties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES;
+        vkPhysicalDeviceIDProperties.pNext = NULL;
+
+        VkPhysicalDeviceProperties2 vkPhysicalDeviceProperties2 = {};
+        vkPhysicalDeviceProperties2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2;
+        vkPhysicalDeviceProperties2.pNext = &vkPhysicalDeviceIDProperties;
+
+        fpGetPhysicalDeviceProperties2(physicalDevice, &vkPhysicalDeviceProperties2);
+
+        memcpy(vkDeviceUUID, vkPhysicalDeviceIDProperties.deviceUUID, sizeof(vkDeviceUUID));
+    }
+
+    int setCudaVkDevice()
+    {
+        int current_device = 0;
+        int device_count = 0;
+        int devices_prohibited = 0;
+
+        cudaDeviceProp deviceProp;
+        checkCudaErrors(cudaGetDeviceCount(&device_count));
+
+        if (device_count == 0) {
+            fprintf(stderr, "CUDA error: no devices supporting CUDA.\n");
+            exit(EXIT_FAILURE);
+        }
+
+        // Find the GPU which is selected by Vulkan
+        while (current_device < device_count) {
+            cudaGetDeviceProperties(&deviceProp, current_device);
+
+            if ((deviceProp.computeMode != cudaComputeModeProhibited)) {
+                // Compare the cuda device UUID with vulkan UUID
+                int ret = memcmp(&deviceProp.uuid, &vkDeviceUUID, VK_UUID_SIZE);
+                if (ret == 0)
+                {
+                    checkCudaErrors(cudaSetDevice(current_device));
+                    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, current_device));
+                    printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n",
+                           current_device, deviceProp.name, deviceProp.major,
+                           deviceProp.minor);
+
+                    return current_device;
+                }
+
+            } else {
+                devices_prohibited++;
+            }
+
+            current_device++;
+        }
+
+        if (devices_prohibited == device_count) {
+            fprintf(stderr,
+                    "CUDA error:"
+                    " No Vulkan-CUDA Interop capable GPU found.\n");
+            exit(EXIT_FAILURE);
+        }
+
+        return -1;
+    }
+
+    bool isDeviceSuitable(VkPhysicalDevice device) {
+        QueueFamilyIndices indices = findQueueFamilies(device);
+        bool extensionsSupported = checkDeviceExtensionSupport(device);
+
+        bool swapChainAdequate = false;
+        if (extensionsSupported) {
+            SwapChainSupportDetails swapChainSupport = querySwapChainSupport(device);
+            swapChainAdequate = !swapChainSupport.formats.empty() && !swapChainSupport.presentModes.empty();
+        }
+
+        return indices.isComplete() && extensionsSupported && swapChainAdequate;
+    }
+
+    bool checkDeviceExtensionSupport(VkPhysicalDevice device) {
+        uint32_t extensionCount;
+        vkEnumerateDeviceExtensionProperties(device, nullptr, &extensionCount, nullptr);
+
+        std::vector<VkExtensionProperties> availableExtensions(extensionCount);
+        vkEnumerateDeviceExtensionProperties(device, nullptr, &extensionCount, availableExtensions.data());
+
+        std::set<std::string> requiredExtensions(deviceExtensions.begin(), deviceExtensions.end());
+
+        for (const auto& extension : availableExtensions) {
+            requiredExtensions.erase(extension.extensionName);
+        }
+
+        return requiredExtensions.empty();
+    }
+
+    QueueFamilyIndices findQueueFamilies(VkPhysicalDevice device) {
+        QueueFamilyIndices indices;
+        uint32_t queueFamilyCount = 0;
+        vkGetPhysicalDeviceQueueFamilyProperties(device, &queueFamilyCount, nullptr);
+
+        std::vector<VkQueueFamilyProperties> queueFamilies(queueFamilyCount);
+        vkGetPhysicalDeviceQueueFamilyProperties(device, &queueFamilyCount, queueFamilies.data());
+
+
+        int i = 0;
+        for (const auto& queueFamily : queueFamilies) {
+            if (queueFamily.queueCount > 0 && queueFamily.queueFlags & VK_QUEUE_GRAPHICS_BIT) {
+                indices.graphicsFamily = i;
+            }
+
+            VkBool32 presentSupport = false;
+            vkGetPhysicalDeviceSurfaceSupportKHR(device, i, surface, &presentSupport);
+
+            if (queueFamily.queueCount > 0 && presentSupport) {
+                indices.presentFamily = i;
+            }
+
+            if (indices.isComplete()) {
+                break;
+            }
+            i++;
+        }
+        return indices;
+    }
+
+    SwapChainSupportDetails querySwapChainSupport(VkPhysicalDevice device) {
+        SwapChainSupportDetails details;
+        vkGetPhysicalDeviceSurfaceCapabilitiesKHR(device, surface, &details.capabilities);
+
+        uint32_t formatCount;
+        vkGetPhysicalDeviceSurfaceFormatsKHR(device, surface, &formatCount, nullptr);
+
+        if (formatCount != 0) {
+            details.formats.resize(formatCount);
+            vkGetPhysicalDeviceSurfaceFormatsKHR(device, surface, &formatCount, details.formats.data());
+        }
+
+        uint32_t presentModeCount;
+        vkGetPhysicalDeviceSurfacePresentModesKHR(device, surface, &presentModeCount, nullptr);
+
+        if (presentModeCount != 0) {
+            details.presentModes.resize(presentModeCount);
+            vkGetPhysicalDeviceSurfacePresentModesKHR(device, surface, &presentModeCount, details.presentModes.data());
+        }
+
+        return details;
+    }
+
+    VkSurfaceFormatKHR chooseSwapSurfaceFormat(const std::vector<VkSurfaceFormatKHR>& availableFormats) {
+        if (availableFormats.size() == 1 && availableFormats[0].format == VK_FORMAT_UNDEFINED) {
+            return {VK_FORMAT_B8G8R8A8_UNORM, VK_COLOR_SPACE_SRGB_NONLINEAR_KHR};
+        }
+
+        for (const auto& availableFormat : availableFormats) {
+            if (availableFormat.format == VK_FORMAT_B8G8R8A8_UNORM && availableFormat.colorSpace == VK_COLOR_SPACE_SRGB_NONLINEAR_KHR) {
+                return availableFormat;
+            }
+        }
+
+        return availableFormats[0];
+    }
+
+    VkPresentModeKHR chooseSwapPresentMode(const std::vector<VkPresentModeKHR> availablePresentModes) {
+        VkPresentModeKHR bestMode = VK_PRESENT_MODE_FIFO_KHR;
+
+        for (const auto& availablePresentMode : availablePresentModes) {
+            if (availablePresentMode == VK_PRESENT_MODE_MAILBOX_KHR) {
+                return availablePresentMode;
+            } else if (availablePresentMode == VK_PRESENT_MODE_IMMEDIATE_KHR) {
+                bestMode = availablePresentMode;
+            }
+        }
+
+        return bestMode;
+    }
+
+    VkExtent2D chooseSwapExtent(const VkSurfaceCapabilitiesKHR& capabilities) {
+        if (capabilities.currentExtent.width != std::numeric_limits<uint32_t>::max()) {
+            return capabilities.currentExtent;
+        } else {
+            VkExtent2D actualExtent = {WIDTH, HEIGHT};
+
+            actualExtent.width  = std::max(capabilities.minImageExtent.width, std::min(capabilities.maxImageExtent.width, actualExtent.width));
+            actualExtent.height = std::max(capabilities.minImageExtent.height, std::min(capabilities.maxImageExtent.height, actualExtent.height));
+
+            return actualExtent;
+        }
+    }
+
+    void createLogicalDevice() {
+
+        QueueFamilyIndices indices = findQueueFamilies(physicalDevice);
+
+        std::vector<VkDeviceQueueCreateInfo> queueCreateInfos;
+        std::set<int> uniqueQueueFamilies = {indices.graphicsFamily, indices.presentFamily};
+
+        float queuePriority = 1.0f;
+        for (int queueFamily : uniqueQueueFamilies) {
+            VkDeviceQueueCreateInfo queueCreateInfo = {};
+            queueCreateInfo.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO;
+            queueCreateInfo.queueFamilyIndex = queueFamily;
+            queueCreateInfo.queueCount = 1;
+            queueCreateInfo.pQueuePriorities = &queuePriority;
+            queueCreateInfos.push_back(queueCreateInfo);
+        }
+
+        VkPhysicalDeviceFeatures deviceFeatures = {};
+
+        VkDeviceCreateInfo createInfo = {};
+        createInfo.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO;
+
+        createInfo.pQueueCreateInfos = queueCreateInfos.data();
+        createInfo.queueCreateInfoCount = queueCreateInfos.size();
+
+        createInfo.pEnabledFeatures = &deviceFeatures;
+        std::vector<const char*> enabledExtensionNameList;
+
+        for (int i = 0; i < deviceExtensions.size(); i++)
+        {
+            enabledExtensionNameList.push_back(deviceExtensions[i]);
+        }
+        if (enableValidationLayers) {
+            createInfo.enabledLayerCount = static_cast<uint32_t>(validationLayers.size());
+            createInfo.ppEnabledLayerNames = validationLayers.data();
+        }
+        else {
+            createInfo.enabledLayerCount = 0;
+        }
+        createInfo.enabledExtensionCount = static_cast<uint32_t>(enabledExtensionNameList.size());
+        createInfo.ppEnabledExtensionNames = enabledExtensionNameList.data();
+
+        if (vkCreateDevice(physicalDevice, &createInfo, nullptr, &device) != VK_SUCCESS) {
+            throw std::runtime_error("failed to create logical device!");
+        }
+        vkGetDeviceQueue(device, indices.graphicsFamily, 0, &graphicsQueue);
+        vkGetDeviceQueue(device, indices.presentFamily, 0, &presentQueue);
+    }
+
+    void createSwapChain() {
+        SwapChainSupportDetails swapChainSupport = querySwapChainSupport(physicalDevice);
+
+        VkSurfaceFormatKHR surfaceFormat = chooseSwapSurfaceFormat(swapChainSupport.formats);
+        VkPresentModeKHR presentMode = chooseSwapPresentMode(swapChainSupport.presentModes);
+        VkExtent2D extent = chooseSwapExtent(swapChainSupport.capabilities);
+
+        uint32_t imageCount = swapChainSupport.capabilities.minImageCount + 1;
+        if (swapChainSupport.capabilities.maxImageCount > 0 && imageCount > swapChainSupport.capabilities.maxImageCount) {
+            imageCount = swapChainSupport.capabilities.maxImageCount;
+        }
+
+        VkSwapchainCreateInfoKHR createInfo = {};
+        createInfo.sType = VK_STRUCTURE_TYPE_SWAPCHAIN_CREATE_INFO_KHR;
+        createInfo.surface = surface;
+        createInfo.minImageCount = imageCount;
+        createInfo.imageFormat = surfaceFormat.format;
+        createInfo.imageColorSpace = surfaceFormat.colorSpace;
+        createInfo.imageExtent = extent;
+        createInfo.imageArrayLayers = 1;
+        createInfo.imageUsage = VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT;
+
+        QueueFamilyIndices indices = findQueueFamilies(physicalDevice);
+        uint32_t queueFamilyIndices[] = {(uint32_t) indices.graphicsFamily, (uint32_t) indices.presentFamily};
+
+        if (indices.graphicsFamily != indices.presentFamily) {
+            createInfo.imageSharingMode = VK_SHARING_MODE_CONCURRENT;
+            createInfo.queueFamilyIndexCount = 2;
+            createInfo.pQueueFamilyIndices = queueFamilyIndices;
+        } else {
+            createInfo.imageSharingMode = VK_SHARING_MODE_EXCLUSIVE;
+            createInfo.queueFamilyIndexCount = 0; // Optional
+            createInfo.pQueueFamilyIndices = nullptr; // Optional
+        }
+
+        createInfo.preTransform = swapChainSupport.capabilities.currentTransform;
+        createInfo.compositeAlpha = VK_COMPOSITE_ALPHA_OPAQUE_BIT_KHR;
+        createInfo.presentMode = presentMode;
+        createInfo.clipped = VK_TRUE;
+        createInfo.oldSwapchain = VK_NULL_HANDLE;
+
+        if (vkCreateSwapchainKHR(device, &createInfo, nullptr, &swapChain) != VK_SUCCESS) {
+            throw std::runtime_error("failed to create swap chain!");
+        }
+        else
+        {
+            std::cout<< "Swapchain created!!\n";
+        }
+
+        vkGetSwapchainImagesKHR(device, swapChain, &imageCount, nullptr);
+        swapChainImages.resize(imageCount);
+        vkGetSwapchainImagesKHR(device, swapChain, &imageCount, swapChainImages.data());
+
+        swapChainImageFormat = surfaceFormat.format;
+        swapChainExtent = extent;
+    }
+
+    void createImageViews() {
+        swapChainImageViews.resize(swapChainImages.size());
+
+        for (size_t i = 0; i < swapChainImages.size(); i++) {
+            VkImageViewCreateInfo createInfo = {};
+            createInfo.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO;
+            createInfo.image = swapChainImages[i];
+            createInfo.viewType = VK_IMAGE_VIEW_TYPE_2D;
+            createInfo.format = swapChainImageFormat;
+
+            createInfo.components.r = VK_COMPONENT_SWIZZLE_IDENTITY;
+            createInfo.components.g = VK_COMPONENT_SWIZZLE_IDENTITY;
+            createInfo.components.b = VK_COMPONENT_SWIZZLE_IDENTITY;
+            createInfo.components.a = VK_COMPONENT_SWIZZLE_IDENTITY;
+
+            createInfo.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
+            createInfo.subresourceRange.baseMipLevel = 0;
+            createInfo.subresourceRange.levelCount = 1;
+            createInfo.subresourceRange.baseArrayLayer = 0;
+            createInfo.subresourceRange.layerCount = 1;
+
+            if (vkCreateImageView(device, &createInfo, nullptr, &swapChainImageViews[i]) != VK_SUCCESS) {
+                throw std::runtime_error("failed to create image views!");
+            }
+        }
+    }
+
+    void createDescriptorSetLayout() {
+        VkDescriptorSetLayoutBinding uboLayoutBinding = {};
+        uboLayoutBinding.binding = 0;
+        uboLayoutBinding.descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER;
+        uboLayoutBinding.descriptorCount = 1;
+        uboLayoutBinding.stageFlags = VK_SHADER_STAGE_VERTEX_BIT;
+        uboLayoutBinding.pImmutableSamplers = nullptr; // Optional
+
+        VkDescriptorSetLayoutCreateInfo layoutInfo = {};
+        layoutInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO;
+        layoutInfo.bindingCount = 1;
+        layoutInfo.pBindings = &uboLayoutBinding;
+
+        if (vkCreateDescriptorSetLayout(device, &layoutInfo, nullptr, &descriptorSetLayout) != VK_SUCCESS) {
+            throw std::runtime_error("failed to create descriptor set layout!");
+        }
+
+    }
+
+    void createGraphicsPipeline() {
+        auto vertShaderCode = readFile("shader_sine.vert");
+        auto fragShaderCode = readFile("shader_sine.frag");
+
+        VkShaderModule vertShaderModule;
+        VkShaderModule fragShaderModule;
+
+        vertShaderModule = createShaderModule(vertShaderCode);
+        fragShaderModule = createShaderModule(fragShaderCode);
+
+        VkPipelineShaderStageCreateInfo vertShaderStageInfo = {};
+        vertShaderStageInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
+        vertShaderStageInfo.stage = VK_SHADER_STAGE_VERTEX_BIT;
+        vertShaderStageInfo.module = vertShaderModule;
+        vertShaderStageInfo.pName = "main";
+
+        VkPipelineShaderStageCreateInfo fragShaderStageInfo = {};
+        fragShaderStageInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
+        fragShaderStageInfo.stage = VK_SHADER_STAGE_FRAGMENT_BIT;
+        fragShaderStageInfo.module = fragShaderModule;
+        fragShaderStageInfo.pName = "main";
+
+        VkPipelineShaderStageCreateInfo shaderStages[] = {vertShaderStageInfo, fragShaderStageInfo};
+
+        VkPipelineVertexInputStateCreateInfo vertexInputInfo = {};
+        vertexInputInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO;
+        auto bindingDescription = Vertex::getBindingDescription();
+        auto attributeDescriptions = Vertex::getAttributeDescriptions();
+        vertexInputInfo.vertexBindingDescriptionCount = 1;
+        vertexInputInfo.pVertexBindingDescriptions = &bindingDescription;
+        vertexInputInfo.vertexAttributeDescriptionCount = static_cast<uint32_t>(attributeDescriptions.size());
+        vertexInputInfo.pVertexAttributeDescriptions = attributeDescriptions.data();
+
+        VkPipelineInputAssemblyStateCreateInfo inputAssembly = {};
+        inputAssembly.sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO;
+        inputAssembly.topology = VK_PRIMITIVE_TOPOLOGY_POINT_LIST;
+        inputAssembly.primitiveRestartEnable = VK_FALSE;
+
+        VkViewport viewport = {};
+        viewport.x = 0.0f;
+        viewport.y = 0.0f;
+        viewport.width = (float) swapChainExtent.width;
+        viewport.height = (float) swapChainExtent.height;
+        viewport.minDepth = 0.0f;
+        viewport.maxDepth = 1.0f;
+
+        VkRect2D scissor = {};
+        scissor.offset = {0, 0};
+        scissor.extent = swapChainExtent;
+
+        VkPipelineViewportStateCreateInfo viewportState = {};
+        viewportState.sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO;
+        viewportState.viewportCount = 1;
+        viewportState.pViewports = &viewport;
+        viewportState.scissorCount = 1;
+        viewportState.pScissors = &scissor;
+
+        VkPipelineRasterizationStateCreateInfo rasterizer = {};
+        rasterizer.sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO;
+        rasterizer.depthClampEnable = VK_FALSE;
+        rasterizer.rasterizerDiscardEnable = VK_FALSE;
+        rasterizer.polygonMode = VK_POLYGON_MODE_FILL;
+        rasterizer.lineWidth = 1.0f;
+        rasterizer.cullMode = VK_CULL_MODE_BACK_BIT;
+        rasterizer.frontFace = VK_FRONT_FACE_COUNTER_CLOCKWISE;
+        rasterizer.depthBiasEnable = VK_FALSE;
+        rasterizer.depthBiasConstantFactor = 0.0f; // Optional
+        rasterizer.depthBiasClamp = 0.0f; // Optional
+        rasterizer.depthBiasSlopeFactor = 0.0f; // Optional
+
+        VkPipelineMultisampleStateCreateInfo multisampling = {};
+        multisampling.sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO;
+        multisampling.sampleShadingEnable = VK_FALSE;
+        multisampling.rasterizationSamples = VK_SAMPLE_COUNT_1_BIT;
+        multisampling.minSampleShading = 1.0f; // Optional
+        multisampling.pSampleMask = nullptr; // Optional
+        multisampling.alphaToCoverageEnable = VK_FALSE; // Optional
+        multisampling.alphaToOneEnable = VK_FALSE; // Optional
+
+        VkPipelineColorBlendAttachmentState colorBlendAttachment = {};
+        colorBlendAttachment.colorWriteMask = VK_COLOR_COMPONENT_R_BIT | VK_COLOR_COMPONENT_G_BIT | VK_COLOR_COMPONENT_B_BIT | VK_COLOR_COMPONENT_A_BIT;
+        colorBlendAttachment.blendEnable = VK_FALSE;
+        colorBlendAttachment.srcColorBlendFactor = VK_BLEND_FACTOR_ONE; // Optional
+        colorBlendAttachment.dstColorBlendFactor = VK_BLEND_FACTOR_ZERO; // Optional
+        colorBlendAttachment.colorBlendOp = VK_BLEND_OP_ADD; // Optional
+        colorBlendAttachment.srcAlphaBlendFactor = VK_BLEND_FACTOR_ONE; // Optional
+        colorBlendAttachment.dstAlphaBlendFactor = VK_BLEND_FACTOR_ZERO; // Optional
+        colorBlendAttachment.alphaBlendOp = VK_BLEND_OP_ADD; // Optional
+
+        VkPipelineColorBlendStateCreateInfo colorBlending = {};
+        colorBlending.sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO;
+        colorBlending.logicOpEnable = VK_FALSE;
+        colorBlending.logicOp = VK_LOGIC_OP_COPY; // Optional
+        colorBlending.attachmentCount = 1;
+        colorBlending.pAttachments = &colorBlendAttachment;
+        colorBlending.blendConstants[0] = 0.0f; // Optional
+        colorBlending.blendConstants[1] = 0.0f; // Optional
+        colorBlending.blendConstants[2] = 0.0f; // Optional
+        colorBlending.blendConstants[3] = 0.0f; // Optional
+
+#if 0
+        VkDynamicState dynamicStates[] = {
+            VK_DYNAMIC_STATE_VIEWPORT,
+            VK_DYNAMIC_STATE_LINE_WIDTH
+        };
+
+        VkPipelineDynamicStateCreateInfo dynamicState = {};
+        dynamicState.sType = VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO;
+        dynamicState.dynamicStateCount = 2;
+        dynamicState.pDynamicStates = dynamicStates;
+#endif
+        VkPipelineLayoutCreateInfo pipelineLayoutInfo = {};
+        pipelineLayoutInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO;
+        pipelineLayoutInfo.setLayoutCount = 1; // Optional
+        pipelineLayoutInfo.pSetLayouts =  &descriptorSetLayout; // Optional
+        pipelineLayoutInfo.pushConstantRangeCount = 0; // Optional
+        pipelineLayoutInfo.pPushConstantRanges = nullptr; // Optional
+
+        if (vkCreatePipelineLayout(device, &pipelineLayoutInfo, nullptr, &pipelineLayout) != VK_SUCCESS) {
+            throw std::runtime_error("failed to create pipeline layout!");
+        }
+
+        VkGraphicsPipelineCreateInfo pipelineInfo = {};
+        pipelineInfo.sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO;
+        pipelineInfo.stageCount = 2;
+        pipelineInfo.pStages = shaderStages;
+        pipelineInfo.pVertexInputState = &vertexInputInfo;
+        pipelineInfo.pInputAssemblyState = &inputAssembly;
+        pipelineInfo.pViewportState = &viewportState;
+        pipelineInfo.pRasterizationState = &rasterizer;
+        pipelineInfo.pMultisampleState = &multisampling;
+        pipelineInfo.pDepthStencilState = nullptr; // Optional
+        pipelineInfo.pColorBlendState = &colorBlending;
+        pipelineInfo.pDynamicState = nullptr; // Optional
+        pipelineInfo.layout = pipelineLayout;
+        pipelineInfo.renderPass = renderPass;
+        pipelineInfo.subpass = 0;
+        pipelineInfo.basePipelineHandle = VK_NULL_HANDLE; // Optional
+        pipelineInfo.basePipelineIndex = -1; // Optional
+
+        if (vkCreateGraphicsPipelines(device, VK_NULL_HANDLE, 1, &pipelineInfo, nullptr, &graphicsPipeline) != VK_SUCCESS) {
+            throw std::runtime_error("failed to create graphics pipeline!");
+        }
+        else
+        {
+            std::cout << "Pipeline created successfully!!\n";
+        }
+        vkDestroyShaderModule(device, fragShaderModule, nullptr);
+        vkDestroyShaderModule(device, vertShaderModule, nullptr);
+    }
+
+    void createRenderPass() {
+        VkAttachmentDescription colorAttachment = {};
+        colorAttachment.format = swapChainImageFormat;
+        colorAttachment.samples = VK_SAMPLE_COUNT_1_BIT;
+
+        colorAttachment.loadOp = VK_ATTACHMENT_LOAD_OP_CLEAR;
+        colorAttachment.storeOp = VK_ATTACHMENT_STORE_OP_STORE;
+
+        colorAttachment.stencilLoadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE;
+        colorAttachment.stencilStoreOp = VK_ATTACHMENT_STORE_OP_DONT_CARE;
+
+        colorAttachment.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED;
+        colorAttachment.finalLayout = VK_IMAGE_LAYOUT_PRESENT_SRC_KHR;
+
+        VkAttachmentReference colorAttachmentRef = {};
+        colorAttachmentRef.attachment = 0;
+        colorAttachmentRef.layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL;
+
+        VkSubpassDescription subpass = {};
+        subpass.pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS;
+
+        subpass.colorAttachmentCount = 1;
+        subpass.pColorAttachments = &colorAttachmentRef;
+
+        VkRenderPassCreateInfo renderPassInfo = {};
+        renderPassInfo.sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO;
+        renderPassInfo.attachmentCount = 1;
+        renderPassInfo.pAttachments = &colorAttachment;
+        renderPassInfo.subpassCount = 1;
+        renderPassInfo.pSubpasses = &subpass;
+
+        VkSubpassDependency dependency = {};
+        dependency.srcSubpass = VK_SUBPASS_EXTERNAL;
+        dependency.dstSubpass = 0;
+        dependency.srcStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT;
+        dependency.srcAccessMask = 0;
+        dependency.dstStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT;
+        dependency.dstAccessMask = VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT;
+        renderPassInfo.dependencyCount = 1;
+        renderPassInfo.pDependencies = &dependency;
+
+        if (vkCreateRenderPass(device, &renderPassInfo, nullptr, &renderPass) != VK_SUCCESS) {
+            throw std::runtime_error("failed to create render pass!");
+        }
+
+    }
+
+    void createFramebuffers() {
+        swapChainFramebuffers.resize(swapChainImageViews.size());
+
+        for (size_t i = 0; i < swapChainImageViews.size(); i++) {
+            VkImageView attachments[] = {
+                    swapChainImageViews[i]
+            };
+
+            VkFramebufferCreateInfo framebufferInfo = {};
+            framebufferInfo.sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO;
+            framebufferInfo.renderPass = renderPass;
+            framebufferInfo.attachmentCount = 1;
+            framebufferInfo.pAttachments = attachments;
+            framebufferInfo.width = swapChainExtent.width;
+            framebufferInfo.height = swapChainExtent.height;
+            framebufferInfo.layers = 1;
+
+            if (vkCreateFramebuffer(device, &framebufferInfo, nullptr, &swapChainFramebuffers[i]) != VK_SUCCESS) {
+                throw std::runtime_error("failed to create framebuffer!");
+            }
+        }
+    }
+
+    void createCommandPool() {
+        QueueFamilyIndices queueFamilyIndices = findQueueFamilies(physicalDevice);
+
+        VkCommandPoolCreateInfo poolInfo = {};
+        poolInfo.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO;
+        poolInfo.queueFamilyIndex = queueFamilyIndices.graphicsFamily;
+        poolInfo.flags = 0; // Optional
+
+        if (vkCreateCommandPool(device, &poolInfo, nullptr, &commandPool) != VK_SUCCESS) {
+            throw std::runtime_error("failed to create command pool!");
+        }
+    }
+
+    void createBuffer(VkDeviceSize size, VkBufferUsageFlags usage,
+                      VkMemoryPropertyFlags properties, VkBuffer& buffer,
+                      VkDeviceMemory& bufferMemory) {
+        VkBufferCreateInfo bufferInfo = {};
+        bufferInfo.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
+        bufferInfo.size = size;
+        bufferInfo.usage = usage;
+        bufferInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
+
+        if (vkCreateBuffer(device, &bufferInfo, nullptr, &buffer) != VK_SUCCESS) {
+            throw std::runtime_error("failed to create buffer!");
+        }
+
+        VkMemoryRequirements memRequirements;
+        vkGetBufferMemoryRequirements(device, buffer, &memRequirements);
+
+        VkMemoryAllocateInfo allocInfo = {};
+        allocInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
+        allocInfo.allocationSize = memRequirements.size;
+        allocInfo.memoryTypeIndex = findMemoryType(memRequirements.memoryTypeBits, properties);
+
+        if (vkAllocateMemory(device, &allocInfo, nullptr, &bufferMemory) != VK_SUCCESS) {
+            throw std::runtime_error("failed to allocate buffer memory!");
+        }
+
+        vkBindBufferMemory(device, buffer, bufferMemory, 0);
+    }
+
+    void createBufferExtMem(VkDeviceSize size, VkBufferUsageFlags usage, VkMemoryPropertyFlags properties,
+                            VkExternalMemoryHandleTypeFlagsKHR extMemHandleType,
+                            VkBuffer& buffer, VkDeviceMemory& bufferMemory)
+    {
+        VkBufferCreateInfo bufferInfo = {};
+        bufferInfo.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
+        bufferInfo.size = size;
+        bufferInfo.usage = usage;
+        bufferInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
+
+        if (vkCreateBuffer(device, &bufferInfo, nullptr, &buffer) != VK_SUCCESS) {
+            throw std::runtime_error("failed to create buffer!");
+        }
+
+        VkMemoryRequirements memRequirements;
+        vkGetBufferMemoryRequirements(device, buffer, &memRequirements);
+
+#ifdef _WIN64
+        WindowsSecurityAttributes winSecurityAttributes;
+
+        VkExportMemoryWin32HandleInfoKHR vulkanExportMemoryWin32HandleInfoKHR = {};
+        vulkanExportMemoryWin32HandleInfoKHR.sType       = VK_STRUCTURE_TYPE_EXPORT_MEMORY_WIN32_HANDLE_INFO_KHR;
+        vulkanExportMemoryWin32HandleInfoKHR.pNext       = NULL;
+        vulkanExportMemoryWin32HandleInfoKHR.pAttributes = &winSecurityAttributes;
+        vulkanExportMemoryWin32HandleInfoKHR.dwAccess    = DXGI_SHARED_RESOURCE_READ | DXGI_SHARED_RESOURCE_WRITE;
+        vulkanExportMemoryWin32HandleInfoKHR.name        = (LPCWSTR)NULL;
+#endif
+        VkExportMemoryAllocateInfoKHR vulkanExportMemoryAllocateInfoKHR = {};
+        vulkanExportMemoryAllocateInfoKHR.sType       = VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO_KHR;
+#ifdef _WIN64
+        vulkanExportMemoryAllocateInfoKHR.pNext = extMemHandleType & VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT_KHR ? &vulkanExportMemoryWin32HandleInfoKHR : NULL;
+        vulkanExportMemoryAllocateInfoKHR.handleTypes = extMemHandleType;
+#else
+        vulkanExportMemoryAllocateInfoKHR.pNext       = NULL;
+        vulkanExportMemoryAllocateInfoKHR.handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT;
+#endif
+        VkMemoryAllocateInfo allocInfo = {};
+        allocInfo.sType           = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
+        allocInfo.pNext           = &vulkanExportMemoryAllocateInfoKHR;
+        allocInfo.allocationSize  = memRequirements.size;
+        allocInfo.memoryTypeIndex = findMemoryType(memRequirements.memoryTypeBits, properties);
+
+        if (vkAllocateMemory(device, &allocInfo, nullptr, &bufferMemory) != VK_SUCCESS) {
+            throw std::runtime_error("failed to allocate external buffer memory!");
+        }
+
+        vkBindBufferMemory(device, buffer, bufferMemory, 0);
+    }
+
+    void createVertexBuffer() {
+        mesh_width = swapChainExtent.width / 2;
+        mesh_height = swapChainExtent.height / 2;
+        vertexBufSize = mesh_height*mesh_width;
+
+        VkDeviceSize bufferSize = sizeof(Vertex) * vertexBufSize;
+#ifdef _WIN64
+        if (IsWindows8OrGreater())
+        {
+            createBufferExtMem(bufferSize, VK_BUFFER_USAGE_VERTEX_BUFFER_BIT, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT,
+                vertexBuffer, vertexBufferMemory);
+        }
+        else
+        {
+            createBufferExtMem(bufferSize, VK_BUFFER_USAGE_VERTEX_BUFFER_BIT, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT,
+                vertexBuffer, vertexBufferMemory);
+        }
+#else
+        createBufferExtMem(bufferSize, VK_BUFFER_USAGE_VERTEX_BUFFER_BIT, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT,
+                           vertexBuffer, vertexBufferMemory);
+#endif
+    }
+
+    void cudaInitVertexMem()
+    {
+        checkCudaErrors(cudaStreamCreate(&streamToRun));
+
+        dim3 block(16, 16, 1);
+        dim3 grid(mesh_width/16, mesh_height/16, 1);
+        Vertex *vertices = (Vertex*) cudaDevVertptr;
+        sinewave_gen_kernel<<<grid, block, 0, streamToRun>>>(vertices, mesh_width, mesh_height, 1.0);
+        checkCudaErrors(cudaStreamSynchronize(streamToRun));
+    }
+
+    void createUniformBuffer() {
+        VkDeviceSize bufferSize = sizeof(UniformBufferObject);
+        createBuffer(bufferSize, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, uniformBuffer, uniformBufferMemory);
+    }
+
+    uint32_t findMemoryType(uint32_t typeFilter, VkMemoryPropertyFlags properties) {
+        VkPhysicalDeviceMemoryProperties memProperties;
+        vkGetPhysicalDeviceMemoryProperties(physicalDevice, &memProperties);
+
+        for (uint32_t i = 0; i < memProperties.memoryTypeCount; i++) {
+            if (typeFilter & (1 << i) && (memProperties.memoryTypes[i].propertyFlags & properties) == properties) {
+                return i;
+            }
+        }
+
+        throw std::runtime_error("failed to find suitable memory type!");
+    }
+
+    void getKhrExtensionsFn()
+    {
+#ifdef _WIN64
+
+        fpGetSemaphoreWin32HandleKHR = (PFN_vkGetSemaphoreWin32HandleKHR)vkGetDeviceProcAddr(device, "vkGetSemaphoreWin32HandleKHR");
+        if (fpGetSemaphoreWin32HandleKHR == NULL) {
+            throw std::runtime_error("Vulkan: Proc address for \"vkGetSemaphoreWin32HandleKHR\" not found.\n");
+        }
+#else
+        fpGetSemaphoreFdKHR = (PFN_vkGetSemaphoreFdKHR)vkGetDeviceProcAddr(device, "vkGetSemaphoreFdKHR");
+        if (fpGetSemaphoreFdKHR == NULL) {
+            throw std::runtime_error("Vulkan: Proc address for \"vkGetSemaphoreFdKHR\" not found.\n");
+        }
+#endif
+    }
+
+    void createCommandBuffers() {
+        commandBuffers.resize(swapChainFramebuffers.size());
+
+        VkCommandBufferAllocateInfo allocInfo = {};
+        allocInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO;
+        allocInfo.commandPool = commandPool;
+        allocInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
+        allocInfo.commandBufferCount = (uint32_t) commandBuffers.size();
+
+        if (vkAllocateCommandBuffers(device, &allocInfo, commandBuffers.data()) != VK_SUCCESS) {
+            throw std::runtime_error("failed to allocate command buffers!");
+        }
+
+        for (size_t i = 0; i < commandBuffers.size(); i++) {
+            VkCommandBufferBeginInfo beginInfo = {};
+            beginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
+            beginInfo.flags = VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT;
+            beginInfo.pInheritanceInfo = nullptr; // Optional
+
+            if (vkBeginCommandBuffer(commandBuffers[i], &beginInfo) != VK_SUCCESS) {
+                throw std::runtime_error("failed to begin recording command buffer!");
+            }
+
+            VkRenderPassBeginInfo renderPassInfo = {};
+            renderPassInfo.sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO;
+            renderPassInfo.renderPass = renderPass;
+            renderPassInfo.framebuffer = swapChainFramebuffers[i];
+            renderPassInfo.renderArea.offset = {0, 0};
+            renderPassInfo.renderArea.extent = swapChainExtent;
+
+            VkClearValue clearColor = {0.0f, 0.0f, 0.0f, 1.0f};
+            renderPassInfo.clearValueCount = 1;
+            renderPassInfo.pClearValues = &clearColor;
+
+            vkCmdBeginRenderPass(commandBuffers[i], &renderPassInfo, VK_SUBPASS_CONTENTS_INLINE);
+            vkCmdBindPipeline(commandBuffers[i], VK_PIPELINE_BIND_POINT_GRAPHICS, graphicsPipeline);
+            VkBuffer vertexBuffers[] = {vertexBuffer};
+            VkDeviceSize offsets[] = {0};
+            vkCmdBindVertexBuffers(commandBuffers[i], 0, 1, vertexBuffers, offsets);
+            vkCmdBindDescriptorSets(commandBuffers[i], VK_PIPELINE_BIND_POINT_GRAPHICS, pipelineLayout, 0, 1, &descriptorSet, 0, nullptr);
+            vkCmdDraw(commandBuffers[i], static_cast<uint32_t>(vertexBufSize), 1, 0, 0);
+            vkCmdEndRenderPass(commandBuffers[i]);
+            if (vkEndCommandBuffer(commandBuffers[i]) != VK_SUCCESS) {
+                throw std::runtime_error("failed to record command buffer!");
+            }
+        }
+
+    }
+
+    VkShaderModule createShaderModule(const std::vector<char>& code) {
+        VkShaderModuleCreateInfo createInfo = {};
+        createInfo.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO;
+        createInfo.codeSize = code.size();
+        createInfo.pCode = reinterpret_cast<const uint32_t*>(code.data());
+
+        VkShaderModule shaderModule;
+        if (vkCreateShaderModule(device, &createInfo, nullptr, &shaderModule) != VK_SUCCESS) {
+            throw std::runtime_error("failed to create shader module!");
+        }
+
+        return shaderModule;
+    }
+
+    static std::vector<char> readFile(const std::string& filename) {
+        char* file_path = sdkFindFilePath(filename.c_str(), execution_path.c_str());
+
+        std::ifstream file(file_path, std::ios::ate | std::ios::binary);
+
+        if (!file.is_open()) {
+            throw std::runtime_error("failed to open shader spv file!\n");
+        }
+        size_t fileSize = (size_t) file.tellg();
+        std::vector<char> buffer(fileSize);
+        file.seekg(0);
+        file.read(buffer.data(), fileSize);
+        file.close();
+
+        return buffer;
+    }
+
+    void mainLoop() {
+
+        updateUniformBuffer();
+
+        while (!glfwWindowShouldClose(window)) {
+            glfwPollEvents();
+            drawFrame();
+        }
+
+        vkDeviceWaitIdle(device);
+    }
+
+    void updateUniformBuffer() {
+
+        UniformBufferObject ubo = {};
+
+        mat4x4_identity(ubo.model);
+        mat4x4 Model;
+        mat4x4_dup(Model, ubo.model);
+        mat4x4_rotate(ubo.model, Model, 1.0f, 0.0f, 1.0f, degreesToRadians(45.0f));
+
+        vec3 eye = {2.0f, 2.0f, 2.0f};
+        vec3 center = {0.0f, 0.0f, 0.0f};
+        vec3 up = {0.0f, 0.0f, 1.0f};
+        mat4x4_look_at(ubo.view, eye, center, up);
+        mat4x4_perspective(ubo.proj, degreesToRadians(45.0f), swapChainExtent.width / (float) swapChainExtent.height, 0.1f, 10.0f);
+        ubo.proj[1][1] *= -1;
+        void* data;
+        vkMapMemory(device, uniformBufferMemory, 0, sizeof(ubo), 0, &data);
+        memcpy(data, &ubo, sizeof(ubo));
+        vkUnmapMemory(device, uniformBufferMemory);
+    }
+
+    void createDescriptorPool() {
+        VkDescriptorPoolSize poolSize = {};
+        poolSize.type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER;
+        poolSize.descriptorCount = 1;
+
+        VkDescriptorPoolCreateInfo poolInfo = {};
+        poolInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO;
+        poolInfo.poolSizeCount = 1;
+        poolInfo.pPoolSizes = &poolSize;
+        poolInfo.maxSets = 1;
+
+        if (vkCreateDescriptorPool(device, &poolInfo, nullptr, &descriptorPool) != VK_SUCCESS) {
+            throw std::runtime_error("failed to create descriptor pool!");
+        }
+    }
+
+    void createDescriptorSet() {
+        VkDescriptorSetLayout layouts[] = {descriptorSetLayout};
+        VkDescriptorSetAllocateInfo allocInfo = {};
+        allocInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO;
+        allocInfo.descriptorPool = descriptorPool;
+        allocInfo.descriptorSetCount = 1;
+        allocInfo.pSetLayouts = layouts;
+
+        if (vkAllocateDescriptorSets(device, &allocInfo, &descriptorSet) != VK_SUCCESS) {
+            throw std::runtime_error("failed to allocate descriptor set!");
+        }
+
+        VkDescriptorBufferInfo bufferInfo = {};
+        bufferInfo.buffer = uniformBuffer;
+        bufferInfo.offset = 0;
+        bufferInfo.range = sizeof(UniformBufferObject);
+
+        VkWriteDescriptorSet descriptorWrite = {};
+        descriptorWrite.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
+        descriptorWrite.dstSet = descriptorSet;
+        descriptorWrite.dstBinding = 0;
+        descriptorWrite.dstArrayElement = 0;
+        descriptorWrite.descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER;
+        descriptorWrite.descriptorCount = 1;
+        descriptorWrite.pBufferInfo = &bufferInfo;
+        descriptorWrite.pImageInfo = nullptr; // Optional
+        descriptorWrite.pTexelBufferView = nullptr; // Optional
+
+        vkUpdateDescriptorSets(device, 1, &descriptorWrite, 0, nullptr);
+    }
+
+    void drawFrame() {
+
+        uint32_t imageIndex;
+        vkAcquireNextImageKHR(device, swapChain, std::numeric_limits<uint64_t>::max(), imageAvailableSemaphore, VK_NULL_HANDLE, &imageIndex);
+
+        if (!startSubmit)
+        {
+            submitVulkan(imageIndex);
+            startSubmit = 1;
+        }
+        else
+        {
+            submitVulkanCuda(imageIndex);
+        }
+
+        VkPresentInfoKHR presentInfo = {};
+        presentInfo.sType = VK_STRUCTURE_TYPE_PRESENT_INFO_KHR;
+
+        VkSemaphore signalSemaphores[] = {renderFinishedSemaphore};
+
+        presentInfo.waitSemaphoreCount = 1;
+        presentInfo.pWaitSemaphores = signalSemaphores;
+
+        VkSwapchainKHR swapChains[] = {swapChain};
+        presentInfo.swapchainCount = 1;
+        presentInfo.pSwapchains = swapChains;
+        presentInfo.pImageIndices = &imageIndex;
+        presentInfo.pResults = nullptr; // Optional
+
+        vkQueuePresentKHR(presentQueue, &presentInfo);
+
+        cudaUpdateVertexBuffer();
+        // Added sleep of 5 millisecs so that CPU does not submit too much work to GPU
+        std::this_thread::sleep_for(std::chrono::microseconds(5000));
+    }
+
+    void submitVulkan(uint32_t imageIndex)
+    {
+        VkSubmitInfo submitInfo = {};
+        submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
+
+        VkSemaphore waitSemaphores[] = {imageAvailableSemaphore};
+        VkPipelineStageFlags waitStages[] = { VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT };
+        submitInfo.waitSemaphoreCount = 1;
+        submitInfo.pWaitSemaphores = waitSemaphores;
+        submitInfo.pWaitDstStageMask = waitStages;
+        submitInfo.commandBufferCount = 1;
+        submitInfo.pCommandBuffers = &commandBuffers[imageIndex];
+
+        VkSemaphore signalSemaphores[] = {renderFinishedSemaphore, vkUpdateCudaVertexBufSemaphore};
+
+        submitInfo.signalSemaphoreCount = 2;
+        submitInfo.pSignalSemaphores = signalSemaphores;
+
+        if (vkQueueSubmit(graphicsQueue, 1, &submitInfo, VK_NULL_HANDLE) != VK_SUCCESS) {
+            throw std::runtime_error("failed to submit draw command buffer!");
+        }
+    }
+
+    void submitVulkanCuda(uint32_t imageIndex)
+    {
+        VkSubmitInfo submitInfo = {};
+        submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
+
+        VkSemaphore waitSemaphores[] = {imageAvailableSemaphore, cudaUpdateVkVertexBufSemaphore};
+        VkPipelineStageFlags waitStages[] = { VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT};
+        submitInfo.waitSemaphoreCount = 2;
+        submitInfo.pWaitSemaphores = waitSemaphores;
+        submitInfo.pWaitDstStageMask = waitStages;
+        submitInfo.commandBufferCount = 1;
+        submitInfo.pCommandBuffers = &commandBuffers[imageIndex];
+
+        VkSemaphore signalSemaphores[] = {renderFinishedSemaphore, vkUpdateCudaVertexBufSemaphore};
+
+        submitInfo.signalSemaphoreCount = 2;
+        submitInfo.pSignalSemaphores = signalSemaphores;
+
+        if (vkQueueSubmit(graphicsQueue, 1, &submitInfo, VK_NULL_HANDLE) != VK_SUCCESS) {
+            throw std::runtime_error("failed to submit draw command buffer!");
+        }
+    }
+
+    void createSyncObjects() {
+
+        VkSemaphoreCreateInfo semaphoreInfo = {};
+        semaphoreInfo.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO;
+
+        if (vkCreateSemaphore(device, &semaphoreInfo, nullptr, &imageAvailableSemaphore) != VK_SUCCESS ||
+            vkCreateSemaphore(device, &semaphoreInfo, nullptr, &renderFinishedSemaphore) != VK_SUCCESS) {
+
+            throw std::runtime_error("failed to create synchronization objects for a frame!");
+        }
+    }
+
+    void createSyncObjectsExt() {
+
+        VkSemaphoreCreateInfo semaphoreInfo = {};
+        semaphoreInfo.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO;
+
+        memset(&semaphoreInfo, 0, sizeof(semaphoreInfo));
+        semaphoreInfo.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO;
+
+#ifdef _WIN64
+        WindowsSecurityAttributes winSecurityAttributes;
+
+        VkExportSemaphoreWin32HandleInfoKHR vulkanExportSemaphoreWin32HandleInfoKHR = {};
+        vulkanExportSemaphoreWin32HandleInfoKHR.sType       = VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_WIN32_HANDLE_INFO_KHR;
+        vulkanExportSemaphoreWin32HandleInfoKHR.pNext       = NULL;
+        vulkanExportSemaphoreWin32HandleInfoKHR.pAttributes = &winSecurityAttributes;
+        vulkanExportSemaphoreWin32HandleInfoKHR.dwAccess    = DXGI_SHARED_RESOURCE_READ | DXGI_SHARED_RESOURCE_WRITE;
+        vulkanExportSemaphoreWin32HandleInfoKHR.name        = (LPCWSTR)NULL;
+#endif
+        VkExportSemaphoreCreateInfoKHR vulkanExportSemaphoreCreateInfo = {};
+        vulkanExportSemaphoreCreateInfo.sType       = VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_CREATE_INFO_KHR;
+#ifdef _WIN64
+        vulkanExportSemaphoreCreateInfo.pNext       = IsWindows8OrGreater() ? &vulkanExportSemaphoreWin32HandleInfoKHR : NULL;
+        vulkanExportSemaphoreCreateInfo.handleTypes = IsWindows8OrGreater() ? VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT : VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT;
+#else
+        vulkanExportSemaphoreCreateInfo.pNext       = NULL;
+        vulkanExportSemaphoreCreateInfo.handleTypes = VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT;
+#endif
+        semaphoreInfo.pNext = &vulkanExportSemaphoreCreateInfo;
+
+        if (vkCreateSemaphore(device, &semaphoreInfo, nullptr, &cudaUpdateVkVertexBufSemaphore) != VK_SUCCESS ||
+            vkCreateSemaphore(device, &semaphoreInfo, nullptr, &vkUpdateCudaVertexBufSemaphore) != VK_SUCCESS )
+        {
+            throw std::runtime_error("failed to create synchronization objects for a CUDA-Vulkan!");
+        }
+    }
+
+    void cudaVkImportVertexMem()
+    {
+        cudaExternalMemoryHandleDesc cudaExtMemHandleDesc;
+        memset(&cudaExtMemHandleDesc, 0, sizeof(cudaExtMemHandleDesc));
+#ifdef _WIN64
+        cudaExtMemHandleDesc.type = IsWindows8OrGreater() ? cudaExternalMemoryHandleTypeOpaqueWin32 : cudaExternalMemoryHandleTypeOpaqueWin32Kmt;
+        cudaExtMemHandleDesc.handle.win32.handle = getVkMemHandle(IsWindows8OrGreater() ? VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT : VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT);
+#else
+        cudaExtMemHandleDesc.type = cudaExternalMemoryHandleTypeOpaqueFd;
+        cudaExtMemHandleDesc.handle.fd = getVkMemHandle(VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT);
+#endif
+        cudaExtMemHandleDesc.size = sizeof(Vertex) * vertexBufSize;
+
+        checkCudaErrors(cudaImportExternalMemory(&cudaExtMemVertexBuffer, &cudaExtMemHandleDesc));
+
+        cudaExternalMemoryBufferDesc cudaExtBufferDesc;
+        cudaExtBufferDesc.offset = 0;
+        cudaExtBufferDesc.size = sizeof(Vertex) * vertexBufSize;
+        cudaExtBufferDesc.flags = 0;
+
+        checkCudaErrors(cudaExternalMemoryGetMappedBuffer(&cudaDevVertptr, cudaExtMemVertexBuffer, &cudaExtBufferDesc));
+        printf("CUDA Imported Vulkan vertex buffer\n");
+    }
+
+    void cudaVkImportSemaphore()
+    {
+        cudaExternalSemaphoreHandleDesc externalSemaphoreHandleDesc;
+        memset(&externalSemaphoreHandleDesc, 0, sizeof(externalSemaphoreHandleDesc));
+#ifdef _WIN64
+        externalSemaphoreHandleDesc.type = IsWindows8OrGreater() ? cudaExternalSemaphoreHandleTypeOpaqueWin32 : cudaExternalSemaphoreHandleTypeOpaqueWin32Kmt;
+        externalSemaphoreHandleDesc.handle.win32.handle = getVkSemaphoreHandle(IsWindows8OrGreater() ? VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT : VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT,
+                                                                                cudaUpdateVkVertexBufSemaphore);
+#else
+        externalSemaphoreHandleDesc.type      = cudaExternalSemaphoreHandleTypeOpaqueFd;
+        externalSemaphoreHandleDesc.handle.fd = getVkSemaphoreHandle(VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT, cudaUpdateVkVertexBufSemaphore);
+#endif
+        externalSemaphoreHandleDesc.flags = 0;
+
+        checkCudaErrors(cudaImportExternalSemaphore(&cudaExtCudaUpdateVkVertexBufSemaphore, &externalSemaphoreHandleDesc));
+
+        memset(&externalSemaphoreHandleDesc, 0, sizeof(externalSemaphoreHandleDesc));
+#ifdef _WIN64
+        externalSemaphoreHandleDesc.type = IsWindows8OrGreater() ? cudaExternalSemaphoreHandleTypeOpaqueWin32 : cudaExternalSemaphoreHandleTypeOpaqueWin32Kmt;;
+        externalSemaphoreHandleDesc.handle.win32.handle = getVkSemaphoreHandle(IsWindows8OrGreater() ? VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT : VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT,
+                                                                                vkUpdateCudaVertexBufSemaphore);
+#else
+        externalSemaphoreHandleDesc.type      = cudaExternalSemaphoreHandleTypeOpaqueFd;
+        externalSemaphoreHandleDesc.handle.fd = getVkSemaphoreHandle(VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT, vkUpdateCudaVertexBufSemaphore);
+#endif
+        externalSemaphoreHandleDesc.flags = 0;
+        checkCudaErrors(cudaImportExternalSemaphore(&cudaExtVkUpdateCudaVertexBufSemaphore, &externalSemaphoreHandleDesc));
+        printf("CUDA Imported Vulkan semaphore\n");
+    }
+
+#ifdef _WIN64 // For windows
+    HANDLE getVkMemHandle(VkExternalMemoryHandleTypeFlagsKHR externalMemoryHandleType)
+    {
+        HANDLE handle;
+
+        VkMemoryGetWin32HandleInfoKHR vkMemoryGetWin32HandleInfoKHR = {};
+        vkMemoryGetWin32HandleInfoKHR.sType      = VK_STRUCTURE_TYPE_MEMORY_GET_WIN32_HANDLE_INFO_KHR;
+        vkMemoryGetWin32HandleInfoKHR.pNext      = NULL;
+        vkMemoryGetWin32HandleInfoKHR.memory     = vertexBufferMemory;
+        vkMemoryGetWin32HandleInfoKHR.handleType = (VkExternalMemoryHandleTypeFlagBitsKHR)externalMemoryHandleType;
+
+        fpGetMemoryWin32HandleKHR(device, &vkMemoryGetWin32HandleInfoKHR, &handle);
+        return handle;
+    }
+#else
+    int getVkMemHandle(VkExternalMemoryHandleTypeFlagsKHR externalMemoryHandleType)
+    {
+        if (externalMemoryHandleType == VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT) {
+            int fd;
+
+            VkMemoryGetFdInfoKHR vkMemoryGetFdInfoKHR = {};
+            vkMemoryGetFdInfoKHR.sType      = VK_STRUCTURE_TYPE_MEMORY_GET_FD_INFO_KHR;
+            vkMemoryGetFdInfoKHR.pNext      = NULL;
+            vkMemoryGetFdInfoKHR.memory     = vertexBufferMemory;
+            vkMemoryGetFdInfoKHR.handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT_KHR;
+
+            fpGetMemoryFdKHR(device, &vkMemoryGetFdInfoKHR, &fd);
+
+            return fd;
+        }
+        return -1;
+    }
+#endif
+
+
+#ifdef _WIN64
+    HANDLE getVkSemaphoreHandle(VkExternalSemaphoreHandleTypeFlagBitsKHR externalSemaphoreHandleType, VkSemaphore &semVkCuda)
+    {
+        HANDLE handle;
+
+        VkSemaphoreGetWin32HandleInfoKHR vulkanSemaphoreGetWin32HandleInfoKHR = {};
+        vulkanSemaphoreGetWin32HandleInfoKHR.sType      = VK_STRUCTURE_TYPE_SEMAPHORE_GET_WIN32_HANDLE_INFO_KHR;
+        vulkanSemaphoreGetWin32HandleInfoKHR.pNext      = NULL;
+        vulkanSemaphoreGetWin32HandleInfoKHR.semaphore  = semVkCuda;
+        vulkanSemaphoreGetWin32HandleInfoKHR.handleType = externalSemaphoreHandleType;
+
+        fpGetSemaphoreWin32HandleKHR(device, &vulkanSemaphoreGetWin32HandleInfoKHR, &handle);
+
+        return handle;
+    }
+#else
+    int getVkSemaphoreHandle(VkExternalSemaphoreHandleTypeFlagBitsKHR externalSemaphoreHandleType, VkSemaphore &semVkCuda)
+    {
+        if (externalSemaphoreHandleType == VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT) {
+            int fd;
+
+            VkSemaphoreGetFdInfoKHR vulkanSemaphoreGetFdInfoKHR = {};
+            vulkanSemaphoreGetFdInfoKHR.sType      = VK_STRUCTURE_TYPE_SEMAPHORE_GET_FD_INFO_KHR;
+            vulkanSemaphoreGetFdInfoKHR.pNext      = NULL;
+            vulkanSemaphoreGetFdInfoKHR.semaphore  = semVkCuda;
+            vulkanSemaphoreGetFdInfoKHR.handleType = VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT_KHR;
+
+            fpGetSemaphoreFdKHR(device, &vulkanSemaphoreGetFdInfoKHR, &fd);
+
+            return fd;
+        }
+        return -1;
+    }
+#endif
+
+    void cudaVkSemaphoreSignal(cudaExternalSemaphore_t &extSemaphore)
+    {
+        cudaExternalSemaphoreSignalParams extSemaphoreSignalParams;
+        memset(&extSemaphoreSignalParams, 0, sizeof(extSemaphoreSignalParams));
+
+        extSemaphoreSignalParams.params.fence.value = 0;
+        extSemaphoreSignalParams.flags              = 0;
+        checkCudaErrors(cudaSignalExternalSemaphoresAsync(&extSemaphore, &extSemaphoreSignalParams, 1, streamToRun));
+    }
+
+    void cudaVkSemaphoreWait(cudaExternalSemaphore_t &extSemaphore)
+    {
+        cudaExternalSemaphoreWaitParams extSemaphoreWaitParams;
+
+        memset(&extSemaphoreWaitParams, 0, sizeof(extSemaphoreWaitParams));
+
+        extSemaphoreWaitParams.params.fence.value = 0;
+        extSemaphoreWaitParams.flags              = 0;
+
+        checkCudaErrors(cudaWaitExternalSemaphoresAsync(&extSemaphore, &extSemaphoreWaitParams, 1, streamToRun));
+    }
+
+    void cudaUpdateVertexBuffer()
+    {
+        cudaVkSemaphoreWait(cudaExtVkUpdateCudaVertexBufSemaphore);
+
+        dim3 block(16, 16, 1);
+        dim3 grid(mesh_width/block.x, mesh_height/block.y, 1);
+        Vertex *pos = (Vertex*) cudaDevVertptr;
+        AnimTime += 0.01f;
+        sinewave_gen_kernel<<<grid, block, 0, streamToRun>>>(pos, mesh_width, mesh_height, AnimTime);
+        cudaVkSemaphoreSignal(cudaExtCudaUpdateVkVertexBufSemaphore);
+    }
+
+    void cleanup() {
+        if (enableValidationLayers) {
+            DestroyDebugReportCallbackEXT(instance, callback, nullptr);
+        }
+
+        vkDestroySemaphore(device, renderFinishedSemaphore, nullptr);
+        vkDestroySemaphore(device, imageAvailableSemaphore, nullptr);
+        checkCudaErrors(cudaDestroyExternalSemaphore(cudaExtCudaUpdateVkVertexBufSemaphore));
+        vkDestroySemaphore(device, cudaUpdateVkVertexBufSemaphore, nullptr);
+        checkCudaErrors(cudaDestroyExternalSemaphore(cudaExtVkUpdateCudaVertexBufSemaphore));
+        vkDestroySemaphore(device, vkUpdateCudaVertexBufSemaphore, nullptr);
+
+        vkDestroyCommandPool(device, commandPool, nullptr);
+        for (auto framebuffer : swapChainFramebuffers) {
+            vkDestroyFramebuffer(device, framebuffer, nullptr);
+        }
+        for (auto imageView : swapChainImageViews) {
+            vkDestroyImageView(device, imageView, nullptr);
+        }
+        vkDestroyPipeline(device, graphicsPipeline, nullptr);
+        vkDestroyPipelineLayout(device, pipelineLayout, nullptr);
+        vkDestroyDescriptorSetLayout(device, descriptorSetLayout, nullptr);
+        vkDestroyBuffer(device, uniformBuffer, nullptr);
+        vkFreeMemory(device, uniformBufferMemory, nullptr);
+        vkDestroyRenderPass(device, renderPass, nullptr);
+        vkDestroySwapchainKHR(device, swapChain, nullptr);
+        checkCudaErrors(cudaDestroyExternalMemory(cudaExtMemVertexBuffer));
+        vkDestroyBuffer(device, vertexBuffer, nullptr);
+        vkFreeMemory(device, vertexBufferMemory, nullptr);
+        vkDestroyDescriptorPool(device, descriptorPool, nullptr);
+        vkDestroyDevice(device, nullptr);
+        vkDestroySurfaceKHR(instance, surface, nullptr);
+        vkDestroyInstance(instance, nullptr);
+        glfwDestroyWindow(window);
+        glfwTerminate();
+    }
+};
+
+int main(int argc, char* argv[]) {
+    execution_path = argv[0];
+    vulkanCudaApp app;
+
+    try {
+        app.run();
+    } catch (const std::runtime_error& e) {
+        std::cerr << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    return EXIT_SUCCESS;
+}
\ No newline at end of file