From 0e8fcea1bdfe43e541b1dacd4af699de080a8853 Mon Sep 17 00:00:00 2001
From: kwoodle <woodlek@optonline.net>
Date: Fri, 23 Apr 2021 22:44:26 -0400
Subject: [PATCH] Fixed CMakeLists.txt so you don't have to change CLion Build
 Settings.

---
 CMakeLists.txt  |  43 +++--
 contraction.cu  | 310 ++++++++++++++++++++++++++++++++++++
 imageResize.cpp | 405 ++++++++++++++++++++++++++++++++++++++++++++++++
 imageResize.h   | 234 ++++++++++++++++++++++++++++
 reduction.cu    | 229 +++++++++++++++++++++++++++
 5 files changed, 1205 insertions(+), 16 deletions(-)
 create mode 100644 contraction.cu
 create mode 100644 imageResize.cpp
 create mode 100644 imageResize.h
 create mode 100644 reduction.cu
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7626076..4461704 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,25 +1,19 @@
-# NO LONGER NEEDS THESE SETTINGS?
-# Needs CUDAHOSTCXX=/usr/bin/cuda-g++;CUDA_LIB_PATH=/usr/local/cuda/lib64 in
-# CLion Settings | Build, Execution, Deployment | CMake > Environment on fedora
-# and CUDAHOSTCXX=/usr/bin/g++-8;CUDA_LIB_PATH=/usr/lib/cuda/lib64 on pop_os
+# Changes for cmake 3.18
+cmake_minimum_required(VERSION 3.18)
+#list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
 
-cmake_minimum_required(VERSION 3.16)
-set(CMAKE_CUDA_HOST_COMPILER /usr/bin/g++-8)
 project(CudaSamples LANGUAGES CXX CUDA)
-#set(ENV{MPI_HOME} /usr/local )
 set(CMAKE_CUDA_STANDARD 14)
-set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "-arch=sm_75")
+
 set(cuda_include_dirs = ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
 set(cuda_samples_include = "${cuda_include_dirs}/../../../samples/common/inc/")
+message(STATUS "cuda_include_dirs = ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}")
 include_directories(${cuda_samples_include})
 include_directories(${cuda_include_dirs})
 set(CUDA_VERBOSE_BUILD ON)
-set(CUDA_SEPARABLE_COMPILATION ON) # Allow multiple CUDA files compilation
 
 add_executable(asyncAPI asyncAPI.cu)
-set(MPIEXEC_EXECUTABLE /opt/openmpi-cuda-4.0.3/bin/mpicxx)
-#set(MPI_CXX_COMPILER /usr/local/bin/mpicxx)
-#set(MPI_HOME /usr/local)
+
 find_package(MPI REQUIRED)
 include_directories(${MPI_CXX_INCLUDE_DIRS})
 
@@ -43,8 +37,25 @@ target_link_libraries(simpleVulkan vulkan glfw)
 
 add_executable(deviceQuery deviceQuery.cpp)
 
-set_property(TARGET deviceQuery PROPERTY LINKER_LANGUAGE CUDA)
-target_link_libraries(deviceQuery cuda)
-
 add_executable(deviceQueryDrv deviceQueryDrv.cpp)
-target_link_libraries(deviceQueryDrv cuda)
\ No newline at end of file
+target_link_libraries(deviceQueryDrv cuda)
+
+include_directories(/opt/libcutensor/include)
+add_executable(contraction contraction.cu)
+target_link_directories(contraction PRIVATE /opt/libcutensor/lib/10.2)
+target_link_libraries(contraction cutensor cudart)
+
+add_executable(reduction reduction.cu)
+target_link_directories(reduction PRIVATE /opt/libcutensor/lib/10.2)
+target_link_libraries(reduction cutensor)
+
+find_library(CUDART_LIBRARY cudart ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
+find_library(NVJPEG_LIBRARY nvjpeg ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
+find_library(NPPC_LIBRARY nppc ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
+find_library(NPPIG_LIBRARY nppig ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
+find_library(CULIBOS culibos ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
+
+add_executable(image_resize imageResize.cpp)
+target_link_libraries(image_resize ${NVJPEG_LIBRARY} ${NPPC_LIBRARY} ${NPPIG_LIBRARY} ${CUDART_LIBRARY})
+set_property(TARGET reduction contraction bandwidthTest simpleVulkan simpleGL
+        cudaTensorCoreGemm simpleMPI asyncAPI PROPERTY CUDA_ARCHITECTURES 75)
diff --git a/contraction.cu b/contraction.cu
new file mode 100644
index 0000000..1024062
--- /dev/null
+++ b/contraction.cu
@@ -0,0 +1,310 @@
+//
+// Created by kwoodle on 6/12/20.
+//
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#include <unordered_map>
+#include <vector>
+
+#include <cuda_runtime.h>
+#include <cutensor.h>
+
+#define HANDLE_ERROR(x)                                               \
+{ const auto err = x;                                                 \
+  if( err != CUTENSOR_STATUS_SUCCESS )                                \
+  { printf("Error: %s\n", cutensorGetErrorString(err)); return err; } \
+};
+
+#define HANDLE_CUDA_ERROR(x)                                      \
+{ const auto err = x;                                             \
+  if( err != cudaSuccess )                                        \
+  { printf("Error: %s\n", cudaGetErrorString(err)); return err; } \
+};
+
+struct GPUTimer
+{
+    GPUTimer()
+    {
+        cudaEventCreate(&start_);
+        cudaEventCreate(&stop_);
+        cudaEventRecord(start_, 0);
+    }
+
+    ~GPUTimer()
+    {
+        cudaEventDestroy(start_);
+        cudaEventDestroy(stop_);
+    }
+
+    void start()
+    {
+        cudaEventRecord(start_, 0);
+    }
+
+    float seconds()
+    {
+        cudaEventRecord(stop_, 0);
+        cudaEventSynchronize(stop_);
+        float time;
+        cudaEventElapsedTime(&time, start_, stop_);
+        return time * 1e-3;
+    }
+private:
+    cudaEvent_t start_, stop_;
+};
+
+int main(int argc, char** argv)
+{
+    typedef float floatTypeA;
+    typedef float floatTypeB;
+    typedef float floatTypeC;
+    typedef float floatTypeCompute;
+
+    cudaDataType_t typeA = CUDA_R_32F;
+    cudaDataType_t typeB = CUDA_R_32F;
+    cudaDataType_t typeC = CUDA_R_32F;
+    cutensorComputeType_t typeCompute = CUTENSOR_R_MIN_32F;
+
+    floatTypeCompute alpha = (floatTypeCompute)1.1f;
+    floatTypeCompute beta  = (floatTypeCompute)0.f;
+
+    /**********************
+     * Computing: C_{m,u,n,v} = alpha * A_{m,h,k,n} B_{u,k,v,h} + beta * C_{m,u,n,v}
+     **********************/
+
+    std::vector<int> modeC{'m','u','n','v'};
+    std::vector<int> modeA{'m','h','k','n'};
+    std::vector<int> modeB{'u','k','v','h'};
+    int nmodeA = modeA.size();
+    int nmodeB = modeB.size();
+    int nmodeC = modeC.size();
+
+    std::unordered_map<int, int64_t> extent;
+    extent['m'] = 96;
+    extent['n'] = 96;
+    extent['u'] = 96;
+    extent['v'] = 64;
+    extent['h'] = 64;
+    extent['k'] = 64;
+
+    double gflops = (2.0 * extent['m'] * extent['n'] * extent['u'] * extent['v'] * extent['k'] * extent['h']) /1e9;
+
+    std::vector<int64_t> extentC;
+    for (auto mode : modeC)
+        extentC.push_back(extent[mode]);
+    std::vector<int64_t> extentA;
+    for (auto mode : modeA)
+        extentA.push_back(extent[mode]);
+    std::vector<int64_t> extentB;
+    for (auto mode : modeB)
+        extentB.push_back(extent[mode]);
+
+    /**********************
+     * Allocating data
+     **********************/
+
+    size_t elementsA = 1;
+    for (auto mode : modeA)
+        elementsA *= extent[mode];
+    size_t elementsB = 1;
+    for (auto mode : modeB)
+        elementsB *= extent[mode];
+    size_t elementsC = 1;
+    for (auto mode : modeC)
+        elementsC *= extent[mode];
+
+    size_t sizeA = sizeof(floatTypeA) * elementsA;
+    size_t sizeB = sizeof(floatTypeB) * elementsB;
+    size_t sizeC = sizeof(floatTypeC) * elementsC;
+    printf("Total memory: %.2f GiB\n", (sizeA + sizeB + sizeC)/1024./1024./1024);
+
+    void *A_d, *B_d, *C_d;
+    HANDLE_CUDA_ERROR(cudaMalloc((void**) &A_d, sizeA));
+    HANDLE_CUDA_ERROR(cudaMalloc((void**) &B_d, sizeB));
+    HANDLE_CUDA_ERROR(cudaMalloc((void**) &C_d, sizeC));
+
+    floatTypeA *A = (floatTypeA*) malloc(sizeof(floatTypeA) * elementsA);
+    floatTypeB *B = (floatTypeB*) malloc(sizeof(floatTypeB) * elementsB);
+    floatTypeC *C = (floatTypeC*) malloc(sizeof(floatTypeC) * elementsC);
+
+    if (A == NULL || B == NULL || C == NULL)
+    {
+        printf("Error: Host allocation of A or C.\n");
+        return -1;
+    }
+
+    /*******************
+     * Initialize data
+     *******************/
+
+    for (int64_t i = 0; i < elementsA; i++)
+        A[i] = (((float) rand())/RAND_MAX - 0.5)*100;
+    for (int64_t i = 0; i < elementsB; i++)
+        B[i] = (((float) rand())/RAND_MAX - 0.5)*100;
+    for (int64_t i = 0; i < elementsC; i++)
+        C[i] = (((float) rand())/RAND_MAX - 0.5)*100;
+
+    HANDLE_CUDA_ERROR(cudaMemcpy(A_d, A, sizeA, cudaMemcpyHostToDevice));
+    HANDLE_CUDA_ERROR(cudaMemcpy(B_d, B, sizeB, cudaMemcpyHostToDevice));
+    HANDLE_CUDA_ERROR(cudaMemcpy(C_d, C, sizeC, cudaMemcpyHostToDevice));
+
+    /*************************
+     * cuTENSOR
+     *************************/
+
+    cutensorHandle_t handle;
+    HANDLE_ERROR(cutensorInit(&handle));
+
+    /**********************
+     * Create Tensor Descriptors
+     **********************/
+
+    cutensorTensorDescriptor_t descA;
+    HANDLE_ERROR(cutensorInitTensorDescriptor(&handle,
+                                              &descA,
+                                              nmodeA,
+                                              extentA.data(),
+                                              NULL,/*stride*/
+                                              typeA, CUTENSOR_OP_IDENTITY));
+
+    cutensorTensorDescriptor_t descB;
+    HANDLE_ERROR(cutensorInitTensorDescriptor(&handle,
+                                              &descB,
+                                              nmodeB,
+                                              extentB.data(),
+                                              NULL,/*stride*/
+                                              typeB, CUTENSOR_OP_IDENTITY));
+
+    cutensorTensorDescriptor_t descC;
+    HANDLE_ERROR(cutensorInitTensorDescriptor( &handle,
+                                               &descC,
+                                               nmodeC,
+                                               extentC.data(),
+                                               NULL,/*stride*/
+                                               typeC, CUTENSOR_OP_IDENTITY));
+
+    /**********************************************
+     * Retrieve the memory alignment for each tensor
+     **********************************************/
+
+    uint32_t alignmentRequirementA;
+    HANDLE_ERROR(cutensorGetAlignmentRequirement(&handle,
+                                                 A_d,
+                                                 &descA,
+                                                 &alignmentRequirementA));
+
+    uint32_t alignmentRequirementB;
+    HANDLE_ERROR(cutensorGetAlignmentRequirement(&handle,
+                                                 B_d,
+                                                 &descB,
+                                                 &alignmentRequirementB));
+
+    uint32_t alignmentRequirementC;
+    HANDLE_ERROR(cutensorGetAlignmentRequirement(&handle,
+                                                 C_d,
+                                                 &descC,
+                                                 &alignmentRequirementC));
+
+    /*******************************
+     * Create Contraction Descriptor
+     *******************************/
+
+    cutensorContractionDescriptor_t desc;
+    HANDLE_ERROR(cutensorInitContractionDescriptor(&handle,
+                                                   &desc,
+                                                   &descA, modeA.data(), alignmentRequirementA,
+                                                   &descB, modeB.data(), alignmentRequirementB,
+                                                   &descC, modeC.data(), alignmentRequirementC,
+                                                   &descC, modeC.data(), alignmentRequirementC,
+                                                   typeCompute));
+
+    /**************************
+    * Set the algorithm to use
+    ***************************/
+
+    cutensorContractionFind_t find;
+    HANDLE_ERROR(cutensorInitContractionFind(
+            &handle, &find,
+            CUTENSOR_ALGO_DEFAULT));
+
+    /**********************
+     * Query workspace
+     **********************/
+
+    uint64_t worksize = 0;
+    HANDLE_ERROR(cutensorContractionGetWorkspace(&handle,
+                                                 &desc,
+                                                 &find,
+                                                 CUTENSOR_WORKSPACE_RECOMMENDED, &worksize));
+
+    void *work = nullptr;
+    if (worksize > 0)
+    {
+        if (cudaSuccess != cudaMalloc(&work, worksize))
+        {
+            work = nullptr;
+            worksize = 0;
+        }
+    }
+
+    /**************************
+     * Create Contraction Plan
+     **************************/
+
+    cutensorContractionPlan_t plan;
+    HANDLE_ERROR(cutensorInitContractionPlan(&handle,
+                                             &plan,
+                                             &desc,
+                                             &find,
+                                             worksize));
+
+    /**********************
+     * Run
+     **********************/
+
+    double minTimeCUTENSOR = 1e100;
+    cutensorStatus_t err;
+    for (int i=0; i < 3; ++i)
+    {
+        cudaMemcpy(C_d, C, sizeC, cudaMemcpyHostToDevice);
+        cudaDeviceSynchronize();
+
+        // Set up timing
+        GPUTimer timer;
+        timer.start();
+
+        err = cutensorContraction(&handle,
+                                  &plan,
+                                  (void*) &alpha, A_d, B_d,
+                                  (void*) &beta,  C_d, C_d,
+                                  work, worksize, 0 /* stream */);
+
+        // Synchronize and measure timing
+        auto time = timer.seconds();
+
+        if (err != CUTENSOR_STATUS_SUCCESS)
+        {
+            printf("ERROR: %s in line %d\n", cutensorGetErrorString(err), __LINE__);
+        }
+        minTimeCUTENSOR = (minTimeCUTENSOR < time) ? minTimeCUTENSOR : time;
+    }
+
+    /*************************/
+
+    double transferedBytes = sizeC + sizeA + sizeB;
+    transferedBytes += ((float) beta != 0.f) ? sizeC : 0;
+    transferedBytes /= 1e9;
+    printf("cuTensor: %.2f GFLOPs/s %.2f GB/s\n", gflops / minTimeCUTENSOR, transferedBytes/ minTimeCUTENSOR);
+
+    if (A) free(A);
+    if (B) free(B);
+    if (C) free(C);
+    if (A_d) cudaFree(A_d);
+    if (B_d) cudaFree(B_d);
+    if (C_d) cudaFree(C_d);
+    if (work) cudaFree(work);
+
+    return 0;
+}
diff --git a/imageResize.cpp b/imageResize.cpp
new file mode 100644
index 0000000..a5052fc
--- /dev/null
+++ b/imageResize.cpp
@@ -0,0 +1,405 @@
+//
+// Created by kwoodle on 6/12/20.
+//
+
+#include "imageResize.h"
+
+//#define OPTIMIZED_HUFFMAN
+//#define CUDA10U2
+
+// *****************************************************************************
+// nvJPEG handles and parameters
+// -----------------------------------------------------------------------------
+nvjpegBackend_t impl = NVJPEG_BACKEND_GPU_HYBRID; //NVJPEG_BACKEND_DEFAULT;
+nvjpegHandle_t nvjpeg_handle;
+nvjpegJpegStream_t nvjpeg_jpeg_stream;
+nvjpegDecodeParams_t nvjpeg_decode_params;
+nvjpegJpegState_t nvjpeg_decoder_state;
+nvjpegEncoderParams_t nvjpeg_encode_params;
+nvjpegEncoderState_t nvjpeg_encoder_state;
+
+#ifdef CUDA10U2 // This part needs CUDA 10.1 Update 2 for copy the metadata other information from base image.
+nvjpegJpegEncoding_t nvjpeg_encoding;
+#endif
+
+
+// *****************************************************************************
+// Decode, Resize and Encoder function
+// -----------------------------------------------------------------------------
+int decodeResizeEncodeOneImage(std::string sImagePath, std::string sOutputPath, double &time, int resizeWidth, int resizeHeight, int resize_quality)
+{
+    // Decode, Encoder format
+    nvjpegOutputFormat_t oformat = NVJPEG_OUTPUT_BGR;
+    nvjpegInputFormat_t iformat = NVJPEG_INPUT_BGR;
+
+    // timing for resize
+    time = 0.;
+    float resize_time = 0.;
+    cudaEvent_t start, stop;
+    CHECK_CUDA(cudaEventCreate(&start));
+    CHECK_CUDA(cudaEventCreate(&stop));
+
+    // Image reading section
+    // Get the file name, without extension.
+    // This will be used to rename the output file.
+    size_t position = sImagePath.rfind("/");
+    std::string sFileName = (std::string::npos == position)? sImagePath : sImagePath.substr(position + 1, sImagePath.size());
+    position = sFileName.rfind(".");
+    sFileName = (std::string::npos == position)? sFileName : sFileName.substr(0, position);
+
+#ifndef _WIN64
+    position = sFileName.rfind("/");
+    sFileName = (std::string::npos == position) ? sFileName : sFileName.substr(position + 1, sFileName.length());
+#else
+    position = sFileName.rfind("\\");
+    sFileName = (std::string::npos == position) ? sFileName : sFileName.substr(position+1, sFileName.length());
+#endif
+
+    // Read an image from disk.
+    std::ifstream oInputStream(sImagePath.c_str(), std::ios::in | std::ios::binary | std::ios::ate);
+    if(!(oInputStream.is_open()))
+    {
+        std::cerr << "Cannot open image: " << sImagePath << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // Get the size.
+    std::streamsize nSize = oInputStream.tellg();
+    oInputStream.seekg(0, std::ios::beg);
+
+    // Image buffers.
+    unsigned char * pBuffer = NULL;
+    unsigned char * pResizeBuffer = NULL;
+
+    std::vector<char> vBuffer(nSize);
+    if (oInputStream.read(vBuffer.data(), nSize))
+    {
+        unsigned char * dpImage = (unsigned char *)vBuffer.data();
+
+        // Retrieve the componenet and size info.
+        int nComponent = 0;
+        nvjpegChromaSubsampling_t subsampling;
+        int widths[NVJPEG_MAX_COMPONENT];
+        int heights[NVJPEG_MAX_COMPONENT];
+        int nReturnCode = 0;
+        if (NVJPEG_STATUS_SUCCESS != nvjpegGetImageInfo(nvjpeg_handle, dpImage, nSize, &nComponent, &subsampling, widths, heights))
+        {
+            std::cerr << "Error decoding JPEG header: " << sImagePath << std::endl;
+            return EXIT_FAILURE;
+        }
+
+        if(resizeWidth == 0 || resizeHeight == 0)
+        {
+            resizeWidth = widths[0]/2;
+            resizeHeight = heights[0]/2;
+        }
+
+        // image resize
+        size_t pitchDesc, pitchResize;
+        NppiSize srcSize = { (int)widths[0], (int)heights[0] };
+        NppiRect srcRoi = { 0, 0, srcSize.width, srcSize.height };
+        NppiSize dstSize = { (int)resizeWidth, (int)resizeHeight };
+        NppiRect dstRoi = { 0, 0, dstSize.width, dstSize.height };
+        NppStatus st;
+        NppStreamContext nppStreamCtx;
+        nppStreamCtx.hStream = NULL; // default stream
+
+        // device image buffers.
+        nvjpegImage_t imgDesc;
+        nvjpegImage_t imgResize;
+
+        if (is_interleaved(oformat))
+        {
+            pitchDesc = NVJPEG_MAX_COMPONENT * widths[0];
+            pitchResize = NVJPEG_MAX_COMPONENT * resizeWidth;
+        }
+        else
+        {
+            pitchDesc = 3 * widths[0];
+            pitchResize = 3 * resizeWidth;
+        }
+
+        cudaError_t eCopy = cudaMalloc(&pBuffer, pitchDesc * heights[0]);
+        if (cudaSuccess != eCopy)
+        {
+            std::cerr << "cudaMalloc failed : " << cudaGetErrorString(eCopy) << std::endl;
+            return EXIT_FAILURE;
+        }
+        cudaError_t eCopy1 = cudaMalloc(&pResizeBuffer, pitchResize * resizeHeight);
+        if (cudaSuccess != eCopy1)
+        {
+            std::cerr << "cudaMalloc failed : " << cudaGetErrorString(eCopy) << std::endl;
+            return EXIT_FAILURE;
+        }
+
+
+        imgDesc.channel[0] = pBuffer;
+        imgDesc.channel[1] = pBuffer + widths[0] * heights[0];
+        imgDesc.channel[2] = pBuffer + widths[0] * heights[0] * 2;
+        imgDesc.pitch[0] = (unsigned int)(is_interleaved(oformat) ? widths[0] * NVJPEG_MAX_COMPONENT : widths[0]);
+        imgDesc.pitch[1] = (unsigned int)widths[0];
+        imgDesc.pitch[2] = (unsigned int)widths[0];
+
+        imgResize.channel[0] = pResizeBuffer;
+        imgResize.channel[1] = pResizeBuffer + resizeWidth * resizeHeight;
+        imgResize.channel[2] = pResizeBuffer + resizeWidth * resizeHeight * 2;
+        imgResize.pitch[0] = (unsigned int)(is_interleaved(oformat) ? resizeWidth * NVJPEG_MAX_COMPONENT : resizeWidth);;
+        imgResize.pitch[1] = (unsigned int)resizeWidth;
+        imgResize.pitch[2] = (unsigned int)resizeWidth;
+
+        if (is_interleaved(oformat))
+        {
+            imgDesc.channel[3] = pBuffer + widths[0] * heights[0] * 3;
+            imgDesc.pitch[3] = (unsigned int)widths[0];
+            imgResize.channel[3] = pResizeBuffer + resizeWidth * resizeHeight * 3;
+            imgResize.pitch[3] = (unsigned int)resizeWidth;
+        }
+
+        // nvJPEG encoder parameter setting
+        CHECK_NVJPEG(nvjpegEncoderParamsSetQuality(nvjpeg_encode_params, resize_quality, NULL));
+
+#ifdef OPTIMIZED_HUFFMAN  // Optimized Huffman
+        CHECK_NVJPEG(nvjpegEncoderParamsSetOptimizedHuffman(nvjpeg_encode_params, 1, NULL));
+#endif
+        CHECK_NVJPEG(nvjpegEncoderParamsSetSamplingFactors(nvjpeg_encode_params, subsampling, NULL));
+
+
+        // Timing start
+        CHECK_CUDA(cudaEventRecord(start, 0));
+
+#ifdef CUDA10U2 // This part needs CUDA 10.1 Update 2
+        //parse image save metadata in jpegStream structure
+        CHECK_NVJPEG(nvjpegJpegStreamParse(nvjpeg_handle, dpImage, nSize, 1, 0, nvjpeg_jpeg_stream));
+#endif
+
+        // decode by stages
+        nReturnCode = nvjpegDecode(nvjpeg_handle, nvjpeg_decoder_state, dpImage, nSize, oformat, &imgDesc, NULL);
+        if(nReturnCode != 0)
+        {
+            std::cerr << "Error in nvjpegDecode." << nReturnCode << std::endl;
+            return EXIT_FAILURE;
+        }
+
+        // image resize
+        /* Note: this is the simplest resizing function from NPP. */
+        if (is_interleaved(oformat))
+        {
+            st = nppiResize_8u_C3R_Ctx(imgDesc.channel[0], imgDesc.pitch[0], srcSize, srcRoi,
+                                       imgResize.channel[0], imgResize.pitch[0], dstSize, dstRoi, NPPI_INTER_LANCZOS, nppStreamCtx);
+        }
+        else
+        {
+            st = nppiResize_8u_C1R_Ctx(imgDesc.channel[0], imgDesc.pitch[0], srcSize, srcRoi,
+                                       imgResize.channel[0], imgResize.pitch[0], dstSize, dstRoi, NPPI_INTER_LANCZOS, nppStreamCtx);
+            st = nppiResize_8u_C1R_Ctx(imgDesc.channel[1], imgDesc.pitch[1], srcSize, srcRoi,
+                                       imgResize.channel[1], imgResize.pitch[1], dstSize, dstRoi, NPPI_INTER_LANCZOS, nppStreamCtx);
+            st = nppiResize_8u_C1R_Ctx(imgDesc.channel[2], imgDesc.pitch[2], srcSize, srcRoi,
+                                       imgResize.channel[2], imgResize.pitch[2], dstSize, dstRoi, NPPI_INTER_LANCZOS, nppStreamCtx);
+        }
+
+        if (st != NPP_SUCCESS)
+        {
+            std::cerr << "NPP resize failed : " << st << std::endl;
+            return EXIT_FAILURE;
+        }
+
+        // get encoding from the jpeg stream and copy it to the encode parameters
+#ifdef CUDA10U2 // This part needs CUDA 10.1 Update 2 for copy the metadata other information from base image.
+        CHECK_NVJPEG(nvjpegJpegStreamGetJpegEncoding(nvjpeg_jpeg_stream, &nvjpeg_encoding));
+        CHECK_NVJPEG(nvjpegEncoderParamsSetEncoding(nvjpeg_encode_params, nvjpeg_encoding, NULL));
+        CHECK_NVJPEG(nvjpegEncoderParamsCopyQuantizationTables(nvjpeg_encode_params, nvjpeg_jpeg_stream, NULL));
+        CHECK_NVJPEG(nvjpegEncoderParamsCopyHuffmanTables(nvjpeg_encoder_state, nvjpeg_encode_params, nvjpeg_jpeg_stream, NULL));
+        CHECK_NVJPEG(nvjpegEncoderParamsCopyMetadata(nvjpeg_encoder_state, nvjpeg_encode_params, nvjpeg_jpeg_stream, NULL));
+#endif
+
+        // encoding the resize data
+        CHECK_NVJPEG(nvjpegEncodeImage(nvjpeg_handle,
+                                       nvjpeg_encoder_state,
+                                       nvjpeg_encode_params,
+                                       &imgResize,
+                                       iformat,
+                                       dstSize.width,
+                                       dstSize.height,
+                                       NULL));
+
+        // retrive the encoded bitstream for file writing
+        std::vector<unsigned char> obuffer;
+        size_t length;
+        CHECK_NVJPEG(nvjpegEncodeRetrieveBitstream(
+                nvjpeg_handle,
+                nvjpeg_encoder_state,
+                NULL,
+                &length,
+                NULL));
+
+        obuffer.resize(length);
+
+        CHECK_NVJPEG(nvjpegEncodeRetrieveBitstream(
+                nvjpeg_handle,
+                nvjpeg_encoder_state,
+                obuffer.data(),
+                &length,
+                NULL));
+
+        // Timing stop
+        CHECK_CUDA(cudaEventRecord(stop, 0));
+        CHECK_CUDA(cudaEventSynchronize(stop));
+
+        // file writing
+        std::cout << "Resize-width: " << dstSize.width << " Resize-height: " << dstSize.height << std::endl;
+        std::string output_filename = sOutputPath + "/" + sFileName + ".jpg";
+        char directory[120];
+        char mkdir_cmd[256];
+        std::string folder = sOutputPath;
+        output_filename = folder + "/"+ sFileName +".jpg";
+#if !defined(_WIN32)
+        sprintf(directory, "%s", folder.c_str());
+        sprintf(mkdir_cmd, "mkdir -p %s 2> /dev/null", directory);
+#else
+        sprintf(directory, "%s", folder.c_str());
+        sprintf(mkdir_cmd, "mkdir %s 2> nul", directory);
+#endif
+
+        int ret = system(mkdir_cmd);
+
+        std::cout << "Writing JPEG file: " << output_filename << std::endl;
+        std::ofstream outputFile(output_filename.c_str(), std::ios::out | std::ios::binary);
+        outputFile.write(reinterpret_cast<const char *>(obuffer.data()), static_cast<int>(length));
+    }
+    // Free memory
+    CHECK_CUDA(cudaFree(pBuffer));
+    CHECK_CUDA(cudaFree(pResizeBuffer));
+
+    // get timing
+    CHECK_CUDA(cudaEventElapsedTime(&resize_time, start, stop));
+    time = (double)resize_time;
+
+    return EXIT_SUCCESS;
+}
+
+// *****************************************************************************
+// parsing the arguments function
+// -----------------------------------------------------------------------------
+int processArgs(image_resize_params_t param)
+{
+    std::string sInputPath(param.input_dir);
+    std::string sOutputPath(param.output_dir);
+    int resizeWidth = param.width;
+    int resizeHeight = param.height;
+    int resize_quality = param.quality;
+
+    int error_code = 1;
+
+    double total_time = 0., decode_time = 0.;
+    int total_images = 0;
+
+    std::vector<std::string> inputFiles;
+    if (readInput(sInputPath, inputFiles))
+    {
+        return error_code;
+    }
+    for (unsigned int i = 0; i < inputFiles.size(); i++)
+    {
+        std::string &sFileName = inputFiles[i];
+        std::cout << "Processing file: " << sFileName << std::endl;
+
+        int image_error_code = decodeResizeEncodeOneImage(sFileName, sOutputPath, decode_time, resizeWidth, resizeHeight, resize_quality);
+
+        if (image_error_code)
+        {
+            std::cerr << "Error processing file: " << sFileName << std::endl;
+            return image_error_code;
+        }
+        else
+        {
+            total_images++;
+            total_time += decode_time;
+        }
+    }
+
+    std::cout << "------------------------------------------------------------- " << std::endl;
+    std::cout << "Total images resized: " << total_images << std::endl;
+    std::cout << "Total time spent on resizing: " << total_time << " (ms)" << std::endl;
+    std::cout << "Avg time/image: " << total_time/total_images << " (ms)" << std::endl;
+    std::cout << "------------------------------------------------------------- " << std::endl;
+    return EXIT_SUCCESS;
+}
+
+// *****************************************************************************
+// main image resize function
+// -----------------------------------------------------------------------------
+int main(int argc, const char *argv[])
+{
+    int pidx;
+
+    if ((pidx = findParamIndex(argv, argc, "-h")) != -1 ||
+        (pidx = findParamIndex(argv, argc, "--help")) != -1) {
+        std::cout << "Usage: " << argv[0]
+                  << " -i images-dir  [-o output-dir]"
+                     "[-q jpeg-quality][-rw resize-width ] [-rh resize-height]\n";
+        std::cout << "Parameters: " << std::endl;
+        std::cout << "\timages-dir\t:\tPath to single image or directory of images" << std::endl;
+        std::cout << "\toutput-dir\t:\tWrite resized images to this directory [default resize_output]" << std::endl;
+        std::cout << "\tJPEG Quality\t:\tUse image quality [default 85]" << std::endl;
+        std::cout << "\tResize Width\t:\t Resize width [default original_img_width/2]" << std::endl;
+        std::cout << "\tResize Height\t:\t Resize height [default original_img_height/2]" << std::endl;
+        return EXIT_SUCCESS;
+    }
+
+    image_resize_params_t params;
+
+    params.input_dir = "./";
+    if ((pidx = findParamIndex(argv, argc, "-i")) != -1) {
+        params.input_dir = argv[pidx + 1];
+    } else {
+        // Search in default paths for input images.
+        int found = getInputDir(params.input_dir, argv[0]);
+        if (!found)
+        {
+            std::cout << "Please specify input directory for image resizing"<< std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+    if ((pidx = findParamIndex(argv, argc, "-o")) != -1) {
+        params.output_dir = argv[pidx + 1];
+    } else {
+        // by-default write the folder named "output" in cwd
+        params.output_dir = "resize_output";
+    }
+
+    params.quality = 85;
+    if ((pidx = findParamIndex(argv, argc, "-q")) != -1) {
+        params.quality = std::atoi(argv[pidx + 1]);
+    }
+
+    params.width = 0;
+    if ((pidx = findParamIndex(argv, argc, "-rw")) != -1) {
+        params.width = std::atoi(argv[pidx + 1]);
+    }
+
+    params.height = 0;
+    if ((pidx = findParamIndex(argv, argc, "-rh")) != -1) {
+        params.height = std::atoi(argv[pidx + 1]);
+    }
+
+    nvjpegDevAllocator_t dev_allocator = {&dev_malloc, &dev_free};
+    CHECK_NVJPEG(nvjpegCreate(impl, &dev_allocator, &nvjpeg_handle));
+    CHECK_NVJPEG(nvjpegJpegStateCreate(nvjpeg_handle, &nvjpeg_decoder_state));
+
+    // create bitstream object
+    CHECK_NVJPEG(nvjpegJpegStreamCreate(nvjpeg_handle, &nvjpeg_jpeg_stream));
+    CHECK_NVJPEG(nvjpegDecodeParamsCreate(nvjpeg_handle, &nvjpeg_decode_params));
+    CHECK_NVJPEG(nvjpegEncoderStateCreate(nvjpeg_handle, &nvjpeg_encoder_state, NULL));
+    CHECK_NVJPEG(nvjpegEncoderParamsCreate(nvjpeg_handle, &nvjpeg_encode_params, NULL));
+
+    pidx = processArgs(params);
+
+    CHECK_NVJPEG(nvjpegEncoderParamsDestroy(nvjpeg_encode_params));
+    CHECK_NVJPEG(nvjpegDecodeParamsDestroy(nvjpeg_decode_params));
+    CHECK_NVJPEG(nvjpegEncoderStateDestroy(nvjpeg_encoder_state));
+    CHECK_NVJPEG(nvjpegJpegStateDestroy(nvjpeg_decoder_state));
+    CHECK_NVJPEG(nvjpegDestroy(nvjpeg_handle));
+
+    return pidx;
+}
+
diff --git a/imageResize.h b/imageResize.h
new file mode 100644
index 0000000..6249c83
--- /dev/null
+++ b/imageResize.h
@@ -0,0 +1,234 @@
+//
+// Created by kwoodle on 6/12/20.
+//
+
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <string>
+#include <vector>
+#include <algorithm>
+
+#include <string.h>  // strcmpi
+#ifndef _WIN64
+#include <sys/time.h>  // timings
+#include <unistd.h>
+#endif
+#include <dirent.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+
+#include <cuda_runtime_api.h>
+#include <nvjpeg.h>
+#include <nppi_geometry_transforms.h>
+
+
+#define CHECK_CUDA(call)                                                        \
+    {                                                                           \
+        cudaError_t _e = (call);                                                \
+        if (_e != cudaSuccess)                                                  \
+        {                                                                       \
+            std::cout << "CUDA Runtime failure: '#" << _e << "' at " <<  __FILE__ << ":" << __LINE__ << std::endl;\
+            exit(1);                                                            \
+        }                                                                       \
+    }
+
+#define CHECK_NVJPEG(call)                                                      \
+    {                                                                           \
+        nvjpegStatus_t _e = (call);                                             \
+        if (_e != NVJPEG_STATUS_SUCCESS)                                        \
+        {                                                                       \
+            std::cout << "NVJPEG failure: '#" << _e << "' at " <<  __FILE__ << ":" << __LINE__ << std::endl;\
+            exit(1);                                                            \
+        }                                                                       \
+    }
+
+struct image_resize_params_t {
+    std::string input_dir;
+    std::string output_dir;
+    int quality;
+    int width;
+    int height;
+    int dev;
+};
+
+
+typedef struct {
+    NppiSize size;
+    nvjpegImage_t data;
+} image_t;
+
+
+int dev_malloc(void** p, size_t s)
+{
+    return (int)cudaMalloc(p, s);
+}
+
+int dev_free(void* p)
+{
+    return (int)cudaFree(p);
+}
+
+bool is_interleaved(nvjpegOutputFormat_t format)
+{
+    if (format == NVJPEG_OUTPUT_RGBI || format == NVJPEG_OUTPUT_BGRI)
+        return true;
+    else
+        return false;
+}
+
+
+// *****************************************************************************
+// reading input directory to file list
+// -----------------------------------------------------------------------------
+int readInput(const std::string &sInputPath, std::vector<std::string> &filelist)
+{
+    int error_code = 1;
+    struct stat s;
+
+    if( stat(sInputPath.c_str(), &s) == 0 )
+    {
+        if( s.st_mode & S_IFREG )
+        {
+            filelist.push_back(sInputPath);
+        }
+        else if( s.st_mode & S_IFDIR )
+        {
+            // processing each file in directory
+            DIR *dir_handle;
+            struct dirent *dir;
+            dir_handle = opendir(sInputPath.c_str());
+            std::vector<std::string> filenames;
+            if (dir_handle)
+            {
+                error_code = 0;
+                while ((dir = readdir(dir_handle)) != NULL)
+                {
+                    if (dir->d_type == DT_REG)
+                    {
+                        std::string sFileName = sInputPath + dir->d_name;
+                        filelist.push_back(sFileName);
+                    }
+                    else if (dir->d_type == DT_DIR)
+                    {
+                        std::string sname = dir->d_name;
+                        if (sname != "." && sname != "..")
+                        {
+                            readInput(sInputPath + sname + "/", filelist);
+                        }
+                    }
+                }
+                closedir(dir_handle);
+            }
+            else
+            {
+                std::cout << "Cannot open input directory: " << sInputPath << std::endl;
+                return error_code;
+            }
+        }
+        else
+        {
+            std::cout << "Cannot open input: " << sInputPath << std::endl;
+            return error_code;
+        }
+    }
+    else
+    {
+        std::cout << "Cannot find input path " << sInputPath << std::endl;
+        return error_code;
+    }
+
+    return 0;
+}
+
+// *****************************************************************************
+// check for inputDirExists
+// -----------------------------------------------------------------------------
+int inputDirExists(const char *pathname) {
+    struct stat info;
+    if (stat(pathname, &info) != 0) {
+        return 0;  // Directory does not exists
+    } else if (info.st_mode & S_IFDIR) {
+        // is a directory
+        return 1;
+    } else {
+        // is not a directory
+        return 0;
+    }
+}
+
+// *****************************************************************************
+// check for getInputDir
+// -----------------------------------------------------------------------------
+int getInputDir(std::string &input_dir, const char *executable_path) {
+    int found = 0;
+    if (executable_path != 0) {
+        std::string executable_name = std::string(executable_path);
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+        // Windows path delimiter
+    size_t delimiter_pos = executable_name.find_last_of('\\');
+    executable_name.erase(0, delimiter_pos + 1);
+
+    if (executable_name.rfind(".exe") != std::string::npos) {
+      // we strip .exe, only if the .exe is found
+      executable_name.resize(executable_name.size() - 4);
+    }
+#else
+        // Linux & OSX path delimiter
+        size_t delimiter_pos = executable_name.find_last_of('/');
+        executable_name.erase(0, delimiter_pos + 1);
+#endif
+
+        // Search in default paths for input images.
+        std::string pathname = "";
+        const char *searchPath[] = {
+                "./images"};
+
+        for (unsigned int i = 0; i < sizeof(searchPath) / sizeof(char *); ++i) {
+            std::string pathname(searchPath[i]);
+            size_t executable_name_pos = pathname.find("<executable_name>");
+
+            // If there is executable_name variable in the searchPath
+            // replace it with the value
+            if (executable_name_pos != std::string::npos) {
+                pathname.replace(executable_name_pos, strlen("<executable_name>"),
+                                 executable_name);
+            }
+
+            if (inputDirExists(pathname.c_str())) {
+                input_dir = pathname + "/";
+                found = 1;
+                break;
+            }
+        }
+    }
+    return found;
+}
+
+// *****************************************************************************
+// parse parameters
+// -----------------------------------------------------------------------------
+int findParamIndex(const char **argv, int argc, const char *parm) {
+    int count = 0;
+    int index = -1;
+
+    for (int i = 0; i < argc; i++) {
+        if (strncmp(argv[i], parm, 100) == 0) {
+            index = i;
+            count++;
+        }
+    }
+
+    if (count == 0 || count == 1) {
+        return index;
+    } else {
+        std::cout << "Error, parameter " << parm
+                  << " has been specified more than once, exiting\n"
+                  << std::endl;
+        return -1;
+    }
+
+    return -1;
+}
diff --git a/reduction.cu b/reduction.cu
new file mode 100644
index 0000000..da4f144
--- /dev/null
+++ b/reduction.cu
@@ -0,0 +1,229 @@
+//
+// Created by kwoodle on 6/12/20.
+//
+
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <unordered_map>
+#include <vector>
+
+#include <cuda_runtime.h>
+#include <cutensor.h>
+
+#define HANDLE_ERROR(x)                                               \
+{ const auto err = x;                                                 \
+  if( err != CUTENSOR_STATUS_SUCCESS )                                \
+  { printf("Error: %s\n", cutensorGetErrorString(err)); return err; } \
+};
+
+#define HANDLE_CUDA_ERROR(x)                                      \
+{ const auto err = x;                                             \
+  if( err != cudaSuccess )                                        \
+  { printf("Error: %s\n", cudaGetErrorString(err)); return err; } \
+};
+
+struct GPUTimer
+{
+    GPUTimer()
+    {
+        cudaEventCreate(&start_);
+        cudaEventCreate(&stop_);
+        cudaEventRecord(start_, 0);
+    }
+
+    ~GPUTimer()
+    {
+        cudaEventDestroy(start_);
+        cudaEventDestroy(stop_);
+    }
+
+    void start()
+    {
+        cudaEventRecord(start_, 0);
+    }
+
+    float seconds()
+    {
+        cudaEventRecord(stop_, 0);
+        cudaEventSynchronize(stop_);
+        float time;
+        cudaEventElapsedTime(&time, start_, stop_);
+        return time * 1e-3;
+    }
+private:
+    cudaEvent_t start_, stop_;
+};
+
+int main(int argc, char** argv)
+{
+    typedef float floatTypeA;
+    typedef float floatTypeB;
+    typedef float floatTypeC;
+    typedef float floatTypeCompute;
+
+    cudaDataType_t typeA = CUDA_R_32F;
+    cudaDataType_t typeC = CUDA_R_32F;
+    cutensorComputeType_t typeCompute = CUTENSOR_R_MIN_32F;
+
+    floatTypeCompute alpha = (floatTypeCompute)1.1f;
+    floatTypeCompute beta  = (floatTypeCompute)0.f;
+
+    /**********************
+     * Computing (partial) reduction : C_{m,v} = alpha * A_{m,h,k,v} + beta * C_{m,v}
+     *********************/
+
+    std::vector<int32_t> modeA{'m','h','k','v'};
+    std::vector<int32_t> modeC{'m','v'};
+    int32_t nmodeA = modeA.size();
+    int32_t nmodeC = modeC.size();
+
+    std::unordered_map<int32_t, int64_t> extent;
+    extent['m'] = 196;
+    extent['v'] = 64;
+    extent['h'] = 256;
+    extent['k'] = 64;
+
+    std::vector<int64_t> extentC;
+    for (auto mode : modeC)
+        extentC.push_back(extent[mode]);
+    std::vector<int64_t> extentA;
+    for (auto mode : modeA)
+        extentA.push_back(extent[mode]);
+
+    /**********************
+     * Allocating data
+     *********************/
+
+    size_t elementsA = 1;
+    for (auto mode : modeA)
+        elementsA *= extent[mode];
+    size_t elementsC = 1;
+    for (auto mode : modeC)
+        elementsC *= extent[mode];
+
+    size_t sizeA = sizeof(floatTypeA) * elementsA;
+    size_t sizeC = sizeof(floatTypeC) * elementsC;
+    printf("Total memory: %.2f GiB\n",(sizeA + sizeC)/1024./1024./1024);
+
+    void *A_d, *C_d;
+    HANDLE_CUDA_ERROR(cudaMalloc((void**)&A_d, sizeA));
+    HANDLE_CUDA_ERROR(cudaMalloc((void**)&C_d, sizeC));
+
+    floatTypeA *A = (floatTypeA*) malloc(sizeof(floatTypeA) * elementsA);
+    floatTypeC *C = (floatTypeC*) malloc(sizeof(floatTypeC) * elementsC);
+
+    if (A == NULL || C == NULL)
+    {
+        printf("Error: Host allocation of A, B, or C.\n");
+        return -1;
+    }
+
+    /*******************
+     * Initialize data
+     *******************/
+
+    for (int64_t i = 0; i < elementsA; i++)
+        A[i] = (((float) rand())/RAND_MAX - 0.5)*100;
+    for (int64_t i = 0; i < elementsC; i++)
+        C[i] = (((float) rand())/RAND_MAX - 0.5)*100;
+
+    HANDLE_CUDA_ERROR(cudaMemcpy(C_d, C, sizeC, cudaMemcpyHostToDevice));
+    HANDLE_CUDA_ERROR(cudaMemcpy(A_d, A, sizeA, cudaMemcpyHostToDevice));
+
+    /*************************
+     * cuTENSOR
+     *************************/
+
+    cutensorHandle_t handle;
+    HANDLE_ERROR(cutensorInit(&handle));
+
+    /**********************
+     * Create Tensor Descriptors
+     **********************/
+
+    cutensorTensorDescriptor_t descA;
+    HANDLE_ERROR(cutensorInitTensorDescriptor(&handle,
+                                              &descA,
+                                              nmodeA,
+                                              extentA.data(),
+                                              NULL /* stride */,
+                                              typeA, CUTENSOR_OP_IDENTITY));
+
+    cutensorTensorDescriptor_t descC;
+    HANDLE_ERROR(cutensorInitTensorDescriptor(&handle,
+                                              &descC,
+                                              nmodeC,
+                                              extentC.data(),
+                                              NULL /* stride */,
+                                              typeC, CUTENSOR_OP_IDENTITY));
+
+    const cutensorOperator_t opReduce = CUTENSOR_OP_ADD;
+
+    /**********************
+     * Querry workspace
+     **********************/
+
+    uint64_t worksize = 0;
+    HANDLE_ERROR(cutensorReductionGetWorkspace(&handle,
+                                               A_d, &descA, modeA.data(),
+                                               C_d, &descC, modeC.data(),
+                                               C_d, &descC, modeC.data(),
+                                               opReduce, typeCompute, &worksize));
+    void *work = nullptr;
+    if (worksize > 0)
+    {
+        if (cudaSuccess != cudaMalloc(&work, worksize))
+        {
+            work = nullptr;
+            worksize = 0;
+        }
+    }
+
+    /**********************
+     * Run
+     **********************/
+
+    double minTimeCUTENSOR = 1e100;
+    cutensorStatus_t err;
+    for(int i=0; i < 3; ++i)
+    {
+        HANDLE_CUDA_ERROR(cudaMemcpy(C_d, C, sizeC, cudaMemcpyHostToDevice));
+        HANDLE_CUDA_ERROR(cudaDeviceSynchronize());
+
+        // Set up timing
+        GPUTimer timer;
+        timer.start();
+
+        err = cutensorReduction(&handle,
+                                (const void*)&alpha, A_d, &descA, modeA.data(),
+                                (const void*)&beta,  C_d, &descC, modeC.data(),
+                                C_d, &descC, modeC.data(),
+                                opReduce, typeCompute, work, worksize, 0 /* stream */);
+
+        // Synchronize and measure timing
+        auto time = timer.seconds();
+
+        if (err != CUTENSOR_STATUS_SUCCESS)
+        {
+            printf("ERROR: %s in line %d\n", cutensorGetErrorString(err), __LINE__);
+        }
+        minTimeCUTENSOR = (minTimeCUTENSOR < time) ? minTimeCUTENSOR : time;
+    }
+
+    /*************************/
+
+    double transferedBytes = sizeC + sizeA;
+    transferedBytes += ((float) beta != 0.f) ? sizeC : 0;
+    transferedBytes /= 1e9;
+    printf("cuTensor: %.2f GB/s\n", transferedBytes / minTimeCUTENSOR);
+
+    if (A) free(A);
+    if (C) free(C);
+    if (A_d) cudaFree(A_d);
+    if (C_d) cudaFree(C_d);
+    if (work) cudaFree(work);
+
+    return 0;
+}