From 0e8fcea1bdfe43e541b1dacd4af699de080a8853 Mon Sep 17 00:00:00 2001 From: kwoodle Date: Fri, 23 Apr 2021 22:44:26 -0400 Subject: [PATCH] Fixed CMakeLists.txt so you don't have to change CLion Build Settings. --- CMakeLists.txt | 43 +++-- contraction.cu | 310 ++++++++++++++++++++++++++++++++++++ imageResize.cpp | 405 ++++++++++++++++++++++++++++++++++++++++++++++++ imageResize.h | 234 ++++++++++++++++++++++++++++ reduction.cu | 229 +++++++++++++++++++++++++++ 5 files changed, 1205 insertions(+), 16 deletions(-) create mode 100644 contraction.cu create mode 100644 imageResize.cpp create mode 100644 imageResize.h create mode 100644 reduction.cu diff --git a/CMakeLists.txt b/CMakeLists.txt index 7626076..4461704 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,25 +1,19 @@ -# NO LONGER NEEDS THESE SETTINGS? -# Needs CUDAHOSTCXX=/usr/bin/cuda-g++;CUDA_LIB_PATH=/usr/local/cuda/lib64 in -# CLion Settings | Build, Execution, Deployment | CMake > Environment on fedora -# and CUDAHOSTCXX=/usr/bin/g++-8;CUDA_LIB_PATH=/usr/lib/cuda/lib64 on pop_os +# Changes for cmake 3.18 +cmake_minimum_required(VERSION 3.18) +#list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake") -cmake_minimum_required(VERSION 3.16) -set(CMAKE_CUDA_HOST_COMPILER /usr/bin/g++-8) project(CudaSamples LANGUAGES CXX CUDA) -#set(ENV{MPI_HOME} /usr/local ) set(CMAKE_CUDA_STANDARD 14) -set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "-arch=sm_75") + set(cuda_include_dirs = ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) set(cuda_samples_include = "${cuda_include_dirs}/../../../samples/common/inc/") +message(STATUS "cuda_include_dirs = ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}") include_directories(${cuda_samples_include}) include_directories(${cuda_include_dirs}) set(CUDA_VERBOSE_BUILD ON) -set(CUDA_SEPARABLE_COMPILATION ON) # Allow multiple CUDA files compilation add_executable(asyncAPI asyncAPI.cu) -set(MPIEXEC_EXECUTABLE /opt/openmpi-cuda-4.0.3/bin/mpicxx) -#set(MPI_CXX_COMPILER /usr/local/bin/mpicxx) -#set(MPI_HOME /usr/local) + find_package(MPI REQUIRED) include_directories(${MPI_CXX_INCLUDE_DIRS}) @@ -43,8 +37,25 @@ target_link_libraries(simpleVulkan vulkan glfw) add_executable(deviceQuery deviceQuery.cpp) -set_property(TARGET deviceQuery PROPERTY LINKER_LANGUAGE CUDA) -target_link_libraries(deviceQuery cuda) - add_executable(deviceQueryDrv deviceQueryDrv.cpp) -target_link_libraries(deviceQueryDrv cuda) \ No newline at end of file +target_link_libraries(deviceQueryDrv cuda) + +include_directories(/opt/libcutensor/include) +add_executable(contraction contraction.cu) +target_link_directories(contraction PRIVATE /opt/libcutensor/lib/10.2) +target_link_libraries(contraction cutensor cudart) + +add_executable(reduction reduction.cu) +target_link_directories(reduction PRIVATE /opt/libcutensor/lib/10.2) +target_link_libraries(reduction cutensor) + +find_library(CUDART_LIBRARY cudart ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}) +find_library(NVJPEG_LIBRARY nvjpeg ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}) +find_library(NPPC_LIBRARY nppc ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}) +find_library(NPPIG_LIBRARY nppig ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}) +find_library(CULIBOS culibos ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}) + +add_executable(image_resize imageResize.cpp) +target_link_libraries(image_resize ${NVJPEG_LIBRARY} ${NPPC_LIBRARY} ${NPPIG_LIBRARY} ${CUDART_LIBRARY}) +set_property(TARGET reduction contraction bandwidthTest simpleVulkan simpleGL + cudaTensorCoreGemm simpleMPI asyncAPI PROPERTY CUDA_ARCHITECTURES 75) diff --git a/contraction.cu b/contraction.cu new file mode 100644 index 0000000..1024062 --- /dev/null +++ b/contraction.cu @@ -0,0 +1,310 @@ +// +// Created by kwoodle on 6/12/20. +// + +#include +#include + +#include +#include + +#include +#include + +#define HANDLE_ERROR(x) \ +{ const auto err = x; \ + if( err != CUTENSOR_STATUS_SUCCESS ) \ + { printf("Error: %s\n", cutensorGetErrorString(err)); return err; } \ +}; + +#define HANDLE_CUDA_ERROR(x) \ +{ const auto err = x; \ + if( err != cudaSuccess ) \ + { printf("Error: %s\n", cudaGetErrorString(err)); return err; } \ +}; + +struct GPUTimer +{ + GPUTimer() + { + cudaEventCreate(&start_); + cudaEventCreate(&stop_); + cudaEventRecord(start_, 0); + } + + ~GPUTimer() + { + cudaEventDestroy(start_); + cudaEventDestroy(stop_); + } + + void start() + { + cudaEventRecord(start_, 0); + } + + float seconds() + { + cudaEventRecord(stop_, 0); + cudaEventSynchronize(stop_); + float time; + cudaEventElapsedTime(&time, start_, stop_); + return time * 1e-3; + } +private: + cudaEvent_t start_, stop_; +}; + +int main(int argc, char** argv) +{ + typedef float floatTypeA; + typedef float floatTypeB; + typedef float floatTypeC; + typedef float floatTypeCompute; + + cudaDataType_t typeA = CUDA_R_32F; + cudaDataType_t typeB = CUDA_R_32F; + cudaDataType_t typeC = CUDA_R_32F; + cutensorComputeType_t typeCompute = CUTENSOR_R_MIN_32F; + + floatTypeCompute alpha = (floatTypeCompute)1.1f; + floatTypeCompute beta = (floatTypeCompute)0.f; + + /********************** + * Computing: C_{m,u,n,v} = alpha * A_{m,h,k,n} B_{u,k,v,h} + beta * C_{m,u,n,v} + **********************/ + + std::vector modeC{'m','u','n','v'}; + std::vector modeA{'m','h','k','n'}; + std::vector modeB{'u','k','v','h'}; + int nmodeA = modeA.size(); + int nmodeB = modeB.size(); + int nmodeC = modeC.size(); + + std::unordered_map extent; + extent['m'] = 96; + extent['n'] = 96; + extent['u'] = 96; + extent['v'] = 64; + extent['h'] = 64; + extent['k'] = 64; + + double gflops = (2.0 * extent['m'] * extent['n'] * extent['u'] * extent['v'] * extent['k'] * extent['h']) /1e9; + + std::vector extentC; + for (auto mode : modeC) + extentC.push_back(extent[mode]); + std::vector extentA; + for (auto mode : modeA) + extentA.push_back(extent[mode]); + std::vector extentB; + for (auto mode : modeB) + extentB.push_back(extent[mode]); + + /********************** + * Allocating data + **********************/ + + size_t elementsA = 1; + for (auto mode : modeA) + elementsA *= extent[mode]; + size_t elementsB = 1; + for (auto mode : modeB) + elementsB *= extent[mode]; + size_t elementsC = 1; + for (auto mode : modeC) + elementsC *= extent[mode]; + + size_t sizeA = sizeof(floatTypeA) * elementsA; + size_t sizeB = sizeof(floatTypeB) * elementsB; + size_t sizeC = sizeof(floatTypeC) * elementsC; + printf("Total memory: %.2f GiB\n", (sizeA + sizeB + sizeC)/1024./1024./1024); + + void *A_d, *B_d, *C_d; + HANDLE_CUDA_ERROR(cudaMalloc((void**) &A_d, sizeA)); + HANDLE_CUDA_ERROR(cudaMalloc((void**) &B_d, sizeB)); + HANDLE_CUDA_ERROR(cudaMalloc((void**) &C_d, sizeC)); + + floatTypeA *A = (floatTypeA*) malloc(sizeof(floatTypeA) * elementsA); + floatTypeB *B = (floatTypeB*) malloc(sizeof(floatTypeB) * elementsB); + floatTypeC *C = (floatTypeC*) malloc(sizeof(floatTypeC) * elementsC); + + if (A == NULL || B == NULL || C == NULL) + { + printf("Error: Host allocation of A or C.\n"); + return -1; + } + + /******************* + * Initialize data + *******************/ + + for (int64_t i = 0; i < elementsA; i++) + A[i] = (((float) rand())/RAND_MAX - 0.5)*100; + for (int64_t i = 0; i < elementsB; i++) + B[i] = (((float) rand())/RAND_MAX - 0.5)*100; + for (int64_t i = 0; i < elementsC; i++) + C[i] = (((float) rand())/RAND_MAX - 0.5)*100; + + HANDLE_CUDA_ERROR(cudaMemcpy(A_d, A, sizeA, cudaMemcpyHostToDevice)); + HANDLE_CUDA_ERROR(cudaMemcpy(B_d, B, sizeB, cudaMemcpyHostToDevice)); + HANDLE_CUDA_ERROR(cudaMemcpy(C_d, C, sizeC, cudaMemcpyHostToDevice)); + + /************************* + * cuTENSOR + *************************/ + + cutensorHandle_t handle; + HANDLE_ERROR(cutensorInit(&handle)); + + /********************** + * Create Tensor Descriptors + **********************/ + + cutensorTensorDescriptor_t descA; + HANDLE_ERROR(cutensorInitTensorDescriptor(&handle, + &descA, + nmodeA, + extentA.data(), + NULL,/*stride*/ + typeA, CUTENSOR_OP_IDENTITY)); + + cutensorTensorDescriptor_t descB; + HANDLE_ERROR(cutensorInitTensorDescriptor(&handle, + &descB, + nmodeB, + extentB.data(), + NULL,/*stride*/ + typeB, CUTENSOR_OP_IDENTITY)); + + cutensorTensorDescriptor_t descC; + HANDLE_ERROR(cutensorInitTensorDescriptor( &handle, + &descC, + nmodeC, + extentC.data(), + NULL,/*stride*/ + typeC, CUTENSOR_OP_IDENTITY)); + + /********************************************** + * Retrieve the memory alignment for each tensor + **********************************************/ + + uint32_t alignmentRequirementA; + HANDLE_ERROR(cutensorGetAlignmentRequirement(&handle, + A_d, + &descA, + &alignmentRequirementA)); + + uint32_t alignmentRequirementB; + HANDLE_ERROR(cutensorGetAlignmentRequirement(&handle, + B_d, + &descB, + &alignmentRequirementB)); + + uint32_t alignmentRequirementC; + HANDLE_ERROR(cutensorGetAlignmentRequirement(&handle, + C_d, + &descC, + &alignmentRequirementC)); + + /******************************* + * Create Contraction Descriptor + *******************************/ + + cutensorContractionDescriptor_t desc; + HANDLE_ERROR(cutensorInitContractionDescriptor(&handle, + &desc, + &descA, modeA.data(), alignmentRequirementA, + &descB, modeB.data(), alignmentRequirementB, + &descC, modeC.data(), alignmentRequirementC, + &descC, modeC.data(), alignmentRequirementC, + typeCompute)); + + /************************** + * Set the algorithm to use + ***************************/ + + cutensorContractionFind_t find; + HANDLE_ERROR(cutensorInitContractionFind( + &handle, &find, + CUTENSOR_ALGO_DEFAULT)); + + /********************** + * Query workspace + **********************/ + + uint64_t worksize = 0; + HANDLE_ERROR(cutensorContractionGetWorkspace(&handle, + &desc, + &find, + CUTENSOR_WORKSPACE_RECOMMENDED, &worksize)); + + void *work = nullptr; + if (worksize > 0) + { + if (cudaSuccess != cudaMalloc(&work, worksize)) + { + work = nullptr; + worksize = 0; + } + } + + /************************** + * Create Contraction Plan + **************************/ + + cutensorContractionPlan_t plan; + HANDLE_ERROR(cutensorInitContractionPlan(&handle, + &plan, + &desc, + &find, + worksize)); + + /********************** + * Run + **********************/ + + double minTimeCUTENSOR = 1e100; + cutensorStatus_t err; + for (int i=0; i < 3; ++i) + { + cudaMemcpy(C_d, C, sizeC, cudaMemcpyHostToDevice); + cudaDeviceSynchronize(); + + // Set up timing + GPUTimer timer; + timer.start(); + + err = cutensorContraction(&handle, + &plan, + (void*) &alpha, A_d, B_d, + (void*) &beta, C_d, C_d, + work, worksize, 0 /* stream */); + + // Synchronize and measure timing + auto time = timer.seconds(); + + if (err != CUTENSOR_STATUS_SUCCESS) + { + printf("ERROR: %s in line %d\n", cutensorGetErrorString(err), __LINE__); + } + minTimeCUTENSOR = (minTimeCUTENSOR < time) ? minTimeCUTENSOR : time; + } + + /*************************/ + + double transferedBytes = sizeC + sizeA + sizeB; + transferedBytes += ((float) beta != 0.f) ? sizeC : 0; + transferedBytes /= 1e9; + printf("cuTensor: %.2f GFLOPs/s %.2f GB/s\n", gflops / minTimeCUTENSOR, transferedBytes/ minTimeCUTENSOR); + + if (A) free(A); + if (B) free(B); + if (C) free(C); + if (A_d) cudaFree(A_d); + if (B_d) cudaFree(B_d); + if (C_d) cudaFree(C_d); + if (work) cudaFree(work); + + return 0; +} diff --git a/imageResize.cpp b/imageResize.cpp new file mode 100644 index 0000000..a5052fc --- /dev/null +++ b/imageResize.cpp @@ -0,0 +1,405 @@ +// +// Created by kwoodle on 6/12/20. +// + +#include "imageResize.h" + +//#define OPTIMIZED_HUFFMAN +//#define CUDA10U2 + +// ***************************************************************************** +// nvJPEG handles and parameters +// ----------------------------------------------------------------------------- +nvjpegBackend_t impl = NVJPEG_BACKEND_GPU_HYBRID; //NVJPEG_BACKEND_DEFAULT; +nvjpegHandle_t nvjpeg_handle; +nvjpegJpegStream_t nvjpeg_jpeg_stream; +nvjpegDecodeParams_t nvjpeg_decode_params; +nvjpegJpegState_t nvjpeg_decoder_state; +nvjpegEncoderParams_t nvjpeg_encode_params; +nvjpegEncoderState_t nvjpeg_encoder_state; + +#ifdef CUDA10U2 // This part needs CUDA 10.1 Update 2 for copy the metadata other information from base image. +nvjpegJpegEncoding_t nvjpeg_encoding; +#endif + + +// ***************************************************************************** +// Decode, Resize and Encoder function +// ----------------------------------------------------------------------------- +int decodeResizeEncodeOneImage(std::string sImagePath, std::string sOutputPath, double &time, int resizeWidth, int resizeHeight, int resize_quality) +{ + // Decode, Encoder format + nvjpegOutputFormat_t oformat = NVJPEG_OUTPUT_BGR; + nvjpegInputFormat_t iformat = NVJPEG_INPUT_BGR; + + // timing for resize + time = 0.; + float resize_time = 0.; + cudaEvent_t start, stop; + CHECK_CUDA(cudaEventCreate(&start)); + CHECK_CUDA(cudaEventCreate(&stop)); + + // Image reading section + // Get the file name, without extension. + // This will be used to rename the output file. + size_t position = sImagePath.rfind("/"); + std::string sFileName = (std::string::npos == position)? sImagePath : sImagePath.substr(position + 1, sImagePath.size()); + position = sFileName.rfind("."); + sFileName = (std::string::npos == position)? sFileName : sFileName.substr(0, position); + +#ifndef _WIN64 + position = sFileName.rfind("/"); + sFileName = (std::string::npos == position) ? sFileName : sFileName.substr(position + 1, sFileName.length()); +#else + position = sFileName.rfind("\\"); + sFileName = (std::string::npos == position) ? sFileName : sFileName.substr(position+1, sFileName.length()); +#endif + + // Read an image from disk. + std::ifstream oInputStream(sImagePath.c_str(), std::ios::in | std::ios::binary | std::ios::ate); + if(!(oInputStream.is_open())) + { + std::cerr << "Cannot open image: " << sImagePath << std::endl; + return EXIT_FAILURE; + } + + // Get the size. + std::streamsize nSize = oInputStream.tellg(); + oInputStream.seekg(0, std::ios::beg); + + // Image buffers. + unsigned char * pBuffer = NULL; + unsigned char * pResizeBuffer = NULL; + + std::vector vBuffer(nSize); + if (oInputStream.read(vBuffer.data(), nSize)) + { + unsigned char * dpImage = (unsigned char *)vBuffer.data(); + + // Retrieve the componenet and size info. + int nComponent = 0; + nvjpegChromaSubsampling_t subsampling; + int widths[NVJPEG_MAX_COMPONENT]; + int heights[NVJPEG_MAX_COMPONENT]; + int nReturnCode = 0; + if (NVJPEG_STATUS_SUCCESS != nvjpegGetImageInfo(nvjpeg_handle, dpImage, nSize, &nComponent, &subsampling, widths, heights)) + { + std::cerr << "Error decoding JPEG header: " << sImagePath << std::endl; + return EXIT_FAILURE; + } + + if(resizeWidth == 0 || resizeHeight == 0) + { + resizeWidth = widths[0]/2; + resizeHeight = heights[0]/2; + } + + // image resize + size_t pitchDesc, pitchResize; + NppiSize srcSize = { (int)widths[0], (int)heights[0] }; + NppiRect srcRoi = { 0, 0, srcSize.width, srcSize.height }; + NppiSize dstSize = { (int)resizeWidth, (int)resizeHeight }; + NppiRect dstRoi = { 0, 0, dstSize.width, dstSize.height }; + NppStatus st; + NppStreamContext nppStreamCtx; + nppStreamCtx.hStream = NULL; // default stream + + // device image buffers. + nvjpegImage_t imgDesc; + nvjpegImage_t imgResize; + + if (is_interleaved(oformat)) + { + pitchDesc = NVJPEG_MAX_COMPONENT * widths[0]; + pitchResize = NVJPEG_MAX_COMPONENT * resizeWidth; + } + else + { + pitchDesc = 3 * widths[0]; + pitchResize = 3 * resizeWidth; + } + + cudaError_t eCopy = cudaMalloc(&pBuffer, pitchDesc * heights[0]); + if (cudaSuccess != eCopy) + { + std::cerr << "cudaMalloc failed : " << cudaGetErrorString(eCopy) << std::endl; + return EXIT_FAILURE; + } + cudaError_t eCopy1 = cudaMalloc(&pResizeBuffer, pitchResize * resizeHeight); + if (cudaSuccess != eCopy1) + { + std::cerr << "cudaMalloc failed : " << cudaGetErrorString(eCopy) << std::endl; + return EXIT_FAILURE; + } + + + imgDesc.channel[0] = pBuffer; + imgDesc.channel[1] = pBuffer + widths[0] * heights[0]; + imgDesc.channel[2] = pBuffer + widths[0] * heights[0] * 2; + imgDesc.pitch[0] = (unsigned int)(is_interleaved(oformat) ? widths[0] * NVJPEG_MAX_COMPONENT : widths[0]); + imgDesc.pitch[1] = (unsigned int)widths[0]; + imgDesc.pitch[2] = (unsigned int)widths[0]; + + imgResize.channel[0] = pResizeBuffer; + imgResize.channel[1] = pResizeBuffer + resizeWidth * resizeHeight; + imgResize.channel[2] = pResizeBuffer + resizeWidth * resizeHeight * 2; + imgResize.pitch[0] = (unsigned int)(is_interleaved(oformat) ? resizeWidth * NVJPEG_MAX_COMPONENT : resizeWidth);; + imgResize.pitch[1] = (unsigned int)resizeWidth; + imgResize.pitch[2] = (unsigned int)resizeWidth; + + if (is_interleaved(oformat)) + { + imgDesc.channel[3] = pBuffer + widths[0] * heights[0] * 3; + imgDesc.pitch[3] = (unsigned int)widths[0]; + imgResize.channel[3] = pResizeBuffer + resizeWidth * resizeHeight * 3; + imgResize.pitch[3] = (unsigned int)resizeWidth; + } + + // nvJPEG encoder parameter setting + CHECK_NVJPEG(nvjpegEncoderParamsSetQuality(nvjpeg_encode_params, resize_quality, NULL)); + +#ifdef OPTIMIZED_HUFFMAN // Optimized Huffman + CHECK_NVJPEG(nvjpegEncoderParamsSetOptimizedHuffman(nvjpeg_encode_params, 1, NULL)); +#endif + CHECK_NVJPEG(nvjpegEncoderParamsSetSamplingFactors(nvjpeg_encode_params, subsampling, NULL)); + + + // Timing start + CHECK_CUDA(cudaEventRecord(start, 0)); + +#ifdef CUDA10U2 // This part needs CUDA 10.1 Update 2 + //parse image save metadata in jpegStream structure + CHECK_NVJPEG(nvjpegJpegStreamParse(nvjpeg_handle, dpImage, nSize, 1, 0, nvjpeg_jpeg_stream)); +#endif + + // decode by stages + nReturnCode = nvjpegDecode(nvjpeg_handle, nvjpeg_decoder_state, dpImage, nSize, oformat, &imgDesc, NULL); + if(nReturnCode != 0) + { + std::cerr << "Error in nvjpegDecode." << nReturnCode << std::endl; + return EXIT_FAILURE; + } + + // image resize + /* Note: this is the simplest resizing function from NPP. */ + if (is_interleaved(oformat)) + { + st = nppiResize_8u_C3R_Ctx(imgDesc.channel[0], imgDesc.pitch[0], srcSize, srcRoi, + imgResize.channel[0], imgResize.pitch[0], dstSize, dstRoi, NPPI_INTER_LANCZOS, nppStreamCtx); + } + else + { + st = nppiResize_8u_C1R_Ctx(imgDesc.channel[0], imgDesc.pitch[0], srcSize, srcRoi, + imgResize.channel[0], imgResize.pitch[0], dstSize, dstRoi, NPPI_INTER_LANCZOS, nppStreamCtx); + st = nppiResize_8u_C1R_Ctx(imgDesc.channel[1], imgDesc.pitch[1], srcSize, srcRoi, + imgResize.channel[1], imgResize.pitch[1], dstSize, dstRoi, NPPI_INTER_LANCZOS, nppStreamCtx); + st = nppiResize_8u_C1R_Ctx(imgDesc.channel[2], imgDesc.pitch[2], srcSize, srcRoi, + imgResize.channel[2], imgResize.pitch[2], dstSize, dstRoi, NPPI_INTER_LANCZOS, nppStreamCtx); + } + + if (st != NPP_SUCCESS) + { + std::cerr << "NPP resize failed : " << st << std::endl; + return EXIT_FAILURE; + } + + // get encoding from the jpeg stream and copy it to the encode parameters +#ifdef CUDA10U2 // This part needs CUDA 10.1 Update 2 for copy the metadata other information from base image. + CHECK_NVJPEG(nvjpegJpegStreamGetJpegEncoding(nvjpeg_jpeg_stream, &nvjpeg_encoding)); + CHECK_NVJPEG(nvjpegEncoderParamsSetEncoding(nvjpeg_encode_params, nvjpeg_encoding, NULL)); + CHECK_NVJPEG(nvjpegEncoderParamsCopyQuantizationTables(nvjpeg_encode_params, nvjpeg_jpeg_stream, NULL)); + CHECK_NVJPEG(nvjpegEncoderParamsCopyHuffmanTables(nvjpeg_encoder_state, nvjpeg_encode_params, nvjpeg_jpeg_stream, NULL)); + CHECK_NVJPEG(nvjpegEncoderParamsCopyMetadata(nvjpeg_encoder_state, nvjpeg_encode_params, nvjpeg_jpeg_stream, NULL)); +#endif + + // encoding the resize data + CHECK_NVJPEG(nvjpegEncodeImage(nvjpeg_handle, + nvjpeg_encoder_state, + nvjpeg_encode_params, + &imgResize, + iformat, + dstSize.width, + dstSize.height, + NULL)); + + // retrive the encoded bitstream for file writing + std::vector obuffer; + size_t length; + CHECK_NVJPEG(nvjpegEncodeRetrieveBitstream( + nvjpeg_handle, + nvjpeg_encoder_state, + NULL, + &length, + NULL)); + + obuffer.resize(length); + + CHECK_NVJPEG(nvjpegEncodeRetrieveBitstream( + nvjpeg_handle, + nvjpeg_encoder_state, + obuffer.data(), + &length, + NULL)); + + // Timing stop + CHECK_CUDA(cudaEventRecord(stop, 0)); + CHECK_CUDA(cudaEventSynchronize(stop)); + + // file writing + std::cout << "Resize-width: " << dstSize.width << " Resize-height: " << dstSize.height << std::endl; + std::string output_filename = sOutputPath + "/" + sFileName + ".jpg"; + char directory[120]; + char mkdir_cmd[256]; + std::string folder = sOutputPath; + output_filename = folder + "/"+ sFileName +".jpg"; +#if !defined(_WIN32) + sprintf(directory, "%s", folder.c_str()); + sprintf(mkdir_cmd, "mkdir -p %s 2> /dev/null", directory); +#else + sprintf(directory, "%s", folder.c_str()); + sprintf(mkdir_cmd, "mkdir %s 2> nul", directory); +#endif + + int ret = system(mkdir_cmd); + + std::cout << "Writing JPEG file: " << output_filename << std::endl; + std::ofstream outputFile(output_filename.c_str(), std::ios::out | std::ios::binary); + outputFile.write(reinterpret_cast(obuffer.data()), static_cast(length)); + } + // Free memory + CHECK_CUDA(cudaFree(pBuffer)); + CHECK_CUDA(cudaFree(pResizeBuffer)); + + // get timing + CHECK_CUDA(cudaEventElapsedTime(&resize_time, start, stop)); + time = (double)resize_time; + + return EXIT_SUCCESS; +} + +// ***************************************************************************** +// parsing the arguments function +// ----------------------------------------------------------------------------- +int processArgs(image_resize_params_t param) +{ + std::string sInputPath(param.input_dir); + std::string sOutputPath(param.output_dir); + int resizeWidth = param.width; + int resizeHeight = param.height; + int resize_quality = param.quality; + + int error_code = 1; + + double total_time = 0., decode_time = 0.; + int total_images = 0; + + std::vector inputFiles; + if (readInput(sInputPath, inputFiles)) + { + return error_code; + } + for (unsigned int i = 0; i < inputFiles.size(); i++) + { + std::string &sFileName = inputFiles[i]; + std::cout << "Processing file: " << sFileName << std::endl; + + int image_error_code = decodeResizeEncodeOneImage(sFileName, sOutputPath, decode_time, resizeWidth, resizeHeight, resize_quality); + + if (image_error_code) + { + std::cerr << "Error processing file: " << sFileName << std::endl; + return image_error_code; + } + else + { + total_images++; + total_time += decode_time; + } + } + + std::cout << "------------------------------------------------------------- " << std::endl; + std::cout << "Total images resized: " << total_images << std::endl; + std::cout << "Total time spent on resizing: " << total_time << " (ms)" << std::endl; + std::cout << "Avg time/image: " << total_time/total_images << " (ms)" << std::endl; + std::cout << "------------------------------------------------------------- " << std::endl; + return EXIT_SUCCESS; +} + +// ***************************************************************************** +// main image resize function +// ----------------------------------------------------------------------------- +int main(int argc, const char *argv[]) +{ + int pidx; + + if ((pidx = findParamIndex(argv, argc, "-h")) != -1 || + (pidx = findParamIndex(argv, argc, "--help")) != -1) { + std::cout << "Usage: " << argv[0] + << " -i images-dir [-o output-dir]" + "[-q jpeg-quality][-rw resize-width ] [-rh resize-height]\n"; + std::cout << "Parameters: " << std::endl; + std::cout << "\timages-dir\t:\tPath to single image or directory of images" << std::endl; + std::cout << "\toutput-dir\t:\tWrite resized images to this directory [default resize_output]" << std::endl; + std::cout << "\tJPEG Quality\t:\tUse image quality [default 85]" << std::endl; + std::cout << "\tResize Width\t:\t Resize width [default original_img_width/2]" << std::endl; + std::cout << "\tResize Height\t:\t Resize height [default original_img_height/2]" << std::endl; + return EXIT_SUCCESS; + } + + image_resize_params_t params; + + params.input_dir = "./"; + if ((pidx = findParamIndex(argv, argc, "-i")) != -1) { + params.input_dir = argv[pidx + 1]; + } else { + // Search in default paths for input images. + int found = getInputDir(params.input_dir, argv[0]); + if (!found) + { + std::cout << "Please specify input directory for image resizing"<< std::endl; + return EXIT_FAILURE; + } + } + if ((pidx = findParamIndex(argv, argc, "-o")) != -1) { + params.output_dir = argv[pidx + 1]; + } else { + // by-default write the folder named "output" in cwd + params.output_dir = "resize_output"; + } + + params.quality = 85; + if ((pidx = findParamIndex(argv, argc, "-q")) != -1) { + params.quality = std::atoi(argv[pidx + 1]); + } + + params.width = 0; + if ((pidx = findParamIndex(argv, argc, "-rw")) != -1) { + params.width = std::atoi(argv[pidx + 1]); + } + + params.height = 0; + if ((pidx = findParamIndex(argv, argc, "-rh")) != -1) { + params.height = std::atoi(argv[pidx + 1]); + } + + nvjpegDevAllocator_t dev_allocator = {&dev_malloc, &dev_free}; + CHECK_NVJPEG(nvjpegCreate(impl, &dev_allocator, &nvjpeg_handle)); + CHECK_NVJPEG(nvjpegJpegStateCreate(nvjpeg_handle, &nvjpeg_decoder_state)); + + // create bitstream object + CHECK_NVJPEG(nvjpegJpegStreamCreate(nvjpeg_handle, &nvjpeg_jpeg_stream)); + CHECK_NVJPEG(nvjpegDecodeParamsCreate(nvjpeg_handle, &nvjpeg_decode_params)); + CHECK_NVJPEG(nvjpegEncoderStateCreate(nvjpeg_handle, &nvjpeg_encoder_state, NULL)); + CHECK_NVJPEG(nvjpegEncoderParamsCreate(nvjpeg_handle, &nvjpeg_encode_params, NULL)); + + pidx = processArgs(params); + + CHECK_NVJPEG(nvjpegEncoderParamsDestroy(nvjpeg_encode_params)); + CHECK_NVJPEG(nvjpegDecodeParamsDestroy(nvjpeg_decode_params)); + CHECK_NVJPEG(nvjpegEncoderStateDestroy(nvjpeg_encoder_state)); + CHECK_NVJPEG(nvjpegJpegStateDestroy(nvjpeg_decoder_state)); + CHECK_NVJPEG(nvjpegDestroy(nvjpeg_handle)); + + return pidx; +} + diff --git a/imageResize.h b/imageResize.h new file mode 100644 index 0000000..6249c83 --- /dev/null +++ b/imageResize.h @@ -0,0 +1,234 @@ +// +// Created by kwoodle on 6/12/20. +// + + +#include +#include +#include +#include +#include +#include + +#include // strcmpi +#ifndef _WIN64 +#include // timings +#include +#endif +#include +#include +#include + + +#include +#include +#include + + +#define CHECK_CUDA(call) \ + { \ + cudaError_t _e = (call); \ + if (_e != cudaSuccess) \ + { \ + std::cout << "CUDA Runtime failure: '#" << _e << "' at " << __FILE__ << ":" << __LINE__ << std::endl;\ + exit(1); \ + } \ + } + +#define CHECK_NVJPEG(call) \ + { \ + nvjpegStatus_t _e = (call); \ + if (_e != NVJPEG_STATUS_SUCCESS) \ + { \ + std::cout << "NVJPEG failure: '#" << _e << "' at " << __FILE__ << ":" << __LINE__ << std::endl;\ + exit(1); \ + } \ + } + +struct image_resize_params_t { + std::string input_dir; + std::string output_dir; + int quality; + int width; + int height; + int dev; +}; + + +typedef struct { + NppiSize size; + nvjpegImage_t data; +} image_t; + + +int dev_malloc(void** p, size_t s) +{ + return (int)cudaMalloc(p, s); +} + +int dev_free(void* p) +{ + return (int)cudaFree(p); +} + +bool is_interleaved(nvjpegOutputFormat_t format) +{ + if (format == NVJPEG_OUTPUT_RGBI || format == NVJPEG_OUTPUT_BGRI) + return true; + else + return false; +} + + +// ***************************************************************************** +// reading input directory to file list +// ----------------------------------------------------------------------------- +int readInput(const std::string &sInputPath, std::vector &filelist) +{ + int error_code = 1; + struct stat s; + + if( stat(sInputPath.c_str(), &s) == 0 ) + { + if( s.st_mode & S_IFREG ) + { + filelist.push_back(sInputPath); + } + else if( s.st_mode & S_IFDIR ) + { + // processing each file in directory + DIR *dir_handle; + struct dirent *dir; + dir_handle = opendir(sInputPath.c_str()); + std::vector filenames; + if (dir_handle) + { + error_code = 0; + while ((dir = readdir(dir_handle)) != NULL) + { + if (dir->d_type == DT_REG) + { + std::string sFileName = sInputPath + dir->d_name; + filelist.push_back(sFileName); + } + else if (dir->d_type == DT_DIR) + { + std::string sname = dir->d_name; + if (sname != "." && sname != "..") + { + readInput(sInputPath + sname + "/", filelist); + } + } + } + closedir(dir_handle); + } + else + { + std::cout << "Cannot open input directory: " << sInputPath << std::endl; + return error_code; + } + } + else + { + std::cout << "Cannot open input: " << sInputPath << std::endl; + return error_code; + } + } + else + { + std::cout << "Cannot find input path " << sInputPath << std::endl; + return error_code; + } + + return 0; +} + +// ***************************************************************************** +// check for inputDirExists +// ----------------------------------------------------------------------------- +int inputDirExists(const char *pathname) { + struct stat info; + if (stat(pathname, &info) != 0) { + return 0; // Directory does not exists + } else if (info.st_mode & S_IFDIR) { + // is a directory + return 1; + } else { + // is not a directory + return 0; + } +} + +// ***************************************************************************** +// check for getInputDir +// ----------------------------------------------------------------------------- +int getInputDir(std::string &input_dir, const char *executable_path) { + int found = 0; + if (executable_path != 0) { + std::string executable_name = std::string(executable_path); +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) + // Windows path delimiter + size_t delimiter_pos = executable_name.find_last_of('\\'); + executable_name.erase(0, delimiter_pos + 1); + + if (executable_name.rfind(".exe") != std::string::npos) { + // we strip .exe, only if the .exe is found + executable_name.resize(executable_name.size() - 4); + } +#else + // Linux & OSX path delimiter + size_t delimiter_pos = executable_name.find_last_of('/'); + executable_name.erase(0, delimiter_pos + 1); +#endif + + // Search in default paths for input images. + std::string pathname = ""; + const char *searchPath[] = { + "./images"}; + + for (unsigned int i = 0; i < sizeof(searchPath) / sizeof(char *); ++i) { + std::string pathname(searchPath[i]); + size_t executable_name_pos = pathname.find(""); + + // If there is executable_name variable in the searchPath + // replace it with the value + if (executable_name_pos != std::string::npos) { + pathname.replace(executable_name_pos, strlen(""), + executable_name); + } + + if (inputDirExists(pathname.c_str())) { + input_dir = pathname + "/"; + found = 1; + break; + } + } + } + return found; +} + +// ***************************************************************************** +// parse parameters +// ----------------------------------------------------------------------------- +int findParamIndex(const char **argv, int argc, const char *parm) { + int count = 0; + int index = -1; + + for (int i = 0; i < argc; i++) { + if (strncmp(argv[i], parm, 100) == 0) { + index = i; + count++; + } + } + + if (count == 0 || count == 1) { + return index; + } else { + std::cout << "Error, parameter " << parm + << " has been specified more than once, exiting\n" + << std::endl; + return -1; + } + + return -1; +} diff --git a/reduction.cu b/reduction.cu new file mode 100644 index 0000000..da4f144 --- /dev/null +++ b/reduction.cu @@ -0,0 +1,229 @@ +// +// Created by kwoodle on 6/12/20. +// + + +#include +#include + +#include +#include + +#include +#include + +#define HANDLE_ERROR(x) \ +{ const auto err = x; \ + if( err != CUTENSOR_STATUS_SUCCESS ) \ + { printf("Error: %s\n", cutensorGetErrorString(err)); return err; } \ +}; + +#define HANDLE_CUDA_ERROR(x) \ +{ const auto err = x; \ + if( err != cudaSuccess ) \ + { printf("Error: %s\n", cudaGetErrorString(err)); return err; } \ +}; + +struct GPUTimer +{ + GPUTimer() + { + cudaEventCreate(&start_); + cudaEventCreate(&stop_); + cudaEventRecord(start_, 0); + } + + ~GPUTimer() + { + cudaEventDestroy(start_); + cudaEventDestroy(stop_); + } + + void start() + { + cudaEventRecord(start_, 0); + } + + float seconds() + { + cudaEventRecord(stop_, 0); + cudaEventSynchronize(stop_); + float time; + cudaEventElapsedTime(&time, start_, stop_); + return time * 1e-3; + } +private: + cudaEvent_t start_, stop_; +}; + +int main(int argc, char** argv) +{ + typedef float floatTypeA; + typedef float floatTypeB; + typedef float floatTypeC; + typedef float floatTypeCompute; + + cudaDataType_t typeA = CUDA_R_32F; + cudaDataType_t typeC = CUDA_R_32F; + cutensorComputeType_t typeCompute = CUTENSOR_R_MIN_32F; + + floatTypeCompute alpha = (floatTypeCompute)1.1f; + floatTypeCompute beta = (floatTypeCompute)0.f; + + /********************** + * Computing (partial) reduction : C_{m,v} = alpha * A_{m,h,k,v} + beta * C_{m,v} + *********************/ + + std::vector modeA{'m','h','k','v'}; + std::vector modeC{'m','v'}; + int32_t nmodeA = modeA.size(); + int32_t nmodeC = modeC.size(); + + std::unordered_map extent; + extent['m'] = 196; + extent['v'] = 64; + extent['h'] = 256; + extent['k'] = 64; + + std::vector extentC; + for (auto mode : modeC) + extentC.push_back(extent[mode]); + std::vector extentA; + for (auto mode : modeA) + extentA.push_back(extent[mode]); + + /********************** + * Allocating data + *********************/ + + size_t elementsA = 1; + for (auto mode : modeA) + elementsA *= extent[mode]; + size_t elementsC = 1; + for (auto mode : modeC) + elementsC *= extent[mode]; + + size_t sizeA = sizeof(floatTypeA) * elementsA; + size_t sizeC = sizeof(floatTypeC) * elementsC; + printf("Total memory: %.2f GiB\n",(sizeA + sizeC)/1024./1024./1024); + + void *A_d, *C_d; + HANDLE_CUDA_ERROR(cudaMalloc((void**)&A_d, sizeA)); + HANDLE_CUDA_ERROR(cudaMalloc((void**)&C_d, sizeC)); + + floatTypeA *A = (floatTypeA*) malloc(sizeof(floatTypeA) * elementsA); + floatTypeC *C = (floatTypeC*) malloc(sizeof(floatTypeC) * elementsC); + + if (A == NULL || C == NULL) + { + printf("Error: Host allocation of A, B, or C.\n"); + return -1; + } + + /******************* + * Initialize data + *******************/ + + for (int64_t i = 0; i < elementsA; i++) + A[i] = (((float) rand())/RAND_MAX - 0.5)*100; + for (int64_t i = 0; i < elementsC; i++) + C[i] = (((float) rand())/RAND_MAX - 0.5)*100; + + HANDLE_CUDA_ERROR(cudaMemcpy(C_d, C, sizeC, cudaMemcpyHostToDevice)); + HANDLE_CUDA_ERROR(cudaMemcpy(A_d, A, sizeA, cudaMemcpyHostToDevice)); + + /************************* + * cuTENSOR + *************************/ + + cutensorHandle_t handle; + HANDLE_ERROR(cutensorInit(&handle)); + + /********************** + * Create Tensor Descriptors + **********************/ + + cutensorTensorDescriptor_t descA; + HANDLE_ERROR(cutensorInitTensorDescriptor(&handle, + &descA, + nmodeA, + extentA.data(), + NULL /* stride */, + typeA, CUTENSOR_OP_IDENTITY)); + + cutensorTensorDescriptor_t descC; + HANDLE_ERROR(cutensorInitTensorDescriptor(&handle, + &descC, + nmodeC, + extentC.data(), + NULL /* stride */, + typeC, CUTENSOR_OP_IDENTITY)); + + const cutensorOperator_t opReduce = CUTENSOR_OP_ADD; + + /********************** + * Querry workspace + **********************/ + + uint64_t worksize = 0; + HANDLE_ERROR(cutensorReductionGetWorkspace(&handle, + A_d, &descA, modeA.data(), + C_d, &descC, modeC.data(), + C_d, &descC, modeC.data(), + opReduce, typeCompute, &worksize)); + void *work = nullptr; + if (worksize > 0) + { + if (cudaSuccess != cudaMalloc(&work, worksize)) + { + work = nullptr; + worksize = 0; + } + } + + /********************** + * Run + **********************/ + + double minTimeCUTENSOR = 1e100; + cutensorStatus_t err; + for(int i=0; i < 3; ++i) + { + HANDLE_CUDA_ERROR(cudaMemcpy(C_d, C, sizeC, cudaMemcpyHostToDevice)); + HANDLE_CUDA_ERROR(cudaDeviceSynchronize()); + + // Set up timing + GPUTimer timer; + timer.start(); + + err = cutensorReduction(&handle, + (const void*)&alpha, A_d, &descA, modeA.data(), + (const void*)&beta, C_d, &descC, modeC.data(), + C_d, &descC, modeC.data(), + opReduce, typeCompute, work, worksize, 0 /* stream */); + + // Synchronize and measure timing + auto time = timer.seconds(); + + if (err != CUTENSOR_STATUS_SUCCESS) + { + printf("ERROR: %s in line %d\n", cutensorGetErrorString(err), __LINE__); + } + minTimeCUTENSOR = (minTimeCUTENSOR < time) ? minTimeCUTENSOR : time; + } + + /*************************/ + + double transferedBytes = sizeC + sizeA; + transferedBytes += ((float) beta != 0.f) ? sizeC : 0; + transferedBytes /= 1e9; + printf("cuTensor: %.2f GB/s\n", transferedBytes / minTimeCUTENSOR); + + if (A) free(A); + if (C) free(C); + if (A_d) cudaFree(A_d); + if (C_d) cudaFree(C_d); + if (work) cudaFree(work); + + return 0; +}