Skip to content

Commit

Permalink
Partial CUDA11 Support (#696)
Browse files Browse the repository at this point in the history
* Partial CUDA11 Support
* Add an extra abort
* Remove unnecessary CUDA11 defines
  • Loading branch information
XapaJIaMnu authored Aug 27, 2020
1 parent e9d7de4 commit 96738fe
Showing 1 changed file with 48 additions and 57 deletions.
105 changes: 48 additions & 57 deletions src/tensors/gpu/prod.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,63 +12,6 @@
#include "tensors/gpu/cuda_helpers.h"
// clang-format on

// recreations of a few cusparse functions that were deprecated in CUDA 11
// @TODO: Fill these in. This is not trivial. Until then, using these with CUDA 11 will fail.
#if CUDA_VERSION >= 11000
cusparseStatus_t
cusparseSgemmi10(cusparseHandle_t handle,
int m,
int n,
int k,
int nnz,
const float* alpha,
const float* A,
int lda,
const float* cscValB,
const int* cscColPtrB,
const int* cscRowIndB,
const float* beta,
float* C,
int ldc) {
ABORT("Sparse matrix operations are currently not supported by Marian under CUDA 11");
}
#define cusparseSgemmi cusparseSgemmi10
cusparseStatus_t
cusparseScsr2csc(cusparseHandle_t handle,
int m,
int n,
int nnz,
const float* csrVal,
const int* csrRowPtr,
const int* csrColInd,
float* cscVal,
int* cscRowInd,
int* cscColPtr,
cusparseAction_t copyValues,
cusparseIndexBase_t idxBase) {
ABORT("Sparse matrix operations are currently not supported by Marian under CUDA 11");
}
cusparseStatus_t
cusparseScsrmm(cusparseHandle_t handle,
cusparseOperation_t transA,
int m,
int n,
int k,
int nnz,
const float* alpha,
const cusparseMatDescr_t descrA,
const float* csrValA,
const int* csrRowPtrA,
const int* csrColIndA,
const float* B,
int ldb,
const float* beta,
float* C,
int ldc) {
ABORT("Sparse matrix operations are currently not supported by Marian under CUDA 11");
}
#endif

namespace marian {

namespace gpu {
Expand Down Expand Up @@ -420,6 +363,9 @@ static cusparseSgemmiEx(cusparseHandle_t handle, int m,
const float *cscValB, const int *cscColPtrB, const int *cscRowIndB, const float *beta,
float *C, int ldc)
{
#if CUDA_VERSION >= 11000
ABORT("cusparseSgemmi is not available in CUDA VERSION >= 11.");
#else
const int nMax = 65535; // max. number of columns allowed by cuSparse 10 implementation
for (int j0 = 0; j0 < n; j0 += 65535) { // loop over column slices, j0 = index of first column
// Call original function on a column slice.
Expand All @@ -432,6 +378,7 @@ static cusparseSgemmiEx(cusparseHandle_t handle, int m,
if (rc != CUSPARSE_STATUS_SUCCESS)
return rc;
}
#endif
return CUSPARSE_STATUS_SUCCESS;
}

Expand Down Expand Up @@ -483,6 +430,45 @@ void CSRProd(marian::Tensor C,
St_indices = allocator->alloc<int>(numValues);
St_offsets = allocator->alloc<int>(colsS + 1);
// transpose the second argument
#if CUDA_VERSION >= 11000
size_t buffer_size;
CUSPARSE_CHECK(cusparseCsr2cscEx2_bufferSize(cusparseHandle,
/*m=*/ rowsS, // number of rows of matrix
/*n=*/ colsS, // number of columns of matrix
/*nnz=*/ (int)numValues,
/*csrcVal=*/ S_values ->data<float>(),
/*csrcRowPtr=*/ (int*)S_offsets->data<IndexType>(),
/*csrcColInd=*/ (int*)S_indices->data<IndexType>(),
/*cscVal=*/ St_values ->data<float>(), // transposed version goes here
/*cscColPtr=*/ St_offsets->data<int>(),
/*cscRowInd=*/ St_indices->data<int>(),
/*valType*/ CUDA_R_32F,
/*copyValues=*/ CUSPARSE_ACTION_NUMERIC,
/*idxBase=*/ CUSPARSE_INDEX_BASE_ZERO,
/*alg*/ CUSPARSE_CSR2CSC_ALG1,
/*bufferSize*/ &buffer_size));
MemoryPiece::PtrType buffer= (buffer_size > 0) ? allocator->alloc<uint8_t>(buffer_size) : nullptr;

CUSPARSE_CHECK(cusparseCsr2cscEx2(cusparseHandle,
/*m=*/ rowsS, // number of rows of matrix
/*n=*/ colsS, // number of columns of matrix
/*nnz=*/ (int)numValues,
/*csrcVal=*/ S_values ->data<float>(),
/*csrcRowPtr=*/ (int*)S_offsets->data<IndexType>(),
/*csrcColInd=*/ (int*)S_indices->data<IndexType>(),
/*cscVal=*/ St_values ->data<float>(), // transposed version goes here
/*cscColPtr=*/ St_offsets->data<int>(),
/*cscRowInd=*/ St_indices->data<int>(),
/*valType=*/ CUDA_R_32F,
/*copyValues=*/ CUSPARSE_ACTION_NUMERIC,
/*idxBase=*/ CUSPARSE_INDEX_BASE_ZERO,
/*alg=*/ CUSPARSE_CSR2CSC_ALG1,
/*buffer=*/ buffer->data<uint8_t>()));

if (buffer)
allocator->free(buffer);
ABORT("This code is untested. Please remove this ABORT once tests exist and pass.");
#else
CUSPARSE_CHECK(cusparseScsr2csc(cusparseHandle,
/*m=*/ rowsS, // number of rows of matrix
/*n=*/ colsS, // number of columns of matrix
Expand All @@ -495,12 +481,16 @@ void CSRProd(marian::Tensor C,
/*cscColPtr=*/ St_offsets->data<int>(),
/*copyValues=*/ CUSPARSE_ACTION_NUMERIC,
/*idxBase=*/ CUSPARSE_INDEX_BASE_ZERO));
#endif
std::swap(rowsS, colsS); // these variables now represent the dims of the explicitly transposed object
}
if (swapOperands) {
// C = D x S for row-major matrices
// Implemented via cusparse as C' = S' x D' ("csrmm") where C' and D' are column-major,
// and S' is CSR (if not transS then we make a transposed copy).
#if CUDA_VERSION >= 11000
ABORT("CSRProd is not yet implemented for CUDA VERSION >= 11");
#else
cusparseMatDescr_t descrA;
CUSPARSE_CHECK(cusparseCreateMatDescr(&descrA));
cusparseSetMatType (descrA, CUSPARSE_MATRIX_TYPE_GENERAL);
Expand All @@ -521,6 +511,7 @@ void CSRProd(marian::Tensor C,
C->data(),
/*ldc=*/ colsC)); // stride
cusparseDestroyMatDescr(descrA);
#endif
}
else {
// C = S x D for row-major matrices
Expand Down

0 comments on commit 96738fe

Please sign in to comment.