From 4d85f27cb249efc311e79b6eeb55b3768481a5f3 Mon Sep 17 00:00:00 2001
From: Alexis Montoison <35051714+amontoison@users.noreply.github.com>
Date: Thu, 6 Feb 2025 03:38:06 -0600
Subject: [PATCH] [CUSPARSE] Support CuSparseMatrixBSR in the generic mm!
 (#2639)

---
 lib/cusparse/generic.jl               | 45 ++++++++++++++++------
 lib/cusparse/level2.jl                | 11 +-----
 lib/cusparse/level3.jl                | 55 +--------------------------
 lib/cusparse/libcusparse.jl           | 12 +++---
 res/wrap/cusparse.toml                | 10 +++++
 test/libraries/cusparse.jl            |  3 +-
 test/libraries/cusparse/generic.jl    | 48 +++++++++++++----------
 test/libraries/cusparse/interfaces.jl | 24 ++++++++----
 8 files changed, 97 insertions(+), 111 deletions(-)

diff --git a/lib/cusparse/generic.jl b/lib/cusparse/generic.jl
index e944645760..ffe9137f48 100644
--- a/lib/cusparse/generic.jl
+++ b/lib/cusparse/generic.jl
@@ -1,9 +1,27 @@
 # generic APIs
 
 export gather!, scatter!, axpby!, rot!
-export vv!, sv!, sm!, gemv, gemm, gemm!, sddmm!
+export vv!, sv!, sm!, mv!, mm!, gemv, gemm, gemm!, sddmm!
 export bmm!
 
+"""
+    mv!(transa::SparseChar, alpha::Number, A::CuSparseMatrix, X::CuVector, beta::Number, Y::CuVector, index::SparseChar)
+
+Performs `Y = alpha * op(A) * X + beta * Y`, where `op` can be nothing (`transa = N`),
+tranpose (`transa = T`) or conjugate transpose (`transa = C`).
+`X` and `Y` are dense vectors.
+"""
+function mv! end
+
+"""
+    mm!(transa::SparseChar, transb::SparseChar, alpha::Number, A::CuSparseMatrix, B::CuMatrix, beta::Number, C::CuMatrix, index::SparseChar)
+    mm!(transa::SparseChar, transb::SparseChar, alpha::Number, A::CuMatrix, B::Union{CuSparseMatrixCSC,CuSparseMatrixCSR,CuSparseMatrixCOO}, beta::Number, C::CuMatrix, index::SparseChar)
+
+Performs `C = alpha * op(A) * op(B) + beta * C`, where `op` can be nothing (`transa = N`),
+tranpose (`transa = T`) or conjugate transpose (`transa = C`).
+"""
+function mm! end
+
 ## API functions
 
 function sparsetodense(A::Union{CuSparseMatrixCSC{T},CuSparseMatrixCSR{T},CuSparseMatrixCOO{T}}, index::SparseChar, algo::cusparseSparseToDenseAlg_t=CUSPARSE_SPARSETODENSE_ALG_DEFAULT) where {T}
@@ -191,9 +209,11 @@ function mv!(transa::SparseChar, alpha::Number, A::Union{CuSparseMatrixCSC{TA},C
     return Y
 end
 
-function mm!(transa::SparseChar, transb::SparseChar, alpha::Number, A::Union{CuSparseMatrixCSC{T},CuSparseMatrixCSR{T},CuSparseMatrixCOO{T}},
+function mm!(transa::SparseChar, transb::SparseChar, alpha::Number, A::CuSparseMatrix{T},
              B::DenseCuMatrix{T}, beta::Number, C::DenseCuMatrix{T}, index::SparseChar, algo::cusparseSpMMAlg_t=CUSPARSE_SPMM_ALG_DEFAULT) where {T}
 
+    (A isa CuSparseMatrixBSR) && (CUSPARSE.version() < v"12.5.1") && throw(ErrorException("This operation is not supported by the current CUDA version."))
+
     # Support transa = 'C' and `transb = 'C' for real matrices
     transa = T <: Real && transa == 'C' ? 'T' : transa
     transb = T <: Real && transb == 'C' ? 'T' : transb
@@ -235,10 +255,10 @@ function mm!(transa::SparseChar, transb::SparseChar, alpha::Number, A::Union{CuS
     #     cusparseCsrSetStridedBatch(obj, batchsize, 0, nnz(A))
     # end
 
-    # Set default buffer for small matrices (10000 chosen arbitrarly)
+    # Set default buffer for small matrices (1000 chosen arbitrarly)
     # Otherwise tries to allocate 120TB of memory (see #2296)
     function bufferSize()
-        out = Ref{Csize_t}(10000) 
+        out = Ref{Csize_t}(1000)
         cusparseSpMM_bufferSize(
             handle(), transa, transb, Ref{T}(alpha), descA, descB, Ref{T}(beta),
             descC, T, algo, out)
@@ -274,7 +294,6 @@ function bmm!(transa::SparseChar, transb::SparseChar, alpha::Number, A::CuSparse
         throw(ErrorException("Batched dense-matrix times batched sparse-matrix (bmm!) requires a CUSPARSE version ≥ 11.7.2 (yours: $(CUSPARSE.version()))."))
     end
 
-
     # Support transa = 'C' and `transb = 'C' for real matrices
     transa = T <: Real && transa == 'C' ? 'T' : transa
     transb = T <: Real && transb == 'C' ? 'T' : transb
@@ -313,10 +332,10 @@ function bmm!(transa::SparseChar, transb::SparseChar, alpha::Number, A::CuSparse
     strideC = stride(C, 3)
     cusparseDnMatSetStridedBatch(descC, b, strideC)
 
-    # Set default buffer for small matrices (10000 chosen arbitrarly)
+    # Set default buffer for small matrices (1000 chosen arbitrarly)
     # Otherwise tries to allocate 120TB of memory (see #2296)
     function bufferSize()
-        out = Ref{Csize_t}(10000)
+        out = Ref{Csize_t}(1000)
         cusparseSpMM_bufferSize(
             handle(), transa, transb, Ref{T}(alpha), descA, descB, Ref{T}(beta),
             descC, T, algo, out)
@@ -337,10 +356,11 @@ function bmm!(transa::SparseChar, transb::SparseChar, alpha::Number, A::CuSparse
 end
 
 function mm!(transa::SparseChar, transb::SparseChar, alpha::Number, A::DenseCuMatrix{T},
-             B::Union{CuSparseMatrixCSC{T},CuSparseMatrixCSR{T},CuSparseMatrixCOO{T}},
-             beta::Number, C::DenseCuMatrix{T}, index::SparseChar, algo::cusparseSpMMAlg_t=CUSPARSE_SPMM_ALG_DEFAULT) where {T}
+             B::Union{CuSparseMatrixCSC{T},CuSparseMatrixCSR{T},CuSparseMatrixCOO{T}}, beta::Number,
+             C::DenseCuMatrix{T}, index::SparseChar, algo::cusparseSpMMAlg_t=CUSPARSE_SPMM_ALG_DEFAULT) where {T}
 
     CUSPARSE.version() < v"11.7.4" && throw(ErrorException("This operation is not supported by the current CUDA version."))
+
     # Support transa = 'C' and `transb = 'C' for real matrices
     transa = T <: Real && transa == 'C' ? 'T' : transa
     transb = T <: Real && transb == 'C' ? 'T' : transb
@@ -373,10 +393,10 @@ function mm!(transa::SparseChar, transb::SparseChar, alpha::Number, A::DenseCuMa
     descB = CuSparseMatrixDescriptor(B, index, transposed=true)
     descC = CuDenseMatrixDescriptor(C, transposed=true)
 
-    # Set default buffer for small matrices (10000 chosen arbitrarly)
+    # Set default buffer for small matrices (1000 chosen arbitrarly)
     # Otherwise tries to allocate 120TB of memory (see #2296)
     function bufferSize()
-        out = Ref{Csize_t}(10000)
+        out = Ref{Csize_t}(1000)
         cusparseSpMM_bufferSize(
             handle(), transb, transa, Ref{T}(alpha), descB, descA, Ref{T}(beta),
             descC, T, algo, out)
@@ -736,9 +756,10 @@ function sm!(transa::SparseChar, transb::SparseChar, uplo::SparseChar, diag::Spa
 end
 
 function sddmm!(transa::SparseChar, transb::SparseChar, alpha::Number, A::DenseCuMatrix{T}, B::DenseCuMatrix{T},
-                beta::Number, C::CuSparseMatrixCSR{T}, index::SparseChar, algo::cusparseSDDMMAlg_t=CUSPARSE_SDDMM_ALG_DEFAULT) where {T}
+                beta::Number, C::Union{CuSparseMatrixCSR{T},CuSparseMatrixBSR{T}}, index::SparseChar, algo::cusparseSDDMMAlg_t=CUSPARSE_SDDMM_ALG_DEFAULT) where {T}
 
     CUSPARSE.version() < v"11.4.1" && throw(ErrorException("This operation is not supported by the current CUDA version."))
+    (C isa CuSparseMatrixBSR) && (CUSPARSE.version() < v"12.1.0") && throw(ErrorException("This operation is not supported by the current CUDA version."))
 
     # Support transa = 'C' and `transb = 'C' for real matrices
     transa = T <: Real && transa == 'C' ? 'T' : transa
diff --git a/lib/cusparse/level2.jl b/lib/cusparse/level2.jl
index 4e5b81de1c..f784ae980d 100644
--- a/lib/cusparse/level2.jl
+++ b/lib/cusparse/level2.jl
@@ -1,16 +1,7 @@
 # sparse linear algebra functions that perform operations between sparse matrices and dense
 # vectors
 
-export mv!, sv2!, sv2, gemvi!
-
-"""
-    mv!(transa::SparseChar, alpha::Number, A::CuSparseMatrix, X::CuVector, beta::Number, Y::CuVector, index::SparseChar)
-
-Performs `Y = alpha * op(A) * X + beta * Y`, where `op` can be nothing (`transa = N`),
-tranpose (`transa = T`) or conjugate transpose (`transa = C`).
-`X` and `Y` are dense vectors.
-"""
-mv!(transa::SparseChar, alpha::Number, A::CuSparseMatrix, X::CuVector, beta::Number, Y::CuVector, index::SparseChar)
+export sv2!, sv2, gemvi!
 
 for (fname,elty) in ((:cusparseSbsrmv, :Float32),
                      (:cusparseDbsrmv, :Float64),
diff --git a/lib/cusparse/level3.jl b/lib/cusparse/level3.jl
index 8f20fea5fc..cd166f7767 100644
--- a/lib/cusparse/level3.jl
+++ b/lib/cusparse/level3.jl
@@ -1,60 +1,7 @@
 # sparse linear algebra functions that perform operations between sparse and (usually tall)
 # dense matrices
 
-export mm!, sm2!, sm2
-
-"""
-    mm!(transa::SparseChar, transb::SparseChar, alpha::Number, A::CuSparseMatrix, B::CuMatrix, beta::Number, C::CuMatrix, index::SparseChar)
-
-Performs `C = alpha * op(A) * op(B) + beta * C`, where `op` can be nothing (`transa = N`),
-tranpose (`transa = T`) or conjugate transpose (`transa = C`).
-`B` and `C` are dense matrices.
-"""
-mm!(transa::SparseChar, transb::SparseChar, alpha::Number, A::CuSparseMatrix, B::CuMatrix, beta::Number, C::CuMatrix, index::SparseChar)
-
-# bsrmm
-for (fname,elty) in ((:cusparseSbsrmm, :Float32),
-                     (:cusparseDbsrmm, :Float64),
-                     (:cusparseCbsrmm, :ComplexF32),
-                     (:cusparseZbsrmm, :ComplexF64))
-    @eval begin
-        function mm!(transa::SparseChar,
-                     transb::SparseChar,
-                     alpha::Number,
-                     A::CuSparseMatrixBSR{$elty},
-                     B::StridedCuMatrix{$elty},
-                     beta::Number,
-                     C::StridedCuMatrix{$elty},
-                     index::SparseChar)
-
-            # Support transa = 'C' and `transb = 'C' for real matrices
-            transa = $elty <: Real && transa == 'C' ? 'T' : transa
-            transb = $elty <: Real && transb == 'C' ? 'T' : transb
-
-            desc = CuMatrixDescriptor('G', 'L', 'N', index)
-            m,k = size(A)
-            mb = cld(m, A.blockDim)
-            kb = cld(k, A.blockDim)
-            n = size(C)[2]
-            if transa == 'N' && transb == 'N'
-                chkmmdims(B,C,k,n,m,n)
-            elseif transa == 'N' && transb != 'N'
-                chkmmdims(B,C,n,k,m,n)
-            elseif transa != 'N' && transb == 'N'
-                chkmmdims(B,C,m,n,k,n)
-            elseif transa != 'N' && transb != 'N'
-                chkmmdims(B,C,n,m,k,n)
-            end
-            ldb = max(1,stride(B,2))
-            ldc = max(1,stride(C,2))
-            $fname(handle(), A.dir,
-                   transa, transb, mb, n, kb, A.nnzb,
-                   alpha, desc, nonzeros(A),A.rowPtr, A.colVal,
-                   A.blockDim, B, ldb, beta, C, ldc)
-            C
-        end
-    end
-end
+export sm2!, sm2
 
 """
     sm2!(transa::SparseChar, transxy::SparseChar, uplo::SparseChar, diag::SparseChar, alpha::BlasFloat, A::CuSparseMatrix, X::CuMatrix, index::SparseChar)
diff --git a/lib/cusparse/libcusparse.jl b/lib/cusparse/libcusparse.jl
index f459bac1cd..8dbbcd7db9 100644
--- a/lib/cusparse/libcusparse.jl
+++ b/lib/cusparse/libcusparse.jl
@@ -5415,9 +5415,9 @@ end
     @gcsafe_ccall libcusparse.cusparseCreateBsr(spMatDescr::Ptr{cusparseSpMatDescr_t},
                                                 brows::Int64, bcols::Int64, bnnz::Int64,
                                                 rowBlockSize::Int64, colBlockSize::Int64,
-                                                bsrRowOffsets::Ptr{Cvoid},
-                                                bsrColInd::Ptr{Cvoid},
-                                                bsrValues::Ptr{Cvoid},
+                                                bsrRowOffsets::CuPtr{Cvoid},
+                                                bsrColInd::CuPtr{Cvoid},
+                                                bsrValues::CuPtr{Cvoid},
                                                 bsrRowOffsetsType::cusparseIndexType_t,
                                                 bsrColIndType::cusparseIndexType_t,
                                                 idxBase::cusparseIndexBase_t,
@@ -5434,9 +5434,9 @@ end
                                                      brows::Int64, bcols::Int64,
                                                      bnnz::Int64, rowBlockDim::Int64,
                                                      colBlockDim::Int64,
-                                                     bsrRowOffsets::Ptr{Cvoid},
-                                                     bsrColInd::Ptr{Cvoid},
-                                                     bsrValues::Ptr{Cvoid},
+                                                     bsrRowOffsets::CuPtr{Cvoid},
+                                                     bsrColInd::CuPtr{Cvoid},
+                                                     bsrValues::CuPtr{Cvoid},
                                                      bsrRowOffsetsType::cusparseIndexType_t,
                                                      bsrColIndType::cusparseIndexType_t,
                                                      idxBase::cusparseIndexBase_t,
diff --git a/res/wrap/cusparse.toml b/res/wrap/cusparse.toml
index 3237bb3e61..524586df8b 100644
--- a/res/wrap/cusparse.toml
+++ b/res/wrap/cusparse.toml
@@ -990,3 +990,13 @@ needs_context = false
 
 [api.cusparseSpMMOp.argtypes]
 2 = "CuPtr{Cvoid}"
+
+[api.cusparseCreateBsr.argtypes]
+7 = "CuPtr{Cvoid}"
+8 = "CuPtr{Cvoid}"
+9 = "CuPtr{Cvoid}"
+
+[api.cusparseCreateConstBsr.argtypes]
+7 = "CuPtr{Cvoid}"
+8 = "CuPtr{Cvoid}"
+9 = "CuPtr{Cvoid}"
diff --git a/test/libraries/cusparse.jl b/test/libraries/cusparse.jl
index 234fb652a2..842ab3fa14 100644
--- a/test/libraries/cusparse.jl
+++ b/test/libraries/cusparse.jl
@@ -908,8 +908,7 @@ end
         alpha = rand(elty)
         beta = rand(elty)
         @testset "$(typeof(d_A))" for d_A in [CuSparseMatrixCSR(A),
-                                              CuSparseMatrixCSC(A),
-                                              CuSparseMatrixBSR(A, blockdim)]
+                                              CuSparseMatrixCSC(A)]
             d_B = CuArray(B)
             d_C = CuArray(C)
             @test_throws DimensionMismatch CUSPARSE.mm!('N','T',alpha,d_A,d_B,beta,d_C,'O')
diff --git a/test/libraries/cusparse/generic.jl b/test/libraries/cusparse/generic.jl
index 1c6c74c05e..8503cc14ee 100644
--- a/test/libraries/cusparse/generic.jl
+++ b/test/libraries/cusparse/generic.jl
@@ -53,10 +53,15 @@ if CUSPARSE.version() >= v"11.4.1" # lower CUDA version doesn't support these al
         push!(SPMV_ALGOS[CuSparseMatrixCOO], CUSPARSE.CUSPARSE_SPMV_COO_ALG2)
     end
 
+    if CUSPARSE.version() >= v"12.5.1"
+        SPMM_ALGOS[CuSparseMatrixBSR] = [CUSPARSE.CUSPARSE_SPMM_ALG_DEFAULT,
+                                         CUSPARSE.CUSPARSE_SPMM_BSR_ALG1]
+    end
+
     for SparseMatrixType in keys(SPMV_ALGOS)
         @testset "$SparseMatrixType -- mv! algo=$algo" for algo in SPMV_ALGOS[SparseMatrixType]
             @testset "mv! $T" for T in [Float32, Float64, ComplexF32, ComplexF64]
-                for (transa, opa) in [('N', identity), ('T', transpose), ('C', adjoint)]
+                @testset "transa = $transa" for (transa, opa) in [('N', identity), ('T', transpose), ('C', adjoint)]
                     CUSPARSE.version() < v"12.0" && SparseMatrixType == CuSparseMatrixCSC && T <: Complex && transa == 'C' && continue
                     A = sprand(T, 20, 10, 0.1)
                     B = transa == 'N' ? rand(T, 10) : rand(T, 20)
@@ -77,14 +82,15 @@ if CUSPARSE.version() >= v"11.4.1" # lower CUDA version doesn't support these al
     for SparseMatrixType in keys(SPMM_ALGOS)
         @testset "$SparseMatrixType * CuMatrix -- mm! algo=$algo" for algo in SPMM_ALGOS[SparseMatrixType]
             @testset "mm! $T" for T in [Float32, Float64, ComplexF32, ComplexF64]
-                for (transa, opa) in [('N', identity), ('T', transpose), ('C', adjoint)]
-                    for (transb, opb) in [('N', identity), ('T', transpose), ('C', adjoint)]
+                @testset "transa = $transa" for (transa, opa) in [('N', identity), ('T', transpose), ('C', adjoint)]
+                    @testset "transb = $transb" for (transb, opb) in [('N', identity), ('T', transpose), ('C', adjoint)]
                         CUSPARSE.version() < v"12.0" && SparseMatrixType == CuSparseMatrixCSC && T <: Complex && transa == 'C' && continue
                         algo == CUSPARSE.CUSPARSE_SPMM_CSR_ALG3 && (transa != 'N' || transb != 'N') && continue
+                        (SparseMatrixType == CuSparseMatrixBSR) && (transa != 'N') && continue
                         A = sprand(T, 10, 10, 0.1)
                         B = transb == 'N' ? rand(T, 10, 2) : rand(T, 2, 10)
                         C = rand(T, 10, 2)
-                        dA = SparseMatrixType(A)
+                        dA = SparseMatrixType == CuSparseMatrixBSR ? SparseMatrixType(A,1) : SparseMatrixType(A)
                         dB = CuArray(B)
                         dC = CuArray(C)
 
@@ -102,7 +108,8 @@ end
 if CUSPARSE.version() >= v"11.7.4"
 
     # Algorithms for CSC and CSR matrices are swapped
-    SPMM_ALGOS = Dict(CuSparseMatrixCSR => [CUSPARSE.CUSPARSE_SPMM_ALG_DEFAULT],
+    SPMM_ALGOS = Dict(CuSparseMatrixBSR => [CUSPARSE.CUSPARSE_SPMM_ALG_DEFAULT],
+                      CuSparseMatrixCSR => [CUSPARSE.CUSPARSE_SPMM_ALG_DEFAULT],
                       CuSparseMatrixCSC => [CUSPARSE.CUSPARSE_SPMM_ALG_DEFAULT,
                                             CUSPARSE.CUSPARSE_SPMM_CSR_ALG1,
                                             CUSPARSE.CUSPARSE_SPMM_CSR_ALG2,
@@ -114,10 +121,11 @@ if CUSPARSE.version() >= v"11.7.4"
                                             CUSPARSE.CUSPARSE_SPMM_COO_ALG4])
 
     for SparseMatrixType in keys(SPMM_ALGOS)
+        (SparseMatrixType == CuSparseMatrixBSR) && continue
         @testset "CuMatrix * $SparseMatrixType -- mm! algo=$algo" for algo in SPMM_ALGOS[SparseMatrixType]
             @testset "$T" for T in [Float32, Float64, ComplexF32, ComplexF64]
-                for (transa, opa) in [('N', identity), ('T', transpose), ('C', adjoint)]
-                    for (transb, opb) in [('N', identity), ('T', transpose), ('C', adjoint)]
+                @testset "transa = $transa" for (transa, opa) in [('N', identity), ('T', transpose), ('C', adjoint)]
+                    @testset "transb = $transb" for (transb, opb) in [('N', identity), ('T', transpose), ('C', adjoint)]
                         CUSPARSE.version() < v"12.0" && SparseMatrixType == CuSparseMatrixCSR && T <: Complex && transb == 'C' && continue
                         algo == CUSPARSE.CUSPARSE_SPMM_CSR_ALG3 && (transa != 'N' || transb != 'N') && continue
                         A = rand(T, 10, 10)
@@ -170,10 +178,10 @@ if CUSPARSE.version() >= v"12.0"
     for SparseMatrixType in [CuSparseMatrixCSC, CuSparseMatrixCSR, CuSparseMatrixCOO]
         @testset "$SparseMatrixType -- sv! algo=$algo" for algo in SPSV_ALGOS[SparseMatrixType]
             @testset "sv! $T" for T in [Float64, ComplexF64]
-                for (transa, opa) in [('N', identity), ('T', transpose), ('C', adjoint)]
+                @testset "transa = $transa" for (transa, opa) in [('N', identity), ('T', transpose), ('C', adjoint)]
                     SparseMatrixType == CuSparseMatrixCSC && T <: Complex && transa == 'C' && continue
-                    for uplo in ('L', 'U')
-                        for diag in ('U', 'N')
+                    @testset "uplo = $uplo" for uplo in ('L', 'U')
+                        @testset "diag = $diag" for diag in ('U', 'N')
                             A = rand(T, 10, 10)
                             A = uplo == 'L' ? tril(A) : triu(A)
                             A = diag == 'U' ? A - Diagonal(A) + I : A
@@ -194,11 +202,11 @@ if CUSPARSE.version() >= v"12.0"
 
         @testset "$SparseMatrixType -- sm! algo=$algo" for algo in SPSM_ALGOS[SparseMatrixType]
             @testset "sm! $T" for T in [Float64, ComplexF64]
-                for (transa, opa) in [('N', identity), ('T', transpose), ('C', adjoint)]
+                @testset "transa = $transa" for (transa, opa) in [('N', identity), ('T', transpose), ('C', adjoint)]
                     SparseMatrixType == CuSparseMatrixCSC && T <: Complex && transa == 'C' && continue
-                    for (transb, opb) in [('N', identity), ('T', transpose)]
-                        for uplo in ('L', 'U')
-                            for diag in ('U', 'N')
+                    @testset "transb = $transb" for (transb, opb) in [('N', identity), ('T', transpose)]
+                        @testset "uplo = $uplo" for uplo in ('L', 'U')
+                            @testset "diag = $diag" for diag in ('U', 'N')
                                 A = rand(T, 10, 10)
                                 A = uplo == 'L' ? tril(A) : triu(A)
                                 A = diag == 'U' ? A - Diagonal(A) + I : A
@@ -246,7 +254,7 @@ for SparseMatrixType in [CuSparseMatrixCSC, CuSparseMatrixCSR, CuSparseMatrixCOO
 end
 
 @testset "vv! $T" for T in [Float32, Float64, ComplexF32, ComplexF64]
-    for (transx, opx) in [('N', identity), ('C', conj)]
+    @testset "transx = $transx" for (transx, opx) in [('N', identity), ('C', conj)]
         T <: Real && transx == 'C' && continue
         X = sprand(T, 20, 0.5)
         dX = CuSparseVector{T}(X)
@@ -328,8 +336,8 @@ end
 for SparseMatrixType in keys(SPGEMM_ALGOS)
     @testset "$SparseMatrixType -- gemm -- gemm! algo=$algo" for algo in SPGEMM_ALGOS[SparseMatrixType]
         @testset "gemm -- gemm! $T" for T in [Float32, Float64, ComplexF32, ComplexF64]
-            for (transa, opa) in [('N', identity)]
-                for (transb, opb) in [('N', identity)]
+            @testset "transa = $transa" for (transa, opa) in [('N', identity)]
+                @testset "transb = $transb" for (transb, opb) in [('N', identity)]
                     A = sprand(T,25,10,0.2)
                     B = sprand(T,10,35,0.3)
                     dA = SparseMatrixType(A)
@@ -360,7 +368,7 @@ for SparseMatrixType in keys(SPGEMM_ALGOS)
     end
 
     @testset "gemv $T" for T in [Float32, Float64, ComplexF32, ComplexF64]
-        for (transa, opa) in [('N', identity)]
+        @testset "transa = $transa" for (transa, opa) in [('N', identity)]
             A = sprand(T,25,10,0.2)
             b = sprand(T,10,0.3)
             dA = SparseMatrixType(A)
@@ -380,8 +388,8 @@ if CUSPARSE.version() >= v"11.4.1"
     for SparseMatrixType in keys(SDDMM_ALGOS)
         @testset "$SparseMatrixType -- sddmm! algo=$algo" for algo in SDDMM_ALGOS[SparseMatrixType]
             @testset "sddmm! $T" for T in [Float32, Float64, ComplexF32, ComplexF64]
-                for (transa, opa) in [('N', identity), ('T', transpose), ('C', adjoint)]
-                    for (transb, opb) in [('N', identity), ('T', transpose), ('C', adjoint)]
+                @testset "transa = $transa" for (transa, opa) in [('N', identity), ('T', transpose), ('C', adjoint)]
+                    @testset "transb = $transb" for (transb, opb) in [('N', identity), ('T', transpose), ('C', adjoint)]
                         T <: Complex && (transa == 'C' || transb == 'C') && continue
                         mA = transa == 'N' ? 25 : 10
                         nA = transa == 'N' ? 10 : 25
diff --git a/test/libraries/cusparse/interfaces.jl b/test/libraries/cusparse/interfaces.jl
index e22bb25e63..e984cac871 100644
--- a/test/libraries/cusparse/interfaces.jl
+++ b/test/libraries/cusparse/interfaces.jl
@@ -128,10 +128,11 @@ using LinearAlgebra, SparseArrays
         end
     end
 
-    for SparseMatrixType in (CuSparseMatrixCSC, CuSparseMatrixCSR, CuSparseMatrixCOO)
+    for SparseMatrixType in (CuSparseMatrixCSC, CuSparseMatrixCSR, CuSparseMatrixCOO, CuSparseMatrixBSR)
 
         if CUSPARSE.version() >= v"11.7.4"
             @testset "CuMatrix * $SparseMatrixType -- A * B $elty" for elty in [Float32, Float64, ComplexF32, ComplexF64]
+                (SparseMatrixType == CuSparseMatrixBSR) && continue
                 for opa in (identity, transpose, adjoint)
                     for opb in (identity, transpose, adjoint)
                         CUSPARSE.version() < v"12.0" && SparseMatrixType == CuSparseMatrixCSR && elty <: Complex && opb == adjoint && continue
@@ -155,6 +156,7 @@ using LinearAlgebra, SparseArrays
             end
 
             @testset "CuMatrix * $SparseMatrixType -- mul!(C, A, B) $elty" for elty in [Float32, Float64, ComplexF32, ComplexF64]
+                (SparseMatrixType == CuSparseMatrixBSR) && continue
                 for opa in (identity, transpose, adjoint)
                     for opb in (identity, transpose, adjoint)
                         CUSPARSE.version() < v"12.0" && SparseMatrixType == CuSparseMatrixCSR && elty <: Complex && opb == adjoint && continue
@@ -183,6 +185,7 @@ using LinearAlgebra, SparseArrays
         end
 
         @testset "$SparseMatrixType * CuVector -- A * b $elty" for elty in [Float32, Float64, ComplexF32, ComplexF64]
+            (SparseMatrixType == CuSparseMatrixBSR) && continue
             for opa in (identity, transpose, adjoint)
                 CUSPARSE.version() < v"12.0" && SparseMatrixType == CuSparseMatrixCSC && elty <: Complex && opa == adjoint && continue
                 n = 10
@@ -200,6 +203,7 @@ using LinearAlgebra, SparseArrays
         end
 
         @testset "$SparseMatrixType * CuVector -- mul!(c, A, b) $elty" for elty in [Float32, Float64, ComplexF32, ComplexF64]
+            (SparseMatrixType == CuSparseMatrixBSR) && continue
             for opa in (identity, transpose, adjoint)
                 CUSPARSE.version() < v"12.0" && SparseMatrixType == CuSparseMatrixCSC && elty <: Complex && opa == adjoint && continue
                 n = 10
@@ -221,8 +225,10 @@ using LinearAlgebra, SparseArrays
         end
 
         @testset "$SparseMatrixType * CuMatrix -- A * B $elty" for elty in [Float32, Float64, ComplexF32, ComplexF64]
-            for opa in (identity, transpose, adjoint)
-                for opb in (identity, transpose, adjoint)
+            (CUSPARSE.version() < v"12.5.1") && (SparseMatrixType == CuSparseMatrixBSR) && continue
+            @testset "opa = $(string(opa))" for opa in (identity, transpose, adjoint)
+                (opa != identity) && (SparseMatrixType == CuSparseMatrixBSR) && continue
+                @testset "opb = $(string(opb))" for opb in (identity, transpose, adjoint)
                     CUSPARSE.version() < v"12.0" && SparseMatrixType == CuSparseMatrixCSC && elty <: Complex && opa == adjoint && continue
                     n = 10
                     k = 15
@@ -230,7 +236,7 @@ using LinearAlgebra, SparseArrays
                     A = opa == identity ? sprand(elty, m, k, 0.2) : sprand(elty, k, m, 0.2)
                     B = opb == identity ? rand(elty, k, n) : rand(elty, n, k)
 
-                    dA = SparseMatrixType(A)
+                    dA = (SparseMatrixType == CuSparseMatrixBSR) ? SparseMatrixType(A,1) : SparseMatrixType(A)
                     dB = CuArray(B)
 
                     C = opa(A) * opb(B)
@@ -241,8 +247,10 @@ using LinearAlgebra, SparseArrays
         end
 
         @testset "$SparseMatrixType * CuMatrix -- mul!(C, A, B) $elty" for elty in [Float32, Float64, ComplexF32, ComplexF64]
-            for opa in (identity, transpose, adjoint)
-                for opb in (identity, transpose, adjoint)
+            (CUSPARSE.version() < v"12.5.1") && (SparseMatrixType == CuSparseMatrixBSR) && continue
+            @testset "opa = $(string(opa))" for opa in (identity, transpose, adjoint)
+                (opa != identity) && (SparseMatrixType == CuSparseMatrixBSR) && continue
+                @testset "opb = $(string(opb))" for opb in (identity, transpose, adjoint)
                     CUSPARSE.version() < v"12.0" && SparseMatrixType == CuSparseMatrixCSC && elty <: Complex && opa == adjoint && continue
                     n = 10
                     k = 15
@@ -253,7 +261,7 @@ using LinearAlgebra, SparseArrays
                     B = opb == identity ? rand(elty, k, n) : rand(elty, n, k)
                     C = rand(elty, m, n)
 
-                    dA = SparseMatrixType(A)
+                    dA = (SparseMatrixType == CuSparseMatrixBSR) ? SparseMatrixType(A,1) : SparseMatrixType(A)
                     dB = CuArray(B)
                     dC = CuArray(C)
 
@@ -265,6 +273,7 @@ using LinearAlgebra, SparseArrays
         end
 
         @testset "$SparseMatrixType * CuSparseVector -- A * b $elty" for elty in [Float32, Float64, ComplexF32, ComplexF64]
+            (SparseMatrixType == CuSparseMatrixBSR) && continue
             for opa in (identity, transpose, adjoint)
                 CUSPARSE.version() < v"12.0" && SparseMatrixType == CuSparseMatrixCSC && elty <: Complex && opa == adjoint && continue
                 n = 10
@@ -282,6 +291,7 @@ using LinearAlgebra, SparseArrays
         end
 
         @testset "$SparseMatrixType * CuSparseVector -- mul!(c, A, b) $elty" for elty in [Float32, Float64, ComplexF32, ComplexF64]
+            (SparseMatrixType == CuSparseMatrixBSR) && continue
             for opa in (identity, transpose, adjoint)
                 CUSPARSE.version() < v"12.0" && SparseMatrixType == CuSparseMatrixCSC && elty <: Complex && opa == adjoint && continue
                 n = 10