- added Dmul() function to NL blas

- finished NLCUDA backend for AMGCL (to be tested now)
BrunoLevy · Nov 17, 2024 · fca4d8c · fca4d8c
1 parent c75963c
commit fca4d8c
Show file tree

Hide file tree

Showing 4 changed files with 94 additions and 19 deletions.
diff --git a/src/lib/geogram/NL/nl_amgcl.cpp b/src/lib/geogram/NL/nl_amgcl.cpp
@@ -245,11 +245,6 @@ namespace nlcuda_adapters {
 	NLMatrix impl_;
     };
 
-    /**
-     * \brief a diagonal matrix, stored in device memory
-     */
-    typedef matrix matrix_diagonal;
-
     /**
      * \brief a vector, stored in device memory
      */
@@ -273,9 +268,13 @@ namespace nlcuda_adapters {
 	~vector() {
 	    if(data_ != nullptr) {
 		NL_DELETE_VECTOR(nlCUDABlas(), NL_DEVICE_MEMORY, n_, data_);
-		n_ = 0;
-		data_ = nullptr;
 	    }
+	    if(temp_ != nullptr) {
+		NL_DELETE_VECTOR(nlCUDABlas(), NL_DEVICE_MEMORY, n_, temp_);
+	    }
+	    n_ = 0;
+	    data_ = nullptr;
+	    temp_ = nullptr;
 	}
 
 	vector(const vector& rhs) = delete;
@@ -338,8 +337,11 @@ namespace nlcuda_adapters {
 
 	index_type n_;
 	double* data_;
+	mutable double* temp_; // temporary for vmul()
     };
 
+    typedef vector matrix_diagonal;
+
     /**
      * Wrapper around solver::skyline_lu for use with the NLCUDA backend.
      * Inspired from AMGCL cuda wrapper
@@ -556,32 +558,60 @@ namespace amgcl { namespace backend {
 
     template <> struct vmul_impl<
 	double,
-	nlcuda_adapters::vector,
+	nlcuda_adapters::matrix_diagonal,
 	nlcuda_adapters::vector,
 	double,
 	nlcuda_adapters::vector
 	> {
 	static void apply(
 	    double a,
-	    const nlcuda_adapters::vector &x,
+	    const nlcuda_adapters::matrix_diagonal &M,
 	    const nlcuda_adapters::vector &y,
 	    double b,
 	    nlcuda_adapters::vector &z
 	) {
-	    // TODO
-	    nl_assert(false);
+	    nl_assert(M.n_ == y.n_);
+	    nl_assert(M.n_ == z.n_);
+	    int N = M.n_;
+	    NLBlas_t blas = nlCUDABlas();
+	    // z <- a * M * y + b * z
+
+	    if(b != 0.0) {
+		// tmp <- z
+		if(M.temp_ == nullptr) {
+		    M.temp_ =  NL_NEW_VECTOR(nlCUDABlas(), NL_DEVICE_MEMORY, N);
+		    blas->Dcopy(blas,N,z.data_,1,M.temp_,1);
+		}
+	    }
+
+	    // z <- a * M * y
+	    blas->Dmul(blas,N,M.data_,y.data_,z.data_);
+	    if(a != 1.0) {
+		blas->Dscal(blas,N,a,y.data_,1);
+	    }
+
+	    if(b != 0.0) {
+		// z <- b * tmp + z
+		blas->Daxpy(blas,N,b,M.temp_,1,z.data_,1);
+	    }
 	}
     };
 
-    template <> struct copy_impl<
-	nlcuda_adapters::vector,
-	nlcuda_adapters::vector
-	> {
+    template <class V> struct copy_impl<V,nlcuda_adapters::vector> {
 	static void apply(
-	    const nlcuda_adapters::vector &x,
+	    const V &x,
 	    nlcuda_adapters::vector &y
 	) {
-	    y.copy_from(x);
+	    y.copy_from_host(x.data());
+	}
+    };
+
+    template <class V> struct copy_impl<nlcuda_adapters::vector,V> {
+	static void apply(
+	    const nlcuda_adapters::vector &x,
+	    V &y
+	) {
+	    x.copy_to_host(y.data());
 	}
     };
 

diff --git a/src/lib/geogram/NL/nl_blas.c b/src/lib/geogram/NL/nl_blas.c
@@ -1474,12 +1474,23 @@ static double host_blas_dnrm2(
 }
 
 static void host_blas_daxpy(
-    NLBlas_t blas, int n, double a, const double *x, int incx, double *y, int incy
+    NLBlas_t blas, int n,
+    double a, const double *x, int incx, double *y, int incy
 ) {
     blas->flops += (NLulong)(2*n);
     NL_FORTRAN_WRAP(daxpy)(&n,&a,(double*)x,&incx,y,&incy);
 }
 
+static void host_blas_dmul(
+    NLBlas_t blas, int n,
+    const double *x, const double *y, double* z
+) {
+    blas->flops += (NLulong)(n);
+    for(int i=0; i<n; ++i) {
+	z[i] = x[i]*y[i];
+    }
+}
+
 static void host_blas_dscal(
     NLBlas_t blas, int n, double a, double *x, int incx
 ) {
@@ -1530,6 +1541,7 @@ NLBlas_t nlHostBlas(void) {
         blas.Ddot = host_blas_ddot;
         blas.Dnrm2 = host_blas_dnrm2;
         blas.Daxpy = host_blas_daxpy;
+        blas.Dmul = host_blas_dmul;
         blas.Dscal = host_blas_dscal;
         blas.Dgemv = host_blas_dgemv;
         blas.Dtpsv = host_blas_dtpsv;

diff --git a/src/lib/geogram/NL/nl_blas.h b/src/lib/geogram/NL/nl_blas.h
@@ -233,6 +233,19 @@ extern "C" {
     );
 
 
+/**
+ * \brief Component-wise multiplication
+ * \details In formula: \f$ z \leftarrow x ** y \f$
+ * \param[in] blas a handle to the BLAS abstraction layer
+ * \param[in] n dimension of the vectors
+ * \param[in] x , y source vectors
+ * \param[in] z destination vector
+ */
+    typedef void (*FUNPTR_dmul)(
+        NLBlas_t blas, int n,
+	const double* x, const double* y, double* z
+    );
+
 /**
  * \brief Computes a matrix-vector product
  * \details performs one of the matrix-vector operations
@@ -334,6 +347,7 @@ extern "C" {
         FUNPTR_ddot  Ddot;
         FUNPTR_dnrm2 Dnrm2;
         FUNPTR_daxpy Daxpy;
+        FUNPTR_dmul  Dmul;
         FUNPTR_dgemv Dgemv;
         FUNPTR_dtpsv Dtpsv;
 

diff --git a/src/lib/geogram/NL/nl_cuda.c b/src/lib/geogram/NL/nl_cuda.c
@@ -1515,7 +1515,7 @@ static void nlDiagonalMatrixCUDAMult(
 ) {
     int N = (int)Mcuda->n;
     /*
-     * vector x vector slice-wise product implemented
+     * vector x vector component-wise product implemented
      * using diagonal matrix x matrix function.
      */
     nlCUDACheck(CUDA()->cublasDdgmm(
@@ -1667,6 +1667,24 @@ static void cuda_blas_daxpy(
     CUDA()->cublasDaxpy(CUDA()->HNDL_cublas,n,&a,x,incx,y,incy);
 }
 
+static void cuda_blas_dmul(
+    NLBlas_t blas, int n,
+    const double *x, const double *y, double* z
+) {
+    blas->flops += (NLulong)(n);
+    /*
+     * vector x vector component-wise product implemented
+     * using diagonal matrix x matrix function.
+     */
+    nlCUDACheck(CUDA()->cublasDdgmm(
+                    CUDA()->HNDL_cublas, CUBLAS_SIDE_LEFT,
+                    n, 1,
+                    x, n,
+                    y, 1,
+                    z, n
+                ));
+}
+
 static void cuda_blas_dscal(
     NLBlas_t blas, int n, double a, double *x, int incx
 ) {
@@ -1719,6 +1737,7 @@ NLBlas_t nlCUDABlas(void) {
         blas.Ddot = cuda_blas_ddot;
         blas.Dnrm2 = cuda_blas_dnrm2;
         blas.Daxpy = cuda_blas_daxpy;
+	blas.Dmul = cuda_blas_dmul;
         blas.Dscal = cuda_blas_dscal;
         blas.Dgemv = cuda_blas_dgemv;
         blas.Dtpsv = cuda_blas_dtpsv;