nl_cuda: Blas allocations on NL_HOST_MEMORY use now CudaMallocHost()

(creates pinned memory) AMGCL backend: vector type in backend can be allocated on device or host's pinned memory. Used by direct solver (that copies data back and forth between the GPU and the CPU to solve on the CPU)
BrunoLevy · Nov 20, 2024 · 68cc47a · 68cc47a
1 parent b7f132e
commit 68cc47a
Show file tree

Hide file tree

Showing 2 changed files with 75 additions and 26 deletions.
diff --git a/src/lib/geogram/NL/nl_amgcl.cpp b/src/lib/geogram/NL/nl_amgcl.cpp
@@ -123,19 +123,24 @@ namespace amgcl2nl {
 	 * \brief vector constructor from size
 	 * \details vector is initialized to zero
 	 */
-	vector(index_type n) {
+	vector(index_type n, NLmemoryType mem_type = NL_DEVICE_MEMORY) {
+	    mem_type_ = mem_type;
 	    n_ = n;
-	    data_ = NL_NEW_VECTOR(nlCUDABlas(), NL_DEVICE_MEMORY, n_);
+	    data_ = NL_NEW_VECTOR(nlCUDABlas(), mem_type_, n_);
 	    temp_ = nullptr;
 	    clear(); // TODO: check whether it is necessary to clear.
 	}
 
 	/**
 	 * \brief vector constructor from data on host and size
 	 */
-	vector(const value_type* x_on_host, index_type n) {
+	vector(
+	    const value_type* x_on_host, index_type n,
+	    NLmemoryType mem_type = NL_DEVICE_MEMORY
+	) {
+	    mem_type_ = mem_type;
 	    n_ = n;
-	    data_ = NL_NEW_VECTOR(nlCUDABlas(), NL_DEVICE_MEMORY, n_);
+	    data_ = NL_NEW_VECTOR(nlCUDABlas(), mem_type_, n_);
 	    temp_ = nullptr;
 	    copy_from_host(x_on_host, n);
 	}
@@ -145,7 +150,7 @@ namespace amgcl2nl {
 	 */
 	~vector() {
 	    if(data_ != nullptr) {
-		NL_DELETE_VECTOR(nlCUDABlas(), NL_DEVICE_MEMORY, n_, data_);
+		NL_DELETE_VECTOR(nlCUDABlas(), mem_type_, n_, data_);
 	    }
 	    n_ = 0;
 	    data_ = nullptr;
@@ -169,6 +174,26 @@ namespace amgcl2nl {
 	    return n_;
 	}
 
+	/**
+	 * \brief Gets the memory type where vector data resides
+	 * \return one of NL_HOST_MEMORY, NL_DEVICE_MEMORY
+	 */
+	NLmemoryType mem_type() const {
+	    return mem_type_;
+	}
+
+	double& operator[](index_type i) {
+	    nl_debug_assert(i < n_);
+	    nl_debug_assert(mem_type_ == NL_HOST_MEMORY);
+	    return data_[i];
+	}
+
+	const double& operator[](index_type i) const {
+	    nl_debug_assert(i < n_);
+	    nl_debug_assert(mem_type_ == NL_HOST_MEMORY);
+	    return data_[i];
+	}
+
 	/**
 	 * \brief Gets a const device pointer to the data
 	 * \return a pointer to the stored data on the GPU
@@ -206,7 +231,7 @@ namespace amgcl2nl {
 	 */
 	void clear() {
 	    NLBlas_t blas = nlCUDABlas();
-	    blas->Memset(blas, data_, NL_DEVICE_MEMORY, 0, bytes());
+	    blas->Memset(blas, data_, mem_type_, 0, bytes());
 	}
 
 	/**
@@ -223,7 +248,7 @@ namespace amgcl2nl {
 	    NLBlas_t blas = nlCUDABlas();
 	    blas->Memcpy(
 		blas,
-		data_, NL_DEVICE_MEMORY,
+		data_, mem_type_,
 		const_cast<double*>(x_on_host), NL_HOST_MEMORY,
 		bytes()
 	    );
@@ -240,7 +265,7 @@ namespace amgcl2nl {
 	    blas->Memcpy(
 		blas,
 		x_on_host, NL_HOST_MEMORY,
-		data_, NL_DEVICE_MEMORY,
+		data_, mem_type_,
 		bytes()
 	    );
 	}
@@ -255,22 +280,28 @@ namespace amgcl2nl {
 	    NLBlas_t blas = nlCUDABlas();
 	    blas->Memcpy(
 		blas,
-		data_, NL_DEVICE_MEMORY,
+		data_, mem_type_,
 		const_cast<double*>(x_on_device), NL_DEVICE_MEMORY,
 		bytes()
 	    );
 	}
 
 	void copy_from(const vector& rhs) {
 	    nl_debug_assert(rhs.n_ == n_);
-	    copy_from_device(rhs.data_, rhs.n_);
+	    if(rhs.mem_type() == NL_DEVICE_MEMORY) {
+		copy_from_device(rhs.data_, rhs.n_);
+	    } else {
+		copy_from_host(rhs.data_, rhs.n_);
+	    }
 	}
 
 	/**
 	 * \brief computes the dot product between two vectors
 	 */
 	static double dot(const vector& x, const vector& y) {
 	    nl_debug_assert(x.size() == y.size());
+	    nl_debug_assert(x.mem_type() == NL_DEVICE_MEMORY);
+	    nl_debug_assert(y.mem_type() == NL_DEVICE_MEMORY);
 	    NLBlas_t blas = nlCUDABlas();
 	    return blas->Ddot(blas,x.n_,x.data_,1,y.data_,1);
 	}
@@ -280,6 +311,7 @@ namespace amgcl2nl {
 	 * \brief \f$ x \leftarrow a x \f$
 	 */
 	static void scal(double a, const vector& x) {
+	    nl_debug_assert(x.mem_type() == NL_DEVICE_MEMORY);
 	    NLBlas_t blas = nlCUDABlas();
 	    blas->Dscal(blas, x.n_, a, x.data_, 1);
 	}
@@ -289,6 +321,8 @@ namespace amgcl2nl {
 	 */
 	static void axpy(double a, const vector& x, vector& y) {
 	    nl_debug_assert(x.size() == y.size());
+	    nl_debug_assert(x.mem_type() == NL_DEVICE_MEMORY);
+	    nl_debug_assert(y.mem_type() == NL_DEVICE_MEMORY);
 	    NLBlas_t blas = nlCUDABlas();
 	    blas->Daxpy(blas, x.n_, a, x.data_, 1, y.data_, 1);
 	}
@@ -299,13 +333,17 @@ namespace amgcl2nl {
 	 */
 	static void mul(const vector& M, const vector& x, vector& y) {
 	    nl_debug_assert(x.size() == M.size() && y.size() == M.size());
+	    nl_debug_assert(M.mem_type() == NL_DEVICE_MEMORY);
+	    nl_debug_assert(x.mem_type() == NL_DEVICE_MEMORY);
+	    nl_debug_assert(y.mem_type() == NL_DEVICE_MEMORY);
 	    NLBlas_t blas = nlCUDABlas();
 	    blas->Dmul(blas,M.size(),M.data_,x.data_,y.data_);
 	}
 
     private:
+	double* data_;
 	index_type n_;
-	double* data_; // on device memory
+	NLmemoryType mem_type_;
 	mutable std::shared_ptr<vector> temp_; // temporary vector for vmul()
     };
 
@@ -401,21 +439,23 @@ namespace amgcl2nl {
 	template <class Matrix, class Params> cuda_skyline_lu(
 	    const Matrix &A, const Params&
 	): Base(*A),
-	   rhs_on_host_(amgcl::backend::rows(*A)),
-	   x_on_host_(amgcl::backend::rows(*A)) {
+	   rhs_on_host_(amgcl::backend::rows(*A), NL_HOST_MEMORY),
+	   x_on_host_(amgcl::backend::rows(*A), NL_HOST_MEMORY) {
 	}
 
-	void operator()(const vector &rhs, vector &x) const {
-	    rhs.copy_to_host(rhs_on_host_);
-	    static_cast<const Base*>(this)->operator()(rhs_on_host_, x_on_host_);
-	    x.copy_from_host(x_on_host_);
+	void operator()(const vector &rhs_on_device, vector &x_on_device) const {
+	    rhs_on_host_.copy_from(rhs_on_device);
+	    static_cast<const Base*>(this)->operator()(
+		rhs_on_host_, x_on_host_
+	    );
+	    x_on_device.copy_from(x_on_host_);
 	}
-	mutable std::vector<value_type> rhs_on_host_;
-	mutable std::vector<value_type> x_on_host_;
+
+	mutable amgcl2nl::vector rhs_on_host_;
+	mutable amgcl2nl::vector x_on_host_;
     };
 }
 
-
 namespace amgcl { namespace backend {
 
     /**

diff --git a/src/lib/geogram/NL/nl_cuda.c b/src/lib/geogram/NL/nl_cuda.c
@@ -206,6 +206,8 @@ typedef cudaError_t (*FUNPTR_cudaDeviceGetAttribute)(
 typedef cudaError_t (*FUNPTR_cudaDeviceReset)(void);
 typedef cudaError_t (*FUNPTR_cudaMalloc)(void **devPtr, size_t size);
 typedef cudaError_t (*FUNPTR_cudaFree)(void* devPtr);
+typedef cudaError_t (*FUNPTR_cudaMallocHost)(void **devPtr, size_t size);
+typedef cudaError_t (*FUNPTR_cudaFreeHost)(void* devPtr);
 typedef cudaError_t (*FUNPTR_cudaMemcpy)(
     void *dst, const void *src, size_t count, enum cudaMemcpyKind kind
 );
@@ -545,6 +547,8 @@ typedef struct {
     FUNPTR_cudaDeviceReset cudaDeviceReset;
     FUNPTR_cudaMalloc cudaMalloc;
     FUNPTR_cudaFree cudaFree;
+    FUNPTR_cudaMalloc cudaMallocHost;
+    FUNPTR_cudaFree cudaFreeHost;
     FUNPTR_cudaMemcpy cudaMemcpy;
     FUNPTR_cudaMemset cudaMemset;
     FUNPTR_cudaMemGetInfo cudaMemGetInfo;
@@ -609,6 +613,8 @@ NLboolean nlExtensionIsInitialized_CUDA(void) {
         CUDA()->cudaDeviceReset == NULL ||
         CUDA()->cudaMalloc == NULL ||
         CUDA()->cudaFree == NULL ||
+        CUDA()->cudaMallocHost == NULL ||
+        CUDA()->cudaFreeHost == NULL ||
         CUDA()->cudaMemcpy == NULL ||
 	CUDA()->cudaMemset == NULL ||
 	CUDA()->cudaMemGetInfo == NULL ||
@@ -885,6 +891,8 @@ NLboolean nlInitExtension_CUDA(void) {
     find_cuda_func(cudaDeviceReset);
     find_cuda_func(cudaMalloc);
     find_cuda_func(cudaFree);
+    find_cuda_func(cudaMallocHost);
+    find_cuda_func(cudaFreeHost);
     find_cuda_func(cudaMemcpy);
     find_cuda_func(cudaMemset);
     find_cuda_func(cudaMemGetInfo);
@@ -1610,7 +1618,9 @@ static void* cuda_blas_malloc(
         blas->max_used_ram[type],blas->used_ram[type]
     );
     if(type == NL_HOST_MEMORY) {
-        result = malloc(size);
+        // result = malloc(size);
+	// pinned memory, makes Host <-> device xfers faster
+	nlCUDACheck(CUDA()->cudaMallocHost(&result,size));
     } else {
         nlCUDACheck(CUDA()->cudaMalloc(&result,size));
     }
@@ -1622,7 +1632,9 @@ static void cuda_blas_free(
 ) {
     blas->used_ram[type] -= (NLulong)size;
     if(type == NL_HOST_MEMORY) {
-        free(ptr);
+        // free(ptr);
+	// pinned memory, makes Host <-> device xfers faster
+	nlCUDACheck(CUDA()->cudaFreeHost(ptr));
     } else {
         nlCUDACheck(CUDA()->cudaFree(ptr));
     }
@@ -1768,12 +1780,9 @@ static void cuda_blas_reset_stats(NLBlas_t blas) {
 static void cuda_blas_show_stats(NLBlas_t blas) {
     size_t free_RAM;
     size_t total_RAM;
-    nl_printf("BLAS stats\n");
-    nl_printf("----------\n");
-    nl_printf("  GFlops: %d\n", nlBlasGFlops(blas));
     CUDA()->cudaMemGetInfo(&free_RAM, &total_RAM);
     nl_printf(
-	"  used GPU RAM: %f GB / total GPU RAM: %f GB (free: %f GB)\n",
+	"NLBlas: used GPU RAM: %f / total: %f GB (free: %f GB)\n",
 	(double)(total_RAM - free_RAM)/1e9,
 	(double)total_RAM/1e9,
 	(double)free_RAM/1e9