Skip to content

Commit

Permalink
nl_cuda: Blas allocations on NL_HOST_MEMORY use now CudaMallocHost()
Browse files Browse the repository at this point in the history
  (creates pinned memory)
AMGCL backend: vector type in backend can be allocated on device or
   host's pinned memory. Used by direct solver (that copies data back
   and forth between the GPU and the CPU to solve on the CPU)
  • Loading branch information
BrunoLevy committed Nov 20, 2024
1 parent b7f132e commit 68cc47a
Show file tree
Hide file tree
Showing 2 changed files with 75 additions and 26 deletions.
80 changes: 60 additions & 20 deletions src/lib/geogram/NL/nl_amgcl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -123,19 +123,24 @@ namespace amgcl2nl {
* \brief vector constructor from size
* \details vector is initialized to zero
*/
vector(index_type n) {
vector(index_type n, NLmemoryType mem_type = NL_DEVICE_MEMORY) {
mem_type_ = mem_type;
n_ = n;
data_ = NL_NEW_VECTOR(nlCUDABlas(), NL_DEVICE_MEMORY, n_);
data_ = NL_NEW_VECTOR(nlCUDABlas(), mem_type_, n_);
temp_ = nullptr;
clear(); // TODO: check whether it is necessary to clear.
}

/**
* \brief vector constructor from data on host and size
*/
vector(const value_type* x_on_host, index_type n) {
vector(
const value_type* x_on_host, index_type n,
NLmemoryType mem_type = NL_DEVICE_MEMORY
) {
mem_type_ = mem_type;
n_ = n;
data_ = NL_NEW_VECTOR(nlCUDABlas(), NL_DEVICE_MEMORY, n_);
data_ = NL_NEW_VECTOR(nlCUDABlas(), mem_type_, n_);
temp_ = nullptr;
copy_from_host(x_on_host, n);
}
Expand All @@ -145,7 +150,7 @@ namespace amgcl2nl {
*/
~vector() {
if(data_ != nullptr) {
NL_DELETE_VECTOR(nlCUDABlas(), NL_DEVICE_MEMORY, n_, data_);
NL_DELETE_VECTOR(nlCUDABlas(), mem_type_, n_, data_);
}
n_ = 0;
data_ = nullptr;
Expand All @@ -169,6 +174,26 @@ namespace amgcl2nl {
return n_;
}

/**
* \brief Gets the memory type where vector data resides
* \return one of NL_HOST_MEMORY, NL_DEVICE_MEMORY
*/
NLmemoryType mem_type() const {
return mem_type_;
}

double& operator[](index_type i) {
nl_debug_assert(i < n_);
nl_debug_assert(mem_type_ == NL_HOST_MEMORY);
return data_[i];
}

const double& operator[](index_type i) const {
nl_debug_assert(i < n_);
nl_debug_assert(mem_type_ == NL_HOST_MEMORY);
return data_[i];
}

/**
* \brief Gets a const device pointer to the data
* \return a pointer to the stored data on the GPU
Expand Down Expand Up @@ -206,7 +231,7 @@ namespace amgcl2nl {
*/
void clear() {
NLBlas_t blas = nlCUDABlas();
blas->Memset(blas, data_, NL_DEVICE_MEMORY, 0, bytes());
blas->Memset(blas, data_, mem_type_, 0, bytes());
}

/**
Expand All @@ -223,7 +248,7 @@ namespace amgcl2nl {
NLBlas_t blas = nlCUDABlas();
blas->Memcpy(
blas,
data_, NL_DEVICE_MEMORY,
data_, mem_type_,
const_cast<double*>(x_on_host), NL_HOST_MEMORY,
bytes()
);
Expand All @@ -240,7 +265,7 @@ namespace amgcl2nl {
blas->Memcpy(
blas,
x_on_host, NL_HOST_MEMORY,
data_, NL_DEVICE_MEMORY,
data_, mem_type_,
bytes()
);
}
Expand All @@ -255,22 +280,28 @@ namespace amgcl2nl {
NLBlas_t blas = nlCUDABlas();
blas->Memcpy(
blas,
data_, NL_DEVICE_MEMORY,
data_, mem_type_,
const_cast<double*>(x_on_device), NL_DEVICE_MEMORY,
bytes()
);
}

void copy_from(const vector& rhs) {
nl_debug_assert(rhs.n_ == n_);
copy_from_device(rhs.data_, rhs.n_);
if(rhs.mem_type() == NL_DEVICE_MEMORY) {
copy_from_device(rhs.data_, rhs.n_);
} else {
copy_from_host(rhs.data_, rhs.n_);
}
}

/**
* \brief computes the dot product between two vectors
*/
static double dot(const vector& x, const vector& y) {
nl_debug_assert(x.size() == y.size());
nl_debug_assert(x.mem_type() == NL_DEVICE_MEMORY);
nl_debug_assert(y.mem_type() == NL_DEVICE_MEMORY);
NLBlas_t blas = nlCUDABlas();
return blas->Ddot(blas,x.n_,x.data_,1,y.data_,1);
}
Expand All @@ -280,6 +311,7 @@ namespace amgcl2nl {
* \brief \f$ x \leftarrow a x \f$
*/
static void scal(double a, const vector& x) {
nl_debug_assert(x.mem_type() == NL_DEVICE_MEMORY);
NLBlas_t blas = nlCUDABlas();
blas->Dscal(blas, x.n_, a, x.data_, 1);
}
Expand All @@ -289,6 +321,8 @@ namespace amgcl2nl {
*/
static void axpy(double a, const vector& x, vector& y) {
nl_debug_assert(x.size() == y.size());
nl_debug_assert(x.mem_type() == NL_DEVICE_MEMORY);
nl_debug_assert(y.mem_type() == NL_DEVICE_MEMORY);
NLBlas_t blas = nlCUDABlas();
blas->Daxpy(blas, x.n_, a, x.data_, 1, y.data_, 1);
}
Expand All @@ -299,13 +333,17 @@ namespace amgcl2nl {
*/
static void mul(const vector& M, const vector& x, vector& y) {
nl_debug_assert(x.size() == M.size() && y.size() == M.size());
nl_debug_assert(M.mem_type() == NL_DEVICE_MEMORY);
nl_debug_assert(x.mem_type() == NL_DEVICE_MEMORY);
nl_debug_assert(y.mem_type() == NL_DEVICE_MEMORY);
NLBlas_t blas = nlCUDABlas();
blas->Dmul(blas,M.size(),M.data_,x.data_,y.data_);
}

private:
double* data_;
index_type n_;
double* data_; // on device memory
NLmemoryType mem_type_;
mutable std::shared_ptr<vector> temp_; // temporary vector for vmul()
};

Expand Down Expand Up @@ -401,21 +439,23 @@ namespace amgcl2nl {
template <class Matrix, class Params> cuda_skyline_lu(
const Matrix &A, const Params&
): Base(*A),
rhs_on_host_(amgcl::backend::rows(*A)),
x_on_host_(amgcl::backend::rows(*A)) {
rhs_on_host_(amgcl::backend::rows(*A), NL_HOST_MEMORY),
x_on_host_(amgcl::backend::rows(*A), NL_HOST_MEMORY) {
}

void operator()(const vector &rhs, vector &x) const {
rhs.copy_to_host(rhs_on_host_);
static_cast<const Base*>(this)->operator()(rhs_on_host_, x_on_host_);
x.copy_from_host(x_on_host_);
void operator()(const vector &rhs_on_device, vector &x_on_device) const {
rhs_on_host_.copy_from(rhs_on_device);
static_cast<const Base*>(this)->operator()(
rhs_on_host_, x_on_host_
);
x_on_device.copy_from(x_on_host_);
}
mutable std::vector<value_type> rhs_on_host_;
mutable std::vector<value_type> x_on_host_;

mutable amgcl2nl::vector rhs_on_host_;
mutable amgcl2nl::vector x_on_host_;
};
}


namespace amgcl { namespace backend {

/**
Expand Down
21 changes: 15 additions & 6 deletions src/lib/geogram/NL/nl_cuda.c
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,8 @@ typedef cudaError_t (*FUNPTR_cudaDeviceGetAttribute)(
typedef cudaError_t (*FUNPTR_cudaDeviceReset)(void);
typedef cudaError_t (*FUNPTR_cudaMalloc)(void **devPtr, size_t size);
typedef cudaError_t (*FUNPTR_cudaFree)(void* devPtr);
typedef cudaError_t (*FUNPTR_cudaMallocHost)(void **devPtr, size_t size);
typedef cudaError_t (*FUNPTR_cudaFreeHost)(void* devPtr);
typedef cudaError_t (*FUNPTR_cudaMemcpy)(
void *dst, const void *src, size_t count, enum cudaMemcpyKind kind
);
Expand Down Expand Up @@ -545,6 +547,8 @@ typedef struct {
FUNPTR_cudaDeviceReset cudaDeviceReset;
FUNPTR_cudaMalloc cudaMalloc;
FUNPTR_cudaFree cudaFree;
FUNPTR_cudaMalloc cudaMallocHost;
FUNPTR_cudaFree cudaFreeHost;
FUNPTR_cudaMemcpy cudaMemcpy;
FUNPTR_cudaMemset cudaMemset;
FUNPTR_cudaMemGetInfo cudaMemGetInfo;
Expand Down Expand Up @@ -609,6 +613,8 @@ NLboolean nlExtensionIsInitialized_CUDA(void) {
CUDA()->cudaDeviceReset == NULL ||
CUDA()->cudaMalloc == NULL ||
CUDA()->cudaFree == NULL ||
CUDA()->cudaMallocHost == NULL ||
CUDA()->cudaFreeHost == NULL ||
CUDA()->cudaMemcpy == NULL ||
CUDA()->cudaMemset == NULL ||
CUDA()->cudaMemGetInfo == NULL ||
Expand Down Expand Up @@ -885,6 +891,8 @@ NLboolean nlInitExtension_CUDA(void) {
find_cuda_func(cudaDeviceReset);
find_cuda_func(cudaMalloc);
find_cuda_func(cudaFree);
find_cuda_func(cudaMallocHost);
find_cuda_func(cudaFreeHost);
find_cuda_func(cudaMemcpy);
find_cuda_func(cudaMemset);
find_cuda_func(cudaMemGetInfo);
Expand Down Expand Up @@ -1610,7 +1618,9 @@ static void* cuda_blas_malloc(
blas->max_used_ram[type],blas->used_ram[type]
);
if(type == NL_HOST_MEMORY) {
result = malloc(size);
// result = malloc(size);
// pinned memory, makes Host <-> device xfers faster
nlCUDACheck(CUDA()->cudaMallocHost(&result,size));
} else {
nlCUDACheck(CUDA()->cudaMalloc(&result,size));
}
Expand All @@ -1622,7 +1632,9 @@ static void cuda_blas_free(
) {
blas->used_ram[type] -= (NLulong)size;
if(type == NL_HOST_MEMORY) {
free(ptr);
// free(ptr);
// pinned memory, makes Host <-> device xfers faster
nlCUDACheck(CUDA()->cudaFreeHost(ptr));
} else {
nlCUDACheck(CUDA()->cudaFree(ptr));
}
Expand Down Expand Up @@ -1768,12 +1780,9 @@ static void cuda_blas_reset_stats(NLBlas_t blas) {
static void cuda_blas_show_stats(NLBlas_t blas) {
size_t free_RAM;
size_t total_RAM;
nl_printf("BLAS stats\n");
nl_printf("----------\n");
nl_printf(" GFlops: %d\n", nlBlasGFlops(blas));
CUDA()->cudaMemGetInfo(&free_RAM, &total_RAM);
nl_printf(
" used GPU RAM: %f GB / total GPU RAM: %f GB (free: %f GB)\n",
"NLBlas: used GPU RAM: %f / total: %f GB (free: %f GB)\n",
(double)(total_RAM - free_RAM)/1e9,
(double)total_RAM/1e9,
(double)free_RAM/1e9
Expand Down

0 comments on commit 68cc47a

Please sign in to comment.