Skip to content

Commit

Permalink
Only initialise cublas if GPU offloading is enabled at runtime
Browse files Browse the repository at this point in the history
In the example programs pddrive, pddrive3d, and so on, check if
SUPERLU_ACC_OFFLOAD is set before creating and destroying cublas
handles. Otherwise, pddrive may fail or hang in cublasDestroy.
  • Loading branch information
jamtrott committed Oct 20, 2022
1 parent 98695a4 commit 0520fc1
Show file tree
Hide file tree
Showing 6 changed files with 201 additions and 168 deletions.
62 changes: 34 additions & 28 deletions EXAMPLE/pddrive.c
Original file line number Diff line number Diff line change
Expand Up @@ -155,26 +155,29 @@ int main(int argc, char *argv[])
SUPERLU_FREE(usermap);

#ifdef GPU_ACC
/* Binding each MPI to a GPU device */
char *ttemp;
ttemp = getenv ("SUPERLU_BIND_MPI_GPU");

if (ttemp) {
int devs, rank;
MPI_Comm_rank(MPI_COMM_WORLD, &rank); // MPI_COMM_WORLD needs to be used here instead of SubComm
gpuGetDeviceCount(&devs); // Returns the number of compute-capable devices
gpuSetDevice(rank % devs); // Set device to be used for GPU executions
}
int superlu_acc_offload = get_acc_offload();
if (superlu_acc_offload) {
/* Binding each MPI to a GPU device */
char *ttemp;
ttemp = getenv ("SUPERLU_BIND_MPI_GPU");

if (ttemp) {
int devs, rank;
MPI_Comm_rank(MPI_COMM_WORLD, &rank); // MPI_COMM_WORLD needs to be used here instead of SubComm
gpuGetDeviceCount(&devs); // Returns the number of compute-capable devices
gpuSetDevice(rank % devs); // Set device to be used for GPU executions
}

// This is to initialize GPU, which can be costly.
double t1 = SuperLU_timer_();
gpuFree(0);
double t2 = SuperLU_timer_();
if(!myrank)printf("first gpufree time: %7.4f\n",t2-t1);
gpublasHandle_t hb;
gpublasCreate(&hb);
if(!myrank)printf("first blas create time: %7.4f\n",SuperLU_timer_()-t2);
gpublasDestroy(hb);
// This is to initialize GPU, which can be costly.
double t1 = SuperLU_timer_();
gpuFree(0);
double t2 = SuperLU_timer_();
if(!myrank)printf("first gpufree time: %7.4f\n",t2-t1);
gpublasHandle_t hb;
gpublasCreate(&hb);
if(!myrank)printf("first blas create time: %7.4f\n",SuperLU_timer_()-t2);
gpublasDestroy(hb);
}
#endif
// printf("grid.iam %5d, myrank %5d\n",grid.iam,myrank);
// fflush(stdout);
Expand All @@ -186,15 +189,18 @@ int main(int argc, char *argv[])
superlu_gridinit(MPI_COMM_WORLD, nprow, npcol, &grid);

#ifdef GPU_ACC
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
double t1 = SuperLU_timer_();
gpuFree(0);
double t2 = SuperLU_timer_();
if(!myrank)printf("first gpufree time: %7.4f\n",t2-t1);
gpublasHandle_t hb;
gpublasCreate(&hb);
if(!myrank)printf("first blas create time: %7.4f\n",SuperLU_timer_()-t2);
gpublasDestroy(hb);
int superlu_acc_offload = get_acc_offload();
if (superlu_acc_offload) {
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
double t1 = SuperLU_timer_();
gpuFree(0);
double t2 = SuperLU_timer_();
if(!myrank)printf("first gpufree time: %7.4f\n",t2-t1);
gpublasHandle_t hb;
gpublasCreate(&hb);
if(!myrank)printf("first blas create time: %7.4f\n",SuperLU_timer_()-t2);
gpublasDestroy(hb);
}
#endif
}

Expand Down
61 changes: 33 additions & 28 deletions EXAMPLE/pddrive3d.c
Original file line number Diff line number Diff line change
Expand Up @@ -222,26 +222,28 @@ main (int argc, char *argv[])
SUPERLU_FREE(usermap);

#ifdef GPU_ACC
/* Binding each MPI to a GPU device */
char *ttemp;
ttemp = getenv ("SUPERLU_BIND_MPI_GPU");

if (ttemp) {
int devs, rank;
MPI_Comm_rank(MPI_COMM_WORLD, &rank); // MPI_COMM_WORLD needs to be used here instead of SubComm
gpuGetDeviceCount(&devs); // Returns the number of compute-capable devices
gpuSetDevice(rank % devs); // Set device to be used for GPU executions
int superlu_acc_offload = get_acc_offload();
if (superlu_acc_offload) {
/* Binding each MPI to a GPU device */
char *ttemp;
ttemp = getenv ("SUPERLU_BIND_MPI_GPU");

if (ttemp) {
int devs, rank;
MPI_Comm_rank(MPI_COMM_WORLD, &rank); // MPI_COMM_WORLD needs to be used here instead of SubComm
gpuGetDeviceCount(&devs); // Returns the number of compute-capable devices
gpuSetDevice(rank % devs); // Set device to be used for GPU executions
}
// This is to initialize GPU, which can be costly.
double t1 = SuperLU_timer_();
gpuFree(0);
double t2 = SuperLU_timer_();
if(!myrank)printf("first gpufree time: %7.4f\n",t2-t1);
gpublasHandle_t hb;
gpublasCreate(&hb);
if(!myrank)printf("first blas create time: %7.4f\n",SuperLU_timer_()-t2);
gpublasDestroy(hb);
}
// This is to initialize GPU, which can be costly.
double t1 = SuperLU_timer_();
gpuFree(0);
double t2 = SuperLU_timer_();
if(!myrank)printf("first gpufree time: %7.4f\n",t2-t1);
gpublasHandle_t hb;
gpublasCreate(&hb);
if(!myrank)printf("first blas create time: %7.4f\n",SuperLU_timer_()-t2);
gpublasDestroy(hb);

#endif

// printf("grid.iam %5d, myrank %5d\n",grid.iam,myrank);
Expand All @@ -253,15 +255,18 @@ main (int argc, char *argv[])
------------------------------------------------------------ */
superlu_gridinit3d (MPI_COMM_WORLD, nprow, npcol, npdep, &grid);
#ifdef GPU_ACC
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
double t1 = SuperLU_timer_();
gpuFree(0);
double t2 = SuperLU_timer_();
if(!myrank)printf("first gpufree time: %7.4f\n",t2-t1);
gpublasHandle_t hb;
gpublasCreate(&hb);
if(!myrank)printf("first blas create time: %7.4f\n",SuperLU_timer_()-t2);
gpublasDestroy(hb);
int superlu_acc_offload = get_acc_offload();
if (superlu_acc_offload) {
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
double t1 = SuperLU_timer_();
gpuFree(0);
double t2 = SuperLU_timer_();
if(!myrank)printf("first gpufree time: %7.4f\n",t2-t1);
gpublasHandle_t hb;
gpublasCreate(&hb);
if(!myrank)printf("first blas create time: %7.4f\n",SuperLU_timer_()-t2);
gpublasDestroy(hb);
}
#endif
}

Expand Down
62 changes: 34 additions & 28 deletions EXAMPLE/psdrive.c
Original file line number Diff line number Diff line change
Expand Up @@ -155,26 +155,29 @@ int main(int argc, char *argv[])
SUPERLU_FREE(usermap);

#ifdef GPU_ACC
/* Binding each MPI to a GPU device */
char *ttemp;
ttemp = getenv ("SUPERLU_BIND_MPI_GPU");

if (ttemp) {
int devs, rank;
MPI_Comm_rank(MPI_COMM_WORLD, &rank); // MPI_COMM_WORLD needs to be used here instead of SubComm
gpuGetDeviceCount(&devs); // Returns the number of compute-capable devices
gpuSetDevice(rank % devs); // Set device to be used for GPU executions
}
int superlu_acc_offload = get_acc_offload();
if (superlu_acc_offload) {
/* Binding each MPI to a GPU device */
char *ttemp;
ttemp = getenv ("SUPERLU_BIND_MPI_GPU");

if (ttemp) {
int devs, rank;
MPI_Comm_rank(MPI_COMM_WORLD, &rank); // MPI_COMM_WORLD needs to be used here instead of SubComm
gpuGetDeviceCount(&devs); // Returns the number of compute-capable devices
gpuSetDevice(rank % devs); // Set device to be used for GPU executions
}

// This is to initialize GPU, which can be costly.
double t1 = SuperLU_timer_();
gpuFree(0);
double t2 = SuperLU_timer_();
if(!myrank)printf("first gpufree time: %7.4f\n",t2-t1);
gpublasHandle_t hb;
gpublasCreate(&hb);
if(!myrank)printf("first blas create time: %7.4f\n",SuperLU_timer_()-t2);
gpublasDestroy(hb);
// This is to initialize GPU, which can be costly.
double t1 = SuperLU_timer_();
gpuFree(0);
double t2 = SuperLU_timer_();
if(!myrank)printf("first gpufree time: %7.4f\n",t2-t1);
gpublasHandle_t hb;
gpublasCreate(&hb);
if(!myrank)printf("first blas create time: %7.4f\n",SuperLU_timer_()-t2);
gpublasDestroy(hb);
}
#endif
// printf("grid.iam %5d, myrank %5d\n",grid.iam,myrank);
// fflush(stdout);
Expand All @@ -186,15 +189,18 @@ int main(int argc, char *argv[])
superlu_gridinit(MPI_COMM_WORLD, nprow, npcol, &grid);

#ifdef GPU_ACC
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
double t1 = SuperLU_timer_();
gpuFree(0);
double t2 = SuperLU_timer_();
if(!myrank)printf("first gpufree time: %7.4f\n",t2-t1);
gpublasHandle_t hb;
gpublasCreate(&hb);
if(!myrank)printf("first blas create time: %7.4f\n",SuperLU_timer_()-t2);
gpublasDestroy(hb);
int superlu_acc_offload = get_acc_offload();
if (superlu_acc_offload) {
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
double t1 = SuperLU_timer_();
gpuFree(0);
double t2 = SuperLU_timer_();
if(!myrank)printf("first gpufree time: %7.4f\n",t2-t1);
gpublasHandle_t hb;
gpublasCreate(&hb);
if(!myrank)printf("first blas create time: %7.4f\n",SuperLU_timer_()-t2);
gpublasDestroy(hb);
}
#endif
}

Expand Down
61 changes: 33 additions & 28 deletions EXAMPLE/psdrive3d.c
Original file line number Diff line number Diff line change
Expand Up @@ -222,26 +222,28 @@ main (int argc, char *argv[])
SUPERLU_FREE(usermap);

#ifdef GPU_ACC
/* Binding each MPI to a GPU device */
char *ttemp;
ttemp = getenv ("SUPERLU_BIND_MPI_GPU");

if (ttemp) {
int devs, rank;
MPI_Comm_rank(MPI_COMM_WORLD, &rank); // MPI_COMM_WORLD needs to be used here instead of SubComm
gpuGetDeviceCount(&devs); // Returns the number of compute-capable devices
gpuSetDevice(rank % devs); // Set device to be used for GPU executions
int superlu_acc_offload = get_acc_offload();
if (superlu_acc_offload) {
/* Binding each MPI to a GPU device */
char *ttemp;
ttemp = getenv ("SUPERLU_BIND_MPI_GPU");

if (ttemp) {
int devs, rank;
MPI_Comm_rank(MPI_COMM_WORLD, &rank); // MPI_COMM_WORLD needs to be used here instead of SubComm
gpuGetDeviceCount(&devs); // Returns the number of compute-capable devices
gpuSetDevice(rank % devs); // Set device to be used for GPU executions
}
// This is to initialize GPU, which can be costly.
double t1 = SuperLU_timer_();
gpuFree(0);
double t2 = SuperLU_timer_();
if(!myrank)printf("first gpufree time: %7.4f\n",t2-t1);
gpublasHandle_t hb;
gpublasCreate(&hb);
if(!myrank)printf("first blas create time: %7.4f\n",SuperLU_timer_()-t2);
gpublasDestroy(hb);
}
// This is to initialize GPU, which can be costly.
double t1 = SuperLU_timer_();
gpuFree(0);
double t2 = SuperLU_timer_();
if(!myrank)printf("first gpufree time: %7.4f\n",t2-t1);
gpublasHandle_t hb;
gpublasCreate(&hb);
if(!myrank)printf("first blas create time: %7.4f\n",SuperLU_timer_()-t2);
gpublasDestroy(hb);

#endif

// printf("grid.iam %5d, myrank %5d\n",grid.iam,myrank);
Expand All @@ -253,15 +255,18 @@ main (int argc, char *argv[])
------------------------------------------------------------ */
superlu_gridinit3d (MPI_COMM_WORLD, nprow, npcol, npdep, &grid);
#ifdef GPU_ACC
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
double t1 = SuperLU_timer_();
gpuFree(0);
double t2 = SuperLU_timer_();
if(!myrank)printf("first gpufree time: %7.4f\n",t2-t1);
gpublasHandle_t hb;
gpublasCreate(&hb);
if(!myrank)printf("first blas create time: %7.4f\n",SuperLU_timer_()-t2);
gpublasDestroy(hb);
int superlu_acc_offload = get_acc_offload();
if (superlu_acc_offload) {
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
double t1 = SuperLU_timer_();
gpuFree(0);
double t2 = SuperLU_timer_();
if(!myrank)printf("first gpufree time: %7.4f\n",t2-t1);
gpublasHandle_t hb;
gpublasCreate(&hb);
if(!myrank)printf("first blas create time: %7.4f\n",SuperLU_timer_()-t2);
gpublasDestroy(hb);
}
#endif
}

Expand Down
62 changes: 34 additions & 28 deletions EXAMPLE/pzdrive.c
Original file line number Diff line number Diff line change
Expand Up @@ -154,26 +154,29 @@ int main(int argc, char *argv[])
SUPERLU_FREE(usermap);

#ifdef GPU_ACC
/* Binding each MPI to a GPU device */
char *ttemp;
ttemp = getenv ("SUPERLU_BIND_MPI_GPU");

if (ttemp) {
int devs, rank;
MPI_Comm_rank(MPI_COMM_WORLD, &rank); // MPI_COMM_WORLD needs to be used here instead of SubComm
gpuGetDeviceCount(&devs); // Returns the number of compute-capable devices
gpuSetDevice(rank % devs); // Set device to be used for GPU executions
}
int superlu_acc_offload = get_acc_offload();
if (superlu_acc_offload) {
/* Binding each MPI to a GPU device */
char *ttemp;
ttemp = getenv ("SUPERLU_BIND_MPI_GPU");

if (ttemp) {
int devs, rank;
MPI_Comm_rank(MPI_COMM_WORLD, &rank); // MPI_COMM_WORLD needs to be used here instead of SubComm
gpuGetDeviceCount(&devs); // Returns the number of compute-capable devices
gpuSetDevice(rank % devs); // Set device to be used for GPU executions
}

// This is to initialize GPU, which can be costly.
double t1 = SuperLU_timer_();
gpuFree(0);
double t2 = SuperLU_timer_();
if(!myrank)printf("first gpufree time: %7.4f\n",t2-t1);
gpublasHandle_t hb;
gpublasCreate(&hb);
if(!myrank)printf("first blas create time: %7.4f\n",SuperLU_timer_()-t2);
gpublasDestroy(hb);
// This is to initialize GPU, which can be costly.
double t1 = SuperLU_timer_();
gpuFree(0);
double t2 = SuperLU_timer_();
if(!myrank)printf("first gpufree time: %7.4f\n",t2-t1);
gpublasHandle_t hb;
gpublasCreate(&hb);
if(!myrank)printf("first blas create time: %7.4f\n",SuperLU_timer_()-t2);
gpublasDestroy(hb);
}
#endif
// printf("grid.iam %5d, myrank %5d\n",grid.iam,myrank);
// fflush(stdout);
Expand All @@ -185,15 +188,18 @@ int main(int argc, char *argv[])
superlu_gridinit(MPI_COMM_WORLD, nprow, npcol, &grid);

#ifdef GPU_ACC
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
double t1 = SuperLU_timer_();
gpuFree(0);
double t2 = SuperLU_timer_();
if(!myrank)printf("first gpufree time: %7.4f\n",t2-t1);
gpublasHandle_t hb;
gpublasCreate(&hb);
if(!myrank)printf("first blas create time: %7.4f\n",SuperLU_timer_()-t2);
gpublasDestroy(hb);
int superlu_acc_offload = get_acc_offload();
if (superlu_acc_offload) {
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
double t1 = SuperLU_timer_();
gpuFree(0);
double t2 = SuperLU_timer_();
if(!myrank)printf("first gpufree time: %7.4f\n",t2-t1);
gpublasHandle_t hb;
gpublasCreate(&hb);
if(!myrank)printf("first blas create time: %7.4f\n",SuperLU_timer_()-t2);
gpublasDestroy(hb);
}
#endif
}

Expand Down
Loading

0 comments on commit 0520fc1

Please sign in to comment.