Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add saxpy(MPI+CUDA) benchmark #3

Open
wants to merge 8 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions saxpy-mpi-cuda/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
MPI_DIR = $(OPENMPI_ROOT)
NVCC = nvcc
NVCC_FLAGS = -I$(MPI_DIR)/include
CUDA_INCLUDE = $(CUDA_HOME)/include
LIBS = -L$(MPI_DIR)/lib -lmpi -L$(CUDA_HOME)/lib64 -lcudart -lnvToolsExt
PROFILE_FLAGS = -lineinfo
SRCS = saxpy-mpi-cuda.cu
OBJS = saxpy-mpi-cuda.o
EXECUTABLE = saxpy-mpi-cuda
ARCH = #-arch=sm_80

all: $(EXECUTABLE)
$(EXECUTABLE): $(OBJS)
$(NVCC) $(OBJS) -o $(EXECUTABLE) $(LIBS)

$(OBJS): $(SRCS)
$(NVCC) $(NVCC_FLAGS) $(ARCH) $(PROFILE_FLAGS) -I$(CUDA_INCLUDE) -c $(SRCS) -o $(OBJS)

clean:
rm -rf $(OBJS) $(EXECUTABLE)

15 changes: 15 additions & 0 deletions saxpy-mpi-cuda/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
## MPI+CUDA implementation of the saxpy program

#### Benchmark information
- Uses two processes and one GPU per process.
- Rank 0 initializes the input arrays and sends them to Rank 1.
- Both processes does the same saxpy calculation.
- Rank 1 sends the results to Rank 0.

The following parameters can be used to run the program:
- -i -> number of iterations.
- -N -> the problem size.

#### Example
`./saxpy-mpi-cuda -i 5 -N 32768000`

146 changes: 146 additions & 0 deletions saxpy-mpi-cuda/saxpy-mpi-cuda.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
#include <iostream>
#include <cstdint>
#include <cstdlib>
#include <cstdio>
#include <cuda_runtime.h>
#include <mpi.h>
#include <unistd.h>
#include "nvToolsExt.h"

__global__ void saxpy(double *z, double *x, double *y, double alpha, int N) {

int idx = blockDim.x*blockIdx.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;

for (int i = idx; i < N; i += stride) {
//if (idx < N)
z[i] = alpha*x[i] + y[i];
}
}

int main(int argc, char *argv[]) {

double *h_z, *h_x, *h_y;
double *d_z, *d_x, *d_y;
double alpha = 1.5;
int N = 4096;
int iterations = 2;

MPI_Init(&argc, &argv);
int commSize, commRank;
MPI_Comm_size(MPI_COMM_WORLD, &commSize);
MPI_Comm_rank(MPI_COMM_WORLD, &commRank);

cudaSetDevice(commRank);

int c;
char* endp;
// parse arguments
while ((c = getopt (argc, argv, "N:i:h")) != -1) {
switch (c) {
case 'N':
N = strtol(optarg, &endp, 10);
break;
case 'i':
iterations = strtol(optarg, &endp, 10);
break;
case 'h':
printf("-N <problem_size> => default: -N 4096\n");
printf("-i <number_of_iterations> => default: -i 2\n");
exit(0);
break;
case '?':
printf("Unknown argument. Use -h to see the options.\n");
exit(1);
break;
}
}

if (commRank == 0) {
printf("Number of iterations: %d\n", iterations);
printf("Problem size (N): %d\n", N);
}

int deviceCount = 0;
cudaGetDeviceCount(&deviceCount);
printf("Rank %d - Number of GPUs: %d\n", commRank, deviceCount);

h_z = new double[N];
h_x = new double[N];
h_y = new double[N];

for (int it = 0; it < iterations; it++){
// initialize
if (commRank == 0){
for (int i = 0; i < N; i += 1) {
h_x[i] = 5.0;
h_y[i] = -2.0;
h_z[i] = 0.0;
}
}

// send the input arrays to the other process.
if (commRank == 0) {
MPI_Send(h_x, N, MPI_DOUBLE, 1, it+0, MPI_COMM_WORLD);
MPI_Send(h_y, N, MPI_DOUBLE, 1, it+1, MPI_COMM_WORLD);
}
else if (commRank == 1) {
MPI_Recv(h_x, N, MPI_DOUBLE, 0, it+0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
MPI_Recv(h_y, N, MPI_DOUBLE, 0, it+1, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
}

cudaMalloc(&d_z, N*sizeof(double));
cudaMalloc(&d_y, N*sizeof(double));
cudaMalloc(&d_x, N*sizeof(double));

// copy arrays from host to device
cudaMemcpy(d_x, h_x, N*sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(d_y, h_y, N*sizeof(double), cudaMemcpyHostToDevice);

int threadsPerBlock = 512;
int numBlocks = 2; //N/threadsPerBlock + (N % threadsPerBlock != 0);

// kernel call
nvtxRangePushA("saxpy");
saxpy<<<numBlocks, threadsPerBlock>>>(d_z, d_x, d_y, alpha, N);
nvtxRangePop();

// copy arrays back to the host
cudaMemcpy(h_z, d_z, N*sizeof(double), cudaMemcpyDeviceToHost);

// check if the results are correct
bool success = true;
for (size_t i = 0; i < N; i += 1) {
if (std::abs(h_z[i] - (1.5*5.0-2.0)) > 1E-8) {
success = false;
}
}
if (!success) {
printf("Rank %d => Error: incorrect results! it: %d\n", commRank, it);
}
else {
printf("Rank %d => Correct results! it: %d\n", commRank, it);
}

// send the result to rank 0.
if (commRank == 1) {
MPI_Send(h_z, N, MPI_DOUBLE, 0, it+2, MPI_COMM_WORLD);
}
else if (commRank == 0) {
MPI_Recv(h_z, N, MPI_DOUBLE, 1, it+2, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
}
}

// cleaning
delete[] h_x;
delete[] h_y;
delete[] h_z;

cudaFree(d_z);
cudaFree(d_x);
cudaFree(d_y);

MPI_Finalize();

return 0;
}