From 47a902f0e1bc2a6631984dc05b062d644b39806d Mon Sep 17 00:00:00 2001 From: Ben Albrecht Date: Tue, 12 Jan 2016 17:12:29 -0800 Subject: [PATCH 1/3] Serial implementation for Stencil, Synch_p2p, and Transpose Shared implementation for Stencil and Transpose as well --- .gitignore | 1 - CHAPEL/Stencil/Makefile | 25 ++++++ CHAPEL/Stencil/stencil-serial.chpl | 102 ++++++++++++++++++++++++ CHAPEL/Stencil/stencil-shared.chpl | 105 +++++++++++++++++++++++++ CHAPEL/Synch_p2p/Makefile | 25 ++++++ CHAPEL/Synch_p2p/p2p-serial.chpl | 78 ++++++++++++++++++ CHAPEL/Transpose/Makefile | 25 ++++++ CHAPEL/Transpose/transpose-serial.chpl | 96 ++++++++++++++++++++++ CHAPEL/Transpose/transpose-shared.chpl | 90 +++++++++++++++++++++ include/PRK.chpl | 5 ++ 10 files changed, 551 insertions(+), 1 deletion(-) create mode 100644 CHAPEL/Stencil/Makefile create mode 100644 CHAPEL/Stencil/stencil-serial.chpl create mode 100644 CHAPEL/Stencil/stencil-shared.chpl create mode 100644 CHAPEL/Synch_p2p/Makefile create mode 100644 CHAPEL/Synch_p2p/p2p-serial.chpl create mode 100644 CHAPEL/Transpose/Makefile create mode 100644 CHAPEL/Transpose/transpose-serial.chpl create mode 100644 CHAPEL/Transpose/transpose-shared.chpl create mode 100644 include/PRK.chpl diff --git a/.gitignore b/.gitignore index 0670c4db5..cdec027f9 100644 --- a/.gitignore +++ b/.gitignore @@ -164,4 +164,3 @@ FORTRAN/transpose-coarray FORTRAN/transpose-openmp FORTRAN/transpose-openmp-target FORTRAN/transpose-pretty - diff --git a/CHAPEL/Stencil/Makefile b/CHAPEL/Stencil/Makefile new file mode 100644 index 000000000..433097617 --- /dev/null +++ b/CHAPEL/Stencil/Makefile @@ -0,0 +1,25 @@ +# Chapel Makefile + +# TODO Find a way to make this fit into the existing Makefile system + +OPTFLAGS = --fast -sassertNoSlicing +MODULEFLAGS = --module-dir ../../include/ + +# Debugging +DEBUGFLAGS = +ifdef DEBUG +DEBUGFLAGS = --savec c +endif + +CHPL = chpl + +SOURCES = stencil-serial.chpl stencil-shared.chpl +EXECUTABLE = $(SOURCES:.chpl=) + +all: $(EXECUTABLE) + +%: %.chpl + $(CHPL) $(OPTFLAGS) $(MODULEFLAGS) $(DEBUGFLAGS) $*.chpl -o $* + +clean: + rm -f $(EXECUTABLE) diff --git a/CHAPEL/Stencil/stencil-serial.chpl b/CHAPEL/Stencil/stencil-serial.chpl new file mode 100644 index 000000000..167a11061 --- /dev/null +++ b/CHAPEL/Stencil/stencil-serial.chpl @@ -0,0 +1,102 @@ +// Chapel's serial stencil implementation +use PRK; + +// Note: Defaulting to STAR stencil (defines weight) +// Configurable runtime constants +config const n: int = 100; +config const iterations: int = 100; +config const debug: bool = false; + +// Compile constants +param R = 2; +param coefx = 1.0; +param coefy = 1.0; +param epsilon = 1.e-8; + +// Runtime constants +const activePoints = (n-2*R)*(n-2*R); +const stencilSize = 4*R + 1; + +// Timer +var timer: Timer; + +// Domains +const Dom = {0.. # n, 0.. # n}, + InnerDom = Dom.expand(-R), + W = {-R..R, -R..R}; + +// Arrays +var input, output: [Dom] real = 0.0; +var weight: [W] real = 0.0; + +for i in 1..R do { + const element = 1.0 / (2.0*i*R); + weight[0, i] = element; + weight[i, 0] = element; + weight[-i, 0] = -element; + weight[0, -i] = -element; +} + +// Initialize the input and output arrays +serial do [(i, j) in Dom] input[i,j] = coefx*i+coefy*j; + +// Print information before main loop +writeln("Parallel Research Kernels Version ", PRKVERSION); +writeln("Serial stencil execution on 2D grid"); +writeln("Grid size = ", n); +writeln("Radius of stencil = ", R); +writeln("Type of stencil = star"); // Temporarily hard-coded +writeln("Data type = double precision"); +writeln("Untiled"); // Temporarily hard-coded +writeln("Number of iterations = ", iterations); + +for iteration in 0..iterations do { + // Start timer after warmup iteration + if (iteration == 1) { + timer.start(); + } + + for (i,j) in InnerDom { + for jj in -R..R do output[i, j] += weight[0, jj] * input[i, j+jj]; + for ii in -R..-1 do output[i, j] += weight[ii, 0] * input[i+ii, j]; + for ii in 1..R do output[i, j] += weight[ii, 0] * input[i+ii, j]; + } + + + // Add constant to solution to force refresh of neighbor data, if any + for (i,j) in Dom do input[i,j] += 1.0; + +} // end of iterations + +timer.stop(); + +// Timings +var stencilTime = timer.elapsed(); +writeln("stencil_time: ", stencilTime); + +// Compute L1 norm in parallel +var norm = + reduce abs(output); + +norm /= activePoints; + +/******************************************************************************* +** Analyze and output results. +********************************************************************************/ + +// Verify correctness +var referenceNorm = (iterations + 1) * (coefx + coefy); + +if abs(norm-referenceNorm) > epsilon then { + writeln("ERROR: L1 norm = ", norm, ", Reference L1 norm = ", referenceNorm); + exit(1); +} else { + writeln("Solution validates"); + if debug then { + writeln("L1 norm = ", norm, ", Reference L1 norm = ", referenceNorm); + } +} + +var flops = (2*stencilSize + 1) * activePoints; +var avgTime = stencilTime / iterations; +writeln("Rate (MFlops/s): ", 1.0E-06 * flops/avgTime, + " Avg time (s): ", avgTime); diff --git a/CHAPEL/Stencil/stencil-shared.chpl b/CHAPEL/Stencil/stencil-shared.chpl new file mode 100644 index 000000000..41e86df3e --- /dev/null +++ b/CHAPEL/Stencil/stencil-shared.chpl @@ -0,0 +1,105 @@ +// Chapel's shared-memory parallel stencil +use PRK; + +// Note: Defaulting to STAR stencil (defines weight) +// Configurable runtime constants +config const n: int = 100; +config const iterations: int = 100; +config const debug: bool = false; + +// Compile constants +param R = 2; +param coefx = 1.0; +param coefy = 1.0; +param epsilon = 1.e-8; + +// Runtime constants +const activePoints = (n-2*R)*(n-2*R); +const stencilSize = 4*R + 1; + +// Timer +var timer: Timer; + +// Domains +const Dom = {0.. # n, 0.. # n}, +ProblemSpace = Dom, + InnerDom = ProblemSpace.expand(-R), + W = {-R..R, -R..R}; + +// Arrays (initialized to zeros) +var input, output: [ProblemSpace] real = 0.0; +var weight: [W] real = 0.0; + +forall i in 1..R do { + const element = 1.0 / (2.0*i*R); + weight[0, i] = element; + weight[i, 0] = element; + weight[-i, 0] = -element; + weight[0, -i] = -element; +} + +// Initialize the input and output arrays +[(i, j) in ProblemSpace] input[i,j] = coefx*i+coefy*j; + +// Print information before main loop +writeln("Parallel Research Kernels Version ", PRKVERSION); +writeln("Serial stencil execution on 2D grid"); +writeln("Grid size = ", n); +writeln("Radius of stencil = ", R); +writeln("Type of stencil = star"); // Temporarily hard-coded +writeln("Data type = double precision"); +writeln("Untiled"); // Temporarily hard-coded +writeln("Number of iterations = ", iterations); + +for iteration in 0..iterations { + + // Start timer after warmup iteration + if (iteration == 1) { + timer.start(); + } + + forall (i,j) in ProblemSpace { + for jj in -R..R do output[i, j] += weight[0, jj] * input[i, j+jj]; + for ii in -R..-1 do output[i, j] += weight[ii, 0] * input[i+ii, j]; + for ii in 1..R do output[i, j] += weight[ii, 0] * input[i+ii, j]; + } + + // Add constant to solution to force refresh of neighbor data, if any + forall (i,j) in ProblemSpace { + input[i, j] += 1.0; + } + +} // end of iterations + +timer.stop(); + +// Timings +var stencilTime = timer.elapsed(); +writeln("stencil_time: ", stencilTime); + +// Compute L1 norm in parallel +var norm = + reduce abs(output); + +norm /= activePoints; + +/******************************************************************************* +** Analyze and output results. +********************************************************************************/ + +// Verify correctness +var referenceNorm = (iterations + 1) * (coefx + coefy); + +if (abs(norm-referenceNorm) > epsilon) { + writeln("ERROR: L1 norm = ", norm, ", Reference L1 norm = ", referenceNorm); + exit(1); +} else { + writeln("Solution validates"); + if debug { + writeln("L1 norm = ", norm, ", Reference L1 norm = ", referenceNorm); + } +} + +var flops = (2*stencilSize + 1) * activePoints; +var avgTime = stencilTime / iterations; +writeln("Rate (MFlops/s): ", 1.0E-06 * flops/avgTime, + " Avg time (s): ", avgTime); diff --git a/CHAPEL/Synch_p2p/Makefile b/CHAPEL/Synch_p2p/Makefile new file mode 100644 index 000000000..24a3e018a --- /dev/null +++ b/CHAPEL/Synch_p2p/Makefile @@ -0,0 +1,25 @@ +# Chapel Makefile + +# TODO Find a way to make this fit into the existing Makefile system + +OPTFLAGS = --fast -sassertNoSlicing +MODULEFLAGS = --module-dir ../../include/ + +# Debugging +DEBUGFLAGS = +ifdef DEBUG +DEBUGFLAGS = --savec c +endif + +CHPL = chpl + +SOURCES = p2p-serial.chpl +EXECUTABLE = $(SOURCES:.chpl=) + +all: $(EXECUTABLE) + +%: %.chpl + $(CHPL) $(OPTFLAGS) $(MODULEFLAGS) $(DEBUGFLAGS) $*.chpl -o $* + +clean: + rm -f $(EXECUTABLE) diff --git a/CHAPEL/Synch_p2p/p2p-serial.chpl b/CHAPEL/Synch_p2p/p2p-serial.chpl new file mode 100644 index 000000000..2c912347c --- /dev/null +++ b/CHAPEL/Synch_p2p/p2p-serial.chpl @@ -0,0 +1,78 @@ +// Chapel's serial implementation of synch_p2p + +use PRK; + +config const iterations = 1000; +config const m = 1000; +config const n = 1000; +config const debug: bool = false; + +param timer: Timer; + +if (iterations < 1) { + writeln("ERROR: iterations must be >= 1 : ", iterations); + exit(1); +} + +if (m < 1 || n < 1) { + writeln("ERROR: grid dimensions must be positive:", m, ", ", n); + exit(1); +} + +// Initialize and zero out vector +const mrange = 0 .. # m, + nrange = 0 .. # n, + Dom = {mrange, nrange}, + Dom1 = {1 ..(m-1), 1 .. (n-1)}; + +var vector : [Dom] real = 0.0; + +writeln("Parallel Research Kernels version ", PRKVERSION); +writeln("Serial pipeline execution on 2D grid"); +writeln("Grid sizes = ", m, ", ", n); +writeln("Number of iterations = ", iterations); + +// Set boundary values (top and left side of grid) +serial do [j in nrange] vector[0,j] = j; +serial do [i in mrange] vector[i,0] = i; + +for iteration in 0 .. iterations { + + // Start timer after warmup iteration + if (iteration == 1) then timer.start(); + + for (i,j) in Dom1 { + vector[i, j] = vector[i-1, j] + vector[i, j-1] - vector[i-1, j-1]; + } + + // Copy bottom right corner value to top left, creating dependency + vector[0, 0] = -vector[m-1, n-1]; +} + +timer.stop(); + +var pipelineTime = timer.elapsed(); + +// Analyze and output results + +// Error threshold +const epsilon = 1.e-8; + +// Verify correctness, using bottom right value of array +var cornerValue : real = (iterations + 1)*(n + m - 2); +if (abs(vector[m-1, n-1] - cornerValue) / cornerValue > epsilon) { + writeln("ERROR: checksum ", vector[m-1, n-1], " does not match verification \ + value", cornerValue); + exit(1); +} + +if (debug) { + writeln("Solution validates; verification value = ", cornerValue); +} else { + writeln("Solution validates"); +} + +var avgTime = pipelineTime / iterations; + +writeln("Rate (MFlops/s): ", 1.0e-6 * 2 * ((m-1)*(n-1)) / avgTime, + " Avg time (s): ", avgTime); diff --git a/CHAPEL/Transpose/Makefile b/CHAPEL/Transpose/Makefile new file mode 100644 index 000000000..a4c41faf0 --- /dev/null +++ b/CHAPEL/Transpose/Makefile @@ -0,0 +1,25 @@ +# Chapel Makefile + +# TODO Find a way to make this fit into the existing Makefile system + +OPTFLAGS = --fast -sassertNoSlicing +MODULEFLAGS = --module-dir ../../include/ + +# Debugging +DEBUGFLAGS = +ifdef DEBUG +DEBUGFLAGS = --savec c +endif + +CHPL = chpl + +SOURCES = transpose-serial.chpl transpose-shared.chpl +EXECUTABLE = $(SOURCES:.chpl=) + +all: $(EXECUTABLE) + +%: %.chpl + $(CHPL) $(OPTFLAGS) $(MODULEFLAGS) $(DEBUGFLAGS) $*.chpl -o $* + +clean: + rm -f $(EXECUTABLE) diff --git a/CHAPEL/Transpose/transpose-serial.chpl b/CHAPEL/Transpose/transpose-serial.chpl new file mode 100644 index 000000000..e5caf9b44 --- /dev/null +++ b/CHAPEL/Transpose/transpose-serial.chpl @@ -0,0 +1,96 @@ +// Chapel's serial implementation of transpose + +use PRK; + +config const iterations : int = 100; +config const order : int = 100; +config const tileSize : int = 32; +config const debug: bool = false; + +// Timer +var timer: Timer; + +var tiled : bool; +var bytes = 2.0 * numBytes(real) * order * order; + +const Dom = {0.. # order, 0.. # order}, + tiledDom = {0.. # order by tileSize, 0.. # order by tileSize}; +var A, B : [Dom] real; + + +writeln("Parallel Research Kernels version ", PRKVERSION); +writeln("Serial Matrix transpose: B = A^T"); +writeln("Matrix order = ", order); +if (tileSize < order && tileSize > 0) { + writeln("Tile size = ", tileSize); + tiled = true; +} else { + writeln("Untiled"); + tiled = false; +} +writeln("Number of iterations = ", iterations); + + +// Fill original column matrix +for (i,j) in Dom { + A[i,j] = order*j + i; +} + +// Set transpose matrix to known garbage value +B = -1.0; + +var work = 0; +// Main loop +for iteration in 0..iterations { + // Start timer after a warmup lap + if (iteration == 1) then timer.start(); + + if (tiled) { + for (i,j) in tiledDom { + for it in i .. # min(order - i, tileSize) { + for jt in j .. # min(order - j, tileSize) { + B[jt,it] = A[it,jt]; + work += 1; + } + } + } + } + else { + for (i,j) in Dom { + B[j,i] = A[i,j]; + work += 1; + } + } + +} // end of main loop + +timer.stop(); + +// Analyze and output results + +var transposeTime = timer.elapsed(); + + +var abserr = 0.0; +for (i,j) in Dom { + abserr += abs(B[i,j] - (order*i + j)); +} + +if (debug) { + writeln("transposeTime = ", transposeTime); + writeln("work = ", work); + writeln("Sum of absolute differences: ", abserr); +} + +// Error tolerance +const epsilon = 1.e-8; + +if (abserr < epsilon) { + writeln("Solution validates\n"); + var avgtime = transposeTime / iterations; + writeln("Rate (MB/s): ", 1.0E-06 * bytes / avgtime, + " Avg time (s): ", avgtime); +} else { + writeln("ERROR: Aggregate squared error", abserr, + " exceeds threshold ", epsilon); +} diff --git a/CHAPEL/Transpose/transpose-shared.chpl b/CHAPEL/Transpose/transpose-shared.chpl new file mode 100644 index 000000000..4228f7da7 --- /dev/null +++ b/CHAPEL/Transpose/transpose-shared.chpl @@ -0,0 +1,90 @@ +// Chapel's serial implementation of transpose + +use PRK; + +config const iterations : int = 100; +config const order : int = 100; +config const tileSize : int = 32; +config const debug: bool = false; + +// Timer +var timer: Timer; + +var tiled : bool; +var bytes = 2.0 * numBytes(real) * order * order; + +const Dom = {0.. # order, 0.. # order}, + tiledDom = {0.. # order by tileSize, 0.. # order by tileSize}; +var A, B : [Dom] real; + + +writeln("Parallel Research Kernels version ", PRKVERSION); +writeln("Serial Matrix transpose: B = A^T"); +writeln("Matrix order = ", order); +if (tileSize < order && tileSize > 0) { + writeln("Tile size = ", tileSize); + tiled = true; +} else { + writeln("Untiled"); + tiled = false; +} +writeln("Number of iterations = ", iterations); + + +// Fill original column matrix +[(i, j) in Dom] A[i,j] = order*j + i; + +// Set transpose matrix to known garbage value +B = -1.0; + +// Main loop +for iteration in 0..iterations { + // Start timer after a warmup lap + if (iteration == 1) then timer.start(); + + if (tiled) { + forall (i,j) in tiledDom { + for it in i .. # min(order - i, tileSize) { + for jt in j .. # min(order - j, tileSize) { + B[jt,it] = A[it,jt]; + } + } + } + } + else { + forall (i,j) in Dom { + B[j,i] = A[i,j]; + } + } + +} // end of main loop + +timer.stop(); + +// Analyze and output results + +var transposeTime = timer.elapsed(); + + +var abserr = 0.0; +for (i,j) in Dom { + abserr += abs(B[i,j] - (order*i + j)); +} + +if (debug) { + writeln("transposeTime = ", transposeTime); + writeln("Sum of absolute differences: ", abserr); +} + +// Error tolerance +const epsilon = 1.e-8; + +if (abserr < epsilon) { + writeln("Solution validates\n"); + var avgtime = transposeTime / iterations; + writeln("Rate (MB/s): ", 1.0E-06 * bytes / avgtime, + " Avg time (s): ", avgtime); +} else { + writeln("ERROR: Aggregate squared error", abserr, + " exceeds threshold ", epsilon); +} diff --git a/include/PRK.chpl b/include/PRK.chpl new file mode 100644 index 000000000..3cc01b651 --- /dev/null +++ b/include/PRK.chpl @@ -0,0 +1,5 @@ +// Common modules, definitions, and constants across Chapel implementations + +use Time; + +param PRKVERSION = "2.15"; From 90bd80077fb8f195b71a84f0a0c6667f977c5e4f Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 17 Jun 2017 10:43:49 -0700 Subject: [PATCH 2/3] fix printout [ci skip] --- travis/install-mpi.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/travis/install-mpi.sh b/travis/install-mpi.sh index afe39d850..5e022e28a 100755 --- a/travis/install-mpi.sh +++ b/travis/install-mpi.sh @@ -55,7 +55,7 @@ case "$os" in if [ -f "`which clang$clangversion`" ]; then export PRK_CC="clang$clangversion" export PRK_CXX="clang++$clangversion" - echo "Found GCC: $PRK_CC" + echo "Found Clang: $PRK_CC" break fi done From d4fed6e972d4511e6cc56668674248af7f30c333 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 18 Feb 2020 17:04:16 -0800 Subject: [PATCH 3/3] reduce PR delta --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 41f4962ec..0b5905f43 100644 --- a/.gitignore +++ b/.gitignore @@ -281,6 +281,7 @@ FORTRAN/transpose-coarray FORTRAN/transpose-openmp FORTRAN/transpose-openmp-target FORTRAN/transpose-pretty +FORTRAN/transpose-ornlacc FORTRAN/transpose-taskloop-openmp FORTRAN/transpose-tasks-openmp FORTRAN/transpose-ornlacc