From 47a902f0e1bc2a6631984dc05b062d644b39806d Mon Sep 17 00:00:00 2001
From: Ben Albrecht <benalbrecht@pitt.edu>
Date: Tue, 12 Jan 2016 17:12:29 -0800
Subject: [PATCH 1/3] Serial implementation for Stencil, Synch_p2p, and
 Transpose

Shared implementation for Stencil and Transpose as well
---
 .gitignore                             |   1 -
 CHAPEL/Stencil/Makefile                |  25 ++++++
 CHAPEL/Stencil/stencil-serial.chpl     | 102 ++++++++++++++++++++++++
 CHAPEL/Stencil/stencil-shared.chpl     | 105 +++++++++++++++++++++++++
 CHAPEL/Synch_p2p/Makefile              |  25 ++++++
 CHAPEL/Synch_p2p/p2p-serial.chpl       |  78 ++++++++++++++++++
 CHAPEL/Transpose/Makefile              |  25 ++++++
 CHAPEL/Transpose/transpose-serial.chpl |  96 ++++++++++++++++++++++
 CHAPEL/Transpose/transpose-shared.chpl |  90 +++++++++++++++++++++
 include/PRK.chpl                       |   5 ++
 10 files changed, 551 insertions(+), 1 deletion(-)
 create mode 100644 CHAPEL/Stencil/Makefile
 create mode 100644 CHAPEL/Stencil/stencil-serial.chpl
 create mode 100644 CHAPEL/Stencil/stencil-shared.chpl
 create mode 100644 CHAPEL/Synch_p2p/Makefile
 create mode 100644 CHAPEL/Synch_p2p/p2p-serial.chpl
 create mode 100644 CHAPEL/Transpose/Makefile
 create mode 100644 CHAPEL/Transpose/transpose-serial.chpl
 create mode 100644 CHAPEL/Transpose/transpose-shared.chpl
 create mode 100644 include/PRK.chpl

diff --git a/.gitignore b/.gitignore
index 0670c4db5..cdec027f9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -164,4 +164,3 @@ FORTRAN/transpose-coarray
 FORTRAN/transpose-openmp
 FORTRAN/transpose-openmp-target
 FORTRAN/transpose-pretty
-
diff --git a/CHAPEL/Stencil/Makefile b/CHAPEL/Stencil/Makefile
new file mode 100644
index 000000000..433097617
--- /dev/null
+++ b/CHAPEL/Stencil/Makefile
@@ -0,0 +1,25 @@
+# Chapel Makefile
+
+# TODO Find a way to make this fit into the existing Makefile system
+
+OPTFLAGS = --fast -sassertNoSlicing
+MODULEFLAGS = --module-dir ../../include/
+
+# Debugging
+DEBUGFLAGS =
+ifdef DEBUG
+DEBUGFLAGS = --savec c
+endif
+
+CHPL = chpl
+
+SOURCES = stencil-serial.chpl stencil-shared.chpl
+EXECUTABLE = $(SOURCES:.chpl=)
+
+all: $(EXECUTABLE)
+
+%: %.chpl
+	$(CHPL) $(OPTFLAGS) $(MODULEFLAGS) $(DEBUGFLAGS) $*.chpl -o $*
+
+clean:
+	rm -f $(EXECUTABLE)
diff --git a/CHAPEL/Stencil/stencil-serial.chpl b/CHAPEL/Stencil/stencil-serial.chpl
new file mode 100644
index 000000000..167a11061
--- /dev/null
+++ b/CHAPEL/Stencil/stencil-serial.chpl
@@ -0,0 +1,102 @@
+// Chapel's serial stencil implementation
+use PRK;
+
+// Note: Defaulting to STAR stencil (defines weight)
+// Configurable runtime constants
+config const n: int = 100;
+config const iterations: int = 100;
+config const debug: bool = false;
+
+// Compile constants
+param R = 2;
+param coefx = 1.0;
+param coefy = 1.0;
+param epsilon = 1.e-8;
+
+// Runtime constants
+const activePoints = (n-2*R)*(n-2*R);
+const stencilSize = 4*R + 1;
+
+// Timer
+var timer: Timer;
+
+// Domains
+const Dom = {0.. # n, 0.. # n},
+      InnerDom = Dom.expand(-R),
+       W = {-R..R, -R..R};
+
+// Arrays
+var input, output: [Dom] real = 0.0;
+var weight: [W] real = 0.0;
+
+for i in 1..R do {
+  const element = 1.0 / (2.0*i*R);
+  weight[0, i]  =  element;
+  weight[i, 0]  =  element;
+  weight[-i, 0] = -element;
+  weight[0, -i] = -element;
+}
+
+// Initialize the input and output arrays
+serial do [(i, j) in Dom] input[i,j] = coefx*i+coefy*j;
+
+// Print information before main loop
+writeln("Parallel Research Kernels Version ", PRKVERSION);
+writeln("Serial stencil execution on 2D grid");
+writeln("Grid size            = ", n);
+writeln("Radius of stencil    = ", R);
+writeln("Type of stencil      = star"); // Temporarily hard-coded
+writeln("Data type            = double precision");
+writeln("Untiled");                     // Temporarily hard-coded
+writeln("Number of iterations = ", iterations);
+
+for iteration in 0..iterations do {
+  // Start timer after warmup iteration
+  if (iteration == 1) {
+    timer.start();
+  }
+
+  for (i,j) in InnerDom {
+    for jj in -R..R  do output[i, j] += weight[0, jj] * input[i, j+jj];
+    for ii in -R..-1 do output[i, j] += weight[ii, 0] * input[i+ii, j];
+    for ii in 1..R   do output[i, j] += weight[ii, 0] * input[i+ii, j];
+  }
+
+
+  // Add constant to solution to force refresh of neighbor data, if any
+  for (i,j) in Dom do input[i,j] += 1.0;
+
+} // end of iterations
+
+timer.stop();
+
+// Timings
+var stencilTime = timer.elapsed();
+writeln("stencil_time: ", stencilTime);
+
+// Compute L1 norm in parallel
+var norm = + reduce abs(output);
+
+norm /= activePoints;
+
+/*******************************************************************************
+** Analyze and output results.
+********************************************************************************/
+
+// Verify correctness
+var referenceNorm = (iterations + 1) * (coefx + coefy);
+
+if abs(norm-referenceNorm) > epsilon then {
+  writeln("ERROR: L1 norm = ", norm, ", Reference L1 norm = ", referenceNorm);
+  exit(1);
+} else {
+  writeln("Solution validates");
+  if debug then {
+    writeln("L1 norm = ", norm, ", Reference L1 norm = ", referenceNorm);
+  }
+}
+
+var flops = (2*stencilSize + 1) * activePoints;
+var avgTime = stencilTime / iterations;
+writeln("Rate (MFlops/s): ", 1.0E-06 * flops/avgTime,
+        "  Avg time (s): ", avgTime);
diff --git a/CHAPEL/Stencil/stencil-shared.chpl b/CHAPEL/Stencil/stencil-shared.chpl
new file mode 100644
index 000000000..41e86df3e
--- /dev/null
+++ b/CHAPEL/Stencil/stencil-shared.chpl
@@ -0,0 +1,105 @@
+// Chapel's shared-memory parallel stencil
+use PRK;
+
+// Note: Defaulting to STAR stencil (defines weight)
+// Configurable runtime constants
+config const n: int = 100;
+config const iterations: int = 100;
+config const debug: bool = false;
+
+// Compile constants
+param R = 2;
+param coefx = 1.0;
+param coefy = 1.0;
+param epsilon = 1.e-8;
+
+// Runtime constants
+const activePoints = (n-2*R)*(n-2*R);
+const stencilSize = 4*R + 1;
+
+// Timer
+var timer: Timer;
+
+// Domains
+const    Dom = {0.. # n, 0.. # n},
+ProblemSpace = Dom,
+    InnerDom = ProblemSpace.expand(-R),
+           W = {-R..R, -R..R};
+
+// Arrays (initialized to zeros)
+var input, output:  [ProblemSpace] real = 0.0;
+var weight: [W] real = 0.0;
+
+forall i in 1..R do {
+  const element = 1.0 / (2.0*i*R);
+  weight[0, i]  =  element;
+  weight[i, 0]  =  element;
+  weight[-i, 0] = -element;
+  weight[0, -i] = -element;
+}
+
+// Initialize the input and output arrays
+[(i, j) in ProblemSpace] input[i,j] = coefx*i+coefy*j;
+
+// Print information before main loop
+writeln("Parallel Research Kernels Version ", PRKVERSION);
+writeln("Serial stencil execution on 2D grid");
+writeln("Grid size            = ", n);
+writeln("Radius of stencil    = ", R);
+writeln("Type of stencil      = star"); // Temporarily hard-coded
+writeln("Data type            = double precision");
+writeln("Untiled");                     // Temporarily hard-coded
+writeln("Number of iterations = ", iterations);
+
+for iteration in 0..iterations {
+
+  // Start timer after warmup iteration
+  if (iteration == 1) {
+    timer.start();
+  }
+
+  forall (i,j) in ProblemSpace {
+    for jj in -R..R  do output[i, j] += weight[0, jj] * input[i, j+jj];
+    for ii in -R..-1 do output[i, j] += weight[ii, 0] * input[i+ii, j];
+    for ii in 1..R   do output[i, j] += weight[ii, 0] * input[i+ii, j];
+  }
+
+  // Add constant to solution to force refresh of neighbor data, if any
+  forall (i,j) in ProblemSpace {
+    input[i, j] += 1.0;
+  }
+
+} // end of iterations
+
+timer.stop();
+
+// Timings
+var stencilTime = timer.elapsed();
+writeln("stencil_time: ", stencilTime);
+
+// Compute L1 norm in parallel
+var norm = + reduce abs(output);
+
+norm /= activePoints;
+
+/*******************************************************************************
+** Analyze and output results.
+********************************************************************************/
+
+// Verify correctness
+var referenceNorm = (iterations + 1) * (coefx + coefy);
+
+if (abs(norm-referenceNorm) > epsilon) {
+  writeln("ERROR: L1 norm = ", norm, ", Reference L1 norm = ", referenceNorm);
+  exit(1);
+} else {
+  writeln("Solution validates");
+  if debug {
+    writeln("L1 norm = ", norm, ", Reference L1 norm = ", referenceNorm);
+  }
+}
+
+var flops = (2*stencilSize + 1) * activePoints;
+var avgTime = stencilTime / iterations;
+writeln("Rate (MFlops/s): ", 1.0E-06 * flops/avgTime,
+        "  Avg time (s): ", avgTime);
diff --git a/CHAPEL/Synch_p2p/Makefile b/CHAPEL/Synch_p2p/Makefile
new file mode 100644
index 000000000..24a3e018a
--- /dev/null
+++ b/CHAPEL/Synch_p2p/Makefile
@@ -0,0 +1,25 @@
+# Chapel Makefile
+
+# TODO Find a way to make this fit into the existing Makefile system
+
+OPTFLAGS = --fast -sassertNoSlicing
+MODULEFLAGS = --module-dir ../../include/
+
+# Debugging
+DEBUGFLAGS =
+ifdef DEBUG
+DEBUGFLAGS = --savec c
+endif
+
+CHPL = chpl
+
+SOURCES = p2p-serial.chpl
+EXECUTABLE = $(SOURCES:.chpl=)
+
+all: $(EXECUTABLE)
+
+%: %.chpl
+	$(CHPL) $(OPTFLAGS) $(MODULEFLAGS) $(DEBUGFLAGS) $*.chpl -o $*
+
+clean:
+	rm -f $(EXECUTABLE)
diff --git a/CHAPEL/Synch_p2p/p2p-serial.chpl b/CHAPEL/Synch_p2p/p2p-serial.chpl
new file mode 100644
index 000000000..2c912347c
--- /dev/null
+++ b/CHAPEL/Synch_p2p/p2p-serial.chpl
@@ -0,0 +1,78 @@
+// Chapel's serial implementation of synch_p2p
+
+use PRK;
+
+config const iterations = 1000;
+config const m = 1000;
+config const n = 1000;
+config const debug: bool = false;
+
+param timer: Timer;
+
+if (iterations < 1) {
+  writeln("ERROR: iterations must be >= 1 : ", iterations);
+  exit(1);
+}
+
+if (m < 1 || n < 1) {
+  writeln("ERROR: grid dimensions must be positive:", m, ", ", n);
+  exit(1);
+}
+
+// Initialize and zero out vector
+const mrange = 0 .. # m,
+      nrange = 0 .. # n,
+      Dom = {mrange, nrange},
+      Dom1 = {1 ..(m-1), 1 .. (n-1)};
+
+var vector : [Dom] real = 0.0;
+
+writeln("Parallel Research Kernels version ", PRKVERSION);
+writeln("Serial pipeline execution on 2D grid");
+writeln("Grid sizes                = ", m, ", ", n);
+writeln("Number of iterations      = ", iterations);
+
+// Set boundary values (top and left side of grid)
+serial do [j in nrange] vector[0,j] = j;
+serial do [i in mrange] vector[i,0] = i;
+
+for iteration in 0 .. iterations {
+
+  // Start timer after warmup iteration
+  if (iteration == 1) then timer.start();
+
+  for (i,j) in Dom1 {
+    vector[i, j] = vector[i-1, j] + vector[i, j-1] - vector[i-1, j-1];
+  }
+
+  // Copy bottom right corner value to top left, creating dependency
+  vector[0, 0] = -vector[m-1, n-1];
+}
+
+timer.stop();
+
+var pipelineTime = timer.elapsed();
+
+// Analyze and output results
+
+// Error threshold
+const epsilon = 1.e-8;
+
+// Verify correctness, using bottom right value of array
+var cornerValue : real = (iterations + 1)*(n + m - 2);
+if (abs(vector[m-1, n-1] - cornerValue) / cornerValue > epsilon) {
+  writeln("ERROR: checksum ", vector[m-1, n-1], " does not match verification \
+      value", cornerValue);
+  exit(1);
+}
+
+if (debug) {
+  writeln("Solution validates; verification value = ", cornerValue);
+} else {
+  writeln("Solution validates");
+}
+
+var avgTime = pipelineTime / iterations;
+
+writeln("Rate (MFlops/s): ", 1.0e-6 * 2 * ((m-1)*(n-1)) / avgTime,
+        " Avg time (s): ", avgTime);
diff --git a/CHAPEL/Transpose/Makefile b/CHAPEL/Transpose/Makefile
new file mode 100644
index 000000000..a4c41faf0
--- /dev/null
+++ b/CHAPEL/Transpose/Makefile
@@ -0,0 +1,25 @@
+# Chapel Makefile
+
+# TODO Find a way to make this fit into the existing Makefile system
+
+OPTFLAGS = --fast -sassertNoSlicing
+MODULEFLAGS = --module-dir ../../include/
+
+# Debugging
+DEBUGFLAGS =
+ifdef DEBUG
+DEBUGFLAGS = --savec c
+endif
+
+CHPL = chpl
+
+SOURCES = transpose-serial.chpl transpose-shared.chpl
+EXECUTABLE = $(SOURCES:.chpl=)
+
+all: $(EXECUTABLE)
+
+%: %.chpl
+	$(CHPL) $(OPTFLAGS) $(MODULEFLAGS) $(DEBUGFLAGS) $*.chpl -o $*
+
+clean:
+	rm -f $(EXECUTABLE)
diff --git a/CHAPEL/Transpose/transpose-serial.chpl b/CHAPEL/Transpose/transpose-serial.chpl
new file mode 100644
index 000000000..e5caf9b44
--- /dev/null
+++ b/CHAPEL/Transpose/transpose-serial.chpl
@@ -0,0 +1,96 @@
+// Chapel's serial implementation of transpose
+
+use PRK;
+
+config const iterations : int = 100;
+config const order : int = 100;
+config const tileSize : int = 32;
+config const debug: bool = false;
+
+// Timer
+var timer: Timer;
+
+var tiled : bool;
+var bytes = 2.0 * numBytes(real) * order * order;
+
+const Dom = {0.. # order, 0.. # order},
+ tiledDom = {0.. # order by tileSize, 0.. # order by tileSize};
+var A, B : [Dom] real;
+
+
+writeln("Parallel Research Kernels version ", PRKVERSION);
+writeln("Serial Matrix transpose: B = A^T");
+writeln("Matrix order          = ", order);
+if (tileSize < order && tileSize > 0) {
+  writeln("Tile size              = ", tileSize);
+  tiled = true;
+} else {
+  writeln("Untiled");
+  tiled = false;
+}
+writeln("Number of iterations = ", iterations);
+
+
+// Fill original column matrix
+for (i,j) in Dom {
+  A[i,j] = order*j + i;
+}
+
+// Set transpose matrix to known garbage value
+B = -1.0;
+
+var work = 0;
+// Main loop
+for iteration in 0..iterations {
+  // Start timer after a warmup lap
+  if (iteration == 1) then timer.start();
+
+  if (tiled) {
+    for (i,j) in tiledDom {
+      for it in i .. # min(order - i, tileSize) {
+        for jt in j .. # min(order - j, tileSize) {
+          B[jt,it] = A[it,jt];
+          work += 1;
+        }
+      }
+    }
+  }
+  else {
+    for (i,j) in Dom {
+      B[j,i] = A[i,j];
+      work += 1;
+    }
+  }
+
+} // end of main loop
+
+timer.stop();
+
+// Analyze and output results
+
+var transposeTime = timer.elapsed();
+
+
+var abserr = 0.0;
+for (i,j) in Dom {
+  abserr += abs(B[i,j] - (order*i + j));
+}
+
+if (debug) {
+  writeln("transposeTime = ", transposeTime);
+  writeln("work = ", work);
+  writeln("Sum of absolute differences: ", abserr);
+}
+
+// Error tolerance
+const epsilon = 1.e-8;
+
+if (abserr < epsilon) {
+  writeln("Solution validates\n");
+  var avgtime = transposeTime / iterations;
+  writeln("Rate (MB/s): ", 1.0E-06 * bytes / avgtime,
+          " Avg time (s): ", avgtime);
+} else {
+  writeln("ERROR: Aggregate squared error", abserr,
+          " exceeds threshold ", epsilon);
+}
diff --git a/CHAPEL/Transpose/transpose-shared.chpl b/CHAPEL/Transpose/transpose-shared.chpl
new file mode 100644
index 000000000..4228f7da7
--- /dev/null
+++ b/CHAPEL/Transpose/transpose-shared.chpl
@@ -0,0 +1,90 @@
+// Chapel's serial implementation of transpose
+
+use PRK;
+
+config const iterations : int = 100;
+config const order : int = 100;
+config const tileSize : int = 32;
+config const debug: bool = false;
+
+// Timer
+var timer: Timer;
+
+var tiled : bool;
+var bytes = 2.0 * numBytes(real) * order * order;
+
+const Dom = {0.. # order, 0.. # order},
+ tiledDom = {0.. # order by tileSize, 0.. # order by tileSize};
+var A, B : [Dom] real;
+
+
+writeln("Parallel Research Kernels version ", PRKVERSION);
+writeln("Serial Matrix transpose: B = A^T");
+writeln("Matrix order          = ", order);
+if (tileSize < order && tileSize > 0) {
+  writeln("Tile size              = ", tileSize);
+  tiled = true;
+} else {
+  writeln("Untiled");
+  tiled = false;
+}
+writeln("Number of iterations = ", iterations);
+
+
+// Fill original column matrix
+[(i, j) in Dom] A[i,j] = order*j + i;
+
+// Set transpose matrix to known garbage value
+B = -1.0;
+
+// Main loop
+for iteration in 0..iterations {
+  // Start timer after a warmup lap
+  if (iteration == 1) then timer.start();
+
+  if (tiled) {
+    forall (i,j) in tiledDom {
+      for it in i .. # min(order - i, tileSize) {
+        for jt in j .. # min(order - j, tileSize) {
+          B[jt,it] = A[it,jt];
+        }
+      }
+    }
+  }
+  else {
+    forall (i,j) in Dom {
+      B[j,i] = A[i,j];
+    }
+  }
+
+} // end of main loop
+
+timer.stop();
+
+// Analyze and output results
+
+var transposeTime = timer.elapsed();
+
+
+var abserr = 0.0;
+for (i,j) in Dom {
+  abserr += abs(B[i,j] - (order*i + j));
+}
+
+if (debug) {
+  writeln("transposeTime = ", transposeTime);
+  writeln("Sum of absolute differences: ", abserr);
+}
+
+// Error tolerance
+const epsilon = 1.e-8;
+
+if (abserr < epsilon) {
+  writeln("Solution validates\n");
+  var avgtime = transposeTime / iterations;
+  writeln("Rate (MB/s): ", 1.0E-06 * bytes / avgtime,
+          " Avg time (s): ", avgtime);
+} else {
+  writeln("ERROR: Aggregate squared error", abserr,
+          " exceeds threshold ", epsilon);
+}
diff --git a/include/PRK.chpl b/include/PRK.chpl
new file mode 100644
index 000000000..3cc01b651
--- /dev/null
+++ b/include/PRK.chpl
@@ -0,0 +1,5 @@
+// Common modules, definitions, and constants across Chapel implementations
+
+use Time;
+
+param PRKVERSION = "2.15";

From 90bd80077fb8f195b71a84f0a0c6667f977c5e4f Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.science@gmail.com>
Date: Sat, 17 Jun 2017 10:43:49 -0700
Subject: [PATCH 2/3] fix printout [ci skip]

---
 travis/install-mpi.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/travis/install-mpi.sh b/travis/install-mpi.sh
index afe39d850..5e022e28a 100755
--- a/travis/install-mpi.sh
+++ b/travis/install-mpi.sh
@@ -55,7 +55,7 @@ case "$os" in
                     if [ -f "`which clang$clangversion`" ]; then
                         export PRK_CC="clang$clangversion"
                         export PRK_CXX="clang++$clangversion"
-                        echo "Found GCC: $PRK_CC"
+                        echo "Found Clang: $PRK_CC"
                         break
                     fi
                 done

From d4fed6e972d4511e6cc56668674248af7f30c333 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jeff.r.hammond@intel.com>
Date: Tue, 18 Feb 2020 17:04:16 -0800
Subject: [PATCH 3/3] reduce PR delta

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 41f4962ec..0b5905f43 100644
--- a/.gitignore
+++ b/.gitignore
@@ -281,6 +281,7 @@ FORTRAN/transpose-coarray
 FORTRAN/transpose-openmp
 FORTRAN/transpose-openmp-target
 FORTRAN/transpose-pretty
+FORTRAN/transpose-ornlacc
 FORTRAN/transpose-taskloop-openmp
 FORTRAN/transpose-tasks-openmp
 FORTRAN/transpose-ornlacc