From 2cd616222b9212069dd1bcfa4f4ad5dafb00b2f2 Mon Sep 17 00:00:00 2001
From: jeremiah-corrado <62707311+jeremiah-corrado@users.noreply.github.com>
Date: Wed, 10 Apr 2024 10:33:55 -0600
Subject: [PATCH] Array API Manipulation Function Improvements (#3056)

* refactor some array manipulation commands to reduce uneeded comm with locale 0

Signed-off-by: Jeremiah Corrado <jeremiah.corrado@hpe.com>

* add implementation of tile, unstack, and repeat to maniputation-functions module. Implement rank-reducing indexing for ND arrays. Bug fixes and performance improvements in manipultation functions.

Signed-off-by: Jeremiah Corrado <jeremiah.corrado@hpe.com>

* fix flake8, mypy and python compat errors

Signed-off-by: Jeremiah Corrado <jeremiah.corrado@hpe.com>

* add error handling to expandDims and stack for cases where max array rank would be exceeded

Signed-off-by: Jeremiah Corrado <jeremiah.corrado@hpe.com>

* add special cases for flatten and unflatten to reshapeMsg

Signed-off-by: Jeremiah Corrado <jeremiah.corrado@hpe.com>

* replace rank reducing slices in flatten/unflatten with explicit put/get operations

Signed-off-by: Jeremiah Corrado <jeremiah.corrado@hpe.com>

* use AryUtil flatten/unflatten in SetMsg

Signed-off-by: Jeremiah Corrado <jeremiah.corrado@hpe.com>

---------

Signed-off-by: Jeremiah Corrado <jeremiah.corrado@hpe.com>
---
 arkouda/array_api/__init__.py                |   8 +
 arkouda/array_api/_manipulation_functions.py | 124 ++--
 arkouda/pdarrayclass.py                      |  49 +-
 src/AryUtil.chpl                             | 148 +++++
 src/ManipulationMsg.chpl                     | 626 +++++++++++++++----
 src/ReductionMsg.chpl                        |   6 +-
 src/SetMsg.chpl                              |  65 --
 src/compat/e-132/ArkoudaAryUtilCompat.chpl   |   4 +-
 src/compat/eq-131/ArkoudaAryUtilCompat.chpl  |   4 +-
 src/compat/eq-133/ArkoudaAryUtilCompat.chpl  |   4 +-
 src/compat/eq-134/ArkoudaAryUtilCompat.chpl  |   4 +-
 src/compat/ge-20/ArkoudaAryUtilCompat.chpl   |   6 +-
 tests/array_api/array_manipulation.py        | 118 +++-
 13 files changed, 860 insertions(+), 306 deletions(-)

diff --git a/arkouda/array_api/__init__.py b/arkouda/array_api/__init__.py
index c353f6fda5..2eee5a4aea 100644
--- a/arkouda/array_api/__init__.py
+++ b/arkouda/array_api/__init__.py
@@ -118,11 +118,15 @@
     concat,
     expand_dims,
     flip,
+    moveaxis,
     permute_dims,
+    repeat,
     reshape,
     roll,
     squeeze,
     stack,
+    tile,
+    unstack,
 )
 
 from ._searching_functions import argmax, argmin, nonzero, where
@@ -255,11 +259,15 @@
     "concat",
     "expand_dims",
     "flip",
+    "moveaxis",
     "permute_dims",
+    "repeat",
     "reshape",
     "roll",
     "squeeze",
     "stack",
+    "tile",
+    "unstack",
 ]
 
 __all__ += ["argmax", "argmin", "nonzero", "where"]
diff --git a/arkouda/array_api/_manipulation_functions.py b/arkouda/array_api/_manipulation_functions.py
index dc5248e813..1880b41d19 100644
--- a/arkouda/array_api/_manipulation_functions.py
+++ b/arkouda/array_api/_manipulation_functions.py
@@ -5,18 +5,13 @@
 from typing import List, Optional, Tuple, Union, cast
 from arkouda.client import generic_msg
 from arkouda.pdarrayclass import create_pdarray
+from arkouda.pdarraycreation import scalar_array
 from arkouda.util import broadcast_dims
 
 import numpy as np
 
 
 def broadcast_arrays(*arrays: Array) -> List[Array]:
-    """
-    Array API compatible wrapper for :py:func:`np.broadcast_arrays <numpy.broadcast_arrays>`.
-
-    See its docstring for more information.
-    """
-
     shapes = [a.shape for a in arrays]
     bcShape = shapes[0]
     for shape in shapes[1:]:
@@ -50,15 +45,9 @@ def broadcast_to(x: Array, /, shape: Tuple[int, ...]) -> Array:
         raise ValueError(f"Failed to broadcast array: {e}")
 
 
-# Note: the function name is different here
 def concat(
     arrays: Union[Tuple[Array, ...], List[Array]], /, *, axis: Optional[int] = 0
 ) -> Array:
-    """
-    Array API compatible wrapper for :py:func:`np.concatenate <numpy.concatenate>`.
-
-    See its docstring for more information.
-    """
     # TODO: type promotion across input arrays
 
     return Array._new(
@@ -81,11 +70,6 @@ def concat(
 
 
 def expand_dims(x: Array, /, *, axis: int) -> Array:
-    """
-    Array API compatible wrapper for :py:func:`np.expand_dims <numpy.expand_dims>`.
-
-    See its docstring for more information.
-    """
     try:
         return Array._new(
             create_pdarray(
@@ -106,11 +90,6 @@ def expand_dims(x: Array, /, *, axis: int) -> Array:
 
 
 def flip(x: Array, /, *, axis: Optional[Union[int, Tuple[int, ...]]] = None) -> Array:
-    """
-    Array API compatible wrapper for :py:func:`np.flip <numpy.flip>`.
-
-    See its docstring for more information.
-    """
     axisList = []
     if axis is not None:
         axisList = list(axis) if isinstance(axis, tuple) else [axis]
@@ -137,15 +116,22 @@ def flip(x: Array, /, *, axis: Optional[Union[int, Tuple[int, ...]]] = None) ->
 def moveaxis(
     x: Array, source: Union[int, Tuple[int, ...]], destination: Union[int, Tuple[int, ...]], /
 ) -> Array:
-    raise NotImplementedError("moveaxis is not yet implemented")
+    perm = list(range(x.ndim))
+    if isinstance(source, tuple):
+        if isinstance(destination, tuple):
+            for s, d in zip(source, destination):
+                perm[s] = d
+        else:
+            raise ValueError("source and destination must both be tuples if source is a tuple")
+    elif isinstance(destination, int):
+        perm[source] = destination
+    else:
+        raise ValueError("source and destination must both be integers if source is a tuple")
 
+    return permute_dims(x, axes=tuple(perm))
 
-def permute_dims(x: Array, /, axes: Tuple[int, ...]) -> Array:
-    """
-    Array API compatible wrapper for :py:func:`np.transpose <numpy.transpose>`.
 
-    See its docstring for more information.
-    """
+def permute_dims(x: Array, /, axes: Tuple[int, ...]) -> Array:
     try:
         return Array._new(
             create_pdarray(
@@ -166,17 +152,33 @@ def permute_dims(x: Array, /, axes: Tuple[int, ...]) -> Array:
 
 
 def repeat(x: Array, repeats: Union[int, Array], /, *, axis: Optional[int] = None) -> Array:
-    raise NotImplementedError("repeat is not yet implemented")
+    if isinstance(repeats, int):
+        reps = Array._new(scalar_array(repeats))
+    else:
+        reps = repeats
+
+    if axis is None:
+        return Array._new(
+            create_pdarray(
+                cast(
+                    str,
+                    generic_msg(
+                        cmd=f"repeatFlat{x.ndim}D",
+                        args={
+                            "name": x._array,
+                            "repeats": reps._array,
+                        },
+                    ),
+                )
+            )
+        )
+    else:
+        raise NotImplementedError("repeat with 'axis' argument is not yet implemented")
 
 
 def reshape(
     x: Array, /, shape: Tuple[int, ...], *, copy: Optional[bool] = None
 ) -> Array:
-    """
-    Array API compatible wrapper for :py:func:`np.reshape <numpy.reshape>`.
-
-    See its docstring for more information.
-    """
 
     # TODO: figure out copying semantics (currently always creates a copy)
     try:
@@ -205,11 +207,6 @@ def roll(
     *,
     axis: Optional[Union[int, Tuple[int, ...]]] = None,
 ) -> Array:
-    """
-    Array API compatible wrapper for :py:func:`np.roll <numpy.roll>`.
-
-    See its docstring for more information.
-    """
     axisList = []
     if axis is not None:
         axisList = list(axis) if isinstance(axis, tuple) else [axis]
@@ -240,11 +237,6 @@ def roll(
 
 
 def squeeze(x: Array, /, axis: Union[int, Tuple[int, ...]]) -> Array:
-    """
-    Array API compatible wrapper for :py:func:`np.squeeze <numpy.squeeze>`.
-
-    See its docstring for more information.
-    """
     nAxes = len(axis) if isinstance(axis, tuple) else 1
     try:
         return Array._new(
@@ -267,11 +259,6 @@ def squeeze(x: Array, /, axis: Union[int, Tuple[int, ...]]) -> Array:
 
 
 def stack(arrays: Union[Tuple[Array, ...], List[Array]], /, *, axis: int = 0) -> Array:
-    """
-    Array API compatible wrapper for :py:func:`np.stack <numpy.stack>`.
-
-    See its docstring for more information.
-    """
     # TODO: type promotion across input arrays
     return Array._new(
         create_pdarray(
@@ -291,8 +278,43 @@ def stack(arrays: Union[Tuple[Array, ...], List[Array]], /, *, axis: int = 0) ->
 
 
 def tile(x: Array, repetitions: Tuple[int, ...], /) -> Array:
-    raise NotImplementedError("tile is not yet implemented")
+    if len(repetitions) > x.ndim:
+        xr = reshape(x, (1,) * (len(repetitions) - x.ndim) + x.shape)
+        reps = repetitions
+    elif len(repetitions) < x.ndim:
+        xr = x
+        reps = (1,) * (x.ndim - len(repetitions)) + repetitions
+    else:
+        xr = x
+        reps = repetitions
+
+    return Array._new(
+        create_pdarray(
+            cast(
+                str,
+                generic_msg(
+                    cmd=f"tile{xr.ndim}D",
+                    args={
+                        "name": xr._array,
+                        "reps": reps,
+                    },
+                ),
+            )
+        )
+    )
 
 
 def unstack(x: Array, /, *, axis: int = 0) -> Tuple[Array, ...]:
-    raise NotImplementedError("unstack is not yet implemented")
+    resp = cast(
+                str,
+                generic_msg(
+                    cmd=f"unstack{x.ndim}D",
+                    args={
+                        "name": x._array,
+                        "axis": axis,
+                        "numReturnArrays": x.shape[axis],
+                    },
+                ),
+            )
+
+    return tuple([Array._new(create_pdarray(a)) for a in resp.split("+")])
diff --git a/arkouda/pdarrayclass.py b/arkouda/pdarrayclass.py
index 72bee0a1d6..4ac5f51f34 100755
--- a/arkouda/pdarrayclass.py
+++ b/arkouda/pdarrayclass.py
@@ -657,18 +657,41 @@ def __getitem__(self, key):
             return create_pdarray(repMsg)
 
         if isinstance(key, tuple):
-            allScalar = True
+            if len(key) > self.ndim:
+                raise IndexError(f"too many indices ({len(key)}) for array with {self.ndim} dimensions")
+
+            # replace '...' with the appropriate number of ':'
+            elipsis_axis_idx = -1
+            for dim, k in enumerate(key):
+                if isinstance(k, type(Ellipsis)):
+                    if elipsis_axis_idx != -1:
+                        raise IndexError("array index can only have one ellipsis")
+                    else:
+                        elipsis_axis_idx = dim
+
+            if elipsis_axis_idx != -1:
+                key = tuple(
+                        key[:elipsis_axis_idx] +
+                        (slice(None),) * (self.ndim - len(key) + 1) +
+                        key[(elipsis_axis_idx+1):]
+                    )
+
+            # parse the key tuple
+            num_scalar = 0
+            scalar_axes = []
             starts = []
             stops = []
             strides = []
             for dim, k in enumerate(key):
                 if isinstance(k, slice):
-                    allScalar = False
                     (start, stop, stride) = k.indices(self.shape[dim])
                     starts.append(start)
                     stops.append(stop)
                     strides.append(stride)
                 elif np.isscalar(k) and (resolve_scalar_dtype(k) in ["int64", "uint64"]):
+                    num_scalar += 1
+                    scalar_axes.append(dim)
+
                     if k < 0:
                         # Interpret negative key as offset from end of array
                         k += int(self.shape[dim])
@@ -678,15 +701,14 @@ def __getitem__(self, key):
                         )
                     else:
                         # treat this as a single element slice
-                        # TODO: implement rank-reducing slices
                         starts.append(k)
                         stops.append(k + 1)
                         strides.append(1)
                 else:
                     raise IndexError(f"Unhandled key type: {k} ({type(k)})")
 
-            if allScalar:
-                # use simpler indexing (and return a scalar) if we got a tuple of only scalars
+            if num_scalar == len(key):
+                # all scalars: use simpler indexing (and return a scalar)
                 repMsg = generic_msg(
                     cmd=f"[int]{self.ndim}D",
                     args={
@@ -706,7 +728,22 @@ def __getitem__(self, key):
                         "strides": tuple(strides),
                     },
                 )
-                return create_pdarray(repMsg)
+                maybe_degen_arr = create_pdarray(repMsg)
+
+                if num_scalar > 0:
+                    # reduce the array rank if there are any scalar indices
+                    # note: squeeze requires the non-default ManipulationMsg server module
+                    repMsg = generic_msg(
+                        cmd=f"squeeze{maybe_degen_arr.ndim}Dx{maybe_degen_arr.ndim - num_scalar}D",
+                        args={
+                            "name": maybe_degen_arr,
+                            "nAxes": num_scalar,
+                            "axes": scalar_axes,
+                        },
+                    )
+                    return create_pdarray(repMsg)
+                else:
+                    return maybe_degen_arr
 
         if isinstance(key, pdarray) and self.ndim == 1:
             kind, _ = translate_np_dtype(key.dtype)
diff --git a/src/AryUtil.chpl b/src/AryUtil.chpl
index db2eca7f26..7e10cb4fe6 100644
--- a/src/AryUtil.chpl
+++ b/src/AryUtil.chpl
@@ -10,6 +10,7 @@ module AryUtil
     use BitOps;
     use GenSymIO;
     use PrivateDist;
+    use Communication;
 
     use ArkoudaPOSIXCompat;
     use ArkoudaCTypesCompat;
@@ -731,4 +732,151 @@ module AryUtil
       }
       return s;
     }
+
+    /*
+      unflatten a 1D array into a multi-dimensional array of the given shape
+    */
+    proc unflatten(const ref a: [?d] ?t, shape: ?N*int): [] t throws {
+      var unflat = makeDistArray((...shape), t);
+
+      if N == 1 {
+        unflat = a;
+        return unflat;
+      }
+
+      // ranges of flat indices owned by each locale
+      const flatLocRanges = [loc in Locales] d.localSubdomain(loc).dim(0);
+
+      coforall loc in Locales do on loc {
+        const lduf = unflat.domain.localSubdomain(),
+              lastRank = lduf.dim(N-1);
+
+        // iterate over each slice of contiguous memory in the local subdomain
+        forall idx in domOffAxis(lduf, N-1) with (
+            const ord = new orderer(shape),
+            const dufc = unflat.domain,
+            in flatLocRanges
+        ) {
+          var idxTup: (N-1)*int;
+          for i in 0..<(N-1) do idxTup[i] = idx[i];
+
+          const low = ((...idxTup), lastRank.low),
+                high = ((...idxTup), lastRank.high),
+                flatSlice = ord.indexToOrder(low)..ord.indexToOrder(high);
+
+          // compute which locales in the input array this slice corresponds to
+          var locInStart, locInStop = 0;
+          for (flr, locID) in zip(flatLocRanges, 0..<numLocales) {
+            if flr.contains(flatSlice.low) then locInStart = locID;
+            if flr.contains(flatSlice.high) then locInStop = locID;
+          }
+
+          if locInStart == locInStop {
+            // flat region sits within a single locale, do a single get
+            get(
+              c_ptrTo(unflat[low]),
+              c_ptrToConst(a[flatSlice.low]):c_ptr(t),
+              locInStart,
+              c_sizeof(t) * flatSlice.size
+            );
+          } else {
+            // flat region is spread across multiple locales, do a get for each source locale
+            for locInID in locInStart..locInStop {
+              const flatSubSlice = flatSlice[flatLocRanges[locInID]];
+
+              get(
+                c_ptrTo(unflat[dufc.orderToIndex(flatSubSlice.low)]),
+                c_ptrToConst(a[flatSubSlice.low]):c_ptr(t),
+                locInID,
+                c_sizeof(t) * flatSubSlice.size
+              );
+            }
+          }
+        }
+      }
+
+      return unflat;
+    }
+
+    /*
+      flatten a multi-dimensional array into a 1D array
+    */
+    proc flatten(const ref a: [?d] ?t): [] t throws
+      where a.rank > 1
+    {
+      var flat = makeDistArray(d.size, t);
+
+      // ranges of flat indices owned by each locale
+      const flatLocRanges = [loc in Locales] flat.domain.localSubdomain(loc).dim(0);
+
+      coforall loc in Locales do on loc {
+        const ld = d.localSubdomain(),
+              lastRank = ld.dim(d.rank-1);
+
+        // iterate over each slice of contiguous memory in the local subdomain
+        forall idx in domOffAxis(ld, d.rank-1) with (
+            const ord = new orderer(d.shape),
+            const dc = d,
+            in flatLocRanges
+        ) {
+          var idxTup: (d.rank-1)*int;
+          for i in 0..<(d.rank-1) do idxTup[i] = idx[i];
+
+          const low = ((...idxTup), lastRank.low),
+                high = ((...idxTup), lastRank.high),
+                flatSlice = ord.indexToOrder(low)..ord.indexToOrder(high);
+
+          // compute which locales in the output array this slice corresponds to
+          var locOutStart, locOutStop = 0;
+          for (flr, locID) in zip(flatLocRanges, 0..<numLocales) {
+            if flr.contains(flatSlice.low) then locOutStart = locID;
+            if flr.contains(flatSlice.high) then locOutStop = locID;
+          }
+
+          if locOutStart == locOutStop {
+            // flat region sits within a single locale, do a single put
+            put(
+                c_ptrTo(flat[flatSlice.low]),
+                c_ptrToConst(a[low]):c_ptr(t),
+                locOutStart,
+                c_sizeof(t) * flatSlice.size
+            );
+          } else {
+            // flat region is spread across multiple locales, do a put for each destination locale
+            for locOutID in locOutStart..locOutStop {
+              const flatSubSlice = flatSlice[flatLocRanges[locOutID]];
+
+              put(
+                c_ptrTo(flat[flatSubSlice.low]),
+                c_ptrToConst(a[dc.orderToIndex(flatSubSlice.low)]):c_ptr(t),
+                locOutID,
+                c_sizeof(t) * flatSubSlice.size
+              );
+            }
+          }
+        }
+      }
+
+      return flat;
+    }
+
+    // helper for computing an array element's index from its order
+    record orderer {
+      param rank: int;
+      const accumRankSizes: [0..<rank] int;
+
+      proc init(shape: ?N*int) {
+        this.rank = N;
+        const sizesRev = [i in 0..<N] shape[N - i - 1];
+        this.accumRankSizes = * scan sizesRev / sizesRev;
+      }
+
+      // index -> order for the input array's indices
+      // e.g., order = k + (nz * j) + (nz * ny * i)
+      inline proc indexToOrder(idx: rank*int): int {
+        var order = 0;
+        for param i in 0..<rank do order += idx[i] * accumRankSizes[rank - i - 1];
+        return order;
+      }
+    }
 }
diff --git a/src/ManipulationMsg.chpl b/src/ManipulationMsg.chpl
index 03bd649c7b..2c7f6cba96 100644
--- a/src/ManipulationMsg.chpl
+++ b/src/ManipulationMsg.chpl
@@ -8,9 +8,10 @@ module ManipulationMsg {
   use Logging;
   use ServerErrorStrings;
   use CommAggregation;
+  use AryUtil;
+  use ArkoudaAryUtilCompat;
 
   use Reflection;
-  use BigInteger;
 
   private config const logLevel = ServerConfig.logLevel;
   private config const logChannel = ServerConfig.logChannel;
@@ -42,7 +43,6 @@ module ManipulationMsg {
     aware that promotion of singleton dimensions may be necessary. E.g.,
     make matrix multiplication aware that it can treat a singleton
     value as a vector of the appropriate length during multiplication.
-
     (this may require a modification of SymEntry to keep track of
     which dimensions are explicitly singletons)
 
@@ -86,18 +86,41 @@ module ManipulationMsg {
           mLogger.error(getModuleName(),getRoutineName(),getLineNumber(),errorMsg);
           return new MsgTuple(errorMsg, MsgType.ERROR);
         } else {
+          // use List;
+          // var bcDimsList = new list(int);
+          // for i in 0..<ndIn do if bcDims[i] then bcDimsList.pushBack(i);
+
+          // // iterate over each slice of the output array corresponding to one
+          // // copy of the input array and perform the copy
+          // /* Example:
+          //   broadcast 5x1x3 array into 5x4x3 array:
+          //   - the 5x1x3 array is copied into the 5x4x3 array 4 times
+          //   - domOffAxis => {0..0, 0..<4, 0..0}
+          //   - for 'nonBCIndex' = (0, 0, 0), outSliceIdx = (0..<5, 0..0, 0..<3)
+          //   - for 'nonBCIndex' = (0, 1, 0), outSliceIdx = (0..<5, 1..1, 0..<3)
+          //   - etc.
+          // */
+          // forall nonBCIndex in domOffAxis(eOut.a.domain, bcDimsList.toArray()) {
+          //   const nbcT = if ndOut == 1 then (nonBCIndex,) else nonBCIndex;
+
+
+          //   var outSliceIdx: ndOut*range;
+          //   for i in 0..<ndOut do outSliceIdx[i] = 0..<shapeOut[i];
+          //   for i in 0..<ndIn do if bcDims[i] then outSliceIdx[i] = nbcT[i];
+
+          //   eOut.a[(...outSliceIdx)] = eIn.a; // !!! Doesn't work because of rank mismatch !!!
+          // }
+
           // define a mapping from the output array's indices to the input array's indices
-          inline proc imap(idx: int ...ndOut): ndIn*int {
+          inline proc imap(idx: ndOut*int, bc: ndIn*int): ndIn*int {
             var ret: ndIn*int;
-            for param i in 0..<ndIn do
-              ret[i] = if bcDims[i] then 0 else idx[i];
+            for param i in 0..<ndIn do ret[i] = if bc[i] then 0 else idx[i];
             return ret;
           }
 
-          forall idx in eOut.a.domain with (var agg = newSrcAggregator(t)) do
-            if ndOut == 1
-              then eOut.a[idx] = eIn.a[imap(idx)];
-              else eOut.a[idx] = eIn.a[imap((...idx))];
+          // copy values from the input array into the output array
+          forall idx in eOut.a.domain with (var agg = newSrcAggregator(t), in bcDims) do
+            agg.copy(eOut.a[idx], eIn.a[imap(if ndOut==1 then (idx,) else idx, bcDims)]);
         }
       }
 
@@ -126,7 +149,7 @@ module ManipulationMsg {
 
     for param iIn in 0..<Nf {
       param iOut = Nt - Nf + iIn;
-      if from[iIn] == 1 && to[iOut] != 1 {
+      if from[iIn] == 1 {
         dimsToBroadcast[iIn] = true;
       } else if from[iIn] != to[iOut] {
         return (false, dimsToBroadcast);
@@ -169,23 +192,14 @@ module ManipulationMsg {
       } else {
         var eOut = st.addEntry(rname, (...shapeOut), t);
 
-        // mapping between the input and output array indices
-        inline proc imap(arrIdx: int, idx: nd*int): nd*int
-          where nd > 1
-        {
-          var ret = idx;
-          ret[axis] += startOffsets[arrIdx];
-          return ret;
-        }
-
-        inline proc imap(arrIdx: int, idx: int): int
-          where nd == 1
-            do return idx + startOffsets[arrIdx];
-
         // copy the data from the input arrays to the output array
-        for (arrIdx, arr) in zip(eIns.domain, eIns) do
-          forall idx in arr.a.domain with (var agg = newDstAggregator(t)) do
-            agg.copy(eOut.a[imap(arrIdx, idx)], arr.a[idx]);
+        forall (arrIdx, arr) in zip(eIns.domain, eIns) with (in startOffsets) {
+          forall idx in arr.a.domain with (var agg = newDstAggregator(t)) {
+            var outIdx = if nd == 1 then (idx,) else idx;
+            outIdx[axis] += startOffsets[arrIdx];
+            agg.copy(eOut.a[outIdx], arr.a[idx]);
+          }
+        }
 
         const repMsg = "created " + st.attrib(rname);
         mLogger.info(getModuleName(),pn,getLineNumber(),repMsg);
@@ -198,7 +212,6 @@ module ManipulationMsg {
       when DType.UInt64 do return doConcat(uint);
       when DType.Float64 do return doConcat(real);
       when DType.Bool do return doConcat(bool);
-      when DType.BigInt do return doConcat(bigint);
       otherwise {
         var errorMsg = notImplementedError(pn,dtype2str(dt));
         mLogger.error(getModuleName(),pn,getLineNumber(),errorMsg);
@@ -257,10 +270,9 @@ module ManipulationMsg {
       var eOut = st.addEntry(rname, + reduce sizes, t);
 
       // copy the data from the input arrays to the output array
-      for arrIdx in 0..<nArrays {
-        const a = flatten(eIns[arrIdx].a);
-        forall idx in a.domain with (var agg = newSrcAggregator(t)) do
-          agg.copy(eOut.a[idx + starts[arrIdx]], a[idx]);
+      forall arrIdx in 0..<nArrays {
+        const a = if nd == 1 then eIns[arrIdx].a else flatten(eIns[arrIdx].a);
+        eOut.a[starts[arrIdx]..#sizes[arrIdx]] = a;
       }
 
       const repMsg = "created " + st.attrib(rname);
@@ -273,7 +285,6 @@ module ManipulationMsg {
       when DType.UInt64 do return doFlatConcat(uint);
       when DType.Float64 do return doFlatConcat(real);
       when DType.Bool do return doFlatConcat(bool);
-      when DType.BigInt do return doFlatConcat(bigint);
       otherwise {
         var errorMsg = notImplementedError(pn,dtype2str(dt));
         mLogger.error(getModuleName(),pn,getLineNumber(),errorMsg);
@@ -282,18 +293,20 @@ module ManipulationMsg {
     }
   }
 
-  private proc flatten(const ref a: [?d] ?t): [] t throws {
-    var flat = makeDistArray({0..<d.size}, t);
-    forall idx in flat.domain with (var agg = newSrcAggregator(t)) do
-      agg.copy(flat[idx], a[d.orderToIndex(idx)]);
-    return flat;
-  }
-
   // https://data-apis.org/array-api/latest/API_specification/generated/array_api.expand_dims.html#array_api.expand_dims
+  // insert a new singleton dimension at the given axis
   @arkouda.registerND
   proc expandDimsMsg(cmd: string, msgArgs: borrowed MessageArgs, st: borrowed SymTab, param nd: int): MsgTuple throws {
     param pn = Reflection.getRoutineName();
-    // TODO: add a check and error handling if nd+1 exceeds the maximum supported array rank
+
+    if nd == MaxArrayDims {
+      const errMsg = "Cannot expand arrays with rank %i, as this would result an an array with rank %i".doFormat(nd, nd+1) +
+                     ", exceeding the server's configured maximum of %i. ".doFormat(MaxArrayDims) +
+                     "Please update the configuration and recompile to support higher-dimensional arrays.";
+      mLogger.error(getModuleName(),pn,getLineNumber(),errMsg);
+      return new MsgTuple(errMsg,MsgType.ERROR);
+    }
+
     const name = msgArgs.getValueOf("name"),
           axis = msgArgs.get("axis").getPositiveIntValue(nd+1),
           rname = st.nextName();
@@ -307,7 +320,7 @@ module ManipulationMsg {
       var eOut = st.addEntry(rname, (...shapeOut), t);
 
       // mapping between the input and output array indices
-      inline proc imap(idx: (nd+1)*int): nd*int {
+      inline proc imap(idx: (nd+1)*int, axis: int): nd*int {
         var ret: nd*int, ii = 0;
         for param io in 0..nd {
           if io != axis {
@@ -320,7 +333,7 @@ module ManipulationMsg {
 
       // copy the data from the input array to the output array
       forall idx in eOut.a.domain with (var agg = newSrcAggregator(t)) do
-        agg.copy(eOut.a[idx], eIn.a[imap(idx)]);
+        agg.copy(eOut.a[idx], eIn.a[imap(idx, axis)]);
 
       const repMsg = "created " + st.attrib(rname);
       mLogger.info(getModuleName(),pn,getLineNumber(),repMsg);
@@ -332,7 +345,6 @@ module ManipulationMsg {
       when DType.UInt64 do return expandDims(uint);
       when DType.Float64 do return expandDims(real);
       when DType.Bool do return expandDims(bool);
-      when DType.BigInt do return expandDims(bigint);
       otherwise {
         var errorMsg = notImplementedError(pn,dtype2str(gEnt.dtype));
         mLogger.error(getModuleName(),pn,getLineNumber(),errorMsg);
@@ -376,18 +388,15 @@ module ManipulationMsg {
         mLogger.error(getModuleName(),pn,getLineNumber(),errMsg);
         return new MsgTuple(errMsg,MsgType.ERROR);
       } else {
-        // mapping between the input and output array indices with the specified axes flipped
-        inline proc imap(idx: nd*int): nd*int {
-          var ret = idx;
-          for axis in axes do
-            ret[axis] = eIn.tupShape[axis] - idx[axis] - 1;
-          return ret;
-        }
-
         // copy the data from the input array to the output array
-        // while flipping along the specified axis
-        forall idx in eOut.a.domain with (var agg = newSrcAggregator(t)) do
-          agg.copy(eOut.a[idx], eIn.a[imap(if nd == 1 then (idx,) else idx)]);
+        // while flipping along the specified axes
+        forall idx in eOut.a.domain with (
+          var agg = newSrcAggregator(t),
+          const imap = new indexFlip(eIn.tupShape, axes)
+        ) {
+          const inIdx = imap(if nd == 1 then (idx,) else idx);
+          agg.copy(eOut.a[idx], eIn.a[inIdx]);
+        }
 
         const repMsg = "created " + st.attrib(rname);
         mLogger.info(getModuleName(),pn,getLineNumber(),repMsg);
@@ -400,7 +409,6 @@ module ManipulationMsg {
       when DType.UInt64 do return doFlip(uint);
       when DType.Float64 do return doFlip(real);
       when DType.Bool do return doFlip(bool);
-      when DType.BigInt do return doFlip(bigint);
       otherwise {
         var errorMsg = notImplementedError(pn,dtype2str(gEnt.dtype));
         mLogger.error(getModuleName(),pn,getLineNumber(),errorMsg);
@@ -409,6 +417,27 @@ module ManipulationMsg {
     }
   }
 
+  record indexFlip {
+    param nd;
+    const shape: nd*int;
+    const d: domain(rank=1, idxType=int, strides=strideKind.one);
+    const axes: [d] int;
+
+    proc init(shape: ?nd*int, in axes: [?d] int) {
+      this.nd = nd;
+      this.shape = shape;
+      this.d = d;
+      this.axes = axes;
+    }
+
+    proc this(idx: nd*int): nd*int {
+      var ret = idx;
+      for axis in axes do
+        ret[axis] = shape[axis] - idx[axis] - 1;
+      return ret;
+    }
+  }
+
   private proc validateAxes(axes: [?d] int, param nd: int): (bool, [d] int) {
     var ret: [d] int;
     if axes.size > nd then return (false, ret);
@@ -437,19 +466,14 @@ module ManipulationMsg {
       const eIn = toSymEntry(gEnt, t, nd);
       var eOut = st.addEntry(rname, (...eIn.tupShape), t);
 
-      // mapping between the input and output array indices with all axes flipped
-      inline proc imap(idx: nd*int): nd*int {
-        var ret: nd*int;
-        for param i in 0..<nd do
-          ret[i] = eIn.tupShape[i] - idx[i] - 1;
-        return ret;
+      forall idx in eOut.a.domain with (
+        var agg = newSrcAggregator(t),
+        const imap = new allIndexFlip(nd, eIn.tupShape)
+      ) {
+        const inIdx = imap(if nd == 1 then (idx,) else idx);
+        agg.copy(eOut.a[idx], eIn.a[inIdx]);
       }
 
-      // copy the data from the input array to the output array
-      // while flipping along each axis
-      forall idx in eOut.a.domain with (var agg = newSrcAggregator(t)) do
-        agg.copy(eOut.a[idx], eIn.a[imap(if nd == 1 then (idx, ) else idx)]);
-
       const repMsg = "created " + st.attrib(rname);
       mLogger.info(getModuleName(),pn,getLineNumber(),repMsg);
       return new MsgTuple(repMsg, MsgType.NORMAL);
@@ -460,7 +484,6 @@ module ManipulationMsg {
       when DType.UInt64 do return doFlip(uint);
       when DType.Float64 do return doFlip(real);
       when DType.Bool do return doFlip(bool);
-      when DType.BigInt do return doFlip(bigint);
       otherwise {
         var errorMsg = notImplementedError(pn,dtype2str(gEnt.dtype));
         mLogger.error(getModuleName(),pn,getLineNumber(),errorMsg);
@@ -469,6 +492,17 @@ module ManipulationMsg {
     }
   }
 
+  record allIndexFlip {
+    param nd;
+    const shape: nd*int;
+    proc this(idx: nd*int): nd*int {
+      var ret = idx;
+      for param i in 0..<nd do
+        ret[i] = shape[i] - idx[i] - 1;
+      return ret;
+    }
+  }
+
   // https://data-apis.org/array-api/latest/API_specification/generated/array_api.permute_dims.html#array_api.permute_dims
   @arkouda.registerND
   proc permuteDims(cmd: string, msgArgs: borrowed MessageArgs, st: borrowed SymTab, param nd: int): MsgTuple throws {
@@ -506,7 +540,6 @@ module ManipulationMsg {
       when DType.UInt64 do return doPermutation(uint);
       when DType.Float64 do return doPermutation(real);
       when DType.Bool do return doPermutation(bool);
-      when DType.BigInt do return doPermutation(bigint);
       otherwise {
         var errorMsg = notImplementedError(pn,dtype2str(gEnt.dtype));
         mLogger.error(getModuleName(),pn,getLineNumber(),errorMsg);
@@ -521,6 +554,9 @@ module ManipulationMsg {
     return ret;
   }
 
+  // ensure all axis indices are in the range [-N, N-1]
+  // convert negative indices to positive indices
+  // return false if any axis index is out of range
   private proc validateAxes(axes: ?N*int): (bool, N*int) {
     var ret: N*int;
     for param i in 0..<N {
@@ -551,30 +587,34 @@ module ManipulationMsg {
 
     proc doReshape(type t): MsgTuple throws {
       const eIn = toSymEntry(gEnt, t, ndIn),
-            (valid, shape) = validateShape(rawShape, eIn.a.size);
+            (valid, outShape) = validateShape(rawShape, eIn.a.size);
 
       if !valid {
         const errMsg = "Cannot reshape array of shape %? into shape %?. The total number of elements must match".doFormat(eIn.tupShape, rawShape);
         mLogger.error(getModuleName(),pn,getLineNumber(),errMsg);
         return new MsgTuple(errMsg,MsgType.ERROR);
       } else {
-        var eOut = st.addEntry(rname, (...shape), t);
-
-        const sizes = [i in 0..<ndOut by -1] eOut.tupShape[i],
-              accumSizes = * scan sizes / sizes;
-
-        // index -> order for the output array's indices
-        // e.g., order = k + (nz * j) + (nz * ny * i)
-        inline proc indexToOrder(idx: ndOut*int): int {
-          var order = 0;
-          for param i in 0..<ndOut do order += idx[i] * accumSizes[i];
-          return order;
-        }
-
-        // copy the data from the input array to the output array while reshaping
-        forall idx in eOut.a.domain with (var agg = newSrcAggregator(t)) {
-          const inIdx = eIn.a.domain.orderToIndex(indexToOrder(if ndOut==1 then (idx,) else idx));
-          agg.copy(eOut.a[idx], eIn.a[inIdx]);
+        if ndIn == 1 && ndOut == 1 {
+          st.addEntry(rname, createSymEntry(eIn.a));
+        } else if ndIn == 1 {
+          // special case: unflatten a 1D array into a higher-dimensional array
+          st.addEntry(rname, createSymEntry(unflatten(eIn.a, outShape)));
+        } else if ndOut == 1 {
+          // special case: flatten an array into a 1D array
+          st.addEntry(rname, createSymEntry(flatten(eIn.a)));
+        } else {
+          // general case
+          var eOut = st.addEntry(rname, (...outShape), t);
+
+          // copy the data from the input array to the output array while reshaping
+          forall idx in eIn.a.domain with (
+            var agg = newDstAggregator(t),
+            const output = eOut.a.domain,
+            const input = new orderer(eIn.tupShape)
+          ) {
+            const outIdx = output.orderToIndex(input.indexToOrder(if ndIn == 1 then (idx,) else idx));
+            agg.copy(eOut.a[outIdx], eIn.a[idx]);
+          }
         }
 
         const repMsg = "created " + st.attrib(rname);
@@ -588,7 +628,6 @@ module ManipulationMsg {
       when DType.UInt64 do return doReshape(uint);
       when DType.Float64 do return doReshape(real);
       when DType.Bool do return doReshape(bool);
-      when DType.BigInt do return doReshape(bigint);
       otherwise {
         var errorMsg = notImplementedError(pn,dtype2str(gEnt.dtype));
         mLogger.error(getModuleName(),pn,getLineNumber(),errorMsg);
@@ -667,17 +706,13 @@ module ManipulationMsg {
       } else {
         var eOut = st.addEntry(rname, (...eIn.tupShape), t);
 
-        // mapping between a starting index and the rolled index
-        inline proc rollIdx(idx: nd*int): nd*int {
-          var ret = idx;
-          for i in 0..<nAxes do
-            ret[axes[i]] = (idx[axes[i]] + shifts[i] + eIn.tupShape[axes[i]]) % eIn.tupShape[axes[i]];
-          return ret;
-        }
-
         // copy the data from the input array to the output array while rolling along the specified axes
-        forall idx in eIn.a.domain with (var agg = newDstAggregator(t)) do
-          agg.copy(eOut.a[rollIdx(if nd == 1 then (idx, ) else idx)], eIn.a[idx]);
+        forall idx in eIn.a.domain with (
+          var agg = newDstAggregator(t),
+          const imap = new rollIdxMapper(eIn.tupShape, axes, shifts)
+        ) {
+          agg.copy(eOut.a[imap(if nd == 1 then (idx, ) else idx)], eIn.a[idx]);
+        }
 
         const repMsg = "created " + st.attrib(rname);
         mLogger.info(getModuleName(),pn,getLineNumber(),repMsg);
@@ -690,7 +725,6 @@ module ManipulationMsg {
       when DType.UInt64 do return doRoll(uint);
       when DType.Float64 do return doRoll(real);
       when DType.Bool do return doRoll(bool);
-      when DType.BigInt do return doRoll(bigint);
       otherwise {
         var errorMsg = notImplementedError(pn,dtype2str(gEnt.dtype));
         mLogger.error(getModuleName(),pn,getLineNumber(),errorMsg);
@@ -699,6 +733,29 @@ module ManipulationMsg {
     }
   }
 
+  record rollIdxMapper {
+    param nd;
+    const shape: nd*int;
+    const nAxes: int;
+    const axes: [0..<nAxes] int;
+    const shifts: [0..<nAxes] int;
+
+    proc init(shape: ?nd*int, in axes: [?d] int, in shifts: [d] int) {
+      this.nd = nd;
+      this.shape = shape;
+      this.nAxes = d.size;
+      this.axes = axes;
+      this.shifts = shifts;
+    }
+
+    proc this(idx: nd*int): nd*int {
+      var ret = idx;
+      for i in 0..<nAxes do
+        ret[axes[i]] = (idx[axes[i]] + shifts[i] + shape[axes[i]]) % shape[axes[i]];
+      return ret;
+    }
+  }
+
   // alternative to 'rollMsg' to be used when the axis argument is 'None'
   @arkouda.registerND
   proc rollFlattenedMsg(cmd: string, msgArgs: borrowed MessageArgs, st: borrowed SymTab, param nd: int): MsgTuple throws {
@@ -711,12 +768,10 @@ module ManipulationMsg {
 
     proc doRoll(type t): MsgTuple throws {
       const eIn = toSymEntry(gEnt, t, nd),
-            eOut = st.addEntry(rname, (...eIn.tupShape), t),
-            inFlatRolled = rollBy(shift, flatten(eIn.a));
+            inFlat = if nd == 1 then eIn.a else flatten(eIn.a),
+            rolled = unflatten(rollBy(shift, inFlat), eIn.tupShape);
 
-      // copy the flattened/rolled array into the output array while unflattening
-      forall idx in inFlatRolled.domain with (var agg = newDstAggregator(t)) do
-        agg.copy(eOut.a[eOut.a.domain.orderToIndex(idx)], inFlatRolled[idx]);
+      st.addEntry(rname, createSymEntry(rolled));
 
       const repMsg = "created " + st.attrib(rname);
       mLogger.info(getModuleName(),pn,getLineNumber(),repMsg);
@@ -728,7 +783,6 @@ module ManipulationMsg {
       when DType.UInt64 do return doRoll(uint);
       when DType.Float64 do return doRoll(real);
       when DType.Bool do return doRoll(bool);
-      when DType.BigInt do return doRoll(bigint);
       otherwise {
         var errorMsg = notImplementedError(pn,dtype2str(gEnt.dtype));
         mLogger.error(getModuleName(),pn,getLineNumber(),errorMsg);
@@ -769,16 +823,11 @@ module ManipulationMsg {
       } else {
         var eOut = st.addEntry(rname, (...shape), t);
 
-        // mapping between the input and output array indices
-        inline proc imap(idx: ndOut*int): ndIn*int {
-          var ret: ndIn*int;
-          for param ii in 0..<ndIn do ret[ii] = eIn.a.domain.dim(ii).low;
-          for param io in 0..<ndOut do ret[mapping[io]] = idx[io];
-          return ret;
-        }
-
         // copy the data from the input array to the output array
-        forall idx in eOut.a.domain with (var agg = newSrcAggregator(t)) do
+        forall idx in eOut.a.domain with (
+          var agg = newSrcAggregator(t),
+          const imap = new squeezeIndexMapper(ndIn, ndOut, mapping)
+        ) do
           agg.copy(eOut.a[idx], eIn.a[imap(if ndOut==1 then (idx,) else idx)]);
 
         const repMsg = "created " + st.attrib(rname);
@@ -792,7 +841,6 @@ module ManipulationMsg {
       when DType.UInt64 do return doSqueeze(uint);
       when DType.Float64 do return doSqueeze(real);
       when DType.Bool do return doSqueeze(bool);
-      when DType.BigInt do return doSqueeze(bigint);
       otherwise {
         var errorMsg = notImplementedError(pn,dtype2str(gEnt.dtype));
         mLogger.error(getModuleName(),pn,getLineNumber(),errorMsg);
@@ -801,6 +849,18 @@ module ManipulationMsg {
     }
   }
 
+  record squeezeIndexMapper {
+    param ndIn: int;
+    param ndOut: int;
+    const mapping: ndOut*int;
+
+    proc this(idx: ndOut*int): ndIn*int {
+      var ret: ndIn*int;
+      for param i in 0..<ndOut do ret[mapping[i]] = idx[i];
+      return ret;
+    }
+  }
+
   private proc validateSqueeze(shape: ?NIn*int, axes: [?d], param NOut: int): (bool, NOut*int, NOut*int) {
     var shapeOut: NOut*int,
         mapping: NOut*int;
@@ -832,43 +892,47 @@ module ManipulationMsg {
   @arkouda.registerND
   proc stackMsg(cmd: string, msgArgs: borrowed MessageArgs, st: borrowed SymTab, param nd: int): MsgTuple throws {
     param pn = Reflection.getRoutineName();
+
+    if nd == MaxArrayDims {
+      const errMsg = "Cannot stack arrays with rank %i, as this would result an an array with rank %i".doFormat(nd, nd+1) +
+                     ", exceeding the server's configured maximum of %i. ".doFormat(MaxArrayDims) +
+                     "Please update the configuration and recompile to support higher-dimensional arrays.";
+      mLogger.error(getModuleName(),pn,getLineNumber(),errMsg);
+      return new MsgTuple(errMsg,MsgType.ERROR);
+    }
+
     const nArrays = msgArgs.get("n").getIntValue(),
           names = msgArgs.get("names").getList(nArrays),
-          axis = msgArgs.get("axis").getPositiveIntValue(nd),
+          axis = msgArgs.get("axis").getPositiveIntValue(nd+1),
           rname = st.nextName();
 
-    var gEnts: [0..<nArrays] borrowed GenSymEntry = getGenericEntries(names, st);
+    var gEnts = for n in names do getGenericTypedArrayEntry(n, st);
 
     // confirm that all arrays have the same dtype and shape
     // (type promotion needs to be completed before calling 'stack')
-    const dt = gEnts[0].dtype,
-          sh = gEnts[0].shape;
-    for i in 1..#nArrays do if gEnts[i].dtype != dt || gEnts[i].shape != sh {
-      const errMsg = "All arrays must have the same dtype and shape to stack";
-      mLogger.error(getModuleName(),pn,getLineNumber(),errMsg);
-      return new MsgTuple(errMsg,MsgType.ERROR);
+    const dt = gEnts[0]!.dtype,
+          sh = gEnts[0]!.shape;
+    for i in 1..<nArrays do if gEnts[i]!.dtype != dt || gEnts[i]!.shape != sh {
+        const errMsg = "All arrays must have the same dtype and shape to stack";
+        mLogger.error(getModuleName(),pn,getLineNumber(),errMsg);
+        return new MsgTuple(errMsg,MsgType.ERROR);
     }
 
     proc doStack(type t): MsgTuple throws {
-      const eIns = [i in 0..#nArrays] toSymEntry(gEnts[i], t, nd),
+      const eIns = [i in 0..#nArrays] toSymEntry(gEnts[i]!, t, nd),
             (shapeOut, mapping) = stackedShape(eIns[0].tupShape, axis, nArrays);
       var eOut = st.addEntry(rname, (...shapeOut), t);
 
-      // mapping between the input and output array indices
-      inline proc imap(arrIdx: int, idx: nd*int): (nd+1)*int {
-        var ret: (nd+1)*int;
-        for i in 0..nd {
-          if i == axis
-            then ret[i] = arrIdx;
-            else ret[i] = idx[mapping[i]];
-        }
-        return ret;
-      }
-
       // copy the data from the input arrays to the output array
-      forall (arrIdx, arr) in zip(eIns.domain, eIns) do
-        forall idx in arr.a.domain with (var agg = newDstAggregator(t)) do
-          agg.copy(eOut.a[imap(arrIdx, if nd==1 then (idx,) else idx)], arr.a[idx]);
+      // TODO: does a nested forall with aggregators use too much memory for agg buffers?
+      //       (maybe make outer loop be a 'for' or switch inner/outer loops?)
+      forall (arrIdx, arr) in zip(eIns.domain, eIns) {
+        forall idx in arr.a.domain with (
+          var agg = newDstAggregator(t),
+          const imap = new stackIndexMapper(nd+1, axis, arrIdx, mapping)
+        ) do
+          agg.copy(eOut.a[imap(if nd==1 then (idx,) else idx)], arr.a[idx]);
+      }
 
       const repMsg = "created " + st.attrib(rname);
       mLogger.info(getModuleName(),pn,getLineNumber(),repMsg);
@@ -880,7 +944,6 @@ module ManipulationMsg {
       when DType.UInt64 do return doStack(uint);
       when DType.Float64 do return doStack(real);
       when DType.Bool do return doStack(bool);
-      when DType.BigInt do return doStack(bigint);
       otherwise {
         var errorMsg = notImplementedError(pn,dtype2str(dt));
         mLogger.error(getModuleName(),pn,getLineNumber(),errorMsg);
@@ -889,6 +952,20 @@ module ManipulationMsg {
     }
   }
 
+  record stackIndexMapper {
+    param ndOut: int;
+    const axis: int;
+    const arrIdx: int;
+    const mapping: ndOut*int;
+
+    proc this(idx: (ndOut-1)*int): ndOut*int {
+      var ret: ndOut*int;
+      for param i in 0..<ndOut do ret[i] = idx[mapping[i]];
+      ret[axis] = arrIdx;
+      return ret;
+    }
+  }
+
   private proc stackedShape(shape: ?N*int, axis: int, nArrays: int): ((N+1)*int, (N+1)*int) {
     var shapeOut: (N+1)*int,
         mapping: (N+1)*int,
@@ -906,6 +983,279 @@ module ManipulationMsg {
     return (shapeOut, mapping);
   }
 
+
+  // https://data-apis.org/array-api/latest/API_specification/generated/array_api.tile.html#array_api.tile
+  // assumes that 'reps' is the same length as the array's shape
+  // this is achieved on the client side by either:
+  //  * reshaping the array to add singleton dimensions (if reps is longer than the array's shape)
+  //  * prepending 1's to reps (if reps is shorter than the array's shape)
+  @arkouda.registerND
+  proc tileMsg(cmd: string, msgArgs: borrowed MessageArgs, st: borrowed SymTab, param nd: int): MsgTuple throws {
+    param pn = Reflection.getRoutineName();
+    const name = msgArgs.getValueOf("name"),
+          reps = msgArgs.get("reps").getTuple(nd),
+          rname = st.nextName();
+
+    var gEnt: borrowed GenSymEntry = getGenericTypedArrayEntry(name, st);
+
+    proc doTile(type t): MsgTuple throws {
+      const eIn = toSymEntry(gEnt, t, nd),
+            shapeOut = tiledShape(eIn.tupShape, reps);
+      var eOut = st.addEntry(rname, (...shapeOut), t);
+
+      // copy the data from the input array to the output array while tiling
+      forall idx in eOut.a.domain with (
+        var agg = newSrcAggregator(t),
+        const imap = new tileIndexMapper(nd, eIn.tupShape)
+      ) {
+        const inIdx = imap(if nd == 1 then (idx,) else idx);
+        agg.copy(eOut.a[idx], eIn.a[inIdx]);
+      }
+
+      const repMsg = "created " + st.attrib(rname);
+      mLogger.info(getModuleName(),pn,getLineNumber(),repMsg);
+      return new MsgTuple(repMsg, MsgType.NORMAL);
+    }
+
+    select gEnt.dtype {
+      when DType.Int64 do return doTile(int);
+      when DType.UInt64 do return doTile(uint);
+      when DType.Float64 do return doTile(real);
+      when DType.Bool do return doTile(bool);
+      otherwise {
+        var errorMsg = notImplementedError(pn,dtype2str(gEnt.dtype));
+        mLogger.error(getModuleName(),pn,getLineNumber(),errorMsg);
+        return new MsgTuple(errorMsg,MsgType.ERROR);
+      }
+    }
+  }
+
+  record tileIndexMapper {
+    param nd: int;
+    const shapeIn: nd*int;
+
+    proc this(idx: nd*int): nd*int {
+      var ret: nd*int;
+      for param i in 0..<nd do ret[i] = idx[i] % shapeIn[i];
+      return ret;
+    }
+  }
+
+
+  proc tiledShape(shape: ?N*int, reps: N*int): N*int {
+    var shapeOut: N*int;
+    for i in 0..<N do shapeOut[i] = shape[i] * reps[i];
+    return shapeOut;
+  }
+
+  // https://data-apis.org/array-api/latest/API_specification/generated/array_api.unstack.html
+  // unstack an array into multiple arrays along a specified axis
+  @arkouda.registerND
+  proc unstackMsg(cmd: string, msgArgs: borrowed MessageArgs, st: borrowed SymTab, param nd: int): MsgTuple throws {
+    param pn = Reflection.getRoutineName();
+
+    if nd == 1 {
+      const errMsg = "Cannot unstack a 1D array";
+      mLogger.error(getModuleName(),pn,getLineNumber(),errMsg);
+      return new MsgTuple(errMsg,MsgType.ERROR);
+    }
+
+    const name = msgArgs.getValueOf("name"),
+          axis = msgArgs.get("axis").getPositiveIntValue(nd),
+          numReturnArrays = msgArgs.get("numReturnArrays").getIntValue(),
+          rnames = [i in 0..<numReturnArrays] st.nextName();
+
+    var gEnt: borrowed GenSymEntry = getGenericTypedArrayEntry(name, st);
+
+    proc doUnstack(type t): MsgTuple throws {
+      const eIn = toSymEntry(gEnt, t, nd),
+            (valid, shapeOut) = unstackedShape(eIn.tupShape, axis, numReturnArrays);
+
+      if !valid {
+        const errMsg = "Unable to unstack array with shape %? along axis %? into %? arrays".doFormat(eIn.tupShape, axis, numReturnArrays);
+        mLogger.error(getModuleName(),pn,getLineNumber(),errMsg);
+        return new MsgTuple(errMsg,MsgType.ERROR);
+      } else {
+        var eOuts = for rn in rnames do (try st.addEntry(rn, (...shapeOut), t));
+
+        // copy the data from the input array to the output arrays while unstacking
+        for arrIdx in 0..<numReturnArrays {
+          forall idx in eOuts[arrIdx].a.domain with (
+            var agg = newSrcAggregator(t),
+            const imap = new unstackIdxMapper(nd, arrIdx, axis)
+          ) {
+            const inIdx = imap(if nd == 2 then (idx,) else idx);
+            agg.copy(eOuts[arrIdx].a[idx], eIn.a[inIdx]);
+          }
+        }
+
+        const repMsg = try! '+'.join([rn in rnames] "created " + st.attrib(rn));
+        mLogger.info(getModuleName(),pn,getLineNumber(),repMsg);
+        return new MsgTuple(repMsg, MsgType.NORMAL);
+      }
+    }
+
+    select gEnt.dtype {
+      when DType.Int64 do return doUnstack(int);
+      when DType.UInt64 do return doUnstack(uint);
+      when DType.Float64 do return doUnstack(real);
+      when DType.Bool do return doUnstack(bool);
+      otherwise {
+        var errorMsg = notImplementedError(pn,dtype2str(gEnt.dtype));
+        mLogger.error(getModuleName(),pn,getLineNumber(),errorMsg);
+        return new MsgTuple(errorMsg,MsgType.ERROR);
+      }
+    }
+  }
+
+  record unstackIdxMapper {
+    param ndIn: int;
+    const arrIdx: int;
+    const axis: int;
+
+    proc this(idx: (ndIn-1)*int): ndIn*int {
+      var ret: ndIn*int;
+      var i = 0;
+      for param ii in 0..<ndIn {
+        if ii == axis {
+          ret[ii] = arrIdx;
+        } else {
+          ret[ii] = idx[i];
+          i += 1;
+        }
+      }
+      return ret;
+    }
+  }
+
+  // TODO: should this reduce the array rank by 1, or introduce a singleton dimension for axis?
+  // (the array-api docs are unclear on this point)
+  proc unstackedShape(shape: ?N*int, axis: int, numReturnArrays: int): (bool, (N-1)*int)
+    where N > 1
+  {
+    var shapeOut: (N-1)*int;
+    if numReturnArrays != shape[axis] {
+      return (false, shapeOut);
+    } else {
+      var i = 0;
+      for ii in 0..N {
+        if ii == axis {
+          continue;
+        } else {
+          shapeOut[i] = shape[ii];
+          i += 1;
+        }
+      }
+      return (true, shapeOut);
+    }
+  }
+
+
+  // see: https://numpy.org/doc/stable/reference/generated/numpy.repeat.html#numpy.repeat
+  // flattens the input array and repeats each element 'repeats' times
+  // if 'repeats' is an array, it must have the same number of elements as the input array
+  @arkouda.registerND
+  proc repeatFlatMsg(cmd: string, msgArgs: borrowed MessageArgs, st: borrowed SymTab, param nd: int): MsgTuple throws {
+    param pn = Reflection.getRoutineName();
+    const name = msgArgs.getValueOf("name"),
+          repeats = msgArgs.getValueOf("repeats"),
+          rname = st.nextName();
+
+    var gEnt: borrowed GenSymEntry = getGenericTypedArrayEntry(name, st),
+        gEntRepeats: borrowed GenSymEntry = getGenericTypedArrayEntry(repeats, st);
+
+    proc doRepeatFlat(type t): MsgTuple throws {
+      const eIn = toSymEntry(gEnt, t, nd),
+            eRepeats = toSymEntry(gEntRepeats, int, 1),
+            aFlat = if nd == 1 then eIn.a else flatten(eIn.a);
+
+      if eRepeats.a.size == 1 {
+        const rep = eRepeats.a[0],
+              eOut = st.addEntry(rname, aFlat.size * rep, t);
+
+        // simple case: repeat each element of the input array 'rep' times
+        forall i in aFlat.domain do eOut.a[i*rep..#rep] = aFlat[i];
+
+      } else if eRepeats.a.size == aFlat.size {
+        // repeat each element of the input array by the corresponding element of 'repeats'
+
+        // // serial algorithm:
+        // var start = 0;
+        // for idx in aFlat.domain {
+        //   eOut.a[start..#eRepeats.a[idx]] = aFlat[idx];
+        //   start += eRepeats.a[idx];
+        // }
+
+        // compute the number of repeated elements in the output array owned by each task
+        const nTasksPerLoc = here.maxTaskPar;
+        var nRepsPerTask: [0..<numLocales] [0..<nTasksPerLoc] int;
+        coforall loc in Locales with (ref nRepsPerTask) do on loc {
+          const lsd = aFlat.localSubdomain(),
+                indicesPerTask = lsd.size / nTasksPerLoc;
+          coforall tid in 0..<nTasksPerLoc with (ref nRepsPerTask) {
+            const startIdx = tid * indicesPerTask,
+                  stopIdx = if tid == nTasksPerLoc - 1 then lsd.size else (tid + 1) * indicesPerTask;
+
+            var sum = 0;
+            for i in startIdx..<stopIdx do
+              sum += eRepeats.a[i];
+            nRepsPerTask[loc.id][tid] = sum;
+          }
+        }
+
+        // compute the output array's size, and where in the output array each locale should start
+        // depositing its repeated elements
+        const nRepsPerLoc = [nt in nRepsPerTask] + reduce nt,
+              locStarts = (+ scan nRepsPerLoc) - nRepsPerLoc,
+              nTotal = + reduce nRepsPerLoc;
+        var eOut = st.addEntry(rname, nTotal, t);
+
+        // copy the repeated elements into the output array
+        coforall loc in Locales with (const ref nRepsPerTask, const ref locStarts) do on loc {
+          const lsd = aFlat.localSubdomain(),
+                indicesPerTask = lsd.size / nTasksPerLoc;
+
+          // compute where in the output array each of this locale's tasks should start depositing
+          // its repeated elements
+          const taskStarts = ((+ scan nRepsPerTask[loc.id]) - nRepsPerTask[loc.id]) + locStarts[loc.id];
+          coforall tid in 0..<nTasksPerLoc {
+            const startIdx = tid * indicesPerTask,
+                  stopIdx = if tid == nTasksPerLoc - 1 then lsd.size else (tid + 1) * indicesPerTask;
+
+            // copy this task's repeated elements into the output array
+            var outStart = taskStarts[tid];
+
+            for i in startIdx..<stopIdx {
+              eOut.a[outStart..#eRepeats.a[i]] = aFlat[i];
+              outStart += eRepeats.a[i];
+            }
+          }
+        }
+      } else {
+        const errMsg = "Unable to repeat array with shape %? using repeats %?. ".doFormat(eIn.tupShape, eRepeats.tupShape) +
+                       "Repeats must be a scalar or have the same number of elements as the input array";
+        mLogger.error(getModuleName(),pn,getLineNumber(),errMsg);
+        return new MsgTuple(errMsg,MsgType.ERROR);
+      }
+
+      const repMsg = "created " + st.attrib(rname);
+      mLogger.info(getModuleName(),pn,getLineNumber(),repMsg);
+      return new MsgTuple(repMsg, MsgType.NORMAL);
+    }
+
+    select gEnt.dtype {
+      when DType.Int64 do return doRepeatFlat(int);
+      when DType.UInt64 do return doRepeatFlat(uint);
+      when DType.Float64 do return doRepeatFlat(real);
+      when DType.Bool do return doRepeatFlat(bool);
+      otherwise {
+        var errorMsg = notImplementedError(pn,dtype2str(gEnt.dtype));
+        mLogger.error(getModuleName(),pn,getLineNumber(),errorMsg);
+        return new MsgTuple(errorMsg,MsgType.ERROR);
+      }
+    }
+  }
+
   proc getGenericEntries(names: [?d] string, st: borrowed SymTab): [] borrowed GenSymEntry throws {
     var gEnts: [d] borrowed GenSymEntry?;
     for (i, name) in zip(d, names) do gEnts[i] = getGenericTypedArrayEntry(name, st);
diff --git a/src/ReductionMsg.chpl b/src/ReductionMsg.chpl
index 13f1dc90b4..6dfec621f8 100644
--- a/src/ReductionMsg.chpl
+++ b/src/ReductionMsg.chpl
@@ -344,7 +344,7 @@ module ReductionMsg
         var nnzPerTask: [0..<numLocales] [0..<nTasks] int;
         coforall loc in Locales with (ref nnzPerTask) do on loc {
           const locDom = eIn.a.localSubdomain();
-          coforall tid in 0..<nTasks with (ref nnzPerTask) do {
+          coforall tid in 0..<nTasks with (ref nnzPerTask) {
             var nnzTask = 0;
             // TODO: evaluate whether 'subDomChunk' chunking along the largest dimension
             // is the best choice. Perhaps it would be better to always chunk along the
@@ -366,8 +366,8 @@ module ReductionMsg
 
         // populate the arrays with the indices of the non-zero elements
         // TODO: refactor to use aggregation or bulk assignment to avoid fine-grained communication
-        coforall loc in Locales do on loc {
-          const taskStarts = (+ scan nnzPerTask[loc.id]) - nnzPerTask[loc.id] + locStarts[loc.id],
+        coforall loc in Locales with (const ref nnzPerTask, const ref locStarts) do on loc {
+          const taskStarts = ((+ scan nnzPerTask[loc.id]) - nnzPerTask[loc.id]) + locStarts[loc.id],
                 locDom = eIn.a.localSubdomain();
           coforall tid in 0..<nTasks {
             var i = taskStarts[tid];
diff --git a/src/SetMsg.chpl b/src/SetMsg.chpl
index 17c942e62d..8b84a7b1e1 100644
--- a/src/SetMsg.chpl
+++ b/src/SetMsg.chpl
@@ -165,69 +165,4 @@ module SetMsg {
       }
     }
   }
-
-  // TODO: put this in AryUtil or some other common module after merging with #3056
-  private proc unflatten(const ref aFlat: [?d] ?t, shape: ?N*int): [] t throws {
-    var unflat = makeDistArray((...shape), t);
-    const lastRank = unflat.domain.dim(N-1);
-
-    // iterate over each slice of the output array along the last dimension
-    // and copy the data from the corresponding slice of the flat array
-    forall idx in domOffAxis(unflat.domain, N-1) with (const ord = new orderer(unflat.domain.shape)) {
-      var idxTup: (N-1)*int;
-      for i in 0..<(N-1) do idxTup[i] = idx[i];
-      const rrSlice = ((...idxTup), lastRank);
-
-      const low = ((...idxTup), lastRank.low),
-            high = ((...idxTup), lastRank.high),
-            flatSlice = ord.indexToOrder(low)..ord.indexToOrder(high);
-
-      unflat[(...rrSlice)] = aFlat[flatSlice];
-    }
-
-    return unflat;
-  }
-
-  // TODO: put this in AryUtil or some other common module after merging with #3056
-  private proc flatten(const ref a: [?d] ?t): [] t throws
-    where a.rank > 1
-  {
-    var flat = makeDistArray({0..<d.size}, t);
-    const rankLast = d.dim(d.rank-1);
-
-    // iterate over each slice of the input array along the last dimension
-    // and copy the data into the corresponding slice of the flat array
-    forall idx in domOffAxis(d, d.rank-1) with (const ord = new orderer(d.shape)) {
-      var idxTup: (d.rank-1)*int;
-      for i in 0..<(d.rank-1) do idxTup[i] = idx[i];
-      const rrSlice = ((...idxTup), rankLast);
-
-      const low = ((...idxTup), rankLast.low),
-            high = ((...idxTup), rankLast.high),
-            flatSlice = ord.indexToOrder(low)..ord.indexToOrder(high);
-
-      flat[flatSlice] = a[(...rrSlice)];
-    }
-
-    return flat;
-  }
-
-  record orderer {
-    param rank: int;
-    const accumRankSizes: [0..<rank] int;
-
-    proc init(shape: ?N*int) {
-      this.rank = N;
-      const sizesRev = [i in 0..<N] shape[N - i - 1];
-      this.accumRankSizes = * scan sizesRev / sizesRev;
-    }
-
-    // index -> order for the input array's indices
-    // e.g., order = k + (nz * j) + (nz * ny * i)
-    inline proc indexToOrder(idx: rank*int): int {
-      var order = 0;
-      for param i in 0..<rank do order += idx[i] * accumRankSizes[rank - i - 1];
-      return order;
-    }
-  }
 }
diff --git a/src/compat/e-132/ArkoudaAryUtilCompat.chpl b/src/compat/e-132/ArkoudaAryUtilCompat.chpl
index 1ef78b8387..166a43aa2d 100644
--- a/src/compat/e-132/ArkoudaAryUtilCompat.chpl
+++ b/src/compat/e-132/ArkoudaAryUtilCompat.chpl
@@ -60,7 +60,7 @@ module ArkoudaAryUtilCompat {
     label ranks for i in 0..<D.rank {
       for param j in 0..<NA {
         if i == axes[j] {
-          outDims[i] = 0..0;
+          outDims[i] = D.dim(i).low..D.dim(i).low;
           continue ranks;
         }
       }
@@ -77,7 +77,7 @@ module ArkoudaAryUtilCompat {
     label ranks for i in 0..<D.rank {
       for j in 0..<axes.size {
         if i == axes[j] {
-          outDims[i] = 0..0;
+          outDims[i] = D.dim(i).low..D.dim(i).low;
           continue ranks;
         }
       }
diff --git a/src/compat/eq-131/ArkoudaAryUtilCompat.chpl b/src/compat/eq-131/ArkoudaAryUtilCompat.chpl
index 1fca8f4ce7..1075f1c395 100644
--- a/src/compat/eq-131/ArkoudaAryUtilCompat.chpl
+++ b/src/compat/eq-131/ArkoudaAryUtilCompat.chpl
@@ -60,7 +60,7 @@ module ArkoudaAryUtilCompat {
     label ranks for i in 0..<D.rank {
       for param j in 0..<NA {
         if i == axes[j] {
-          outDims[i] = 0..0;
+          outDims[i] = D.dim(i).low..D.dim(i).low;
           continue ranks;
         }
       }
@@ -77,7 +77,7 @@ module ArkoudaAryUtilCompat {
     label ranks for i in 0..<D.rank {
       for j in 0..<axes.size {
         if i == axes[j] {
-          outDims[i] = 0..0;
+          outDims[i] = D.dim(i).low..D.dim(i).low;
           continue ranks;
         }
       }
diff --git a/src/compat/eq-133/ArkoudaAryUtilCompat.chpl b/src/compat/eq-133/ArkoudaAryUtilCompat.chpl
index 884f44339b..f4cd2f18a4 100644
--- a/src/compat/eq-133/ArkoudaAryUtilCompat.chpl
+++ b/src/compat/eq-133/ArkoudaAryUtilCompat.chpl
@@ -60,7 +60,7 @@ module ArkoudaAryUtilCompat {
     label ranks for i in 0..<D.rank {
       for param j in 0..<NA {
         if i == axes[j] {
-          outDims[i] = 0..0;
+          outDims[i] = D.dim(i).low..D.dim(i).low;
           continue ranks;
         }
       }
@@ -77,7 +77,7 @@ module ArkoudaAryUtilCompat {
     label ranks for i in 0..<D.rank {
       for j in 0..<axes.size {
         if i == axes[j] {
-          outDims[i] = 0..0;
+          outDims[i] = D.dim(i).low..D.dim(i).low;
           continue ranks;
         }
       }
diff --git a/src/compat/eq-134/ArkoudaAryUtilCompat.chpl b/src/compat/eq-134/ArkoudaAryUtilCompat.chpl
index 884f44339b..f4cd2f18a4 100644
--- a/src/compat/eq-134/ArkoudaAryUtilCompat.chpl
+++ b/src/compat/eq-134/ArkoudaAryUtilCompat.chpl
@@ -60,7 +60,7 @@ module ArkoudaAryUtilCompat {
     label ranks for i in 0..<D.rank {
       for param j in 0..<NA {
         if i == axes[j] {
-          outDims[i] = 0..0;
+          outDims[i] = D.dim(i).low..D.dim(i).low;
           continue ranks;
         }
       }
@@ -77,7 +77,7 @@ module ArkoudaAryUtilCompat {
     label ranks for i in 0..<D.rank {
       for j in 0..<axes.size {
         if i == axes[j] {
-          outDims[i] = 0..0;
+          outDims[i] = D.dim(i).low..D.dim(i).low;
           continue ranks;
         }
       }
diff --git a/src/compat/ge-20/ArkoudaAryUtilCompat.chpl b/src/compat/ge-20/ArkoudaAryUtilCompat.chpl
index 884f44339b..cb6da77dee 100644
--- a/src/compat/ge-20/ArkoudaAryUtilCompat.chpl
+++ b/src/compat/ge-20/ArkoudaAryUtilCompat.chpl
@@ -60,7 +60,7 @@ module ArkoudaAryUtilCompat {
     label ranks for i in 0..<D.rank {
       for param j in 0..<NA {
         if i == axes[j] {
-          outDims[i] = 0..0;
+          outDims[i] = D.dim(i).low..D.dim(i).low;
           continue ranks;
         }
       }
@@ -77,7 +77,7 @@ module ArkoudaAryUtilCompat {
     label ranks for i in 0..<D.rank {
       for j in 0..<axes.size {
         if i == axes[j] {
-          outDims[i] = 0..0;
+          outDims[i] = D.dim(i).low..D.dim(i).low;
           continue ranks;
         }
       }
@@ -95,7 +95,7 @@ module ArkoudaAryUtilCompat {
 
     (if 'nChunks' is greater than the size of the largest dimension, the
     first 'nChunks-1' chunks will be empty, and the last chunk will contain
-    the entire domain)
+    the entire set of indices along that dimension)
   */
   proc subDomChunk(dom: domain(?), chunkIdx: int, nChunks: int): domain(?) {
     const dimSizes = [i in 0..<dom.rank] dom.dim(i).size,
diff --git a/tests/array_api/array_manipulation.py b/tests/array_api/array_manipulation.py
index 2ef336fc0f..05f21ae9f3 100644
--- a/tests/array_api/array_manipulation.py
+++ b/tests/array_api/array_manipulation.py
@@ -6,8 +6,14 @@
 import numpy as np
 
 SEED = 12345
+s = SEED
+
+
+def randArr(shape):
+    global s
+    s += 2
+    return Array.asarray(ak.randint(0, 100, shape, dtype=ak.int64, seed=s))
 
-# requires the server to be built with 3D array support
 
 class ManipulationTests(ArkoudaTest):
     def test_broadcast(self):
@@ -22,64 +28,64 @@ def test_broadcast(self):
         self.assertEqual(abc[2].shape, (5, 6, 10))
 
     def test_concat(self):
-        a = Array.ones((5, 3, 10))
-        b = Array.ones((5, 3, 2))
-        c = Array.ones((5, 3, 17))
+        a = randArr((5, 3, 10))
+        b = randArr((5, 3, 2))
+        c = randArr((5, 3, 17))
 
         abcConcat = Array.concat([a, b, c], axis=2)
+        abcNP = np.concatenate([a.to_ndarray(), b.to_ndarray(), c.to_ndarray()], axis=2)
         self.assertEqual(abcConcat.shape, (5, 3, 29))
-        self.assertTrue(Array.all(abcConcat))
+        self.assertEqual(abcConcat.tolist(), abcNP.tolist())
 
-        d = Array.ones((10, 8))
-        e = Array.ones((11, 8))
-        f = Array.ones((12, 8))
+        d = randArr((10, 8))
+        e = randArr((11, 8))
+        f = randArr((12, 8))
 
         defConcat = Array.concat([d, e, f])
+        defNP = np.concatenate([d.to_ndarray(), e.to_ndarray(), f.to_ndarray()])
         self.assertEqual(defConcat.shape, (33, 8))
-        self.assertTrue(Array.all(defConcat))
+        self.assertEqual(defConcat.tolist(), defNP.tolist())
 
         defConcatNeg = Array.concat((d, e, f), axis=-2)
         self.assertEqual(defConcatNeg.shape, (33, 8))
-        self.assertTrue(Array.all(defConcatNeg))
+        self.assertEqual(defConcatNeg.tolist(), defNP.tolist())
 
-        h = Array.ones((1, 2, 3))
-        i = Array.ones((1, 2, 3))
-        j = Array.ones((1, 2, 3))
+        h = randArr((1, 2, 3))
+        i = randArr((1, 2, 3))
+        j = randArr((1, 2, 3))
 
         hijConcat = Array.concat((h, i, j), axis=None)
+        hijNP = np.concatenate([h.to_ndarray(), i.to_ndarray(), j.to_ndarray()], axis=None)
         self.assertEqual(hijConcat.shape, (18,))
-        self.assertTrue(Array.all(hijConcat))
+        self.assertEqual(hijConcat.tolist(), hijNP.tolist())
 
     def test_expand_dims(self):
-        a = Array.asarray(ak.randint(0, 100, (5, 3), dtype=ak.int64, seed=SEED))
+        a = randArr((5, 3))
         alist = a.tolist()
 
-        # TODO: once rank reducing slices are implemented,
-        # the squeeze operations can be removed below:
-
         a0 = Array.expand_dims(a, axis=0)
         self.assertEqual(a0.shape, (1, 5, 3))
-        self.assertEqual(Array.squeeze(a0[0, :, :], axis=0).tolist(), alist)
+        self.assertEqual(a0[0, ...].tolist(), alist)
 
         a1 = Array.expand_dims(a, axis=1)
         self.assertEqual(a1.shape, (5, 1, 3))
-        self.assertEqual(Array.squeeze(a1[:, 0, :], axis=1).tolist(), alist)
+        self.assertEqual(a1[:, 0, :].tolist(), alist)
 
         a2 = Array.expand_dims(a, axis=2)
         self.assertEqual(a2.shape, (5, 3, 1))
-        self.assertEqual(Array.squeeze(a2[:, :, 0], axis=2).tolist(), alist)
+        self.assertEqual(a2[..., 0].tolist(), alist)
 
         aNeg1 = Array.expand_dims(a, axis=-1)
         self.assertEqual(aNeg1.shape, (5, 3, 1))
-        self.assertEqual(Array.squeeze(aNeg1[:, :, 0], axis=2).tolist(), alist)
+        self.assertEqual(aNeg1[:, :, 0].tolist(), alist)
 
         aNeg2 = Array.expand_dims(a, axis=-2)
         self.assertEqual(aNeg2.shape, (5, 1, 3))
-        self.assertEqual(Array.squeeze(aNeg2[:, 0, :], axis=1).tolist(), alist)
+        self.assertEqual(aNeg2[:, 0, :].tolist(), alist)
 
         aNeg3 = Array.expand_dims(a, axis=-3)
         self.assertEqual(aNeg3.shape, (1, 5, 3))
-        self.assertEqual(Array.squeeze(aNeg3[0, :, :], axis=0).tolist(), alist)
+        self.assertEqual(aNeg3[0, :, :].tolist(), alist)
 
         with self.assertRaises(IndexError):
             Array.expand_dims(a, axis=3)
@@ -100,7 +106,7 @@ def test_flip(self):
         r = Array.asarray(ak.randint(0, 100, (7, 8, 9), dtype=ak.int64, seed=SEED))
         rn = np.asarray(r.tolist())
 
-        f1 = Array.flip(r) # flip all axes
+        f1 = Array.flip(r)  # flip all axes
         f2 = Array.flip(r, axis=0)
         f3 = Array.flip(r, axis=1)
         f4 = Array.flip(r, axis=(0, 2))
@@ -127,7 +133,7 @@ def test_flip(self):
             Array.flip(r, axis=-4)
 
     def test_permute_dims(self):
-        r = Array.asarray(ak.randint(0, 100, (7, 8, 9), dtype=ak.int64, seed=SEED))
+        r = randArr((7, 8, 9))
 
         p1 = Array.permute_dims(r, (0, 1, 2))
         p2 = Array.permute_dims(r, (2, 1, 0))
@@ -153,7 +159,7 @@ def test_permute_dims(self):
             Array.permute_dims(r, (0, 1, -4))
 
     def test_reshape(self):
-        r = Array.asarray(ak.randint(0, 100, (2, 6, 12), dtype=ak.int64, seed=SEED))
+        r = randArr((2, 6, 12))
         nr = np.asarray(r.tolist())
 
         for shape in [(12, 12), (3, 12, 4), (2, 72), (6, 2, 12), (144,)]:
@@ -190,7 +196,7 @@ def test_roll(self):
         self.assertEqual(b2.tolist(), [3, 4, 5, 6, 7, 8, 9, 0, 1, 2])
 
         # ND case
-        r = Array.asarray(ak.randint(0, 100, (7, 8, 9), dtype=ak.int64, seed=SEED))
+        r = randArr((7, 8, 9))
 
         f1 = Array.roll(r, 3)
         f2 = Array.roll(r, -3)
@@ -223,10 +229,10 @@ def test_roll(self):
             Array.roll(r, 3, axis=-4)
 
     def test_squeeze(self):
-        r1 = Array.asarray(ak.randint(0, 100, (1, 2, 3), dtype=ak.int64, seed=SEED))
-        r2 = Array.asarray(ak.randint(0, 100, (2, 1, 3), dtype=ak.int64, seed=SEED))
-        r3 = Array.asarray(ak.randint(0, 100, (2, 3, 1), dtype=ak.int64, seed=SEED))
-        r4 = Array.asarray(ak.randint(0, 100, (1, 3, 1), dtype=ak.int64, seed=SEED))
+        r1 = randArr((1, 2, 3))
+        r2 = randArr((2, 1, 3))
+        r3 = randArr((2, 3, 1))
+        r4 = randArr((1, 3, 1))
 
         s1 = Array.squeeze(r1, axis=0)
         s2 = Array.squeeze(r2, axis=1)
@@ -253,3 +259,51 @@ def test_squeeze(self):
 
         with self.assertRaises(ValueError):
             Array.squeeze(r4, axis=1)
+
+    def test_stack_unstack(self):
+        a = randArr((5, 4))
+        b = randArr((5, 4))
+        c = randArr((5, 4))
+
+        abcStack0 = Array.stack([a, b, c], axis=0)
+        npabcStack0 = np.stack([a.to_ndarray(), b.to_ndarray(), c.to_ndarray()], axis=0)
+        self.assertEqual(abcStack0.shape, (3, 5, 4))
+        self.assertEqual(abcStack0.tolist(), npabcStack0.tolist())
+
+        (ap, bp, cp) = Array.unstack(abcStack0, axis=0)
+        self.assertEqual(ap.tolist(), a.tolist())
+        self.assertEqual(bp.tolist(), b.tolist())
+        self.assertEqual(cp.tolist(), c.tolist())
+
+        abcStackm1 = Array.stack([a, b, c], axis=-1)
+        npabcStackm1 = np.stack([a.to_ndarray(), b.to_ndarray(), c.to_ndarray()], axis=-1)
+        self.assertEqual(abcStackm1.shape, (5, 4, 3))
+        self.assertEqual(abcStackm1.tolist(), npabcStackm1.tolist())
+
+        (ap, bp, cp) = Array.unstack(abcStackm1, axis=-1)
+        self.assertEqual(ap.tolist(), a.tolist())
+        self.assertEqual(bp.tolist(), b.tolist())
+        self.assertEqual(cp.tolist(), c.tolist())
+
+    def test_tile(self):
+        a = randArr((2, 3))
+
+        print(a)
+
+        for reps in [(2, 1), (1, 2), (2, 2), (1, 1, 3), (3,)]:
+            at = Array.tile(a, reps)
+            npat = np.tile(np.asarray(a), reps)
+            self.assertEqual(at.shape, npat.shape)
+            self.assertEqual(at.tolist(), npat.tolist())
+
+    def test_repeat(self):
+        a = randArr((5, 10))
+        r = randArr((50,))
+
+        ar1 = Array.repeat(a, 2)
+        nar1 = np.repeat(np.asarray(a), 2)
+        self.assertEqual(ar1.tolist(), nar1.tolist())
+
+        ar2 = Array.repeat(a, r)
+        nar2 = np.repeat(np.asarray(a), np.asarray(r))
+        self.assertEqual(ar2.tolist(), nar2.tolist())