diff --git a/stumpy/aamp.py b/stumpy/aamp.py
index 7443db774..ca253c9af 100644
--- a/stumpy/aamp.py
+++ b/stumpy/aamp.py
@@ -255,7 +255,8 @@ def _aamp(
     return np.power(P[0, :, :], 1.0 / p), I[0, :, :]
 
 
-def aamp(T_A, m, T_B=None, ignore_trivial=True, p=2.0):
+def aamp(T_A, m, T_B=None, ignore_trivial=True, p=2.0, k=1):
+    # function needs to be changed to return top-k matrix profile
     """
     Compute the non-normalized (i.e., without z-normalization) matrix profile
 
@@ -282,6 +283,11 @@ def aamp(T_A, m, T_B=None, ignore_trivial=True, p=2.0):
     p : float, default 2.0
         The p-norm to apply for computing the Minkowski distance.
 
+    k : int, default 1
+        The number of top `k` smallest distances used to construct the matrix profile.
+        Note that this will increase the total computational time and memory usage
+        when k > 1.
+
     Returns
     -------
     out : numpy.ndarray
diff --git a/stumpy/aamped.py b/stumpy/aamped.py
index ed7251a48..3dd346275 100644
--- a/stumpy/aamped.py
+++ b/stumpy/aamped.py
@@ -12,7 +12,8 @@
 logger = logging.getLogger(__name__)
 
 
-def aamped(dask_client, T_A, m, T_B=None, ignore_trivial=True, p=2.0):
+def aamped(dask_client, T_A, m, T_B=None, ignore_trivial=True, p=2.0, k=1):
+    # function needs to be revised to return top-k matrix profile
     """
     Compute the non-normalized (i.e., without z-normalization) matrix profile
 
@@ -46,6 +47,11 @@ def aamped(dask_client, T_A, m, T_B=None, ignore_trivial=True, p=2.0):
     p : float, default 2.0
         The p-norm to apply for computing the Minkowski distance.
 
+    k : int, default 1
+        The number of top `k` smallest distances used to construct the matrix profile.
+        Note that this will increase the total computational time and memory usage
+        when k > 1.
+
     Returns
     -------
     out : numpy.ndarray
diff --git a/stumpy/aampi.py b/stumpy/aampi.py
index be3e7bd6d..1953aa61f 100644
--- a/stumpy/aampi.py
+++ b/stumpy/aampi.py
@@ -8,6 +8,7 @@
 
 
 class aampi:
+    # needs to be enhanced to support top-k matrix profile
     """
     Compute an incremental non-normalized (i.e., without z-normalization) matrix profile
     for streaming data
@@ -28,6 +29,11 @@ class aampi:
     p : float, default 2.0
         The p-norm to apply for computing the Minkowski distance.
 
+    k : int, default 1
+        The number of top `k` smallest distances used to construct the matrix profile.
+        Note that this will increase the total computational time and memory usage
+        when k > 1.
+
     Attributes
     ----------
     P_ : numpy.ndarray
@@ -62,7 +68,7 @@ class aampi:
     Note that we have extended this algorithm for AB-joins as well.
     """
 
-    def __init__(self, T, m, egress=True, p=2.0):
+    def __init__(self, T, m, egress=True, p=2.0, k=1):
         """
         Initialize the `stumpi` object
 
@@ -81,6 +87,12 @@ def __init__(self, T, m, egress=True, p=2.0):
 
         p : float, default 2.0
             The p-norm to apply for computing the Minkowski distance.
+
+
+        k : int, default 1
+            The number of top `k` smallest distances used to construct the matrix
+            profile. Note that this will increase the total computational time and
+            memory usage when k > 1.
         """
         self._T = core._preprocess(T)
         core.check_window_size(m, max_size=self._T.shape[-1])
diff --git a/stumpy/core.py b/stumpy/core.py
index 3f05c60c0..ef99a0746 100644
--- a/stumpy/core.py
+++ b/stumpy/core.py
@@ -209,6 +209,20 @@ def _gpu_aamp_stimp_driver_not_found(*args, **kwargs):  # pragma: no cover
     driver_not_found()
 
 
+def _gpu_searchsorted_left_driver_not_found(*args, **kwargs):  # pragma: no cover
+    """
+    Dummy function to raise CudaSupportError driver not found error.
+    """
+    driver_not_found()
+
+
+def _gpu_searchsorted_right_driver_not_found(*args, **kwargs):  # pragma: no cover
+    """
+    Dummy function to raise CudaSupportError driver not found error.
+    """
+    driver_not_found()
+
+
 def get_pkg_name():  # pragma: no cover
     """
     Return package name.
@@ -240,7 +254,7 @@ def rolling_window(a, window):
     return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)
 
 
-def z_norm(a, axis=0):
+def z_norm(a, axis=0, threshold=config.STUMPY_STDDEV_THRESHOLD):
     """
     Calculate the z-normalized input array `a` by subtracting the mean and
     dividing by the standard deviation along a given axis.
@@ -253,13 +267,16 @@ def z_norm(a, axis=0):
     axis : int, default 0
         NumPy array axis
 
+    threshold : float, default to config.STUMPY_STDDEV_THRESHOLD
+        A non-nan std value being less than `threshold` will be replaced with 1.0
+
     Returns
     -------
     output : numpy.ndarray
         An array with z-normalized values computed along a specified axis.
     """
     std = np.std(a, axis, keepdims=True)
-    std[std == 0] = 1
+    std[np.less(std, threshold, where=~np.isnan(std))] = 1.0
 
     return (a - np.mean(a, axis, keepdims=True)) / std
 
@@ -2559,6 +2576,201 @@ def _select_P_ABBA_value(P_ABBA, k, custom_func=None):
     return MPdist
 
 
+@njit
+def _merge_topk_PI(PA, PB, IA, IB):
+    """
+    Merge two top-k matrix profiles `PA` and `PB`, and update `PA` (in place).
+    When the inputs are 1D arrays, PA[i] is updated if it is greater than PB[i] and
+    IA[i] != IB[i]. In such case, PA[i] and IA[i] are replaced with PB[i] and IB[i],
+    respectively. (Note that it might happen that IA[i]==IB[i] but PA[i] != PB[i].
+    This situation can occur if there is slight imprecision in numerical calculations.
+    In that case, we do not update PA[i] and IA[i]. While updating PA[i] and IA[i]
+    is harmless in this case, we avoid doing that so to be consistent with the merging
+    process when the inputs are 2D arrays)
+    When the inputs are 2D arrays, we always prioritize the values of `PA` over the
+    values of `PB` in case of ties. (i.e., values from `PB` are always inserted to
+    the right of values from `PA`). Also, update `IA` accordingly. In case of
+    overlapping values between two arrays IA[i] and IB[i], the ones in IB[i] (and
+    their corresponding values in PB[i]) are ignored throughout the updating process
+    of IA[i] (and PA[i]).
+
+    Unlike `_merge_topk_ρI`, where `top-k` largest values are kept, this function
+    keeps `top-k` smallest values.
+
+    Parameters
+    ----------
+    PA : numpy.ndarray
+        A (top-k) matrix profile where values in each row are sorted in ascending
+        order. `PA` must be 1- or 2-dimensional.
+
+    PB : numpy.ndarray
+        A (top-k) matrix profile where values in each row are sorted in ascending
+        order. `PB` must have the same shape as `PA`.
+
+    IA : numpy.ndarray
+        A (top-k) matrix profile indices corresponding to `PA`
+
+    IB : numpy.ndarray
+        A (top-k) matrix profile indices corresponding to `PB`
+
+    Returns
+    -------
+    None
+    """
+    if PA.ndim == 1:
+        mask = (PB < PA) & (IB != IA)
+        PA[mask] = PB[mask]
+        IA[mask] = IB[mask]
+    else:
+        k = PA.shape[1]
+        tmp_P = np.empty(k, dtype=np.float64)
+        tmp_I = np.empty(k, dtype=np.int64)
+        for i in range(PA.shape[0]):
+            overlap = set(IB[i]).intersection(set(IA[i]))
+            aj, bj = 0, 0
+            idx = 0
+            # 2 * k iterations are required to traverse both A and B if needed.
+            for _ in range(2 * k):
+                if idx >= k:
+                    break
+                if bj < k and PB[i, bj] < PA[i, aj]:
+                    if IB[i, bj] not in overlap:
+                        tmp_P[idx] = PB[i, bj]
+                        tmp_I[idx] = IB[i, bj]
+                        idx += 1
+                    bj += 1
+                else:
+                    tmp_P[idx] = PA[i, aj]
+                    tmp_I[idx] = IA[i, aj]
+                    idx += 1
+                    aj += 1
+
+            PA[i] = tmp_P
+            IA[i] = tmp_I
+
+
+@njit
+def _merge_topk_ρI(ρA, ρB, IA, IB):
+    """
+    Merge two top-k pearson profiles `ρA` and `ρB`, and update `ρA` (in place).
+    When the inputs are 1D arrays, ρA[i] is updated if it is less than ρB[i] and
+    IA[i] != IB[i]. In such case, ρA[i] and IA[i] are replaced with ρB[i] and IB[i],
+    respectively. (Note that it might happen that IA[i]==IB[i] but ρA[i] != ρB[i].
+    This situation can occur if there is slight imprecision in numerical calculations.
+    In that case, we do not update ρA[i] and IA[i]. While updating ρA[i] and IA[i]
+    is harmless in this case, we avoid doing that so to be consistent with the merging
+    process when the inputs are 2D arrays)
+    When the inputs are 2D arrays, we always prioritize the values of `ρA` over
+    the values of `ρB` in case of ties. (i.e., values from `ρB` are always inserted
+    to the left of values from `ρA`). Also, update `IA` accordingly. In case of
+    overlapping values between two arrays IA[i] and IB[i], the ones in IB[i] (and
+    their corresponding values in ρB[i]) are ignored throughout the updating process
+    of IA[i] (and ρA[i]).
+
+    Unlike `_merge_topk_PI`, where `top-k` smallest values are kept, this function
+    keeps `top-k` largest values.
+
+    Parameters
+    ----------
+    ρA : numpy.ndarray
+        A (top-k) pearson profile where values in each row are sorted in ascending
+        order. `ρA` must be 1- or 2-dimensional.
+
+    ρB : numpy.ndarray
+        A (top-k) pearson profile, where values in each row are sorted in ascending
+        order. `ρB` must have the same shape as `ρA`.
+
+    IA : numpy.ndarray
+        A (top-k) matrix profile indices corresponding to `ρA`
+
+    IB : numpy.ndarray
+        A (top-k) matrix profile indices corresponding to `ρB`
+
+    Returns
+    -------
+    None
+    """
+    if ρA.ndim == 1:
+        mask = (ρB > ρA) & (IB != IA)
+        ρA[mask] = ρB[mask]
+        IA[mask] = IB[mask]
+    else:
+        k = ρA.shape[1]
+        tmp_ρ = np.empty(k, dtype=np.float64)
+        tmp_I = np.empty(k, dtype=np.int64)
+        last_idx = k - 1
+        for i in range(len(ρA)):
+            overlap = set(IB[i]).intersection(set(IA[i]))
+            aj, bj = last_idx, last_idx
+            idx = last_idx
+            # 2 * k iterations are required to traverse both A and B if needed.
+            for _ in range(2 * k):
+                if idx < 0:
+                    break
+                if bj >= 0 and ρB[i, bj] > ρA[i, aj]:
+                    if IB[i, bj] not in overlap:
+                        tmp_ρ[idx] = ρB[i, bj]
+                        tmp_I[idx] = IB[i, bj]
+                        idx -= 1
+                    bj -= 1
+                else:
+                    tmp_ρ[idx] = ρA[i, aj]
+                    tmp_I[idx] = IA[i, aj]
+                    idx -= 1
+                    aj -= 1
+
+            ρA[i] = tmp_ρ
+            IA[i] = tmp_I
+
+
+@njit
+def _shift_insert_at_index(a, idx, v, shift="right"):
+    """
+    If `shift=right` (default), all elements in `a[idx:]` are shifted to the right by
+    one element, `v` in inserted at index `idx` and the last element is discarded.
+    If `shift=left`, all elements in `a[:idx]` are shifted to the left by one element,
+    `v` in inserted at index `idx-1`, and the first element is discarded. In both cases,
+    `a` is updated in place and its length remains unchanged.
+
+    Note that all unrecognized `shift` inputs will default to `shift=right`.
+
+
+    Parameters
+    ----------
+    a: numpy.ndarray
+        A 1d array
+
+    idx: int
+        The index at which the value `v` should be inserted. This can be any
+        integer number from `0` to `len(a)`. When `idx=len(a)` and `shift="right"`,
+        OR when `idx=0` and `shift="left"`, then no change will occur on
+        the input array `a`.
+
+    v: float
+        The value that should be inserted into array `a` at index `idx`
+
+    shift: str, default "right"
+        The value that indicates whether the shifting of elements should be towards
+        the right or left. If `shift="right"` (default), all elements in `a[idx:]`
+        are shifted to the right by one element. If `shift="left"`, all elements
+        in `a[:idx]` are shifted to the left by one element.
+
+    Returns
+    -------
+    None
+    """
+    if shift == "left":
+        if 0 < idx <= len(a):
+            a[: idx - 1] = a[1:idx]
+            # elements were shifted to the left, thus the insertion index becomes
+            # `idx-1`
+            a[idx - 1] = v
+    else:
+        if 0 <= idx < len(a):
+            a[idx + 1 :] = a[idx:-1]
+            a[idx] = v
+
+
 def _check_P(P, threshold=1e-6):
     """
     Check if the 1-dimensional matrix profile values are too small and
diff --git a/stumpy/gpu_aamp.py b/stumpy/gpu_aamp.py
index b4508a473..7bf783525 100644
--- a/stumpy/gpu_aamp.py
+++ b/stumpy/gpu_aamp.py
@@ -339,7 +339,9 @@ def _gpu_aamp(
     return profile_fname, indices_fname
 
 
-def gpu_aamp(T_A, m, T_B=None, ignore_trivial=True, device_id=0, p=2.0):
+def gpu_aamp(T_A, m, T_B=None, ignore_trivial=True, device_id=0, p=2.0, k=1):
+    # function needs to be revised to return (top-k) matrix profile and
+    # matrix profile indices
     """
     Compute the non-normalized (i.e., without z-normalization) matrix profile with one
     or more GPU devices
@@ -375,6 +377,11 @@ def gpu_aamp(T_A, m, T_B=None, ignore_trivial=True, device_id=0, p=2.0):
     p : float, default 2.0
         The p-norm to apply for computing the Minkowski distance.
 
+    k : int, default 1
+        The number of top `k` smallest distances used to construct the matrix profile.
+        Note that this will increase the total computational time and memory usage
+        when k > 1.
+
     Returns
     -------
     out : numpy.ndarray
diff --git a/stumpy/gpu_stump.py b/stumpy/gpu_stump.py
index e50546cca..d0890cceb 100644
--- a/stumpy/gpu_stump.py
+++ b/stumpy/gpu_stump.py
@@ -15,9 +15,100 @@
 logger = logging.getLogger(__name__)
 
 
+@cuda.jit(device=True)
+def _gpu_searchsorted_left(a, v, bfs, nlevel):
+    """
+    A device function, equivalent to numpy.searchsorted(a, v, side='left')
+
+    Parameters
+    ----------
+    a : numpy.ndarray
+        1-dim array sorted in ascending order.
+
+    v : float
+        Value to insert into array `a`
+
+    bfs : numpy.ndarray
+        The breadth-first-search indices where the missing leaves of its corresponding
+        binary search tree are filled with -1.
+
+    nlevel : int
+        The number of levels in the binary search tree from which the array
+        `bfs` is obtained.
+
+    Returns
+    -------
+    idx : int
+        The index of the insertion point
+    """
+    n = a.shape[0]
+    idx = 0
+    for level in range(nlevel):
+        if v <= a[bfs[idx]]:
+            next_idx = 2 * idx + 1
+        else:
+            next_idx = 2 * idx + 2
+
+        if level == nlevel - 1 or bfs[next_idx] < 0:
+            if v <= a[bfs[idx]]:
+                idx = max(bfs[idx], 0)
+            else:
+                idx = min(bfs[idx] + 1, n)
+            break
+        idx = next_idx
+
+    return idx
+
+
+@cuda.jit(device=True)
+def _gpu_searchsorted_right(a, v, bfs, nlevel):
+    """
+    A device function, equivalent to numpy.searchsorted(a, v, side='right')
+
+    Parameters
+    ----------
+    a : numpy.ndarray
+        1-dim array sorted in ascending order.
+
+    v : float
+        Value to insert into array `a`
+
+    bfs : numpy.ndarray
+        The breadth-first-search indices where the missing leaves of its corresponding
+        binary search tree are filled with -1.
+
+    nlevel : int
+        The number of levels in the binary search tree from which the array
+        `bfs` is obtained.
+
+    Returns
+    -------
+    idx : int
+        The index of the insertion point
+    """
+    n = a.shape[0]
+    idx = 0
+    for level in range(nlevel):
+        if v < a[bfs[idx]]:
+            next_idx = 2 * idx + 1
+        else:
+            next_idx = 2 * idx + 2
+
+        if level == nlevel - 1 or bfs[next_idx] < 0:
+            if v < a[bfs[idx]]:
+                idx = max(bfs[idx], 0)
+            else:
+                idx = min(bfs[idx] + 1, n)
+            break
+        idx = next_idx
+
+    return idx
+
+
 @cuda.jit(
     "(i8, f8[:], f8[:], i8,  f8[:], f8[:], f8[:], f8[:], f8[:],"
-    "f8[:], f8[:], i8, b1, i8, f8[:, :], i8[:, :], b1)"
+    "f8[:], f8[:], i8, b1, i8, f8[:, :], f8[:], f8[:], i8[:, :], i8[:], i8[:],"
+    "b1, i8[:], i8, i2)"
 )
 def _compute_and_update_PI_kernel(
     i,
@@ -31,12 +122,19 @@ def _compute_and_update_PI_kernel(
     Σ_T,
     μ_Q,
     σ_Q,
-    k,
+    w,
     ignore_trivial,
     excl_zone,
     profile,
+    profile_L,
+    profile_R,
     indices,
+    indices_L,
+    indices_R,
     compute_QT,
+    bfs,
+    nlevel,
+    k,
 ):
     """
     A Numba CUDA kernel to update the matrix profile and matrix profile indices
@@ -44,7 +142,7 @@ def _compute_and_update_PI_kernel(
     Parameters
     ----------
     i : int
-        sliding window `i`
+        Sliding window `i`
 
     T_A : numpy.ndarray
         The time series or sequence for which to compute the dot product
@@ -79,7 +177,7 @@ def _compute_and_update_PI_kernel(
     σ_Q : numpy.ndarray
         Standard deviation of the query sequence, `Q`
 
-    k : int
+    w : int
         The total number of sliding windows to iterate over
 
     ignore_trivial : bool
@@ -91,18 +189,39 @@ def _compute_and_update_PI_kernel(
         sliding window
 
     profile : numpy.ndarray
-        Matrix profile. The first column consists of the global matrix profile,
-        the second column consists of the left matrix profile, and the third
-        column consists of the right matrix profile.
+        The (top-k) matrix profile, sorted in ascending order per row
+
+    profile_L : numpy.ndarray
+        The (top-1) left matrix profile
+
+    profile_R : numpy.ndarray
+        The (top-1) right matrix profile
 
     indices : numpy.ndarray
-        The first column consists of the matrix profile indices, the second
-        column consists of the left matrix profile indices, and the third
-        column consists of the right matrix profile indices.
+        The (top-k) matrix profile indices
+
+    indices_L : numpy.ndarray
+        The (top-1) left matrix profile indices
+
+    indices_R : numpy.ndarray
+        The (top-1) right matrix profile indices
 
     compute_QT : bool
         A boolean flag for whether or not to compute QT
 
+    bfs : numpy.ndarray
+        The breadth-first-search indices where the missing leaves of its corresponding
+        binary search tree are filled with -1.
+
+    nlevel : int
+        The number of levels in the binary search tree from which the array
+        `bfs` is obtained.
+
+    k : int
+        The number of top `k` smallest distances used to construct the matrix profile.
+        Note that this will increase the total computational time and memory usage
+        when k > 1.
+
     Returns
     -------
     None
@@ -126,7 +245,7 @@ def _compute_and_update_PI_kernel(
 
     for j in range(start, QT_out.shape[0], stride):
         zone_start = max(0, j - excl_zone)
-        zone_stop = min(k, j + excl_zone)
+        zone_stop = min(w, j + excl_zone)
 
         if compute_QT:
             QT_out[j] = (
@@ -157,16 +276,21 @@ def _compute_and_update_PI_kernel(
         if ignore_trivial:
             if i <= zone_stop and i >= zone_start:
                 p_norm = np.inf
-            if p_norm < profile[j, 1] and i < j:
-                profile[j, 1] = p_norm
-                indices[j, 1] = i
-            if p_norm < profile[j, 2] and i > j:
-                profile[j, 2] = p_norm
-                indices[j, 2] = i
+            if p_norm < profile_L[j] and i < j:
+                profile_L[j] = p_norm
+                indices_L[j] = i
+            if p_norm < profile_R[j] and i > j:
+                profile_R[j] = p_norm
+                indices_R[j] = i
 
-        if p_norm < profile[j, 0]:
-            profile[j, 0] = p_norm
-            indices[j, 0] = i
+        if p_norm < profile[j, -1]:
+            idx = _gpu_searchsorted_right(profile[j], p_norm, bfs, nlevel)
+            for g in range(k - 1, idx, -1):
+                profile[j, g] = profile[j, g - 1]
+                indices[j, g] = indices[j, g - 1]
+
+            profile[j, idx] = p_norm
+            indices[j, idx] = i
 
 
 def _gpu_stump(
@@ -181,10 +305,11 @@ def _gpu_stump(
     QT_first_fname,
     μ_Q_fname,
     σ_Q_fname,
-    k,
+    w,
     ignore_trivial=True,
     range_start=1,
     device_id=0,
+    k=1,
 ):
     """
     A Numba CUDA version of STOMP for parallel computation of the
@@ -235,7 +360,7 @@ def _gpu_stump(
         The file name for the standard deviation of the query sequence, `Q`,
         relative to the current sliding window
 
-    k : int
+    w : int
         The total number of sliding windows to iterate over
 
     ignore_trivial : bool
@@ -249,6 +374,11 @@ def _gpu_stump(
     device_id : int
         The (GPU) device number to use. The default value is `0`.
 
+    k : int
+        The number of top `k` smallest distances used to construct the matrix profile.
+        Note that this will increase the total computational time and memory usage
+        when k > 1.
+
     Returns
     -------
     profile_fname : str
@@ -289,7 +419,7 @@ def _gpu_stump(
     Note that left and right matrix profiles are only available for self-joins.
     """
     threads_per_block = config.STUMPY_THREADS_PER_BLOCK
-    blocks_per_grid = math.ceil(k / threads_per_block)
+    blocks_per_grid = math.ceil(w / threads_per_block)
 
     T_A = np.load(T_A_fname, allow_pickle=False)
     T_B = np.load(T_B_fname, allow_pickle=False)
@@ -300,6 +430,10 @@ def _gpu_stump(
     μ_Q = np.load(μ_Q_fname, allow_pickle=False)
     σ_Q = np.load(σ_Q_fname, allow_pickle=False)
 
+    device_bfs = cuda.to_device(core._bfs_indices(k, fill_value=-1))
+    nlevel = np.floor(np.log2(k) + 1).astype(np.int64)
+    # number of levels in binary search tree from which `bfs` is constructed.
+
     with cuda.gpus[device_id]:
         device_T_A = cuda.to_device(T_A)
         device_QT_odd = cuda.to_device(QT)
@@ -316,11 +450,22 @@ def _gpu_stump(
             device_M_T = cuda.to_device(M_T)
             device_Σ_T = cuda.to_device(Σ_T)
 
-        profile = np.full((k, 3), np.inf, dtype=np.float64)
-        indices = np.full((k, 3), -1, dtype=np.int64)
+        profile = np.full((w, k), np.inf, dtype=np.float64)
+        indices = np.full((w, k), -1, dtype=np.int64)
+
+        profile_L = np.full(w, np.inf, dtype=np.float64)
+        indices_L = np.full(w, -1, dtype=np.int64)
+
+        profile_R = np.full(w, np.inf, dtype=np.float64)
+        indices_R = np.full(w, -1, dtype=np.int64)
 
         device_profile = cuda.to_device(profile)
+        device_profile_L = cuda.to_device(profile_L)
+        device_profile_R = cuda.to_device(profile_R)
         device_indices = cuda.to_device(indices)
+        device_indices_L = cuda.to_device(indices_L)
+        device_indices_R = cuda.to_device(indices_R)
+
         _compute_and_update_PI_kernel[blocks_per_grid, threads_per_block](
             range_start - 1,
             device_T_A,
@@ -333,12 +478,19 @@ def _gpu_stump(
             device_Σ_T,
             device_μ_Q,
             device_σ_Q,
-            k,
+            w,
             ignore_trivial,
             excl_zone,
             device_profile,
+            device_profile_L,
+            device_profile_R,
             device_indices,
+            device_indices_L,
+            device_indices_R,
             False,
+            device_bfs,
+            nlevel,
+            k,
         )
 
         for i in range(range_start, range_stop):
@@ -354,27 +506,52 @@ def _gpu_stump(
                 device_Σ_T,
                 device_μ_Q,
                 device_σ_Q,
-                k,
+                w,
                 ignore_trivial,
                 excl_zone,
                 device_profile,
+                device_profile_L,
+                device_profile_R,
                 device_indices,
+                device_indices_L,
+                device_indices_R,
                 True,
+                device_bfs,
+                nlevel,
+                k,
             )
 
         profile = device_profile.copy_to_host()
+        profile_L = device_profile_L.copy_to_host()
+        profile_R = device_profile_R.copy_to_host()
         indices = device_indices.copy_to_host()
-        profile = np.sqrt(profile)
+        indices_L = device_indices_L.copy_to_host()
+        indices_R = device_indices_R.copy_to_host()
+
+        profile[:, :] = np.sqrt(profile)
+        profile_L[:] = np.sqrt(profile_L)
+        profile_R[:] = np.sqrt(profile_R)
 
         profile_fname = core.array_to_temp_file(profile)
+        profile_L_fname = core.array_to_temp_file(profile_L)
+        profile_R_fname = core.array_to_temp_file(profile_R)
         indices_fname = core.array_to_temp_file(indices)
+        indices_L_fname = core.array_to_temp_file(indices_L)
+        indices_R_fname = core.array_to_temp_file(indices_R)
 
-    return profile_fname, indices_fname
+    return (
+        profile_fname,
+        profile_L_fname,
+        profile_R_fname,
+        indices_fname,
+        indices_L_fname,
+        indices_R_fname,
+    )
 
 
 @core.non_normalized(gpu_aamp)
 def gpu_stump(
-    T_A, m, T_B=None, ignore_trivial=True, device_id=0, normalize=True, p=2.0
+    T_A, m, T_B=None, ignore_trivial=True, device_id=0, normalize=True, p=2.0, k=1
 ):
     """
     Compute the z-normalized matrix profile with one or more GPU devices
@@ -417,13 +594,24 @@ def gpu_stump(
         The p-norm to apply for computing the Minkowski distance. This parameter is
         ignored when `normalize == True`.
 
+    k : int, default 1
+        The number of top `k` smallest distances used to construct the matrix profile.
+        Note that this will increase the total computational time and memory usage
+        when k > 1.
+
     Returns
     -------
     out : numpy.ndarray
-        The first column consists of the matrix profile, the second column
-        consists of the matrix profile indices, the third column consists of
-        the left matrix profile indices, and the fourth column consists of
-        the right matrix profile indices.
+        When k = 1 (default), the first column consists of the matrix profile,
+        the second column consists of the matrix profile indices, the third column
+        consists of the left matrix profile indices, and the fourth column consists
+        of the right matrix profile indices. However, when k > 1, the output array
+        will contain exactly 2 * k + 2 columns. The first k columns (i.e., out[:, :k])
+        consists of the top-k matrix profile, the next set of k columns
+        (i.e., out[:, k:2k]) consists of the corresponding top-k matrix profile
+        indices, and the last two columns (i.e., out[:, 2k] and out[:, 2k+1] or,
+        equivalently, out[:, -2] and out[:, -1]) correspond to the top-1 left
+        matrix profile indices and the top-1 right matrix profile indices, respectively.
 
     See Also
     --------
@@ -505,7 +693,7 @@ def gpu_stump(
         logger.warning("Try setting `ignore_trivial = False`.")
 
     n = T_B.shape[0]
-    k = T_A.shape[0] - m + 1
+    w = T_A.shape[0] - m + 1
     l = n - m + 1
     excl_zone = int(
         np.ceil(m / config.STUMPY_EXCL_ZONE_DENOM)
@@ -518,8 +706,6 @@ def gpu_stump(
     μ_Q_fname = core.array_to_temp_file(μ_Q)
     σ_Q_fname = core.array_to_temp_file(σ_Q)
 
-    out = np.empty((k, 4), dtype=object)
-
     if isinstance(device_id, int):
         device_ids = [device_id]
     else:
@@ -528,6 +714,12 @@ def gpu_stump(
     profile = [None] * len(device_ids)
     indices = [None] * len(device_ids)
 
+    profile_L = [None] * len(device_ids)
+    indices_L = [None] * len(device_ids)
+
+    profile_R = [None] * len(device_ids)
+    indices_R = [None] * len(device_ids)
+
     for _id in device_ids:
         with cuda.gpus[_id]:
             if (
@@ -571,16 +763,24 @@ def gpu_stump(
                     QT_first_fname,
                     μ_Q_fname,
                     σ_Q_fname,
-                    k,
+                    w,
                     ignore_trivial,
                     start + 1,
                     device_ids[idx],
+                    k,
                 ),
             )
         else:
             # Execute last chunk in parent process
             # Only parent process is executed when a single GPU is requested
-            profile[idx], indices[idx] = _gpu_stump(
+            (
+                profile[idx],
+                profile_L[idx],
+                profile_R[idx],
+                indices[idx],
+                indices_L[idx],
+                indices_R[idx],
+            ) = _gpu_stump(
                 T_A_fname,
                 T_B_fname,
                 m,
@@ -592,10 +792,11 @@ def gpu_stump(
                 QT_first_fname,
                 μ_Q_fname,
                 σ_Q_fname,
-                k,
+                w,
                 ignore_trivial,
                 start + 1,
                 device_ids[idx],
+                k,
             )
 
     # Clean up process pool for multi-GPU request
@@ -606,7 +807,14 @@ def gpu_stump(
         # Collect results from spawned child processes if they exist
         for idx, result in enumerate(results):
             if result is not None:
-                profile[idx], indices[idx] = result.get()
+                (
+                    profile[idx],
+                    profile_L[idx],
+                    profile_R[idx],
+                    indices[idx],
+                    indices_L[idx],
+                    indices_R[idx],
+                ) = result.get()
 
     os.remove(T_A_fname)
     os.remove(T_B_fname)
@@ -621,22 +829,44 @@ def gpu_stump(
 
     for idx in range(len(device_ids)):
         profile_fname = profile[idx]
+        profile_L_fname = profile_L[idx]
+        profile_R_fname = profile_R[idx]
         indices_fname = indices[idx]
+        indices_L_fname = indices_L[idx]
+        indices_R_fname = indices_R[idx]
+
         profile[idx] = np.load(profile_fname, allow_pickle=False)
+        profile_L[idx] = np.load(profile_L_fname, allow_pickle=False)
+        profile_R[idx] = np.load(profile_R_fname, allow_pickle=False)
         indices[idx] = np.load(indices_fname, allow_pickle=False)
+        indices_L[idx] = np.load(indices_L_fname, allow_pickle=False)
+        indices_R[idx] = np.load(indices_R_fname, allow_pickle=False)
+
         os.remove(profile_fname)
+        os.remove(profile_L_fname)
+        os.remove(profile_R_fname)
         os.remove(indices_fname)
-
-    for i in range(1, len(device_ids)):
-        # Update all matrix profiles and matrix profile indices
-        # (global, left, right) and store in profile[0] and indices[0]
-        for col in range(profile[0].shape[1]):  # pragma: no cover
-            cond = profile[0][:, col] < profile[i][:, col]
-            profile[0][:, col] = np.where(cond, profile[0][:, col], profile[i][:, col])
-            indices[0][:, col] = np.where(cond, indices[0][:, col], indices[i][:, col])
-
-    out[:, 0] = profile[0][:, 0]
-    out[:, 1:4] = indices[0][:, :]
+        os.remove(indices_L_fname)
+        os.remove(indices_R_fname)
+
+    for i in range(1, len(device_ids)):  # pragma: no cover
+        # Update (top-k) matrix profile and matrix profile indices
+        core._merge_topk_PI(profile[0], profile[i], indices[0], indices[i])
+
+        # Update (top-1) left matrix profile and matrix profile indices
+        mask = profile_L[0] < profile_L[i]
+        profile_L[0][mask] = profile_L[i][mask]
+        indices_L[0][mask] = indices_L[i][mask]
+
+        # Update (top-1) right matrix profile and matrix profile indices
+        mask = profile_R[0] < profile_R[i]
+        profile_R[0][mask] = profile_R[i][mask]
+        indices_R[0][mask] = indices_R[i][mask]
+
+    out = np.empty((w, 2 * k + 2), dtype=object)  # last two columns are to store
+    # (top-1) left/right matrix profile indices
+    out[:, :k] = profile[0]
+    out[:, k:] = np.column_stack((indices[0], indices_L[0], indices_R[0]))
 
     core._check_P(out[:, 0])
 
diff --git a/stumpy/scraamp.py b/stumpy/scraamp.py
index 0f0fc41f2..e81c267c3 100644
--- a/stumpy/scraamp.py
+++ b/stumpy/scraamp.py
@@ -14,6 +14,79 @@
 logger = logging.getLogger(__name__)
 
 
+def _preprocess_prescraamp(T_A, m, T_B=None, s=None):
+    """
+    Performs several preprocessings and returns outputs that are needed for the
+    non-normalized preSCRIMP algorithm.
+
+    Parameters
+    ----------
+    T_A : numpy.ndarray
+        The time series or sequence for which to compute the matrix profile
+
+    m : int
+        Window size
+
+    T_B : numpy.ndarray, default None
+        The time series or sequence that will be used to annotate T_A. For every
+        subsequence in T_A, its nearest neighbor in T_B will be recorded.
+
+    s : int, default None
+        The sampling interval that defaults to
+        `int(np.ceil(m / config.STUMPY_EXCL_ZONE_DENOM))`
+
+    Returns
+    -------
+    T_A : numpy.ndarray
+        A copy of the time series input `T_A`, where all NaN and inf values
+        are replaced with zero.
+
+    T_B : numpy.ndarray
+        A copy of the time series input `T_B`, where all NaN and inf values
+        are replaced with zero. If the input `T_B` is not provided (default),
+        this array is just a copy of `T_A`.
+
+    T_A_subseq_isfinite : numpy.ndarray
+        A boolean array that indicates whether a subsequence in `T_A` contains a
+        `np.nan`/`np.inf` value (False)
+
+    T_B_subseq_isfinite : numpy.ndarray
+        A boolean array that indicates whether a subsequence in `T_B` contains a
+        `np.nan`/`np.inf` value (False)
+
+    indices : numpy.ndarray
+        The subsequence indices to compute `prescrump` for
+
+    s : int
+        The sampling interval that defaults to
+        `int(np.ceil(m / config.STUMPY_EXCL_ZONE_DENOM))`
+
+    excl_zone : int
+        The half width for the exclusion zone
+    """
+    if T_B is None:
+        T_B = T_A
+        excl_zone = int(np.ceil(m / config.STUMPY_EXCL_ZONE_DENOM))
+    else:
+        excl_zone = None
+
+    T_A, T_A_subseq_isfinite = core.preprocess_non_normalized(T_A, m)
+    T_B, T_B_subseq_isfinite = core.preprocess_non_normalized(T_B, m)
+
+    n_A = T_A.shape[0]
+    l = n_A - m + 1
+
+    if s is None:  # pragma: no cover
+        if excl_zone is not None:  # self-join
+            s = excl_zone
+        else:  # AB-join
+            s = int(np.ceil(m / config.STUMPY_EXCL_ZONE_DENOM))
+
+    indices = np.random.permutation(range(0, l, s)).astype(np.int64)
+
+    return (T_A, T_B, T_A_subseq_isfinite, T_B_subseq_isfinite, indices, s, excl_zone)
+
+
 @njit(fastmath=True)
 def _compute_PI(
     T_A,
@@ -272,7 +345,8 @@ def _prescraamp(
     return np.power(P_NORM[0], 1.0 / p), I[0]
 
 
-def prescraamp(T_A, m, T_B=None, s=None, p=2.0):
+def prescraamp(T_A, m, T_B=None, s=None, p=2.0, k=1):
+    # this function should be modified so that it can return top-k matrix profile
     """
     A convenience wrapper around the Numba JIT-compiled parallelized `_prescraamp`
     function which computes the approximate matrix profile according to the
@@ -297,6 +371,11 @@ def prescraamp(T_A, m, T_B=None, s=None, p=2.0):
     p : float, default 2.0
         The p-norm to apply for computing the Minkowski distance.
 
+    k : int, default 1
+        The number of top `k` smallest distances used to construct the matrix profile.
+        Note that this will increase the total computational time and memory usage
+        when k > 1.
+
     Returns
     -------
     P : numpy.ndarray
@@ -312,25 +391,16 @@ def prescraamp(T_A, m, T_B=None, s=None, p=2.0):
 
     See Algorithm 2
     """
-    if T_B is None:
-        T_B = T_A
-        excl_zone = int(np.ceil(m / config.STUMPY_EXCL_ZONE_DENOM))
-    else:
-        excl_zone = None
-
-    T_A, T_A_subseq_isfinite = core.preprocess_non_normalized(T_A, m)
-    T_B, T_B_subseq_isfinite = core.preprocess_non_normalized(T_B, m)
-
-    n_A = T_A.shape[0]
-    l = n_A - m + 1
-
-    if s is None:  # pragma: no cover
-        if excl_zone is not None:  # self-join
-            s = excl_zone
-        else:  # AB-join
-            s = int(np.ceil(m / config.STUMPY_EXCL_ZONE_DENOM))
+    (
+        T_A,
+        T_B,
+        T_A_subseq_isfinite,
+        T_B_subseq_isfinite,
+        indices,
+        s,
+        excl_zone,
+    ) = _preprocess_prescraamp(T_A, m, T_B=T_B, s=s)
 
-    indices = np.random.permutation(range(0, l, s)).astype(np.int64)
     P, I = _prescraamp(
         T_A,
         T_B,
@@ -387,6 +457,11 @@ class scraamp:
     p : float, default 2.0
         The p-norm to apply for computing the Minkowski distance.
 
+    k : int, default 1
+        The number of top `k` smallest distances used to construct the matrix profile.
+        Note that this will increase the total computational time and memory usage
+        when k > 1.
+
     Attributes
     ----------
     P_ : numpy.ndarray
@@ -422,6 +497,7 @@ def __init__(
         pre_scraamp=False,
         s=None,
         p=2.0,
+        k=1,  # this function needs to be modified for top-k
     ):
         """
         Initialize the `scraamp` object
@@ -458,6 +534,11 @@ def __init__(
 
         p : float, default 2.0
             The p-norm to apply for computing the Minkowski distance.
+
+        k : int, default 1
+            The number of top `k` smallest distances used to construct the matrix
+            profile. Note that this will increase the total computational time and
+            memory usage when k > 1.
         """
         self._ignore_trivial = ignore_trivial
         self._p = p
@@ -520,9 +601,38 @@ def __init__(
 
         if pre_scraamp:
             if self._ignore_trivial:
-                P, I = prescraamp(T_A, m, s=s, p=p)
+                (
+                    T_A,
+                    T_B,
+                    T_A_subseq_isfinite,
+                    T_B_subseq_isfinite,
+                    indices,
+                    s,
+                    excl_zone,
+                ) = _preprocess_prescraamp(T_A, m, s=s)
             else:
-                P, I = prescraamp(T_A, m, T_B=T_B, s=s, p=p)
+                (
+                    T_A,
+                    T_B,
+                    T_A_subseq_isfinite,
+                    T_B_subseq_isfinite,
+                    indices,
+                    s,
+                    excl_zone,
+                ) = _preprocess_prescraamp(T_A, m, T_B=T_B, s=s)
+
+            P, I = _prescraamp(
+                T_A,
+                T_B,
+                m,
+                T_A_subseq_isfinite,
+                T_B_subseq_isfinite,
+                p,
+                indices,
+                s,
+                excl_zone,
+            )
+
             for i in range(P.shape[0]):
                 if self._P[i, 0] > P[i]:
                     self._P[i, 0] = P[i]
diff --git a/stumpy/scrump.py b/stumpy/scrump.py
index 3ec55ef7c..f874966e2 100644
--- a/stumpy/scrump.py
+++ b/stumpy/scrump.py
@@ -14,6 +14,83 @@
 logger = logging.getLogger(__name__)
 
 
+def _preprocess_prescrump(T_A, m, T_B=None, s=None):
+    """
+    Performs several preprocessings and returns outputs that are needed for the
+    prescrump algorithm.
+
+    Parameters
+    ----------
+    T_A : numpy.ndarray
+        The time series or sequence for which to compute the matrix profile
+
+    m : int
+        Window size
+
+    T_B : numpy.ndarray, default None
+        The time series or sequence that will be used to annotate T_A. For every
+        subsequence in T_A, its nearest neighbor in T_B will be recorded.
+
+    s : int, default None
+        The sampling interval that defaults to
+        `int(np.ceil(m / config.STUMPY_EXCL_ZONE_DENOM))`
+
+    Returns
+    -------
+    T_A : numpy.ndarray
+        A copy of the time series input `T_A`, where all NaN and inf values
+        are replaced with zero.
+
+    T_B : numpy.ndarray
+        A copy of the time series input `T_B`, where all NaN and inf values
+        are replaced with zero. If the input `T_B` is not provided (default),
+        this array is just a copy of `T_A`.
+
+    μ_Q : numpy.ndarray
+        Sliding window mean for `T_A`
+
+    σ_Q : numpy.ndarray
+        Sliding window standard deviation for `T_A`
+
+    M_T : numpy.ndarray
+        Sliding window mean for `T_B`
+
+    Σ_T : numpy.ndarray
+        Sliding window standard deviation for `T_B`
+
+    indices : numpy.ndarray
+        The subsequence indices to compute `prescrump` for
+
+    s : int
+        The sampling interval that defaults to
+        `int(np.ceil(m / config.STUMPY_EXCL_ZONE_DENOM))`
+
+    excl_zone : int
+        The half width for the exclusion zone
+    """
+    if T_B is None:
+        T_B = T_A
+        excl_zone = int(np.ceil(m / config.STUMPY_EXCL_ZONE_DENOM))
+    else:
+        excl_zone = None
+
+    T_A, μ_Q, σ_Q = core.preprocess(T_A, m)
+    T_B, M_T, Σ_T = core.preprocess(T_B, m)
+
+    n_A = T_A.shape[0]
+    l = n_A - m + 1
+
+    if s is None:  # pragma: no cover
+        if excl_zone is not None:  # self-join
+            s = excl_zone
+        else:  # AB-join
+            s = int(np.ceil(m / config.STUMPY_EXCL_ZONE_DENOM))
+
+    indices = np.random.permutation(range(0, l, s)).astype(np.int64)
+
+    return (T_A, T_B, μ_Q, σ_Q, M_T, Σ_T, indices, s, excl_zone)
+
+
 @njit(fastmath=True)
 def _compute_PI(
     T_A,
@@ -31,10 +108,11 @@ def _compute_PI(
     P_squared,
     I,
     excl_zone=None,
+    k=1,
 ):
     """
-    Compute (Numba JIT-compiled) and update the squared matrix profile distance
-    and matrix profile indces according to the preSCRIMP algorithm
+    Compute (Numba JIT-compiled) and update the squared (top-k) matrix profile
+    distance and matrix profile indces according to the preSCRIMP algorithm.
 
     Parameters
     ----------
@@ -49,16 +127,16 @@ def _compute_PI(
         Window size
 
     μ_Q : numpy.ndarray
-        Sliding window mean for T_A
+        Sliding window mean for `T_A`
 
     σ_Q : numpy.ndarray
-        Sliding window standard deviation for T_A
+        Sliding window standard deviation for `T_A`
 
     M_T : numpy.ndarray
-        Sliding window mean for T_B
+        Sliding window mean for `T_B`
 
     Σ_T : numpy.ndarray
-        Sliding window standard deviation for T_B
+        Sliding window standard deviation for `T_B`
 
     indices : numpy.ndarray
         The subsequence indices to compute `prescrump` for
@@ -77,14 +155,19 @@ def _compute_PI(
         `int(np.ceil(m / config.STUMPY_EXCL_ZONE_DENOM))`
 
     P_squared : numpy.ndarray
-        The squared matrix profile
+        The squared (top-k) matrix profile
 
     I : numpy.ndarray
-        The matrix profile indices
+        The (top-k) matrix profile indices
 
     excl_zone : int
         The half width for the exclusion zone relative to the `i`.
 
+    k : int, default 1
+        The number of top `k` smallest distances used to construct the matrix profile.
+        Note that this will increase the total computational time and memory usage
+        when k > 1.
+
     Returns
     -------
     None
@@ -103,67 +186,139 @@ def _compute_PI(
     for i in indices[start:stop]:
         Q = T_A[i : i + m]
         QT[:] = core._sliding_dot_product(Q, T_B)
-        # Update P[i] relative to all T[j : j + m]
         squared_distance_profile[:] = core._mass(Q, T_B, QT, μ_Q[i], σ_Q[i], M_T, Σ_T)
         squared_distance_profile[:] = np.square(squared_distance_profile)
         if excl_zone is not None:
-            zone_start = max(0, i - excl_zone)
-            zone_stop = min(w, i + excl_zone)
-            squared_distance_profile[zone_start : zone_stop + 1] = np.inf
-
-            # only for self-join
-            mask = squared_distance_profile < P_squared[thread_idx]
-            P_squared[thread_idx][mask] = squared_distance_profile[mask]
-            I[thread_idx][mask] = i
-
-        I[thread_idx, i] = np.argmin(squared_distance_profile)
-        P_squared[thread_idx, i] = squared_distance_profile[I[thread_idx, i]]
-        if P_squared[thread_idx, i] == np.inf:  # pragma: no cover
-            I[thread_idx, i] = -1
-        else:
-            j = I[thread_idx, i]
-            # Given the squared distance, work backwards and compute QT
-            QT_j = (m - P_squared[thread_idx, i] / 2.0) * (Σ_T[j] * σ_Q[i]) + (
-                m * M_T[j] * μ_Q[i]
+            core._apply_exclusion_zone(squared_distance_profile, i, excl_zone, np.inf)
+
+        nn_i = np.argmin(squared_distance_profile)
+        if (
+            squared_distance_profile[nn_i] < P_squared[thread_idx, i, -1]
+            and nn_i not in I[thread_idx, i]
+        ):
+            idx = np.searchsorted(
+                P_squared[thread_idx, i],
+                squared_distance_profile[nn_i],
+                side="right",
+            )
+            core._shift_insert_at_index(
+                P_squared[thread_idx, i], idx, squared_distance_profile[nn_i]
+            )
+            core._shift_insert_at_index(I[thread_idx, i], idx, nn_i)
+
+        if P_squared[thread_idx, i, 0] == np.inf:  # pragma: no cover
+            I[thread_idx, i, 0] = -1
+            continue
+
+        j = nn_i
+        # Given the squared distance, work backwards and compute QT
+        QT_j = (m - P_squared[thread_idx, i, 0] / 2.0) * (Σ_T[j] * σ_Q[i]) + (
+            m * M_T[j] * μ_Q[i]
+        )
+        QT_j_prime = QT_j
+        # Update top-k for both subsequences `S[i+g] = T[i+g:i+g+m]`` and
+        # `S[j+g] = T[j+g:j+g+m]` (i.e., the right neighbors of `T[i : i+m]` and
+        # `T[j:j+m]`) by using the distance between `S[i+g]` and `S[j+g]`
+        for g in range(1, min(s, l - i, w - j)):
+            QT_j = (
+                QT_j
+                - T_B[j + g - 1] * T_A[i + g - 1]
+                + T_B[j + g + m - 1] * T_A[i + g + m - 1]
+            )
+            D_squared = core._calculate_squared_distance(
+                m,
+                QT_j,
+                M_T[j + g],
+                Σ_T[j + g],
+                μ_Q[i + g],
+                σ_Q[i + g],
             )
-            QT_j_prime = QT_j
-            for k in range(1, min(s, l - i, w - j)):
-                QT_j = (
-                    QT_j
-                    - T_B[j + k - 1] * T_A[i + k - 1]
-                    + T_B[j + k + m - 1] * T_A[i + k + m - 1]
+            if (
+                D_squared < P_squared[thread_idx, i + g, -1]
+                and (j + g) not in I[thread_idx, i + g]
+            ):
+                idx = np.searchsorted(
+                    P_squared[thread_idx, i + g], D_squared, side="right"
                 )
-                D_squared = core._calculate_squared_distance(
-                    m,
-                    QT_j,
-                    M_T[j + k],
-                    Σ_T[j + k],
-                    μ_Q[i + k],
-                    σ_Q[i + k],
+                core._shift_insert_at_index(
+                    P_squared[thread_idx, i + g], idx, D_squared
                 )
-                if D_squared < P_squared[thread_idx, i + k]:
-                    P_squared[thread_idx, i + k] = D_squared
-                    I[thread_idx, i + k] = j + k
-                if excl_zone is not None and D_squared < P_squared[thread_idx, j + k]:
-                    P_squared[thread_idx, j + k] = D_squared
-                    I[thread_idx, j + k] = i + k
-            QT_j = QT_j_prime
-            for k in range(1, min(s, i + 1, j + 1)):
-                QT_j = QT_j - T_B[j - k + m] * T_A[i - k + m] + T_B[j - k] * T_A[i - k]
-                D_squared = core._calculate_squared_distance(
-                    m,
-                    QT_j,
-                    M_T[j - k],
-                    Σ_T[j - k],
-                    μ_Q[i - k],
-                    σ_Q[i - k],
+                core._shift_insert_at_index(I[thread_idx, i + g], idx, j + g)
+
+            if (
+                excl_zone is not None
+                and D_squared < P_squared[thread_idx, j + g, -1]
+                and (i + g) not in I[thread_idx, j + g]
+            ):
+                idx = np.searchsorted(
+                    P_squared[thread_idx, j + g], D_squared, side="right"
                 )
-                if D_squared < P_squared[thread_idx, i - k]:
-                    P_squared[thread_idx, i - k] = D_squared
-                    I[thread_idx, i - k] = j - k
-                if excl_zone is not None and D_squared < P_squared[thread_idx, j - k]:
-                    P_squared[thread_idx, j - k] = D_squared
-                    I[thread_idx, j - k] = i - k
+                core._shift_insert_at_index(
+                    P_squared[thread_idx, j + g], idx, D_squared
+                )
+                core._shift_insert_at_index(I[thread_idx, j + g], idx, i + g)
+
+        QT_j = QT_j_prime
+        # Update top-k for both subsequences `S[i-g] = T[i-g:i-g+m]` and
+        # `S[j-g] = T[j-g:j-g+m]` (i.e., the left neighbors of `T[i : i+m]` and
+        # `T[j:j+m]`) by using the distance between `S[i-g]` and `S[j-g]`
+        for g in range(1, min(s, i + 1, j + 1)):
+            QT_j = QT_j - T_B[j - g + m] * T_A[i - g + m] + T_B[j - g] * T_A[i - g]
+            D_squared = core._calculate_squared_distance(
+                m,
+                QT_j,
+                M_T[j - g],
+                Σ_T[j - g],
+                μ_Q[i - g],
+                σ_Q[i - g],
+            )
+            if (
+                D_squared < P_squared[thread_idx, i - g, -1]
+                and (j - g) not in I[thread_idx, i - g]
+            ):
+                idx = np.searchsorted(
+                    P_squared[thread_idx, i - g], D_squared, side="right"
+                )
+                core._shift_insert_at_index(
+                    P_squared[thread_idx, i - g], idx, D_squared
+                )
+                core._shift_insert_at_index(I[thread_idx, i - g], idx, j - g)
+
+            if (
+                excl_zone is not None
+                and D_squared < P_squared[thread_idx, j - g, -1]
+                and (i - g) not in I[thread_idx, j - g]
+            ):
+                idx = np.searchsorted(
+                    P_squared[thread_idx, j - g], D_squared, side="right"
+                )
+                core._shift_insert_at_index(
+                    P_squared[thread_idx, j - g], idx, D_squared
+                )
+                core._shift_insert_at_index(I[thread_idx, j - g], idx, i - g)
+
+        # In the case of a self-join, the calculated distance profile can also be
+        # used to refine the top-k for all non-trivial subsequences
+        if excl_zone is not None:
+            # Note that the squared distance, `squared_distance_profile[j]`,
+            # between subsequences `S_i = T[i : i + m]` and `S_j = T[j : j + m]`
+            # can be used to update the top-k for BOTH subsequence `i` and
+            # subsequence `j`. We update the latter here.
+
+            indices = np.flatnonzero(
+                squared_distance_profile < P_squared[thread_idx, :, -1]
+            )
+            for j in indices:
+                if i not in I[thread_idx, j]:
+                    idx = np.searchsorted(
+                        P_squared[thread_idx, j],
+                        squared_distance_profile[j],
+                        side="right",
+                    )
+                    core._shift_insert_at_index(
+                        P_squared[thread_idx, j], idx, squared_distance_profile[j]
+                    )
+                    core._shift_insert_at_index(I[thread_idx, j], idx, i)
 
 
 @njit(
@@ -183,6 +338,7 @@ def _prescrump(
     indices,
     s,
     excl_zone=None,
+    k=1,
 ):
     """
     A Numba JIT-compiled implementation of the preSCRIMP algorithm.
@@ -200,16 +356,17 @@ def _prescrump(
         Window size
 
     μ_Q : numpy.ndarray
-        Sliding window mean for T_A
+        Sliding window mean for `T_A`
 
     σ_Q : numpy.ndarray
-        Sliding window standard deviation for T_A
+        Sliding window standard deviation for `T_A`
 
     M_T : numpy.ndarray
-        Sliding window mean for T_B
+        Sliding window mean for `T_B`
 
     Σ_T : numpy.ndarray
-        Sliding window standard deviation for T_B
+        Sliding window standard deviation for `T_B`
+
 
     indices : numpy.ndarray
         The subsequence indices to compute `prescrump` for
@@ -231,13 +388,23 @@ def _prescrump(
     excl_zone : int
         The half width for the exclusion zone relative to the `i`.
 
+    k : int, default 1
+        The number of top `k` smallest distances used to construct the matrix profile.
+        Note that this will increase the total computational time and memory usage
+        when k > 1.
+
     Returns
     -------
     out1 : numpy.ndarray
-        Matrix profile
+        The (top-k) matrix profile. When k=1 (default), the first (and only) column
+        in this 2D array consists of the matrix profile. When k > 1, the output
+        has exactly `k` columns consisting of the top-k matrix profile.
 
     out2 : numpy.ndarray
-        Matrix profile indices
+        The (top-k) matrix profile indices. When k=1 (default), the first (and only)
+        column in this 2D array consists of the matrix profile indices. When k > 1,
+        the output has exactly `k` columns consisting of the top-k matrix profile
+        indices.
 
     Notes
     -----
@@ -248,8 +415,8 @@ def _prescrump(
     """
     n_threads = numba.config.NUMBA_NUM_THREADS
     l = T_A.shape[0] - m + 1
-    P_squared = np.full((n_threads, l), np.inf, dtype=np.float64)
-    I = np.full((n_threads, l), -1, dtype=np.int64)
+    P_squared = np.full((n_threads, l, k), np.inf, dtype=np.float64)
+    I = np.full((n_threads, l, k), -1, dtype=np.int64)
 
     idx_ranges = core._get_ranges(len(indices), n_threads, truncate=False)
     for thread_idx in prange(n_threads):
@@ -269,23 +436,21 @@ def _prescrump(
             P_squared,
             I,
             excl_zone,
+            k,
         )
 
     for thread_idx in range(1, n_threads):
-        for i in range(l):
-            if P_squared[thread_idx, i] < P_squared[0, i]:
-                P_squared[0, i] = P_squared[thread_idx, i]
-                I[0, i] = I[thread_idx, i]
+        core._merge_topk_PI(P_squared[0], P_squared[thread_idx], I[0], I[thread_idx])
 
     return np.sqrt(P_squared[0]), I[0]
 
 
 @core.non_normalized(scraamp.prescraamp)
-def prescrump(T_A, m, T_B=None, s=None, normalize=True, p=2.0):
+def prescrump(T_A, m, T_B=None, s=None, normalize=True, p=2.0, k=1):
     """
-    A convenience wrapper around the Numba JIT-compiled parallelized `_prescrump`
-    function which computes the approximate matrix profile according to the preSCRIMP
-    algorithm
+    A convenience wrapper around the Numba JIT-compiled parallelized
+    `_prescrump` function which computes the approximate (top-k) matrix
+    profile according to the preSCRIMP algorithm.
 
     Parameters
     ----------
@@ -312,13 +477,23 @@ def prescrump(T_A, m, T_B=None, s=None, normalize=True, p=2.0):
         The p-norm to apply for computing the Minkowski distance. This parameter is
         ignored when `normalize == True`.
 
+    k : int, default 1
+        The number of top `k` smallest distances used to construct the matrix profile.
+        Note that this will increase the total computational time and memory usage
+        when k > 1.
+
     Returns
     -------
     P : numpy.ndarray
-        Matrix profile
+        The (top-k) matrix profile. When k = 1 (default), this is a 1D array
+        consisting of the matrix profile. When k > 1, the output is a 2D array that
+        has exactly `k` columns consisting of the top-k matrix profile.
 
     I : numpy.ndarray
-        Matrix profile indices
+        The (top-k) matrix profile indices. When k = 1 (default), this is a 1D array
+        consisting of the matrix profile indices. When k > 1, the output is a 2D
+        array that has exactly `k` columns consisting of the top-k matrix profile
+        indices.
 
     Notes
     -----
@@ -327,25 +502,10 @@ def prescrump(T_A, m, T_B=None, s=None, normalize=True, p=2.0):
 
     See Algorithm 2
     """
-    if T_B is None:
-        T_B = T_A
-        excl_zone = int(np.ceil(m / config.STUMPY_EXCL_ZONE_DENOM))
-    else:
-        excl_zone = None
-
-    T_A, μ_Q, σ_Q = core.preprocess(T_A, m)
-    T_B, M_T, Σ_T = core.preprocess(T_B, m)
-
-    n_A = T_A.shape[0]
-    l = n_A - m + 1
-
-    if s is None:  # pragma: no cover
-        if excl_zone is not None:  # self-join
-            s = excl_zone
-        else:  # AB-join
-            s = int(np.ceil(m / config.STUMPY_EXCL_ZONE_DENOM))
+    T_A, T_B, μ_Q, σ_Q, M_T, Σ_T, indices, s, excl_zone = _preprocess_prescrump(
+        T_A, m, T_B=T_B, s=s
+    )
 
-    indices = np.random.permutation(range(0, l, s)).astype(np.int64)
     P, I = _prescrump(
         T_A,
         T_B,
@@ -357,9 +517,13 @@ def prescrump(T_A, m, T_B=None, s=None, normalize=True, p=2.0):
         indices,
         s,
         excl_zone,
+        k,
     )
 
-    return P, I
+    if k == 1:
+        return P.flatten().astype(np.float64), I.flatten().astype(np.int64)
+    else:
+        return P, I
 
 
 @core.non_normalized(
@@ -413,22 +577,40 @@ class scrump:
         The p-norm to apply for computing the Minkowski distance. This parameter is
         ignored when `normalize == True`.
 
+    k : int, default 1
+        The number of top `k` smallest distances used to construct the matrix profile.
+        Note that this will increase the total computational time and memory usage
+        when k > 1.
+
     Attributes
     ----------
     P_ : numpy.ndarray
-        The updated matrix profile
+        The updated (top-k) matrix profile. When `k=1` (default), this output is
+        a 1D array consisting of the matrix profile. When `k > 1`, the output
+        is a 2D array that has exactly `k` columns consisting of the top-k matrix
+        profile.
 
     I_ : numpy.ndarray
-        The updated matrix profile indices
+        The updated (top-k) matrix profile indices. When `k=1` (default), this output is
+        a 1D array consisting of the matrix profile indices. When `k > 1`, the output
+        is a 2D array that has exactly `k` columns consisting of the top-k matrix
+        profile indiecs.
+
+    left_I_ : numpy.ndarray
+        The updated left (top-1) matrix profile indices
+
+    right_I_ : numpy.ndarray
+        The updated right (top-1) matrix profile indices
+
 
     Methods
     -------
     update()
         Update the matrix profile and the matrix profile indices by computing
         additional new distances (limited by `percentage`) that make up the full
-        distance matrix. Each output contains three columns that correspond to
-        the matrix profile, the left matrix profile, and the right matrix profile,
-        respectively.
+        distance matrix. It updates the (top-k) matrix profile, (top-1) left
+        matrix profile, (top-1) right matrix profile, (top-k) matrix profile indices,
+        (top-1) left matrix profile indices, and (top-1) right matrix profile indices.
 
     See Also
     --------
@@ -476,6 +658,7 @@ def __init__(
         s=None,
         normalize=True,
         p=2.0,
+        k=1,
     ):
         """
         Initialize the `scrump` object
@@ -518,6 +701,11 @@ def __init__(
         p : float, default 2.0
             The p-norm to apply for computing the Minkowski distance. This parameter is
             ignored when `normalize == True`.
+
+        k : int, default 1
+            The number of top `k` smallest distances used to construct the matrix
+            profile. Note that this will increase the total computational time and
+            memory usage when k > 1.
         """
         self._ignore_trivial = ignore_trivial
 
@@ -574,11 +762,15 @@ def __init__(
         self._n_A = self._T_A.shape[0]
         self._n_B = self._T_B.shape[0]
         self._l = self._n_A - self._m + 1
+        self._k = k
 
-        self._P = np.empty((self._l, 3), dtype=np.float64)
-        self._I = np.empty((self._l, 3), dtype=np.int64)
-        self._P[:, :] = np.inf
-        self._I[:, :] = -1
+        self._P = np.full((self._l, self._k), np.inf, dtype=np.float64)
+        self._PL = np.full(self._l, np.inf, dtype=np.float64)
+        self._PR = np.full(self._l, np.inf, dtype=np.float64)
+
+        self._I = np.full((self._l, self._k), -1, dtype=np.int64)
+        self._IL = np.full(self._l, -1, dtype=np.int64)
+        self._IR = np.full(self._l, -1, dtype=np.int64)
 
         self._excl_zone = int(np.ceil(self._m / config.STUMPY_EXCL_ZONE_DENOM))
         if s is None:
@@ -589,13 +781,32 @@ def __init__(
 
         if pre_scrump:
             if self._ignore_trivial:
-                P, I = prescrump(T_A, m, s=s)
+                (
+                    T_A,
+                    T_B,
+                    μ_Q,
+                    σ_Q,
+                    M_T,
+                    Σ_T,
+                    indices,
+                    s,
+                    excl_zone,
+                ) = _preprocess_prescrump(T_A, m, s=s)
             else:
-                P, I = prescrump(T_A, m, T_B=T_B, s=s)
-            for i in range(P.shape[0]):
-                if self._P[i, 0] > P[i]:
-                    self._P[i, 0] = P[i]
-                    self._I[i, 0] = I[i]
+                (
+                    T_A,
+                    T_B,
+                    μ_Q,
+                    σ_Q,
+                    M_T,
+                    Σ_T,
+                    indices,
+                    s,
+                    excl_zone,
+                ) = _preprocess_prescrump(T_A, m, T_B=T_B, s=s)
+
+            P, I = _prescrump(T_A, T_B, m, μ_Q, σ_Q, M_T, Σ_T, indices, s, excl_zone, k)
+            core._merge_topk_PI(self._P, P, self._I, I)
 
         if self._ignore_trivial:
             self._diags = np.random.permutation(
@@ -626,14 +837,14 @@ def __init__(
 
     def update(self):
         """
-        Update the matrix profile and the matrix profile indices by computing
-        additional new distances (limited by `percentage`) that make up the full
-        distance matrix.
+        Update the (top-k) matrix profile and the (top-k) matrix profile indices by
+        computing additional new distances (limited by `percentage`) that make up
+        the full distance matrix.
         """
         if self._chunk_idx < self._n_chunks:
             start_idx, stop_idx = self._chunk_diags_ranges[self._chunk_idx]
 
-            P, I = _stump(
+            P, PL, PR, I, IL, IR = _stump(
                 self._T_A,
                 self._T_B,
                 self._m,
@@ -649,48 +860,60 @@ def update(self):
                 self._T_B_subseq_isconstant,
                 self._diags[start_idx:stop_idx],
                 self._ignore_trivial,
+                self._k,
             )
 
-            # Update matrix profile and indices
-            for i in range(self._P.shape[0]):
-                if self._P[i, 0] > P[i, 0]:
-                    self._P[i, 0] = P[i, 0]
-                    self._I[i, 0] = I[i, 0]
-                # left matrix profile and left matrix profile indices
-                if self._P[i, 1] > P[i, 1]:
-                    self._P[i, 1] = P[i, 1]
-                    self._I[i, 1] = I[i, 1]
-                # right matrix profile and right matrix profile indices
-                if self._P[i, 2] > P[i, 2]:
-                    self._P[i, 2] = P[i, 2]
-                    self._I[i, 2] = I[i, 2]
+            # Update (top-k) matrix profile and indices
+            core._merge_topk_PI(self._P, P, self._I, I)
+
+            # update left matrix profile and indices
+            mask = PL < self._PL
+            self._PL[mask] = PL[mask]
+            self._IL[mask] = IL[mask]
+
+            # update right matrix profile and indices
+            mask = PR < self._PR
+            self._PR[mask] = PR[mask]
+            self._IR[mask] = IR[mask]
 
             self._chunk_idx += 1
 
     @property
     def P_(self):
         """
-        Get the updated matrix profile
+        Get the updated (top-k) matrix profile. When `k=1` (default), this output
+        is a 1D array consisting of the updated matrix profile. When `k > 1`, the
+        output is a 2D array that has exactly `k` columns consisting of the updated
+        top-k matrix profile.
         """
-        return self._P[:, 0].astype(np.float64)
+        if self._k == 1:
+            return self._P.flatten().astype(np.float64)
+        else:
+            return self._P.astype(np.float64)
 
     @property
     def I_(self):
         """
-        Get the updated matrix profile indices
+        Get the updated (top-k) matrix profile indices. When `k=1` (default), this
+        output is a 1D array consisting of the updated matrix profile indices. When
+        `k > 1`, the output is a 2D array that has exactly `k` columns consisting
+        of the updated top-k matrix profile indices.
         """
-        return self._I[:, 0].astype(np.int64)
+        if self._k == 1:
+            return self._I.flatten().astype(np.int64)
+        else:
+            return self._I.astype(np.int64)
 
     @property
     def left_I_(self):
         """
-        Get the updated left matrix profile indices
+        Get the updated left (top-1) matrix profile indices
         """
-        return self._I[:, 1].astype(np.int64)
+        return self._IL.astype(np.int64)
 
     @property
     def right_I_(self):
         """
-        Get the updated right matrix profile indices
+        Get the updated right (top-1) matrix profile indices
         """
-        return self._I[:, 2].astype(np.int64)
+        return self._IR.astype(np.int64)
diff --git a/stumpy/stimp.py b/stumpy/stimp.py
index 1c285f116..c377a69aa 100644
--- a/stumpy/stimp.py
+++ b/stumpy/stimp.py
@@ -214,6 +214,7 @@ def update(self):
                     ignore_trivial=True,
                     percentage=self._percentage,
                     pre_scrump=self._pre_scrump,
+                    k=1,
                 )
                 approx.update()
                 self._PAN[
diff --git a/stumpy/stump.py b/stumpy/stump.py
index fed857649..49657370a 100644
--- a/stumpy/stump.py
+++ b/stumpy/stump.py
@@ -40,13 +40,18 @@ def _compute_diagonal(
     diags_stop_idx,
     thread_idx,
     ρ,
+    ρL,
+    ρR,
     I,
+    IL,
+    IR,
     ignore_trivial,
+    k,
 ):
     """
-    Compute (Numba JIT-compiled) and update the Pearson correlation, ρ, and I
-    sequentially along individual diagonals using a single thread and avoiding race
-    conditions
+    Compute (Numba JIT-compiled) and update the (top-k) Pearson correlation (ρ),
+    ρL, ρR, I, IL, and IR sequentially along individual diagonals using a single
+    thread and avoiding race conditions.
 
     Parameters
     ----------
@@ -116,15 +121,32 @@ def _compute_diagonal(
         The thread index
 
     ρ : numpy.ndarray
-        The Pearson correlations
+        The (top-k) Pearson correlations, sorted in ascending order per row
+
+    ρL : numpy.ndarray
+        The top-1 left Pearson correlations
+
+    ρR : numpy.ndarray
+        The top-1 right Pearson correlations
 
     I : numpy.ndarray
-        The matrix profile indices
+        The (top-k) matrix profile indices
+
+    IL : numpy.ndarray
+        The top-1 left matrix profile indices
+
+    IR : numpy.ndarray
+        The top-1 right matrix profile indices
 
     ignore_trivial : bool
         Set to `True` if this is a self-join. Otherwise, for AB-join, set this to
         `False`. Default is `True`.
 
+    k : int
+        The number of top `k` smallest distances used to construct the matrix profile.
+        Note that this will increase the total computational time and memory usage
+        when k > 1.
+
     Returns
     -------
     None
@@ -155,16 +177,16 @@ def _compute_diagonal(
     uint64_m = np.uint64(m)
 
     for diag_idx in range(diags_start_idx, diags_stop_idx):
-        k = diags[diag_idx]
+        g = diags[diag_idx]
 
-        if k >= 0:
-            iter_range = range(0, min(n_A - m + 1, n_B - m + 1 - k))
+        if g >= 0:
+            iter_range = range(0, min(n_A - m + 1, n_B - m + 1 - g))
         else:
-            iter_range = range(-k, min(n_A - m + 1, n_B - m + 1 - k))
+            iter_range = range(-g, min(n_A - m + 1, n_B - m + 1 - g))
 
         for i in iter_range:
             uint64_i = np.uint64(i)
-            uint64_j = np.uint64(i + k)
+            uint64_j = np.uint64(i + g)
 
             if uint64_i == 0 or uint64_j == 0:
                 cov = (
@@ -196,25 +218,39 @@ def _compute_diagonal(
                 if T_B_subseq_isconstant[uint64_j] and T_A_subseq_isconstant[uint64_i]:
                     pearson = 1.0
 
+                # `ρ[thread_idx, i, :]` is sorted ascendingly and MUST be updated
+                # when the newly-calculated `pearson` value becomes greater than the
+                # first (i.e. smallest) element in this array. Note that a higher
+                # pearson value corresponds to a lower distance.
                 if pearson > ρ[thread_idx, uint64_i, 0]:
-                    ρ[thread_idx, uint64_i, 0] = pearson
-                    I[thread_idx, uint64_i, 0] = uint64_j
+                    idx = np.searchsorted(ρ[thread_idx, uint64_i], pearson)
+                    core._shift_insert_at_index(
+                        ρ[thread_idx, uint64_i], idx, pearson, shift="left"
+                    )
+                    core._shift_insert_at_index(
+                        I[thread_idx, uint64_i], idx, uint64_j, shift="left"
+                    )
 
                 if ignore_trivial:  # self-joins only
                     if pearson > ρ[thread_idx, uint64_j, 0]:
-                        ρ[thread_idx, uint64_j, 0] = pearson
-                        I[thread_idx, uint64_j, 0] = uint64_i
+                        idx = np.searchsorted(ρ[thread_idx, uint64_j], pearson)
+                        core._shift_insert_at_index(
+                            ρ[thread_idx, uint64_j], idx, pearson, shift="left"
+                        )
+                        core._shift_insert_at_index(
+                            I[thread_idx, uint64_j], idx, uint64_i, shift="left"
+                        )
 
                     if uint64_i < uint64_j:
                         # left pearson correlation and left matrix profile index
-                        if pearson > ρ[thread_idx, uint64_j, 1]:
-                            ρ[thread_idx, uint64_j, 1] = pearson
-                            I[thread_idx, uint64_j, 1] = uint64_i
+                        if pearson > ρL[thread_idx, uint64_j]:
+                            ρL[thread_idx, uint64_j] = pearson
+                            IL[thread_idx, uint64_j] = uint64_i
 
                         # right pearson correlation and right matrix profile index
-                        if pearson > ρ[thread_idx, uint64_i, 2]:
-                            ρ[thread_idx, uint64_i, 2] = pearson
-                            I[thread_idx, uint64_i, 2] = uint64_j
+                        if pearson > ρR[thread_idx, uint64_i]:
+                            ρR[thread_idx, uint64_i] = pearson
+                            IR[thread_idx, uint64_i] = uint64_j
 
     return
 
@@ -241,11 +277,13 @@ def _stump(
     T_B_subseq_isconstant,
     diags,
     ignore_trivial,
+    k,
 ):
     """
     A Numba JIT-compiled version of STOMPopt with Pearson correlations for parallel
-    computation of the matrix profile, matrix profile indices, left matrix profile
-    indices, and right matrix profile indices.
+    computation of the (top-k) matrix profile, the (top-k) matrix profile indices,
+    the top-1 left matrix profile and its matrix profile index, and the top-1 right
+    matrix profile and its matrix profile index.
 
     Parameters
     ----------
@@ -300,15 +338,30 @@ def _stump(
         Set to `True` if this is a self-join. Otherwise, for AB-join, set this to
         `False`. Default is `True`.
 
+    k : int
+        The number of top `k` smallest distances used to construct the matrix profile.
+        Note that this will increase the total computational time and memory usage
+        when k > 1.
+
     Returns
     -------
     profile : numpy.ndarray
-        Matrix profile
+        The (top-k) matrix profile
 
     indices : numpy.ndarray
-        The first column consists of the matrix profile indices, the second
-        column consists of the left matrix profile indices, and the third
-        column consists of the right matrix profile indices.
+        The (top-k) matrix profile indices
+
+    left profile : numpy.ndarray
+        The (top-1) left matrix profile
+
+    left indices : numpy.ndarray
+        The (top-1) left matrix profile indices
+
+    right profile : numpy.ndarray
+        The (top-1) right matrix profile
+
+    right indices : numpy.ndarray
+        The (top-1) right matrix profile indices
 
     Notes
     -----
@@ -359,8 +412,15 @@ def _stump(
     n_B = T_B.shape[0]
     l = n_A - m + 1
     n_threads = numba.config.NUMBA_NUM_THREADS
-    ρ = np.full((n_threads, l, 3), -np.inf, dtype=np.float64)
-    I = np.full((n_threads, l, 3), -1, dtype=np.int64)
+
+    ρ = np.full((n_threads, l, k), np.NINF, dtype=np.float64)
+    I = np.full((n_threads, l, k), -1, dtype=np.int64)
+
+    ρL = np.full((n_threads, l), np.NINF, dtype=np.float64)
+    IL = np.full((n_threads, l), -1, dtype=np.int64)
+
+    ρR = np.full((n_threads, l), np.NINF, dtype=np.float64)
+    IR = np.full((n_threads, l), -1, dtype=np.int64)
 
     ndist_counts = core._count_diagonal_ndist(diags, m, n_A, n_B)
     diags_ranges = core._get_array_ranges(ndist_counts, n_threads, False)
@@ -383,7 +443,8 @@ def _stump(
     cov_d[:] = cov_d - μ_Q_m_1
 
     for thread_idx in prange(n_threads):
-        # Compute and update cov, I within a single thread to avoiding race conditions
+        # Compute and update pearson correlations and matrix profile indices
+        # within a single thread and avoiding race conditions
         _compute_diagonal(
             T_A,
             T_B,
@@ -405,47 +466,64 @@ def _stump(
             diags_ranges[thread_idx, 1],
             thread_idx,
             ρ,
+            ρL,
+            ρR,
             I,
+            IL,
+            IR,
             ignore_trivial,
+            k,
         )
 
     # Reduction of results from all threads
     for thread_idx in range(1, n_threads):
-        for i in prange(l):
-            if ρ[0, i, 0] < ρ[thread_idx, i, 0]:
-                ρ[0, i, 0] = ρ[thread_idx, i, 0]
-                I[0, i, 0] = I[thread_idx, i, 0]
-            # left pearson correlation and left matrix profile indices
-            if ρ[0, i, 1] < ρ[thread_idx, i, 1]:
-                ρ[0, i, 1] = ρ[thread_idx, i, 1]
-                I[0, i, 1] = I[thread_idx, i, 1]
-            # right pearson correlation and right matrix profile indices
-            if ρ[0, i, 2] < ρ[thread_idx, i, 2]:
-                ρ[0, i, 2] = ρ[thread_idx, i, 2]
-                I[0, i, 2] = I[thread_idx, i, 2]
-
-    # Convert pearson correlations to distances
-    p_norm = np.abs(2 * m * (1 - ρ[0, :, :]))
+        # update top-k arrays
+        core._merge_topk_ρI(ρ[0], ρ[thread_idx], I[0], I[thread_idx])
+
+        # update left matrix profile and matrix profile indices
+        mask = ρL[0] < ρL[thread_idx]
+        ρL[0][mask] = ρL[thread_idx][mask]
+        IL[0][mask] = IL[thread_idx][mask]
+
+        # update right matrix profile and matrix profile indices
+        mask = ρR[0] < ρR[thread_idx]
+        ρR[0][mask] = ρR[thread_idx][mask]
+        IR[0][mask] = IR[thread_idx][mask]
+
+    # Reverse top-k rho (and its associated I) to be in descending order and
+    # then convert from Pearson correlations to Euclidean distances (ascending order)
+    p_norm = np.abs(2 * m * (1 - ρ[0, :, ::-1]))
+    I = I[0, :, ::-1]
+
+    p_norm_L = np.abs(2 * m * (1 - ρL[0, :]))
+    p_norm_R = np.abs(2 * m * (1 - ρR[0, :]))
+
     for i in prange(p_norm.shape[0]):
-        if p_norm[i, 0] < config.STUMPY_P_NORM_THRESHOLD:
-            p_norm[i, 0] = 0.0
-        if p_norm[i, 1] < config.STUMPY_P_NORM_THRESHOLD:
-            p_norm[i, 1] = 0.0
-        if p_norm[i, 2] < config.STUMPY_P_NORM_THRESHOLD:
-            p_norm[i, 2] = 0.0
+        for j in range(p_norm.shape[1]):
+            if p_norm[i, j] < config.STUMPY_P_NORM_THRESHOLD:
+                p_norm[i, j] = 0.0
+
+        if p_norm_L[i] < config.STUMPY_P_NORM_THRESHOLD:
+            p_norm_L[i] = 0.0
+
+        if p_norm_R[i] < config.STUMPY_P_NORM_THRESHOLD:
+            p_norm_R[i] = 0.0
+
     P = np.sqrt(p_norm)
+    PL = np.sqrt(p_norm_L)
+    PR = np.sqrt(p_norm_R)
 
-    return P[:, :], I[0, :, :]
+    return P, PL, PR, I, IL[0], IR[0]
 
 
 @core.non_normalized(aamp)
-def stump(T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0):
+def stump(T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0, k=1):
     """
     Compute the z-normalized matrix profile
 
     This is a convenience wrapper around the Numba JIT-compiled parallelized
-    `_stump` function which computes the matrix profile according to STOMPopt with
-    Pearson correlations.
+    `_stump` function which computes the (top-k) matrix profile according to
+    STOMPopt with Pearson correlations.
 
     Parameters
     ----------
@@ -473,13 +551,25 @@ def stump(T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0):
         The p-norm to apply for computing the Minkowski distance. This parameter is
         ignored when `normalize == True`.
 
+    k : int, default 1
+        The number of top `k` smallest distances used to construct the matrix profile.
+        Note that this will increase the total computational time and memory usage
+        when k > 1. If you have access to a GPU device, then you may be able to
+        leverage `gpu_stump` for better performance and scalability.
+
     Returns
     -------
     out : numpy.ndarray
-        The first column consists of the matrix profile, the second column
-        consists of the matrix profile indices, the third column consists of
-        the left matrix profile indices, and the fourth column consists of
-        the right matrix profile indices.
+        When k = 1 (default), the first column consists of the matrix profile,
+        the second column consists of the matrix profile indices, the third column
+        consists of the left matrix profile indices, and the fourth column consists
+        of the right matrix profile indices. However, when k > 1, the output array
+        will contain exactly 2 * k + 2 columns. The first k columns (i.e., out[:, :k])
+        consists of the top-k matrix profile, the next set of k columns
+        (i.e., out[:, k:2k]) consists of the corresponding top-k matrix profile
+        indices, and the last two columns (i.e., out[:, 2k] and out[:, 2k+1] or,
+        equivalently, out[:, -2] and out[:, -1]) correspond to the top-1 left
+        matrix profile indices and the top-1 right matrix profile indices, respectively.
 
     See Also
     --------
@@ -593,14 +683,13 @@ def stump(T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0):
     l = n_A - m + 1
 
     excl_zone = int(np.ceil(m / config.STUMPY_EXCL_ZONE_DENOM))
-    out = np.empty((l, 4), dtype=object)
 
     if ignore_trivial:
         diags = np.arange(excl_zone + 1, n_A - m + 1, dtype=np.int64)
     else:
         diags = np.arange(-(n_A - m + 1) + 1, n_B - m + 1, dtype=np.int64)
 
-    P, I = _stump(
+    P, PL, PR, I, IL, IR = _stump(
         T_A,
         T_B,
         m,
@@ -616,10 +705,13 @@ def stump(T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0):
         T_B_subseq_isconstant,
         diags,
         ignore_trivial,
+        k,
     )
 
-    out[:, 0] = P[:, 0]
-    out[:, 1:] = I
+    out = np.empty((l, 2 * k + 2), dtype=object)  # last two columns are to
+    # store left and right matrix profile indices
+    out[:, :k] = P
+    out[:, k:] = np.column_stack((I, IL, IR))
 
     core._check_P(out[:, 0])
 
diff --git a/stumpy/stumped.py b/stumpy/stumped.py
index 696d49080..0b95c5b3a 100644
--- a/stumpy/stumped.py
+++ b/stumpy/stumped.py
@@ -14,12 +14,14 @@
 
 
 @core.non_normalized(aamped)
-def stumped(dask_client, T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0):
+def stumped(
+    dask_client, T_A, m, T_B=None, ignore_trivial=True, normalize=True, p=2.0, k=1
+):
     """
-    Compute the z-normalized matrix profile with a distributed dask cluster
+    Compute the z-normalized (top-k) matrix profile with a distributed dask cluster
 
     This is a highly distributed implementation around the Numba JIT-compiled
-    parallelized `_stump` function which computes the matrix profile according
+    parallelized `_stump` function which computes the (top-k) matrix profile according
     to STOMPopt with Pearson correlations.
 
     Parameters
@@ -54,13 +56,25 @@ def stumped(dask_client, T_A, m, T_B=None, ignore_trivial=True, normalize=True,
         The p-norm to apply for computing the Minkowski distance. This parameter is
         ignored when `normalize == True`.
 
+    k : int, default 1
+        The number of top `k` smallest distances used to construct the matrix profile.
+        Note that this will increase the total computational time and memory usage
+        when k > 1. If you have access to a GPU device, then you may be able to
+        leverage `gpu_stump` for better performance and scalability.
+
     Returns
     -------
     out : numpy.ndarray
-        The first column consists of the matrix profile, the second column
-        consists of the matrix profile indices, the third column consists of
-        the left matrix profile indices, and the fourth column consists of
-        the right matrix profile indices.
+        When k = 1 (default), the first column consists of the matrix profile,
+        the second column consists of the matrix profile indices, the third column
+        consists of the left matrix profile indices, and the fourth column consists
+        of the right matrix profile indices. However, when k > 1, the output array
+        will contain exactly 2 * k + 2 columns. The first k columns (i.e., out[:, :k])
+        consists of the top-k matrix profile, the next set of k columns
+        (i.e., out[:, k:2k]) consists of the corresponding top-k matrix profile
+        indices, and the last two columns (i.e., out[:, 2k] and out[:, 2k+1] or,
+        equivalently, out[:, -2] and out[:, -1]) correspond to the top-1 left
+        matrix profile indices and the top-1 right matrix profile indices, respectively.
 
     See Also
     --------
@@ -183,7 +197,6 @@ def stumped(dask_client, T_A, m, T_B=None, ignore_trivial=True, normalize=True,
     l = n_A - m + 1
 
     excl_zone = int(np.ceil(m / config.STUMPY_EXCL_ZONE_DENOM))
-    out = np.empty((l, 4), dtype=object)
 
     hosts = list(dask_client.ncores().keys())
     nworkers = len(hosts)
@@ -248,20 +261,31 @@ def stumped(dask_client, T_A, m, T_B=None, ignore_trivial=True, normalize=True,
                 T_B_subseq_isconstant_future,
                 diags_futures[i],
                 ignore_trivial,
+                k,
             )
         )
 
     results = dask_client.gather(futures)
-    profile, indices = results[0]
+    profile, profile_L, profile_R, indices, indices_L, indices_R = results[0]
+
     for i in range(1, len(hosts)):
-        P, I = results[i]
-        for col in range(P.shape[1]):  # pragma: no cover
-            cond = P[:, col] < profile[:, col]
-            profile[:, col] = np.where(cond, P[:, col], profile[:, col])
-            indices[:, col] = np.where(cond, I[:, col], indices[:, col])
-
-    out[:, 0] = profile[:, 0]
-    out[:, 1:4] = indices
+        P, PL, PR, I, IL, IR = results[i]
+        # Update top-k matrix profile and matrix profile indices
+        core._merge_topk_PI(profile, P, indices, I)
+
+        # Update top-1 left matrix profile and matrix profile index
+        mask = PL < profile_L
+        profile_L[mask] = PL[mask]
+        indices_L[mask] = IL[mask]
+
+        # Update top-1 right matrix profile and matrix profile index
+        mask = PR < profile_R
+        profile_R[mask] = PR[mask]
+        indices_R[mask] = IR[mask]
+
+    out = np.empty((l, 2 * k + 2), dtype=object)
+    out[:, :k] = profile
+    out[:, k:] = np.column_stack((indices, indices_L, indices_R))
 
     core._check_P(out[:, 0])
 
diff --git a/stumpy/stumpi.py b/stumpy/stumpi.py
index a3318eaa0..82fac2da8 100644
--- a/stumpy/stumpi.py
+++ b/stumpy/stumpi.py
@@ -36,19 +36,30 @@ class stumpi:
         The p-norm to apply for computing the Minkowski distance. This parameter is
         ignored when `normalize == True`.
 
+    k : int, default 1
+        The number of top `k` smallest distances used to construct the matrix profile.
+        Note that this will increase the total computational time and memory usage
+        when k > 1.
+
     Attributes
     ----------
     P_ : numpy.ndarray
-        The updated matrix profile for `T`
+        The updated (top-k) matrix profile for `T`. When `k=1` (default), the first
+        (and only) column in this 2D array consists of the matrix profile. When
+        `k > 1`, the output has exactly `k` columns consisting of the top-k matrix
+        profile.
 
     I_ : numpy.ndarray
-        The updated matrix profile indices for `T`
+        The updated (top-k) matrix profile indices for `T`. When `k=1` (default),
+        the first (and only) column in this 2D array consists of the matrix profile
+        indices. When `k > 1`, the output has exactly `k` columns consisting of the
+        top-k matrix profile indices.
 
     left_P_ : numpy.ndarray
-        The updated left matrix profile for `T`
+        The updated left (top-1) matrix profile for `T`
 
     left_I_ : numpy.ndarray
-        The updated left matrix profile indices for `T`
+        The updated left (top-1) matrix profile indices for `T`
 
     T_ : numpy.ndarray
         The updated time series or sequence for which the matrix profile and matrix
@@ -81,7 +92,7 @@ class stumpi:
     array([-1,  0,  1,  2])
     """
 
-    def __init__(self, T, m, egress=True, normalize=True, p=2.0):
+    def __init__(self, T, m, egress=True, normalize=True, p=2.0, k=1):
         """
         Initialize the `stumpi` object
 
@@ -106,34 +117,41 @@ def __init__(self, T, m, egress=True, normalize=True, p=2.0):
         p : float, default 2.0
             The p-norm to apply for computing the Minkowski distance. This parameter is
             ignored when `normalize == True`.
+
+        k : int, default 1
+            The number of top `k` smallest distances used to construct the matrix
+            profile. Note that this will increase the total computational time and
+            memory usage when `k > 1`.
         """
         self._T = core._preprocess(T)
         core.check_window_size(m, max_size=self._T.shape[-1])
         self._m = m
+        self._k = k
+
         self._n = self._T.shape[0]
         self._excl_zone = int(np.ceil(self._m / config.STUMPY_EXCL_ZONE_DENOM))
         self._T_isfinite = np.isfinite(self._T)
         self._egress = egress
 
-        mp = stump(self._T, self._m)
-        self._P = mp[:, 0].astype(np.float64)
-        self._I = mp[:, 1].astype(np.int64)
-        self._left_I = mp[:, 2].astype(np.int64)
-        self._left_P = np.empty(self._P.shape, dtype=np.float64)
-        self._left_P[:] = np.inf
+        mp = stump(self._T, self._m, k=self._k)
+        self._P = mp[:, :k].astype(np.float64)
+        self._I = mp[:, k : 2 * k].astype(np.int64)
+
+        self._left_I = mp[:, 2 * k].astype(np.int64)
+        self._left_P = np.full_like(self._left_I, np.inf, dtype=np.float64)
 
         self._T, self._M_T, self._Σ_T = core.preprocess(self._T, self._m)
         # Retrieve the left matrix profile values
 
-        # Since each matrix profile value is the minimum between the left and right
-        # matrix profile values, we can save time by re-computing only the left matrix
-        # profile value when the matrix profile index is equal to the right matrix
-        # profile index.
-        mask = self._left_I == self._I
-        self._left_P[mask] = self._P[mask]
+        # Since each (top-1) matrix profile value is the minimum between the left
+        # and right matrix profile values, we can save time by re-computing only
+        # the left matrix profile value when the (top-1) matrix profile index is
+        # equal to the right matrix profile index.
+        mask = self._left_I == self._I[:, 0]
+        self._left_P[mask] = self._P[mask, 0]
 
         # Only re-compute the `i`-th left matrix profile value, `self._left_P[i]`,
-        # when `self._I[i] != self._left_I[i]`
+        # when `self._left_I[i] != self._I[i, 0]`
         for i in np.flatnonzero(self._left_I >= 0 & ~mask):
             j = self._left_I[i]
             QT = np.dot(self._T[i : i + self._m], self._T[j : j + self._m])
@@ -156,7 +174,7 @@ def __init__(self, T, m, egress=True, normalize=True, p=2.0):
     def update(self, t):
         """
         Append a single new data point, `t`, to the existing time series `T` and update
-        the matrix profile and matrix profile indices.
+        the (top-k) matrix profile and matrix profile indices.
 
         Parameters
         ----------
@@ -179,8 +197,8 @@ def update(self, t):
 
     def _update_egress(self, t):
         """
-        Ingress a new data point, egress the oldest data point, and update the matrix
-        profile and matrix profile indices
+        Ingress a new data point, egress the oldest data point, and update the (top-k)
+        matrix profile and matrix profile indices
         """
         self._n = self._T.shape[0]
         l = self._n - self._m + 1 - 1  # Subtract 1 due to egress
@@ -229,28 +247,37 @@ def _update_egress(self, t):
 
         core.apply_exclusion_zone(D, D.shape[0] - 1, self._excl_zone, np.inf)
 
-        update_idx = np.argwhere(D < self._P).flatten()
-        self._I[update_idx] = D.shape[0] + self._n_appended - 1  # D.shape[0] is base-1
-        self._P[update_idx] = D[update_idx]
-
-        I_last = np.argmin(D)
-
-        if np.isinf(D[I_last]):
-            self._I[-1] = -1
-            self._P[-1] = np.inf
-        else:
-            self._I[-1] = I_last + self._n_appended
-            self._P[-1] = D[I_last]
-
-        self._left_I[-1] = I_last + self._n_appended
-        self._left_P[-1] = D[I_last]
+        update_idx = np.argwhere(D < self._P[:, -1]).flatten()
+        for i in update_idx:
+            idx = np.searchsorted(self._P[i], D[i], side="right")
+            core._shift_insert_at_index(self._P[i], idx, D[i])
+            core._shift_insert_at_index(
+                self._I[i], idx, D.shape[0] + self._n_appended - 1
+            )
+            # D.shape[0] is base-1
+
+        # Calculate the (top-k) matrix profile values/indices for the last susequence
+        # by using its correspondng distance profile `D`
+        self._P[-1] = np.inf
+        self._I[-1] = -1
+        for i, d in enumerate(D):
+            if d < self._P[-1, -1]:
+                idx = np.searchsorted(self._P[-1], d, side="right")
+                core._shift_insert_at_index(self._P[-1], idx, d)
+                core._shift_insert_at_index(self._I[-1], idx, i + self._n_appended)
+
+        # All neighbors of the last subsequence are on its left. So, its (top-1)
+        # matrix profile value/index and its left matrix profile value/index must
+        # be equal.
+        self._left_P[-1] = self._P[-1, 0]
+        self._left_I[-1] = self._I[-1, 0]
 
         self._QT[:] = self._QT_new
 
     def _update(self, t):
         """
-        Ingress a new data point and update the matrix profile and matrix profile
-        indices without egressing the oldest data point
+        Ingress a new data point and update the (top-k) matrix profile and matrix
+        profile indices without egressing the oldest data point
         """
         n = self._T.shape[0]
         l = n - self._m + 1
@@ -287,25 +314,30 @@ def _update(self, t):
 
         core.apply_exclusion_zone(D, D.shape[0] - 1, self._excl_zone, np.inf)
 
-        update_idx = np.argwhere(D[:l] < self._P[:l]).flatten()
-        self._I[update_idx] = l
-        self._P[update_idx] = D[update_idx]
-
-        I_last = np.argmin(D)
-        if np.isinf(D[I_last]):
-            I_new = np.append(self._I, -1)
-            P_new = np.append(self._P, np.inf)
-        else:
-            I_new = np.append(self._I, I_last)
-            P_new = np.append(self._P, D[I_last])
-        left_I_new = np.append(self._left_I, I_last)
-        left_P_new = np.append(self._left_P, D[I_last])
+        update_idx = np.argwhere(D[:l] < self._P[:l, -1]).flatten()
+        for i in update_idx:
+            idx = np.searchsorted(self._P[i], D[i], side="right")
+            core._shift_insert_at_index(self._P[i], idx, D[i])
+            core._shift_insert_at_index(self._I[i], idx, l)
+
+        # Calculating top-k matrix profile and (top-1) left matrix profile (and thier
+        # corresponding indices) for new subsequence whose distance profie is `D`
+        P_new = np.full(self._k, np.inf, dtype=np.float64)
+        I_new = np.full(self._k, -1, dtype=np.int64)
+        for i, d in enumerate(D):
+            if d < P_new[-1]:  # maximum value in sorted array P_new
+                idx = np.searchsorted(P_new, d, side="right")
+                core._shift_insert_at_index(P_new, idx, d)
+                core._shift_insert_at_index(I_new, idx, i)
+
+        left_I_new = I_new[0]
+        left_P_new = P_new[0]
 
         self._T = T_new
-        self._P = P_new
-        self._I = I_new
-        self._left_I = left_I_new
-        self._left_P = left_P_new
+        self._P = np.append(self._P, P_new.reshape(1, -1), axis=0)
+        self._I = np.append(self._I, I_new.reshape(1, -1), axis=0)
+        self._left_P = np.append(self._left_P, left_P_new)
+        self._left_I = np.append(self._left_I, left_I_new)
         self._QT = QT_new
         self._M_T = M_T_new
         self._Σ_T = Σ_T_new
@@ -313,28 +345,40 @@ def _update(self, t):
     @property
     def P_(self):
         """
-        Get the matrix profile
+        Get the (top-k) matrix profile. When `k=1` (default), the output is
+        a 1D array consisting of the matrix profile. When `k > 1`, the
+        output is a 2D array that has exactly `k` columns and it consists of the
+        top-k matrix profile.
         """
-        return self._P.astype(np.float64)
+        if self._k == 1:
+            return self._P.flatten().astype(np.float64)
+        else:
+            return self._P.astype(np.float64)
 
     @property
     def I_(self):
         """
-        Get the matrix profile indices
+        Get the (top-k) matrix profile indices. When `k=1` (default), the output is
+        a 1D array consisting of the matrix profile indices. When `k > 1`, the
+        output is a 2D array that has exactly `k` columns and it consists of the
+        top-k matrix profile indices.
         """
-        return self._I.astype(np.int64)
+        if self._k == 1:
+            return self._I.flatten().astype(np.int64)
+        else:
+            return self._I.astype(np.int64)
 
     @property
     def left_P_(self):
         """
-        Get the left matrix profile
+        Get the (top-1) left matrix profile
         """
         return self._left_P.astype(np.float64)
 
     @property
     def left_I_(self):
         """
-        Get the left matrix profile indices
+        Get the (top-1) left matrix profile indices
         """
         return self._left_I.astype(np.int64)
 
diff --git a/test.sh b/test.sh
index bf7b4c263..78f0d8f84 100755
--- a/test.sh
+++ b/test.sh
@@ -34,7 +34,7 @@ done
 check_errs()
 {
   # Function. Parameter 1 is the return code
-  if [[ $1 -ne "0" ]]; then
+  if [[ $1 -ne "0" && $1 -ne "5" ]]; then
     echo "Error: pytest encountered exit code $1"
     # as a bonus, make our script exit with the right error code.
     exit $1
@@ -121,6 +121,7 @@ test_unit()
 {
     echo "Testing Numba JIT Compiled Functions"
     pytest -rsx -W ignore::RuntimeWarning -W ignore::DeprecationWarning tests/test_gpu_stump.py
+    check_errs $?
     pytest -x -W ignore::RuntimeWarning -W ignore::DeprecationWarning tests/test_core.py
     check_errs $?
     pytest -x -W ignore::RuntimeWarning -W ignore::DeprecationWarning tests/test_config.py
@@ -136,6 +137,7 @@ test_unit()
     pytest -x -W ignore::RuntimeWarning -W ignore::DeprecationWarning tests/test_ostinato.py
     check_errs $?
     pytest -x -W ignore::RuntimeWarning -W ignore::DeprecationWarning tests/test_gpu_ostinato.py
+    check_errs $?
     pytest -x -W ignore::RuntimeWarning -W ignore::DeprecationWarning tests/test_mpdist.py
     check_errs $?
     pytest -x -W ignore::RuntimeWarning -W ignore::DeprecationWarning tests/test_motifs.py
@@ -143,13 +145,16 @@ test_unit()
     pytest -x -W ignore::RuntimeWarning -W ignore::DeprecationWarning tests/test_mmotifs.py
     check_errs $?
     pytest -x -W ignore::RuntimeWarning -W ignore::DeprecationWarning tests/test_gpu_mpdist.py
+    check_errs $?
     pytest -x -W ignore::RuntimeWarning -W ignore::DeprecationWarning tests/test_snippets.py
     check_errs $?
     pytest -rsx -W ignore::RuntimeWarning -W ignore::DeprecationWarning tests/test_gpu_stimp.py
+    check_errs $?
     pytest -x -W ignore::RuntimeWarning -W ignore::DeprecationWarning tests/test_stimp.py
     check_errs $?
     # aamp
     pytest -rsx -W ignore::RuntimeWarning -W ignore::DeprecationWarning tests/test_gpu_aamp.py
+    check_errs $?
     pytest -x -W ignore::RuntimeWarning -W ignore::DeprecationWarning tests/test_aamp.py tests/test_maamp.py tests/test_scraamp.py tests/test_aampi.py
     check_errs $?
     pytest -x -W ignore::RuntimeWarning -W ignore::DeprecationWarning tests/test_scraamp.py
@@ -161,6 +166,7 @@ test_unit()
     pytest -x -W ignore::RuntimeWarning -W ignore::DeprecationWarning tests/test_aamp_ostinato.py
     check_errs $?
     pytest -x -W ignore::RuntimeWarning -W ignore::DeprecationWarning tests/test_gpu_aamp_ostinato.py
+    check_errs $?
     pytest -x -W ignore::RuntimeWarning -W ignore::DeprecationWarning tests/test_aampdist.py
     check_errs $?
     pytest -x -W ignore::RuntimeWarning -W ignore::DeprecationWarning tests/test_aamp_motifs.py
@@ -168,9 +174,11 @@ test_unit()
     pytest -x -W ignore::RuntimeWarning -W ignore::DeprecationWarning tests/test_aamp_mmotifs.py
     check_errs $?
     pytest -x -W ignore::RuntimeWarning -W ignore::DeprecationWarning tests/test_gpu_aampdist.py
+    check_errs $?
     pytest -x -W ignore::RuntimeWarning -W ignore::DeprecationWarning tests/test_aampdist_snippets.py
     check_errs $?
     pytest -rsx -W ignore::RuntimeWarning -W ignore::DeprecationWarning tests/test_gpu_aamp_stimp.py
+    check_errs $?
     pytest -x -W ignore::RuntimeWarning -W ignore::DeprecationWarning tests/test_aamp_stimp.py
     check_errs $?
     pytest -x -W ignore::DeprecationWarning tests/test_non_normalized_decorator.py
diff --git a/tests/naive.py b/tests/naive.py
index cc4b0cc80..a58069bba 100644
--- a/tests/naive.py
+++ b/tests/naive.py
@@ -6,7 +6,7 @@
 from stumpy import core, config
 
 
-def z_norm(a, axis=0, threshold=1e-7):
+def z_norm(a, axis=0, threshold=config.STUMPY_STDDEV_THRESHOLD):
     std = np.std(a, axis, keepdims=True)
     std[np.less(std, threshold, where=~np.isnan(std))] = 1.0
 
@@ -156,11 +156,22 @@ def stamp(T_A, m, T_B=None, exclusion_zone=None):  # pragma: no cover
     return result
 
 
-def stump(T_A, m, T_B=None, exclusion_zone=None, row_wise=False):
+def searchsorted_right(a, v):
     """
-    Traverse distance matrix diagonally and update the matrix profile and
-    matrix profile indices if the parameter `row_wise` is set to `False`.
-    If the parameter `row_wise` is set to `True`, it is a row-wise traversal.
+    Naive version of numpy.searchsorted(..., side='right')
+    """
+    indices = np.flatnonzero(v < a)
+    if len(indices):
+        return indices.min()
+    else:  # pragma: no cover
+        return len(a)
+
+
+def stump(T_A, m, T_B=None, exclusion_zone=None, row_wise=False, k=1):
+    """
+    Traverse distance matrix diagonally and update the top-k matrix profile and
+    matrix profile indices if the parameter `row_wise` is set to `False`. If the
+    parameter `row_wise` is set to `True`, it is a row-wise traversal.
     """
     if T_B is None:  # self-join:
         ignore_trivial = True
@@ -182,35 +193,35 @@ def stump(T_A, m, T_B=None, exclusion_zone=None, row_wise=False):
     if exclusion_zone is None:
         exclusion_zone = int(np.ceil(m / config.STUMPY_EXCL_ZONE_DENOM))
 
-    P = np.full((l, 3), np.inf)
-    I = np.full((l, 3), -1, dtype=np.int64)
+    P = np.full((l, k + 2), np.inf, dtype=np.float64)
+    I = np.full((l, k + 2), -1, dtype=np.int64)  # two more columns are to store
+    # ... left and right top-1 matrix profile indices
 
     if row_wise:  # row-wise traversal in distance matrix
         if ignore_trivial:  # self-join
             for i in range(l):
                 apply_exclusion_zone(distance_matrix[i], i, exclusion_zone, np.inf)
 
-        for i, D in enumerate(distance_matrix):
+        for i, D in enumerate(distance_matrix):  # D: distance profile
             # self-join / AB-join: matrix proifle and indices
-            idx = np.argmin(D)
-            P[i, 0] = D[idx]
-            if P[i, 0] == np.inf:
-                idx = -1
-            I[i, 0] = idx
+            indices = np.argsort(D)[:k]
+            P[i, :k] = D[indices]
+            indices[P[i, :k] == np.inf] = -1
+            I[i, :k] = indices
 
-            # self-join: left matrix profile
+            # self-join: left matrix profile index (top-1)
             if ignore_trivial and i > 0:
                 IL = np.argmin(D[:i])
                 if D[IL] == np.inf:
                     IL = -1
-                I[i, 1] = IL
+                I[i, k] = IL
 
-            # self-join: right matrix profile
+            # self-join: right matrix profile index (top-1)
             if ignore_trivial and i < D.shape[0]:
-                IR = i + np.argmin(D[i:])  # shift argmin by `i` to get true index
+                IR = i + np.argmin(D[i:])  # offset by `i` to get true index
                 if D[IR] == np.inf:
                     IR = -1
-                I[i, 2] = IR
+                I[i, k + 1] = IR
 
     else:  # diagonal traversal
         if ignore_trivial:
@@ -218,37 +229,40 @@ def stump(T_A, m, T_B=None, exclusion_zone=None, row_wise=False):
         else:
             diags = np.arange(-(n_A - m + 1) + 1, n_B - m + 1)
 
-        for k in diags:
-            if k >= 0:
-                iter_range = range(0, min(n_A - m + 1, n_B - m + 1 - k))
+        for g in diags:
+            if g >= 0:
+                iter_range = range(0, min(n_A - m + 1, n_B - m + 1 - g))
             else:
-                iter_range = range(-k, min(n_A - m + 1, n_B - m + 1 - k))
+                iter_range = range(-g, min(n_A - m + 1, n_B - m + 1 - g))
 
             for i in iter_range:
-                D = distance_matrix[i, i + k]
-                if D < P[i, 0]:
-                    P[i, 0] = D
-                    I[i, 0] = i + k
+                d = distance_matrix[i, i + g]
+                if d < P[i, k - 1]:
+                    idx = searchsorted_right(P[i], d)
+                    # to keep the top-k, we must discard the last element.
+                    P[i, :k] = np.insert(P[i, :k], idx, d)[:-1]
+                    I[i, :k] = np.insert(I[i, :k], idx, i + g)[:-1]
 
                 if ignore_trivial:  # Self-joins only
-                    if D < P[i + k, 0]:
-                        P[i + k, 0] = D
-                        I[i + k, 0] = i
+                    if d < P[i + g, k - 1]:
+                        idx = searchsorted_right(P[i + g], d)
+                        P[i + g, :k] = np.insert(P[i + g, :k], idx, d)[:-1]
+                        I[i + g, :k] = np.insert(I[i + g, :k], idx, i)[:-1]
 
-                    if i < i + k:
+                    if i < i + g:
                         # Left matrix profile and left matrix profile index
-                        if D < P[i + k, 1]:
-                            P[i + k, 1] = D
-                            I[i + k, 1] = i
+                        if d < P[i + g, k]:
+                            P[i + g, k] = d
+                            I[i + g, k] = i
 
-                        if D < P[i, 2]:
+                        if d < P[i, k + 1]:
                             # right matrix profile and right matrix profile index
-                            P[i, 2] = D
-                            I[i, 2] = i + k
+                            P[i, k + 1] = d
+                            I[i, k + 1] = i + g
 
-    result = np.empty((l, 4), dtype=object)
-    result[:, 0] = P[:, 0]
-    result[:, 1:4] = I[:, :]
+    result = np.empty((l, 2 * k + 2), dtype=object)
+    result[:, :k] = P[:, :k]
+    result[:, k:] = I[:, :]
 
     return result
 
@@ -705,11 +719,13 @@ def __init__(self, T, m, excl_zone=None, p=2.0):
         self._T_isfinite = np.isfinite(self._T)
         self._m = m
         self._p = p
-        if excl_zone is None:
+
+        self._excl_zone = excl_zone
+        if self._excl_zone is None:
             self._excl_zone = int(np.ceil(self._m / config.STUMPY_EXCL_ZONE_DENOM))
 
         self._l = self._T.shape[0] - m + 1
-        mp = aamp(T, m, p=p)
+        mp = aamp(T, m, exclusion_zone=self._excl_zone, p=p)
         self.P_ = mp[:, 0]
         self.I_ = mp[:, 1].astype(np.int64)
         self.left_P_ = np.full(self.P_.shape, np.inf)
@@ -772,24 +788,31 @@ def update(self, t):
 
 
 class stumpi_egress(object):
-    def __init__(self, T, m, excl_zone=None):
+    def __init__(self, T, m, excl_zone=None, k=1):
         self._T = np.asarray(T)
         self._T = self._T.copy()
         self._T_isfinite = np.isfinite(self._T)
         self._m = m
-        if excl_zone is None:
+        self._k = k
+
+        self._excl_zone = excl_zone
+        if self._excl_zone is None:
             self._excl_zone = int(np.ceil(self._m / config.STUMPY_EXCL_ZONE_DENOM))
 
         self._l = self._T.shape[0] - m + 1
-        mp = stump(T, m)
-        self.P_ = mp[:, 0]
-        self.I_ = mp[:, 1].astype(np.int64)
-        self.left_P_ = np.full(self.P_.shape, np.inf)
-        self.left_I_ = mp[:, 2].astype(np.int64)
-        for i, j in enumerate(self.left_I_):
-            if j >= 0:
-                D = core.mass(self._T[i : i + self._m], self._T[j : j + self._m])
-                self.left_P_[i] = D[0]
+        mp = stump(T, m, exclusion_zone=self._excl_zone, k=self._k)
+        self._P = mp[:, :k].astype(np.float64)
+        self._I = mp[:, k : 2 * k].astype(np.int64)
+
+        self._left_I = mp[:, 2 * k].astype(np.int64)
+        self._left_P = np.full_like(self._left_I, np.inf, dtype=np.float64)
+
+        for idx, nn_idx in enumerate(self._left_I):
+            if nn_idx >= 0:
+                D = distance_profile(
+                    self._T[idx : idx + self._m], self._T[nn_idx : nn_idx + self._m], m
+                )
+                self._left_P[idx] = D[0]
 
         self._n_appended = 0
 
@@ -804,10 +827,10 @@ def update(self, t):
             self._T[-1] = 0
         self._n_appended += 1
 
-        self.P_[:] = np.roll(self.P_, -1)
-        self.I_[:] = np.roll(self.I_, -1)
-        self.left_P_[:] = np.roll(self.left_P_, -1)
-        self.left_I_[:] = np.roll(self.left_I_, -1)
+        self._P = np.roll(self._P, -1, axis=0)
+        self._I = np.roll(self._I, -1, axis=0)
+        self._left_P[:] = np.roll(self._left_P, -1)
+        self._left_I[:] = np.roll(self._left_I, -1)
 
         D = core.mass(self._T[-self._m :], self._T)
         T_subseq_isfinite = np.all(
@@ -818,22 +841,47 @@ def update(self, t):
             D[:] = np.inf
 
         apply_exclusion_zone(D, D.shape[0] - 1, self._excl_zone, np.inf)
+        # update top-k matrix profile using newly calculated distance profile `D`
         for j in range(D.shape[0]):
-            if D[j] < self.P_[j]:
-                self.I_[j] = D.shape[0] - 1 + self._n_appended
-                self.P_[j] = D[j]
-
-        I_last = np.argmin(D)
+            if D[j] < self._P[j, -1]:
+                pos = np.searchsorted(self._P[j], D[j], side="right")
+                self._P[j] = np.insert(self._P[j], pos, D[j])[:-1]
+                self._I[j] = np.insert(
+                    self._I[j], pos, D.shape[0] - 1 + self._n_appended
+                )[:-1]
+
+        # update top-k for the last, newly-updated index
+        I_last_topk = np.argsort(D, kind="mergesort")[: self._k]
+        self._P[-1] = D[I_last_topk]
+        self._I[-1] = I_last_topk + self._n_appended
+        self._I[-1][self._P[-1] == np.inf] = -1
+
+        # for the last index, the left matrix profile value is self.P_[-1, 0]
+        # and the same goes for the left matrix profile index
+        self._left_P[-1] = self._P[-1, 0]
+        self._left_I[-1] = self._I[-1, 0]
+
+    @property
+    def P_(self):
+        if self._k == 1:
+            return self._P.flatten().astype(np.float64)
+        else:
+            return self._P.astype(np.float64)
 
-        if np.isinf(D[I_last]):
-            self.I_[-1] = -1
-            self.P_[-1] = np.inf
+    @property
+    def I_(self):
+        if self._k == 1:
+            return self._I.flatten().astype(np.int64)
         else:
-            self.I_[-1] = I_last + self._n_appended
-            self.P_[-1] = D[I_last]
+            return self._I.astype(np.int64)
 
-        self.left_I_[-1] = I_last + self._n_appended
-        self.left_P_[-1] = D[I_last]
+    @property
+    def left_P_(self):
+        return self._left_P.astype(np.float64)
+
+    @property
+    def left_I_(self):
+        return self._left_I.astype(np.int64)
 
 
 def across_series_nearest_neighbors(Ts, Ts_idx, subseq_idx, m):
@@ -1388,55 +1436,81 @@ def aampdist_snippets(
     )
 
 
-def prescrump(T_A, m, T_B, s, exclusion_zone=None):
+def prescrump(T_A, m, T_B, s, exclusion_zone=None, k=1):
     dist_matrix = distance_matrix(T_A, T_B, m)
 
     l = T_A.shape[0] - m + 1  # matrix profile length
     w = T_B.shape[0] - m + 1  # distance profile length
 
-    P = np.empty(l)
-    I = np.empty(l, dtype=np.int64)
-    P[:] = np.inf
-    I[:] = -1
+    P = np.full((l, k), np.inf, dtype=np.float64)
+    I = np.full((l, k), -1, dtype=np.int64)
 
     for i in np.random.permutation(range(0, l, s)):
         distance_profile = dist_matrix[i]
         if exclusion_zone is not None:
             apply_exclusion_zone(distance_profile, i, exclusion_zone, np.inf)
 
-            # only for self-join
-            mask = distance_profile < P
-            P[mask] = distance_profile[mask]
-            I[mask] = i
-
-        I[i] = np.argmin(distance_profile)
-        P[i] = distance_profile[I[i]]
-        if P[i] == np.inf:
-            I[i] = -1
-        else:
-            j = I[i]
-            for k in range(1, min(s, l - i, w - j)):
-                d = dist_matrix[i + k, j + k]
-                if d < P[i + k]:
-                    P[i + k] = d
-                    I[i + k] = j + k
-                if exclusion_zone is not None and d < P[j + k]:
-                    P[j + k] = d
-                    I[j + k] = i + k
+        nn_idx = np.argmin(distance_profile)
+        if distance_profile[nn_idx] < P[i, -1] and nn_idx not in I[i]:
+            pos = np.searchsorted(P[i], distance_profile[nn_idx], side="right")
+            P[i] = np.insert(P[i], pos, distance_profile[nn_idx])[:-1]
+            I[i] = np.insert(I[i], pos, nn_idx)[:-1]
+
+        if P[i, 0] == np.inf:
+            I[i, 0] = -1
+            continue
+
+        j = nn_idx
+        for g in range(1, min(s, l - i, w - j)):
+            d = dist_matrix[i + g, j + g]
+            # Do NOT optimize the `condition` in the following if statement
+            # and similar ones in this naive function. This is to ensure
+            # we are avoiding duplicates in each row of I.
+            if d < P[i + g, -1] and (j + g) not in I[i + g]:
+                pos = np.searchsorted(P[i + g], d, side="right")
+                P[i + g] = np.insert(P[i + g], pos, d)[:-1]
+                I[i + g] = np.insert(I[i + g], pos, j + g)[:-1]
+            if (
+                exclusion_zone is not None
+                and d < P[j + g, -1]
+                and (i + g) not in I[j + g]
+            ):
+                pos = np.searchsorted(P[j + g], d, side="right")
+                P[j + g] = np.insert(P[j + g], pos, d)[:-1]
+                I[j + g] = np.insert(I[j + g], pos, i + g)[:-1]
+
+        for g in range(1, min(s, i + 1, j + 1)):
+            d = dist_matrix[i - g, j - g]
+            if d < P[i - g, -1] and (j - g) not in I[i - g]:
+                pos = np.searchsorted(P[i - g], d, side="right")
+                P[i - g] = np.insert(P[i - g], pos, d)[:-1]
+                I[i - g] = np.insert(I[i - g], pos, j - g)[:-1]
+            if (
+                exclusion_zone is not None
+                and d < P[j - g, -1]
+                and (i - g) not in I[j - g]
+            ):
+                pos = np.searchsorted(P[j - g], d, side="right")
+                P[j - g] = np.insert(P[j - g], pos, d)[:-1]
+                I[j - g] = np.insert(I[j - g], pos, i - g)[:-1]
+
+        # In the case of a self-join, the calculated distance profile can also be
+        # used to refine the top-k for all non-trivial subsequences
+        if exclusion_zone is not None:
+            for idx in np.flatnonzero(distance_profile < P[:, -1]):
+                if i not in I[idx]:
+                    pos = np.searchsorted(P[idx], distance_profile[idx], side="right")
+                    P[idx] = np.insert(P[idx], pos, distance_profile[idx])[:-1]
+                    I[idx] = np.insert(I[idx], pos, i)[:-1]
 
-            for k in range(1, min(s, i + 1, j + 1)):
-                d = dist_matrix[i - k, j - k]
-                if d < P[i - k]:
-                    P[i - k] = d
-                    I[i - k] = j - k
-                if exclusion_zone is not None and d < P[j - k]:
-                    P[j - k] = d
-                    I[j - k] = i - k
+    if k == 1:
+        P = P.flatten()
+        I = I.flatten()
 
     return P, I
 
 
-def scrump(T_A, m, T_B, percentage, exclusion_zone, pre_scrump, s):
+def scrump(T_A, m, T_B, percentage, exclusion_zone, pre_scrump, s, k=1):
     dist_matrix = distance_matrix(T_A, T_B, m)
 
     n_A = T_A.shape[0]
@@ -1458,44 +1532,48 @@ def scrump(T_A, m, T_B, percentage, exclusion_zone, pre_scrump, s):
     diags_ranges_start = diags_ranges[0, 0]
     diags_ranges_stop = diags_ranges[0, 1]
 
-    out = np.full((l, 4), np.inf, dtype=object)
-    out[:, 1:] = -1
-    left_P = np.full(l, np.inf, dtype=np.float64)
-    right_P = np.full(l, np.inf, dtype=np.float64)
+    P = np.full((l, k), np.inf, dtype=np.float64)  # Topk
+    PL = np.full(l, np.inf, dtype=np.float64)
+    PR = np.full(l, np.inf, dtype=np.float64)
+
+    I = np.full((l, k), -1, dtype=np.int64)
+    IL = np.full(l, -1, dtype=np.int64)
+    IR = np.full(l, -1, dtype=np.int64)
 
     for diag_idx in range(diags_ranges_start, diags_ranges_stop):
-        k = diags[diag_idx]
+        g = diags[diag_idx]
 
         for i in range(n_A - m + 1):
             for j in range(n_B - m + 1):
-                if j - i == k:
-                    if dist_matrix[i, j] < out[i, 0]:
-                        out[i, 0] = dist_matrix[i, j]
-                        out[i, 1] = i + k
-
-                    if exclusion_zone is not None and dist_matrix[i, j] < out[i + k, 0]:
-                        out[i + k, 0] = dist_matrix[i, j]
-                        out[i + k, 1] = i
+                if j - i == g:
+                    d = dist_matrix[i, j]
+                    if d < P[i, -1]:  # update TopK of P[i]
+                        idx = searchsorted_right(P[i], d)
+                        if (i + g) not in I[i]:
+                            P[i] = np.insert(P[i], idx, d)[:-1]
+                            I[i] = np.insert(I[i], idx, i + g)[:-1]
+
+                    if exclusion_zone is not None and d < P[i + g, -1]:
+                        idx = searchsorted_right(P[i + g], d)
+                        if i not in I[i + g]:
+                            P[i + g] = np.insert(P[i + g], idx, d)[:-1]
+                            I[i + g] = np.insert(I[i + g], idx, i)[:-1]
 
                     # left matrix profile and left matrix profile indices
-                    if (
-                        exclusion_zone is not None
-                        and i < i + k
-                        and dist_matrix[i, j] < left_P[i + k]
-                    ):
-                        left_P[i + k] = dist_matrix[i, j]
-                        out[i + k, 2] = i
+                    if exclusion_zone is not None and i < i + g and d < PL[i + g]:
+                        PL[i + g] = d
+                        IL[i + g] = i
 
                     # right matrix profile and right matrix profile indices
-                    if (
-                        exclusion_zone is not None
-                        and i + k > i
-                        and dist_matrix[i, j] < right_P[i]
-                    ):
-                        right_P[i] = dist_matrix[i, j]
-                        out[i, 3] = i + k
+                    if exclusion_zone is not None and i + g > i and d < PR[i]:
+                        PR[i] = d
+                        IR[i] = i + g
 
-    return out
+    if k == 1:
+        P = P.flatten()
+        I = I.flatten()
+
+    return P, I, IL, IR
 
 
 def prescraamp(T_A, m, T_B, s, exclusion_zone=None, p=2.0):
@@ -1759,6 +1837,83 @@ def _total_diagonal_ndists(tile_lower_diag, tile_upper_diag, tile_height, tile_w
     return total_ndists
 
 
+def merge_topk_PI(PA, PB, IA, IB):
+    if PA.ndim == 1:
+        for i in range(PA.shape[0]):
+            if PB[i] < PA[i] and IB[i] != IA[i]:
+                PA[i] = PB[i]
+                IA[i] = IB[i]
+        return
+
+    else:
+        k = PA.shape[1]
+        for i in range(PA.shape[0]):
+            _, _, overlap_idx_B = np.intersect1d(IA[i], IB[i], return_indices=True)
+            PB[i, overlap_idx_B] = np.inf
+            IB[i, overlap_idx_B] = -1
+
+        profile = np.column_stack((PA, PB))
+        indices = np.column_stack((IA, IB))
+        IDX = np.argsort(profile, axis=1, kind="mergesort")
+        profile[:, :] = np.take_along_axis(profile, IDX, axis=1)
+        indices[:, :] = np.take_along_axis(indices, IDX, axis=1)
+
+        PA[:, :] = profile[:, :k]
+        IA[:, :] = indices[:, :k]
+
+        return
+
+
+def merge_topk_ρI(ρA, ρB, IA, IB):
+    # This function merges two pearson profiles `ρA` and `ρB`, and updates `ρA`
+    # and `IA` accordingly. When the inputs are 1D, `ρA[i]` is updated if
+    #  `ρA[i] < ρB[i]` and IA[i] != IB[i]. When the inputs are 2D, each row in
+    #  `ρA` and `ρB` is sorted ascendingly. we want to keep top-k largest values in
+    # merging row `ρA[i]` and `ρB[i]`.
+
+    # In case of ties between `ρA` and `ρB`, the priority is with `ρA`. In case
+    # of ties within `ρA, the priority is with an element with greater index.
+    # Example
+    # note: the prime symbol is to distinguish two elements with same value
+    # ρA = [0, 0', 1], and ρB = [0, 1, 1'].
+    # merging outcome: [1_B, 1'_B, 1_A]
+
+    # Naive Implementation:
+    # keeping top-k largest with the aforementioned priority rules is the same as
+    # `merge_topk_PI` but with swapping `ρA` and `ρB`
+
+    # For the same example:
+    # merging `ρB` and `ρA` ascendingly while choosing `ρB` over `ρA` in case of
+    # ties: [0_B, 0_A, 0'_A, 1_B, 1'_B, 1_A], and the second half of this array
+    # is the desribale outcome.
+    if ρA.ndim == 1:
+        for i in range(ρA.shape[0]):
+            if ρB[i] > ρA[i] and IB[i] != IA[i]:
+                ρA[i] = ρB[i]
+                IA[i] = IB[i]
+        return
+
+    else:
+        k = ρA.shape[1]
+        for i in range(ρA.shape[0]):
+            _, _, overlap_idx_B = np.intersect1d(IA[i], IB[i], return_indices=True)
+            ρB[i, overlap_idx_B] = np.NINF
+            IB[i, overlap_idx_B] = -1
+
+        profile = np.column_stack((ρB, ρA))
+        indices = np.column_stack((IB, IA))
+
+        idx = np.argsort(profile, axis=1, kind="mergesort")
+        profile[:, :] = np.take_along_axis(profile, idx, axis=1)
+        indices[:, :] = np.take_along_axis(indices, idx, axis=1)
+
+        # keep the last k elements (top-k largest values)
+        ρA[:, :] = profile[:, k:]
+        IA[:, :] = indices[:, k:]
+
+        return
+
+
 def find_matches(D, excl_zone, max_distance, max_matches=None):
     if max_matches is None:
         max_matches = len(D)
diff --git a/tests/test_core.py b/tests/test_core.py
index 675cd9e27..16d2d0fd2 100644
--- a/tests/test_core.py
+++ b/tests/test_core.py
@@ -1061,6 +1061,242 @@ def test_select_P_ABBA_val_inf():
     npt.assert_almost_equal(ref, comp)
 
 
+def test_merge_topk_PI_without_overlap():
+    # This is to test function `core._merge_topk_PI(PA, PB, IA, IB)` when there
+    # is no overlap between row IA[i] and row IB[i].
+    n = 50
+    for k in range(1, 6):
+        PA = np.random.rand(n * k).reshape(n, k)
+        PA[:, :] = np.sort(PA, axis=1)  # sorting each row separately
+
+        PB = np.random.rand(n * k).reshape(n, k)
+        col_idx = np.random.randint(0, k, size=n)
+        for i in range(n):  # creating ties between values of PA and PB
+            PB[i, col_idx[i]] = np.random.choice(PA[i], size=1, replace=False)
+        PB[:, :] = np.sort(PB, axis=1)  # sorting each row separately
+
+        IA = np.arange(n * k).reshape(n, k)
+        IB = IA + n * k
+
+        ref_P = PA.copy()
+        ref_I = IA.copy()
+
+        comp_P = PA.copy()
+        comp_I = IA.copy()
+
+        naive.merge_topk_PI(ref_P, PB.copy(), ref_I, IB.copy())
+        core._merge_topk_PI(comp_P, PB.copy(), comp_I, IB.copy())
+
+        npt.assert_almost_equal(ref_P, comp_P)
+        npt.assert_almost_equal(ref_I, comp_I)
+
+
+def test_merge_topk_PI_with_overlap():
+    # This is to test function `core._merge_topk_PI(PA, PB, IA, IB)` when there
+    # is overlap between row IA[i] and row IB[i].
+    n = 50
+    for k in range(1, 6):
+        # note: we do not have overlap issue when k is 1. The `k=1` is considered
+        # for the sake of consistency with the `without-overlap` test function.
+        PA = np.random.rand(n * k).reshape(n, k)
+        PB = np.random.rand(n * k).reshape(n, k)
+
+        IA = np.arange(n * k).reshape(n, k)
+        IB = IA + n * k
+
+        num_overlaps = np.random.randint(1, k + 1, size=n)
+        for i in range(n):
+            # create overlaps
+            col_IDX = np.random.choice(np.arange(k), num_overlaps[i], replace=False)
+            imprecision = np.random.uniform(low=-1e-06, high=1e-06, size=len(col_IDX))
+            PB[i, col_IDX] = PA[i, col_IDX] + imprecision
+            IB[i, col_IDX] = IA[i, col_IDX]
+
+        # sort each row of PA/PB (and update  IA/IB accordingly)
+        IDX = np.argsort(PA, axis=1)
+        PA[:, :] = np.take_along_axis(PA, IDX, axis=1)
+        IA[:, :] = np.take_along_axis(IA, IDX, axis=1)
+
+        IDX = np.argsort(PB, axis=1)
+        PB[:, :] = np.take_along_axis(PB, IDX, axis=1)
+        IB[:, :] = np.take_along_axis(IB, IDX, axis=1)
+
+        ref_P = PA.copy()
+        ref_I = IA.copy()
+
+        comp_P = PA.copy()
+        comp_I = IA.copy()
+
+        naive.merge_topk_PI(ref_P, PB.copy(), ref_I, IB.copy())
+        core._merge_topk_PI(comp_P, PB.copy(), comp_I, IB.copy())
+
+        npt.assert_almost_equal(ref_P, comp_P)
+        npt.assert_almost_equal(ref_I, comp_I)
+
+
+def test_merge_topk_PI_with_1D_input():
+    # including some overlaps randomly
+    n = 50
+    PA = np.random.rand(n)
+    PB = np.random.rand(n)
+
+    IA = np.arange(n)
+    IB = IA + n
+
+    n_overlaps = np.random.randint(1, n + 1)
+    IDX_rows_with_overlaps = np.random.choice(np.arange(n), n_overlaps, replace=False)
+    imprecision = np.random.uniform(low=-1e-06, high=1e-06, size=n_overlaps)
+    PB[IDX_rows_with_overlaps] = PA[IDX_rows_with_overlaps] + imprecision
+    IB[IDX_rows_with_overlaps] = IA[IDX_rows_with_overlaps]
+
+    ref_P = PA.copy()
+    ref_I = IA.copy()
+
+    comp_P = PA.copy()
+    comp_I = IA.copy()
+
+    naive.merge_topk_PI(ref_P, PB.copy(), ref_I, IB.copy())
+    core._merge_topk_PI(comp_P, PB.copy(), comp_I, IB.copy())
+
+    npt.assert_almost_equal(ref_P, comp_P)
+    npt.assert_almost_equal(ref_I, comp_I)
+
+
+def test_merge_topk_ρI_without_overlap():
+    # This is to test function `core._merge_topk_ρI(ρA, ρB, IA, IB)` when there
+    # is no overlap between row IA[i] and row IB[i].
+    n = 50
+    for k in range(1, 6):
+        ρA = np.random.rand(n * k).reshape(n, k)
+        ρA[:, :] = np.sort(ρA, axis=1)  # sorting each row separately
+
+        ρB = np.random.rand(n * k).reshape(n, k)
+        col_idx = np.random.randint(0, k, size=n)
+        for i in range(n):  # creating ties between values of PA and PB
+            ρB[i, col_idx[i]] = np.random.choice(ρA[i], size=1, replace=False)
+        ρB[:, :] = np.sort(ρB, axis=1)  # sorting each row separately
+
+        IA = np.arange(n * k).reshape(n, k)
+        IB = IA + n * k
+
+        ref_ρ = ρA.copy()
+        ref_I = IA.copy()
+
+        comp_ρ = ρA.copy()
+        comp_I = IA.copy()
+
+        naive.merge_topk_ρI(ref_ρ, ρB.copy(), ref_I, IB.copy())
+        core._merge_topk_ρI(comp_ρ, ρB.copy(), comp_I, IB.copy())
+
+        npt.assert_almost_equal(ref_ρ, comp_ρ)
+        npt.assert_almost_equal(ref_I, comp_I)
+
+
+def test_merge_topk_ρI_with_overlap():
+    # This is to test function `core._merge_topk_ρI(ρA, ρB, IA, IB)` when there
+    # is overlap between row IA[i] and row IB[i].
+    n = 50
+    for k in range(1, 6):
+        # note: we do not have overlap issue when k is 1. The `k=1` is considered
+        # for the sake of consistency with the `without-overlap` test function.
+        ρA = np.random.rand(n * k).reshape(n, k)
+        ρB = np.random.rand(n * k).reshape(n, k)
+
+        IA = np.arange(n * k).reshape(n, k)
+        IB = IA + n * k
+
+        num_overlaps = np.random.randint(1, k + 1, size=n)
+        for i in range(n):
+            # create overlaps
+            col_IDX = np.random.choice(np.arange(k), num_overlaps[i], replace=False)
+            imprecision = np.random.uniform(low=-1e-06, high=1e-06, size=len(col_IDX))
+            ρB[i, col_IDX] = ρA[i, col_IDX] + imprecision
+            IB[i, col_IDX] = IA[i, col_IDX]
+
+        # sort each row of ρA/ρB (and update IA/IB accordingly)
+        IDX = np.argsort(ρA, axis=1)
+        ρA[:, :] = np.take_along_axis(ρA, IDX, axis=1)
+        IA[:, :] = np.take_along_axis(IA, IDX, axis=1)
+
+        IDX = np.argsort(ρB, axis=1)
+        ρB[:, :] = np.take_along_axis(ρB, IDX, axis=1)
+        IB[:, :] = np.take_along_axis(IB, IDX, axis=1)
+
+        ref_ρ = ρA.copy()
+        ref_I = IA.copy()
+
+        comp_ρ = ρA.copy()
+        comp_I = IA.copy()
+
+        naive.merge_topk_ρI(ref_ρ, ρB.copy(), ref_I, IB.copy())
+        core._merge_topk_ρI(comp_ρ, ρB.copy(), comp_I, IB.copy())
+
+        npt.assert_almost_equal(ref_ρ, comp_ρ)
+        npt.assert_almost_equal(ref_I, comp_I)
+
+
+def test_merge_topk_ρI_with_1D_input():
+    # including some overlaps randomly
+    n = 50
+    ρA = np.random.rand(n)
+    ρB = np.random.rand(n)
+
+    IA = np.arange(n)
+    IB = IA + n
+
+    ref_ρ = ρA.copy()
+    ref_I = IA.copy()
+
+    comp_ρ = ρA.copy()
+    comp_I = IA.copy()
+
+    n_overlaps = np.random.randint(1, n + 1)
+    IDX_rows_with_overlaps = np.random.choice(np.arange(n), n_overlaps, replace=False)
+    imprecision = np.random.uniform(low=-1e-06, high=1e-06, size=n_overlaps)
+    ρB[IDX_rows_with_overlaps] = ρA[IDX_rows_with_overlaps] + imprecision
+    IB[IDX_rows_with_overlaps] = IA[IDX_rows_with_overlaps]
+
+    naive.merge_topk_ρI(ref_ρ, ρB.copy(), ref_I, IB.copy())
+    core._merge_topk_ρI(comp_ρ, ρB.copy(), comp_I, IB.copy())
+
+    npt.assert_almost_equal(ref_ρ, comp_ρ)
+    npt.assert_almost_equal(ref_I, comp_I)
+
+
+def test_shift_insert_at_index():
+    for k in range(1, 6):
+        a = np.random.rand(k)
+        ref = np.empty(k, dtype=np.float64)
+        comp = np.empty(k, dtype=np.float64)
+
+        indices = np.arange(k + 1)
+        values = np.random.rand(k + 1)
+
+        # test shift = "right"
+        for (idx, v) in zip(indices, values):
+            ref[:] = a
+            comp[:] = a
+
+            ref = np.insert(ref, idx, v)[:-1]
+            core._shift_insert_at_index(
+                comp, idx, v, shift="right"
+            )  # update comp in place
+
+            npt.assert_almost_equal(ref, comp)
+
+        # test shift = "left"
+        for (idx, v) in zip(indices, values):
+            ref[:] = a
+            comp[:] = a
+
+            ref = np.insert(ref, idx, v)[1:]
+            core._shift_insert_at_index(
+                comp, idx, v, shift="left"
+            )  # update comp in place
+
+            npt.assert_almost_equal(ref, comp)
+
+
 def test_check_P():
     with pytest.raises(ValueError):
         core._check_P(np.random.rand(10).reshape(2, 5))
diff --git a/tests/test_gpu_stump.py b/tests/test_gpu_stump.py
index 508b02a56..a6d4e6953 100644
--- a/tests/test_gpu_stump.py
+++ b/tests/test_gpu_stump.py
@@ -1,10 +1,21 @@
+import math
 import numpy as np
 import numpy.testing as npt
 import pandas as pd
-from stumpy import gpu_stump
+from stumpy import core, gpu_stump
 from stumpy import config
 from numba import cuda
 
+if cuda.is_available():
+    from stumpy.gpu_stump import _gpu_searchsorted_left, _gpu_searchsorted_right
+else:  # pragma: no cover
+    from stumpy.core import (
+        _gpu_searchsorted_left_driver_not_found as _gpu_searchsorted_left,
+    )
+    from stumpy.core import (
+        _gpu_searchsorted_right_driver_not_found as _gpu_searchsorted_right,
+    )
+
 try:
     from numba.errors import NumbaPerformanceWarning
 except ModuleNotFoundError:
@@ -39,6 +50,62 @@ def test_gpu_stump_int_input():
         gpu_stump(np.arange(10), 5, ignore_trivial=True)
 
 
+@cuda.jit("(f8[:, :], f8[:], i8[:], i8, b1, i8[:])")
+def _gpu_searchsorted_kernel(a, v, bfs, nlevel, is_left, idx):
+    # A wrapper kernel for calling device function _gpu_searchsorted_left/right.
+    i = cuda.grid(1)
+    if i < a.shape[0]:
+        if is_left:
+            idx[i] = _gpu_searchsorted_left(a[i], v[i], bfs, nlevel)
+        else:
+            idx[i] = _gpu_searchsorted_right(a[i], v[i], bfs, nlevel)
+
+
+@pytest.mark.filterwarnings("ignore", category=NumbaPerformanceWarning)
+def test_gpu_searchsorted():
+    n = 3 * config.STUMPY_THREADS_PER_BLOCK + 1
+    V = np.empty(n, dtype=np.float64)
+
+    threads_per_block = config.STUMPY_THREADS_PER_BLOCK
+    blocks_per_grid = math.ceil(n / threads_per_block)
+
+    for k in range(1, 32):
+        device_bfs = cuda.to_device(core._bfs_indices(k, fill_value=-1))
+        nlevel = np.floor(np.log2(k) + 1).astype(np.int64)
+
+        A = np.sort(np.random.rand(n, k), axis=1)
+        device_A = cuda.to_device(A)
+
+        V[:] = np.random.rand(n)
+        for i, idx in enumerate(np.random.choice(np.arange(n), size=k, replace=False)):
+            V[idx] = A[idx, i]  # create ties
+        device_V = cuda.to_device(V)
+
+        is_left = True  # test case
+        ref_IDX = [np.searchsorted(A[i], V[i], side="left") for i in range(n)]
+        ref_IDX = np.asarray(ref_IDX, dtype=np.int64)
+
+        comp_IDX = np.full(n, -1, dtype=np.int64)
+        device_comp_IDX = cuda.to_device(comp_IDX)
+        _gpu_searchsorted_kernel[blocks_per_grid, threads_per_block](
+            device_A, device_V, device_bfs, nlevel, is_left, device_comp_IDX
+        )
+        comp_IDX = device_comp_IDX.copy_to_host()
+        npt.assert_array_equal(ref_IDX, comp_IDX)
+
+        is_left = False  # test case
+        ref_IDX = [np.searchsorted(A[i], V[i], side="right") for i in range(n)]
+        ref_IDX = np.asarray(ref_IDX, dtype=np.int64)
+
+        comp_IDX = np.full(n, -1, dtype=np.int64)
+        device_comp_IDX = cuda.to_device(comp_IDX)
+        _gpu_searchsorted_kernel[blocks_per_grid, threads_per_block](
+            device_A, device_V, device_bfs, nlevel, is_left, device_comp_IDX
+        )
+        comp_IDX = device_comp_IDX.copy_to_host()
+        npt.assert_array_equal(ref_IDX, comp_IDX)
+
+
 @pytest.mark.filterwarnings("ignore", category=NumbaPerformanceWarning)
 @pytest.mark.parametrize("T_A, T_B", test_data)
 def test_gpu_stump_self_join(T_A, T_B):
@@ -350,3 +417,32 @@ def test_gpu_stump_nan_zero_mean_self_join():
     naive.replace_inf(ref_mp)
     naive.replace_inf(comp_mp)
     npt.assert_almost_equal(ref_mp, comp_mp)
+
+
+@pytest.mark.filterwarnings("ignore", category=NumbaPerformanceWarning)
+@pytest.mark.parametrize("T_A, T_B", test_data)
+def test_gpu_stump_self_join_KNN(T_A, T_B):
+    m = 3
+    zone = int(np.ceil(m / 4))
+    for k in range(2, 4):
+        ref_mp = naive.stump(T_B, m, exclusion_zone=zone, row_wise=True, k=k)
+        comp_mp = gpu_stump(T_B, m, ignore_trivial=True, k=k)
+        naive.replace_inf(ref_mp)
+        naive.replace_inf(comp_mp)
+        npt.assert_almost_equal(ref_mp, comp_mp)
+
+        comp_mp = gpu_stump(pd.Series(T_B), m, ignore_trivial=True, k=k)
+        naive.replace_inf(comp_mp)
+        npt.assert_almost_equal(ref_mp, comp_mp)
+
+
+@pytest.mark.filterwarnings("ignore", category=NumbaPerformanceWarning)
+@pytest.mark.parametrize("T_A, T_B", test_data)
+def test_gpu_stump_A_B_join_KNN(T_A, T_B):
+    m = 3
+    for k in range(2, 4):
+        ref_mp = naive.stump(T_A, m, T_B=T_B, row_wise=True, k=k)
+        comp_mp = gpu_stump(T_A, m, T_B, ignore_trivial=False, k=k)
+        naive.replace_inf(ref_mp)
+        naive.replace_inf(comp_mp)
+        npt.assert_almost_equal(ref_mp, comp_mp)
diff --git a/tests/test_non_normalized_decorator.py b/tests/test_non_normalized_decorator.py
index 2c6447d09..8da9354c8 100644
--- a/tests/test_non_normalized_decorator.py
+++ b/tests/test_non_normalized_decorator.py
@@ -340,6 +340,7 @@ def test_mmotifs(T, m):
     npt.assert_almost_equal(ref_distances, cmp_distances)
 
 
+@pytest.mark.filterwarnings("ignore:All-NaN slice encountered")
 def test_snippets():
     T = np.random.rand(64)
     m = 10
diff --git a/tests/test_scrump.py b/tests/test_scrump.py
index 0e56a4c62..de978c856 100644
--- a/tests/test_scrump.py
+++ b/tests/test_scrump.py
@@ -107,11 +107,9 @@ def test_scrump_self_join(T_A, T_B, percentages):
         seed = np.random.randint(100000)
 
         np.random.seed(seed)
-        ref_mp = naive.scrump(T_B, m, T_B, percentage, zone, False, None)
-        ref_P = ref_mp[:, 0]
-        ref_I = ref_mp[:, 1]
-        ref_left_I = ref_mp[:, 2]
-        ref_right_I = ref_mp[:, 3]
+        ref_P, ref_I, ref_left_I, ref_right_I = naive.scrump(
+            T_B, m, T_B, percentage, zone, False, None
+        )
 
         np.random.seed(seed)
         approx = scrump(
@@ -140,11 +138,9 @@ def test_scrump_A_B_join(T_A, T_B, percentages):
         seed = np.random.randint(100000)
 
         np.random.seed(seed)
-        ref_mp = naive.scrump(T_A, m, T_B, percentage, None, False, None)
-        ref_P = ref_mp[:, 0]
-        ref_I = ref_mp[:, 1]
-        ref_left_I = ref_mp[:, 2]
-        ref_right_I = ref_mp[:, 3]
+        ref_P, ref_I, ref_left_I, ref_right_I = naive.scrump(
+            T_A, m, T_B, percentage, None, False, None
+        )
 
         np.random.seed(seed)
         approx = scrump(
@@ -174,11 +170,9 @@ def test_scrump_A_B_join_swap(T_A, T_B, percentages):
         seed = np.random.randint(100000)
 
         np.random.seed(seed)
-        ref_mp = naive.scrump(T_B, m, T_A, percentage, None, False, None)
-        ref_P = ref_mp[:, 0]
-        # ref_I = ref_mp[:, 1]
-        ref_left_I = ref_mp[:, 2]
-        ref_right_I = ref_mp[:, 3]
+        ref_P, _, ref_left_I, ref_right_I = naive.scrump(
+            T_B, m, T_A, percentage, None, False, None
+        )
 
         np.random.seed(seed)
         approx = scrump(
@@ -210,11 +204,9 @@ def test_scrump_self_join_larger_window(T_A, T_B, m, percentages):
             seed = np.random.randint(100000)
 
             np.random.seed(seed)
-            ref_mp = naive.scrump(T_B, m, T_B, percentage, zone, False, None)
-            ref_P = ref_mp[:, 0]
-            ref_I = ref_mp[:, 1]
-            ref_left_I = ref_mp[:, 2]
-            ref_right_I = ref_mp[:, 3]
+            ref_P, ref_I, ref_left_I, ref_right_I = naive.scrump(
+                T_B, m, T_B, percentage, zone, False, None
+            )
 
             np.random.seed(seed)
             approx = scrump(
@@ -377,16 +369,12 @@ def test_scrump_plus_plus_self_join(T_A, T_B, percentages):
             seed = np.random.randint(100000)
 
             np.random.seed(seed)
-            ref_P, ref_I = naive.prescrump(T_B, m, T_B, s=s, exclusion_zone=zone)
-            ref_mp = naive.scrump(T_B, m, T_B, percentage, zone, True, s)
-            for i in range(ref_mp.shape[0]):
-                if ref_P[i] < ref_mp[i, 0]:
-                    ref_mp[i, 0] = ref_P[i]
-                    ref_mp[i, 1] = ref_I[i]
-            ref_P = ref_mp[:, 0]
-            ref_I = ref_mp[:, 1]
-            # ref_left_I = ref_mp[:, 2]
-            # ref_right_I = ref_mp[:, 3]
+            ref_P, ref_I = naive.prescrump(T_B, m, T_B, s=s, exclusion_zone=zone, k=1)
+            ref_P_aux, ref_I_aux, _, _ = naive.scrump(
+                T_B, m, T_B, percentage, zone, True, s, k=1
+            )
+
+            naive.merge_topk_PI(ref_P, ref_P_aux, ref_I, ref_I_aux)
 
             np.random.seed(seed)
             approx = scrump(
@@ -395,16 +383,14 @@ def test_scrump_plus_plus_self_join(T_A, T_B, percentages):
             approx.update()
             comp_P = approx.P_
             comp_I = approx.I_
-            # comp_left_I = approx.left_I_
-            # comp_right_I = approx.right_I_
 
             naive.replace_inf(ref_P)
-            naive.replace_inf(comp_I)
+            naive.replace_inf(comp_P)
 
+            ref_P = ref_P
+            ref_I = ref_I
             npt.assert_almost_equal(ref_P, comp_P)
             npt.assert_almost_equal(ref_I, comp_I)
-            # npt.assert_almost_equal(ref_left_I, comp_left_I)
-            # npt.assert_almost_equal(ref_right_I, comp_right_I)
 
 
 @pytest.mark.parametrize("T_A, T_B", test_data)
@@ -418,16 +404,15 @@ def test_scrump_plus_plus_A_B_join(T_A, T_B, percentages):
             seed = np.random.randint(100000)
 
             np.random.seed(seed)
-            ref_P, ref_I = naive.prescrump(T_A, m, T_B, s=s)
-            ref_mp = naive.scrump(T_A, m, T_B, percentage, None, False, None)
-            for i in range(ref_mp.shape[0]):
-                if ref_P[i] < ref_mp[i, 0]:
-                    ref_mp[i, 0] = ref_P[i]
-                    ref_mp[i, 1] = ref_I[i]
-            ref_P = ref_mp[:, 0]
-            ref_I = ref_mp[:, 1]
-            ref_left_I = ref_mp[:, 2]
-            ref_right_I = ref_mp[:, 3]
+            ref_P, ref_I = naive.prescrump(T_A, m, T_B, s=s, k=1)
+
+            ref_P_aux, ref_I_aux, ref_left_I_aux, ref_right_I_aux = naive.scrump(
+                T_A, m, T_B, percentage, None, False, None, k=1
+            )
+
+            naive.merge_topk_PI(ref_P, ref_P_aux, ref_I, ref_I_aux)
+            ref_left_I = ref_left_I_aux
+            ref_right_I = ref_right_I_aux
 
             approx = scrump(
                 T_A,
@@ -447,6 +432,8 @@ def test_scrump_plus_plus_A_B_join(T_A, T_B, percentages):
             naive.replace_inf(ref_P)
             naive.replace_inf(comp_P)
 
+            ref_P = ref_P
+            ref_I = ref_I
             npt.assert_almost_equal(ref_P, comp_P)
             npt.assert_almost_equal(ref_I, comp_I)
             npt.assert_almost_equal(ref_left_I, comp_left_I)
@@ -551,11 +538,9 @@ def test_scrump_constant_subsequence_self_join(percentages):
         seed = np.random.randint(100000)
 
         np.random.seed(seed)
-        ref_mp = naive.scrump(T, m, T, percentage, zone, False, None)
-        ref_P = ref_mp[:, 0]
-        ref_I = ref_mp[:, 1]
-        ref_left_I = ref_mp[:, 2]
-        ref_right_I = ref_mp[:, 3]
+        ref_P, ref_I, ref_left_I, ref_right_I = naive.scrump(
+            T, m, T, percentage, zone, False, None
+        )
 
         np.random.seed(seed)
         approx = scrump(
@@ -589,11 +574,7 @@ def test_scrump_identical_subsequence_self_join(percentages):
         seed = np.random.randint(100000)
 
         np.random.seed(seed)
-        ref_mp = naive.scrump(T, m, T, percentage, zone, False, None)
-        ref_P = ref_mp[:, 0]
-        # ref_I = ref_mp[:, 1]
-        # ref_left_I = ref_mp[:, 2]
-        # ref_right_I = ref_mp[:, 3]
+        ref_P, _, _, _ = naive.scrump(T, m, T, percentage, zone, False, None)
 
         np.random.seed(seed)
         approx = scrump(
@@ -635,11 +616,9 @@ def test_scrump_nan_inf_self_join(
             seed = np.random.randint(100000)
 
             np.random.seed(seed)
-            ref_mp = naive.scrump(T_B_sub, m, T_B_sub, percentage, zone, False, None)
-            ref_P = ref_mp[:, 0]
-            ref_I = ref_mp[:, 1]
-            ref_left_I = ref_mp[:, 2]
-            ref_right_I = ref_mp[:, 3]
+            ref_P, ref_I, ref_left_I, ref_right_I = naive.scrump(
+                T_B_sub, m, T_B_sub, percentage, zone, False, None
+            )
 
             np.random.seed(seed)
             approx = scrump(T_B_sub, m, percentage=percentage, pre_scrump=False)
@@ -669,11 +648,9 @@ def test_scrump_nan_zero_mean_self_join(percentages):
         seed = np.random.randint(100000)
 
         np.random.seed(seed)
-        ref_mp = naive.scrump(T, m, T, percentage, zone, False, None)
-        ref_P = ref_mp[:, 0]
-        ref_I = ref_mp[:, 1]
-        ref_left_I = ref_mp[:, 2]
-        ref_right_I = ref_mp[:, 3]
+        ref_P, ref_I, ref_left_I, ref_right_I = naive.scrump(
+            T, m, T, percentage, zone, False, None
+        )
 
         np.random.seed(seed)
         approx = scrump(T, m, percentage=percentage, pre_scrump=False)
@@ -708,3 +685,300 @@ def test_prescrump_A_B_join_larger_window(T_A, T_B):
 
             npt.assert_almost_equal(ref_P, comp_P)
             npt.assert_almost_equal(ref_I, comp_I)
+
+
+@pytest.mark.parametrize("T_A, T_B", test_data)
+def test_prescrump_self_join_KNN(T_A, T_B):
+    m = 3
+    zone = int(np.ceil(m / 4))
+    for k in range(2, 4):
+        for s in range(1, zone + 1):
+            seed = np.random.randint(100000)
+
+            np.random.seed(seed)
+            ref_P, ref_I = naive.prescrump(T_B, m, T_B, s=s, exclusion_zone=zone, k=k)
+
+            np.random.seed(seed)
+            comp_P, comp_I = prescrump(T_B, m, s=s, k=k)
+
+            npt.assert_almost_equal(ref_I, comp_I)
+            npt.assert_almost_equal(ref_P, comp_P)
+
+
+@pytest.mark.parametrize("T_A, T_B", test_data)
+def test_prescrump_A_B_join_KNN(T_A, T_B):
+    m = 3
+    zone = int(np.ceil(m / 4))
+    for k in range(2, 4):
+        for s in range(1, zone + 1):
+            seed = np.random.randint(100000)
+
+            np.random.seed(seed)
+            ref_P, ref_I = naive.prescrump(T_A, m, T_B, s=s)
+
+            np.random.seed(seed)
+            comp_P, comp_I = prescrump(T_A, m, T_B=T_B, s=s)
+
+            npt.assert_almost_equal(ref_P, comp_P)
+            npt.assert_almost_equal(ref_I, comp_I)
+
+
+@pytest.mark.parametrize("T_A, T_B", test_data)
+@pytest.mark.parametrize("percentages", percentages)
+def test_scrump_self_join_KNN(T_A, T_B, percentages):
+    m = 3
+    zone = int(np.ceil(m / 4))
+
+    for k in range(2, 4):
+        for percentage in percentages:
+            seed = np.random.randint(100000)
+
+            np.random.seed(seed)
+            ref_P, ref_I, ref_left_I, ref_right_I = naive.scrump(
+                T_B, m, T_B, percentage, zone, False, None, k=k
+            )
+
+            np.random.seed(seed)
+            approx = scrump(
+                T_B,
+                m,
+                ignore_trivial=True,
+                percentage=percentage,
+                pre_scrump=False,
+                k=k,
+            )
+            approx.update()
+            comp_P = approx.P_
+            comp_I = approx.I_
+            comp_left_I = approx.left_I_
+            comp_right_I = approx.right_I_
+
+            naive.replace_inf(ref_P)
+            naive.replace_inf(comp_P)
+            npt.assert_almost_equal(ref_P, comp_P)
+            npt.assert_almost_equal(ref_I, comp_I)
+            npt.assert_almost_equal(ref_left_I, comp_left_I)
+            npt.assert_almost_equal(ref_right_I, comp_right_I)
+
+
+@pytest.mark.parametrize("T_A, T_B", test_data)
+@pytest.mark.parametrize("percentages", percentages)
+def test_scrump_A_B_join_KNN(T_A, T_B, percentages):
+    m = 3
+    for k in range(2, 4):
+        for percentage in percentages:
+            seed = np.random.randint(100000)
+
+            np.random.seed(seed)
+            ref_P, ref_I, ref_left_I, ref_right_I = naive.scrump(
+                T_A, m, T_B, percentage, None, False, None, k=k
+            )
+
+            np.random.seed(seed)
+            approx = scrump(
+                T_A,
+                m,
+                T_B,
+                ignore_trivial=False,
+                percentage=percentage,
+                pre_scrump=False,
+                k=k,
+            )
+            approx.update()
+            comp_P = approx.P_
+            comp_I = approx.I_
+            comp_left_I = approx.left_I_
+            comp_right_I = approx.right_I_
+
+            naive.replace_inf(ref_P)
+            naive.replace_inf(comp_P)
+
+            npt.assert_almost_equal(ref_P, comp_P)
+            npt.assert_almost_equal(ref_I, comp_I)
+            npt.assert_almost_equal(ref_left_I, comp_left_I)
+            npt.assert_almost_equal(ref_right_I, comp_right_I)
+
+
+@pytest.mark.parametrize("T_A, T_B", test_data)
+@pytest.mark.parametrize("percentages", percentages)
+def test_scrump_plus_plus_self_join_KNN(T_A, T_B, percentages):
+    m = 3
+    zone = int(np.ceil(m / 4))
+
+    for k in range(2, 4):
+        for s in range(1, zone + 1):
+            for percentage in percentages:
+                seed = np.random.randint(100000)
+
+                np.random.seed(seed)
+                ref_P, ref_I = naive.prescrump(
+                    T_B, m, T_B, s=s, exclusion_zone=zone, k=k
+                )
+                ref_P_aux, ref_I_aux, _, _ = naive.scrump(
+                    T_B, m, T_B, percentage, zone, True, s, k=k
+                )
+                naive.merge_topk_PI(ref_P, ref_P_aux, ref_I, ref_I_aux)
+
+                np.random.seed(seed)
+                approx = scrump(
+                    T_B,
+                    m,
+                    ignore_trivial=True,
+                    percentage=percentage,
+                    pre_scrump=True,
+                    s=s,
+                    k=k,
+                )
+                approx.update()
+                comp_P = approx.P_
+                comp_I = approx.I_
+
+                naive.replace_inf(ref_P)
+                naive.replace_inf(comp_P)
+
+                npt.assert_almost_equal(ref_P, comp_P)
+                npt.assert_almost_equal(ref_I, comp_I)
+
+
+@pytest.mark.parametrize("T_A, T_B", test_data)
+def test_prescrump_self_join_larger_window_m_5_k_5(T_A, T_B):
+    m = 5
+    k = 5
+    zone = int(np.ceil(m / 4))
+
+    if len(T_B) > m:
+        for s in range(1, zone + 1):
+            seed = np.random.randint(100000)
+
+            np.random.seed(seed)
+            ref_P, ref_I = naive.prescrump(T_B, m, T_B, s=s, exclusion_zone=zone, k=k)
+
+            np.random.seed(seed)
+            comp_P, comp_I = prescrump(T_B, m, s=s, k=k)
+
+            npt.assert_almost_equal(ref_P, comp_P)
+            npt.assert_almost_equal(ref_I, comp_I)
+
+
+@pytest.mark.parametrize("T_A, T_B", test_data)
+def test_prescrump_A_B_join_larger_window_m_5_k_5(T_A, T_B):
+    m = 5
+    k = 5
+    zone = int(np.ceil(m / 4))
+    if len(T_A) > m and len(T_B) > m:
+        for s in range(1, zone + 1):
+            seed = np.random.randint(100000)
+
+            np.random.seed(seed)
+            ref_P, ref_I = naive.prescrump(T_A, m, T_B, s=s, k=k)
+
+            np.random.seed(seed)
+            comp_P, comp_I = prescrump(T_A, m, T_B, s=s, k=k)
+
+            npt.assert_almost_equal(ref_P, comp_P)
+            npt.assert_almost_equal(ref_I, comp_I)
+
+
+def test_prescrump_self_join_KNN_no_overlap():
+    # This test is particularly designed to raise error in a rare case described
+    # as follows: Let's denote `I[i]` as the array with length `k` that contains
+    # the start indices of the best-so-far top-k nearest neighbors of `subseq i`,
+    # (`S_i`). Also, we denote `P[i]` as their corresponding ascendingly-sorted
+    # distances. Let's denote `d` as the distane betweeen `S_i` and `S_j`. P[i] and
+    # I[i] must be updated if (1) `j` is not in I[i] and (2) `d` < P[i,-1]. Regarding
+    # the former condition, one needs to check the whole array I[i]. Checking the
+    # array I[i, :idx], where `idx = np.searchsorted(P[i], 'd', side='right')` is
+    # not completly correct and that is due to imprecision in numerical calculation.
+    # It may happen that `j` is not in `I[i, :idx]`, but it is in fact at `I[i, idx]`
+    # (or any other position in array I[i]). And, its corresponding distance, i.e
+    # P[i, idx], is d + 1e-5, for instance. In theory, this should be exactly `d`.
+    #  However, due to imprecision, we may calculated a slightly different value
+    # for such distance in one of previous iterations in function prescrump. This
+    #  test results in error if someone tries to change the performant code of prescrump
+    # function and check `I[i, :idx]` rather than the full array `I[i]`.
+    T = np.array(
+        [
+            -916.64703784,
+            -327.42056679,
+            379.19386284,
+            -281.80427628,
+            -189.85401773,
+            -38.69610569,
+            187.89889345,
+            578.65862523,
+            528.09687811,
+            -667.42973795,
+            -285.27749324,
+            -211.28930925,
+            -703.93802657,
+            -820.53780562,
+            -955.91174663,
+            383.65471851,
+            932.08809422,
+            -563.57569746,
+            784.0546579,
+            -343.14886064,
+            -612.72329848,
+            -270.09273091,
+            -448.39346549,
+            578.03202014,
+            867.15436674,
+            -783.55167049,
+            -494.78062922,
+            -311.18567747,
+            522.70052256,
+            933.45474094,
+            192.34822368,
+            -162.11374908,
+            -612.95359279,
+            -449.62297051,
+            -351.79138459,
+            -77.70189101,
+            -439.46519487,
+            -660.48431174,
+            548.69362177,
+            485.36004744,
+            -535.3566627,
+            -568.0955257,
+            755.26647273,
+            736.1079588,
+            -597.65672557,
+            379.3299783,
+            731.38211912,
+            247.34827447,
+            545.41888454,
+            644.94300763,
+            20.99042666,
+            788.19859515,
+            -898.24325898,
+            -929.47841134,
+            -738.45875181,
+            66.01030291,
+            512.945841,
+            -44.07720164,
+            302.97141464,
+            -696.95271302,
+            662.98385163,
+            -712.3807531,
+            -43.62688539,
+            74.16927482,
+        ]
+    )
+
+    # test_cases: dict() with `key: value` pair, where key is `(m, k)`, and value
+    # is a list of random `seeds`
+    test_cases = {
+        (3, 2): [4279, 9133, 8190],
+        (3, 5): [1267, 4016, 4046],
+        (5, 2): [6327, 4926, 3712],
+        (5, 5): [3032, 3032, 8117],
+    }
+    for (m, k), specified_seeds in test_cases.items():
+        zone = int(np.ceil(m / 4))
+        for seed in specified_seeds:
+            np.random.seed(seed)
+            ref_P, ref_I = naive.prescrump(T, m, T, s=1, exclusion_zone=zone, k=k)
+            comp_P, comp_I = prescrump(T, m, s=1, k=k)
+
+            npt.assert_almost_equal(ref_P, comp_P)
+            npt.assert_almost_equal(ref_I, comp_I)
diff --git a/tests/test_stimp.py b/tests/test_stimp.py
index 089c1f1f9..a56bec695 100644
--- a/tests/test_stimp.py
+++ b/tests/test_stimp.py
@@ -50,12 +50,9 @@ def test_stimp_1_percent(T):
         zone = int(np.ceil(m / 4))
         s = zone
         tmp_P, tmp_I = naive.prescrump(T, m, T, s=s, exclusion_zone=zone)
-        ref_mp = naive.scrump(T, m, T, percentage, zone, True, s)
-        for i in range(ref_mp.shape[0]):
-            if tmp_P[i] < ref_mp[i, 0]:
-                ref_mp[i, 0] = tmp_P[i]
-                ref_mp[i, 1] = tmp_I[i]
-        ref_PAN[pan._bfs_indices[idx], : ref_mp.shape[0]] = ref_mp[:, 0]
+        ref_P, ref_I, _, _ = naive.scrump(T, m, T, percentage, zone, True, s)
+        naive.merge_topk_PI(ref_P, tmp_P, ref_I, tmp_I)
+        ref_PAN[pan._bfs_indices[idx], : ref_P.shape[0]] = ref_P
 
     # Compare raw pan
     cmp_PAN = pan._PAN
@@ -108,12 +105,9 @@ def test_stimp_max_m(T):
         zone = int(np.ceil(m / 4))
         s = zone
         tmp_P, tmp_I = naive.prescrump(T, m, T, s=s, exclusion_zone=zone)
-        ref_mp = naive.scrump(T, m, T, percentage, zone, True, s)
-        for i in range(ref_mp.shape[0]):
-            if tmp_P[i] < ref_mp[i, 0]:
-                ref_mp[i, 0] = tmp_P[i]
-                ref_mp[i, 1] = tmp_I[i]
-        ref_PAN[pan._bfs_indices[idx], : ref_mp.shape[0]] = ref_mp[:, 0]
+        ref_P, ref_I, _, _ = naive.scrump(T, m, T, percentage, zone, True, s)
+        naive.merge_topk_PI(ref_P, tmp_P, ref_I, tmp_I)
+        ref_PAN[pan._bfs_indices[idx], : ref_P.shape[0]] = ref_P
 
     # Compare raw pan
     cmp_PAN = pan._PAN
diff --git a/tests/test_stump.py b/tests/test_stump.py
index 6feaf1598..e08746758 100644
--- a/tests/test_stump.py
+++ b/tests/test_stump.py
@@ -240,3 +240,34 @@ def test_stump_nan_zero_mean_self_join():
     naive.replace_inf(ref_mp)
     naive.replace_inf(comp_mp)
     npt.assert_almost_equal(ref_mp, comp_mp)
+
+
+@pytest.mark.parametrize("T_A, T_B", test_data)
+def test_stump_self_join_KNN(T_A, T_B):
+    m = 3
+    zone = int(np.ceil(m / 4))
+    for k in range(2, 4):
+        ref_mp = naive.stump(T_B, m, exclusion_zone=zone, k=k)
+        comp_mp = stump(T_B, m, ignore_trivial=True, k=k)
+        naive.replace_inf(ref_mp)
+        naive.replace_inf(comp_mp)
+        npt.assert_almost_equal(ref_mp, comp_mp)
+
+        comp_mp = stump(pd.Series(T_B), m, ignore_trivial=True, k=k)
+        naive.replace_inf(comp_mp)
+        npt.assert_almost_equal(ref_mp, comp_mp)
+
+
+@pytest.mark.parametrize("T_A, T_B", test_data)
+def test_stump_A_B_join_KNN(T_A, T_B):
+    m = 3
+    for k in range(2, 4):
+        ref_mp = naive.stump(T_A, m, T_B=T_B, k=k)
+        comp_mp = stump(T_A, m, T_B, ignore_trivial=False, k=k)
+        naive.replace_inf(ref_mp)
+        naive.replace_inf(comp_mp)
+        npt.assert_almost_equal(ref_mp, comp_mp)
+
+        comp_mp = stump(pd.Series(T_A), m, pd.Series(T_B), ignore_trivial=False, k=k)
+        naive.replace_inf(comp_mp)
+        npt.assert_almost_equal(ref_mp, comp_mp)
diff --git a/tests/test_stumped.py b/tests/test_stumped.py
index ca53829fc..9181d81c8 100644
--- a/tests/test_stumped.py
+++ b/tests/test_stumped.py
@@ -608,3 +608,36 @@ def test_stumped_two_subsequences_nan_inf_A_B_join_swap(
         naive.replace_inf(ref_mp)
         naive.replace_inf(comp_mp)
         npt.assert_almost_equal(ref_mp, comp_mp)
+
+
+@pytest.mark.filterwarnings("ignore:numpy.dtype size changed")
+@pytest.mark.filterwarnings("ignore:numpy.ufunc size changed")
+@pytest.mark.filterwarnings("ignore:numpy.ndarray size changed")
+@pytest.mark.filterwarnings("ignore:\\s+Port 8787 is already in use:UserWarning")
+@pytest.mark.parametrize("T_A, T_B", test_data)
+def test_stumped_self_join_KNN(T_A, T_B, dask_cluster):
+    with Client(dask_cluster) as dask_client:
+        m = 3
+        zone = int(np.ceil(m / 4))
+        for k in range(2, 4):
+            ref_mp = naive.stump(T_B, m, exclusion_zone=zone, k=k)
+            comp_mp = stumped(dask_client, T_B, m, ignore_trivial=True, k=k)
+            naive.replace_inf(ref_mp)
+            naive.replace_inf(comp_mp)
+            npt.assert_almost_equal(ref_mp, comp_mp)
+
+
+@pytest.mark.filterwarnings("ignore:numpy.dtype size changed")
+@pytest.mark.filterwarnings("ignore:numpy.ufunc size changed")
+@pytest.mark.filterwarnings("ignore:numpy.ndarray size changed")
+@pytest.mark.filterwarnings("ignore:\\s+Port 8787 is already in use:UserWarning")
+@pytest.mark.parametrize("T_A, T_B", test_data)
+def test_stumped_A_B_join_KNN(T_A, T_B, dask_cluster):
+    with Client(dask_cluster) as dask_client:
+        m = 3
+        for k in range(2, 4):
+            ref_mp = naive.stump(T_A, m, T_B=T_B, k=k)
+            comp_mp = stumped(dask_client, T_A, m, T_B, ignore_trivial=False, k=k)
+            naive.replace_inf(ref_mp)
+            naive.replace_inf(comp_mp)
+            npt.assert_almost_equal(ref_mp, comp_mp)
diff --git a/tests/test_stumpi.py b/tests/test_stumpi.py
index 0fa2c3066..5ab2023a7 100644
--- a/tests/test_stumpi.py
+++ b/tests/test_stumpi.py
@@ -35,9 +35,8 @@ def test_stumpi_self_join():
     ref_mp = naive.stump(stream.T_, m, exclusion_zone=zone, row_wise=True)
     ref_P = ref_mp[:, 0]
     ref_I = ref_mp[:, 1]
-    ref_left_P = np.empty(ref_P.shape)
-    ref_left_P[:] = np.inf
-    ref_left_I = ref_mp[:, 2]
+    ref_left_I = ref_mp[:, 2].astype(np.int64)
+    ref_left_P = np.full_like(ref_left_I, np.inf, dtype=np.float64)
     for i, j in enumerate(ref_left_I):
         if j >= 0:
             D = core.mass(stream.T_[i : i + m], stream.T_[j : j + m])
@@ -861,8 +860,8 @@ def test_stumpi_profile_index_match():
 
     T_stream = T_full[:warm_start].copy()
     stream = stumpi(T_stream, m, egress=True)
-    P = np.full(stream.P_.shape, np.inf)
-    left_P = np.full(stream.left_P_.shape, np.inf)
+    P = np.full_like(stream.P_, np.inf, dtype=np.float64)
+    left_P = np.full_like(stream.left_P_, np.inf, dtype=np.float64)
 
     n = 0
     for i in range(len(T_stream), len(T_full)):
@@ -870,18 +869,18 @@ def test_stumpi_profile_index_match():
         stream.update(t)
 
         P[:] = np.inf
-        idx = np.argwhere(stream.I_ >= 0).flatten()
-        P[idx] = naive.distance(
-            naive.z_norm(T_full_subseq[idx + n + 1], axis=1),
-            naive.z_norm(T_full_subseq[stream.I_[idx]], axis=1),
+        indices = np.argwhere(stream.I_ >= 0).flatten()
+        P[indices] = naive.distance(
+            naive.z_norm(T_full_subseq[indices + n + 1], axis=1),
+            naive.z_norm(T_full_subseq[stream.I_[indices]], axis=1),
             axis=1,
         )
 
         left_P[:] = np.inf
-        idx = np.argwhere(stream.left_I_ >= 0).flatten()
-        left_P[idx] = naive.distance(
-            naive.z_norm(T_full_subseq[idx + n + 1], axis=1),
-            naive.z_norm(T_full_subseq[stream.left_I_[idx]], axis=1),
+        indices = np.argwhere(stream.left_I_ >= 0).flatten()
+        left_P[indices] = naive.distance(
+            naive.z_norm(T_full_subseq[indices + n + 1], axis=1),
+            naive.z_norm(T_full_subseq[stream.left_I_[indices]], axis=1),
             axis=1,
         )
 
@@ -889,3 +888,175 @@ def test_stumpi_profile_index_match():
         npt.assert_almost_equal(stream.left_P_, left_P)
 
         n += 1
+
+
+def test_stumpi_self_join_KNN():
+    m = 3
+    zone = int(np.ceil(m / 4))
+
+    for k in range(2, 4):
+        seed = np.random.randint(100000)
+        np.random.seed(seed)
+
+        T = np.random.rand(30)
+        stream = stumpi(T, m, egress=False, k=k)
+        for i in range(34):
+            t = np.random.rand()
+            stream.update(t)
+
+        comp_P = stream.P_
+        comp_I = stream.I_
+        comp_left_P = stream.left_P_
+        comp_left_I = stream.left_I_
+
+        ref_mp = naive.stump(stream.T_, m, exclusion_zone=zone, row_wise=True, k=k)
+        ref_P = ref_mp[:, :k]
+        ref_I = ref_mp[:, k : 2 * k]
+        ref_left_I = ref_mp[:, 2 * k].astype(np.int64)
+        ref_left_P = np.full_like(ref_left_I, np.inf, dtype=np.float64)
+        for i, j in enumerate(ref_left_I):
+            if j >= 0:
+                D = core.mass(stream.T_[i : i + m], stream.T_[j : j + m])
+                ref_left_P[i] = D[0]
+
+        naive.replace_inf(ref_P)
+        naive.replace_inf(ref_left_P)
+        naive.replace_inf(comp_P)
+        naive.replace_inf(comp_left_P)
+
+        npt.assert_almost_equal(ref_P, comp_P)
+        npt.assert_almost_equal(ref_I, comp_I)
+        npt.assert_almost_equal(ref_left_P, comp_left_P)
+        npt.assert_almost_equal(ref_left_I, comp_left_I)
+
+        np.random.seed(seed)
+        T = np.random.rand(30)
+        T = pd.Series(T)
+        stream = stumpi(T, m, egress=False, k=k)
+        for i in range(34):
+            t = np.random.rand()
+            stream.update(t)
+
+        comp_P = stream.P_
+        comp_I = stream.I_
+        comp_left_P = stream.left_P_
+        comp_left_I = stream.left_I_
+
+        naive.replace_inf(comp_P)
+        naive.replace_inf(comp_left_P)
+
+        npt.assert_almost_equal(ref_P, comp_P)
+        npt.assert_almost_equal(ref_I, comp_I)
+        npt.assert_almost_equal(ref_left_P, comp_left_P)
+        npt.assert_almost_equal(ref_left_I, comp_left_I)
+
+
+def test_stumpi_self_join_egress_KNN():
+    m = 3
+
+    for k in range(2, 4):
+        seed = np.random.randint(100000)
+        np.random.seed(seed)
+        n = 30
+        T = np.random.rand(n)
+
+        ref_mp = naive.stumpi_egress(T, m, k=k)
+        ref_P = ref_mp.P_.copy()
+        ref_I = ref_mp.I_
+        ref_left_P = ref_mp.left_P_.copy()
+        ref_left_I = ref_mp.left_I_
+
+        stream = stumpi(T, m, egress=True, k=k)
+
+        comp_P = stream.P_.copy()
+        comp_I = stream.I_
+        comp_left_P = stream.left_P_.copy()
+        comp_left_I = stream.left_I_
+
+        naive.replace_inf(ref_P)
+        naive.replace_inf(ref_left_P)
+        naive.replace_inf(comp_P)
+        naive.replace_inf(comp_left_P)
+
+        npt.assert_almost_equal(ref_P, comp_P)
+        npt.assert_almost_equal(ref_I, comp_I)
+        npt.assert_almost_equal(ref_left_P, comp_left_P)
+        npt.assert_almost_equal(ref_left_I, comp_left_I)
+
+        for i in range(34):
+            t = np.random.rand()
+            ref_mp.update(t)
+            stream.update(t)
+
+            comp_P = stream.P_.copy()
+            comp_I = stream.I_
+            comp_left_P = stream.left_P_.copy()
+            comp_left_I = stream.left_I_
+
+            ref_P = ref_mp.P_.copy()
+            ref_I = ref_mp.I_
+            ref_left_P = ref_mp.left_P_.copy()
+            ref_left_I = ref_mp.left_I_
+
+            naive.replace_inf(ref_P)
+            naive.replace_inf(ref_left_P)
+            naive.replace_inf(comp_P)
+            naive.replace_inf(comp_left_P)
+
+            npt.assert_almost_equal(ref_P, comp_P)
+            npt.assert_almost_equal(ref_I, comp_I)
+            npt.assert_almost_equal(ref_left_P, comp_left_P)
+            npt.assert_almost_equal(ref_left_I, comp_left_I)
+
+        np.random.seed(seed)
+        T = np.random.rand(n)
+        T = pd.Series(T)
+
+        ref_mp = naive.stumpi_egress(T, m, k=k)
+        ref_P = ref_mp.P_.copy()
+        ref_I = ref_mp.I_
+        ref_left_P = ref_mp.left_P_.copy()
+        ref_left_I = ref_mp.left_I_
+
+        stream = stumpi(T, m, egress=True, k=k)
+
+        comp_P = stream.P_.copy()
+        comp_I = stream.I_
+        comp_left_P = stream.left_P_.copy()
+        comp_left_I = stream.left_I_
+
+        naive.replace_inf(ref_P)
+        naive.replace_inf(ref_left_P)
+        naive.replace_inf(comp_P)
+        naive.replace_inf(comp_left_P)
+
+        npt.assert_almost_equal(ref_P, comp_P)
+        npt.assert_almost_equal(ref_I, comp_I)
+        npt.assert_almost_equal(ref_left_P, comp_left_P)
+        npt.assert_almost_equal(ref_left_I, comp_left_I)
+
+        for i in range(34):
+            t = np.random.rand()
+            t = np.random.rand()
+            ref_mp.update(t)
+            stream.update(t)
+
+            comp_P = stream.P_.copy()
+            comp_I = stream.I_
+            comp_left_P = stream.left_P_.copy()
+            comp_left_I = stream.left_I_
+
+            ref_P = ref_mp.P_.copy()
+            ref_I = ref_mp.I_
+            ref_left_P = ref_mp.left_P_.copy()
+            ref_left_I = ref_mp.left_I_
+
+            naive.replace_inf(ref_P)
+            naive.replace_inf(ref_left_P)
+            naive.replace_inf(comp_P)
+            naive.replace_inf(comp_left_P)
+
+            npt.assert_almost_equal(ref_P, comp_P)
+            npt.assert_almost_equal(ref_I, comp_I)
+            npt.assert_almost_equal(ref_left_P, comp_left_P)
+            npt.assert_almost_equal(ref_left_I, comp_left_I)