diff --git a/.devcontainer/cuda12.2-gcc12/devcontainer.json b/.devcontainer/cuda12.2-gcc12/devcontainer.json
new file mode 100644
index 000000000..199ce44f4
--- /dev/null
+++ b/.devcontainer/cuda12.2-gcc12/devcontainer.json
@@ -0,0 +1,39 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:23.08-cpp-gcc12-cuda12.2-ubuntu22.04",
+  "hostRequirements": {
+    "gpu": true
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "VAULT_HOST": "https://vault.ops.k8s.rapids.ai",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.2-gcc12"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd"
+      ],
+      "settings": {
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}/build/latest"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.2-gcc12"
+}
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
new file mode 100644
index 000000000..84cfa82cc
--- /dev/null
+++ b/.devcontainer/devcontainer.json
@@ -0,0 +1,37 @@
+{
+    "shutdownAction": "stopContainer",
+    "image": "rapidsai/devcontainers:23.08-cpp-gcc12-cuda12.2-ubuntu22.04",
+    "hostRequirements": {
+        "gpu": true
+    },
+    "initializeCommand": [
+        "/bin/bash",
+        "-c",
+        "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+    ],
+    "containerEnv": {
+        "SCCACHE_REGION": "us-east-2",
+        "SCCACHE_BUCKET": "rapids-sccache-devs",
+        "VAULT_HOST": "https://vault.ops.k8s.rapids.ai",
+        "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history"
+    },
+    "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+    "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+    "mounts": [
+        "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+        "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+        "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+    ],
+    "customizations": {
+        "vscode": {
+            "extensions": [
+                "llvm-vs-code-extensions.vscode-clangd"
+            ],
+            "settings": {
+                "clangd.arguments": [
+                    "--compile-commands-dir=${workspaceFolder}/build/latest"
+                ]
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/.devcontainer/launch.sh b/.devcontainer/launch.sh
new file mode 100755
index 000000000..157a49bef
--- /dev/null
+++ b/.devcontainer/launch.sh
@@ -0,0 +1,58 @@
+#! /usr/bin/env bash
+# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+launch_devcontainer() {
+
+    # Ensure we're in the repo root
+    cd "$( cd "$( dirname "$(realpath -m "${BASH_SOURCE[0]}")" )" && pwd )/..";
+
+    if [[ -z $1 ]] || [[ -z $2 ]]; then
+        echo "Usage: $0 [CUDA version] [Host compiler]"
+        echo "Example: $0 12.1 gcc12"
+        return 1
+    fi
+
+    local cuda_version="$1"
+    local host_compiler="$2"
+    local workspace="$(basename "$(pwd)")";
+    local tmpdir="$(mktemp -d)/${workspace}";
+    local path="$(pwd)/.devcontainer/cuda${cuda_version}-${host_compiler}";
+
+    mkdir -p "${tmpdir}";
+    mkdir -p "${tmpdir}/.devcontainer";
+    cp -arL "$path/devcontainer.json" "${tmpdir}/.devcontainer";
+    sed -i "s@\${localWorkspaceFolder}@$(pwd)@g" "${tmpdir}/.devcontainer/devcontainer.json";
+    path="${tmpdir}";
+
+    local hash="$(echo -n "${path}" | xxd -pu - | tr -d '[:space:]')";
+    local url="vscode://vscode-remote/dev-container+${hash}/home/coder/cuCollections";
+
+    echo "devcontainer URL: ${url}";
+
+    local launch="";
+    if type open >/dev/null 2>&1; then
+        launch="open";
+    elif type xdg-open >/dev/null 2>&1; then
+        launch="xdg-open";
+    fi
+
+    if [ -n "${launch}" ]; then
+        code --new-window "${tmpdir}";
+        exec "${launch}" "${url}" >/dev/null 2>&1;
+    fi
+}
+
+launch_devcontainer "$@";
\ No newline at end of file
diff --git a/.devcontainer/make_devcontainers.sh b/.devcontainer/make_devcontainers.sh
new file mode 100755
index 000000000..700dc3713
--- /dev/null
+++ b/.devcontainer/make_devcontainers.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This script parses the CI matrix.yaml file and generates a devcontainer.json file for each unique combination of
+# CUDA version, compiler name/version, and Ubuntu version. The devcontainer.json files are written to the
+# .devcontainer directory to a subdirectory named after the CUDA version and compiler name/version.
+# GitHub docs on using multiple devcontainer.json files:
+# https://docs.github.com/en/codespaces/setting-up-your-project-for-codespaces/adding-a-dev-container-configuration/introduction-to-dev-containers#devcontainerjson
+
+# Ensure the script is being executed in its containing directory
+cd "$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )";
+
+# The root devcontainer.json file is used as a template for all other devcontainer.json files
+# by replacing the `image:` field with the appropriate image name
+base_devcontainer_file="./devcontainer.json"
+
+
+# Read matrix.yaml and convert it to json
+matrix_json=$(yq -o json ../ci/matrix.yml)
+
+
+# Get the devcontainer image version and define image tag root
+DEVCONTAINER_VERSION=$(echo "$matrix_json" | jq -r '.devcontainer_version')
+IMAGE_ROOT="rapidsai/devcontainers:${DEVCONTAINER_VERSION}-cpp-"
+
+# Get unique combinations of cuda version, compiler name/version, and Ubuntu version
+combinations=$(echo "$matrix_json" | jq -c '[.pull_request.nvcc[] | {cuda: .cuda, compiler_name: .compiler.name, compiler_version: .compiler.version, os: .os}] | unique | .[]')
+
+# For each unique combination
+for combination in $combinations; do
+    cuda_version=$(echo "$combination" | jq -r '.cuda')
+    compiler_name=$(echo "$combination" | jq -r '.compiler_name')
+    compiler_version=$(echo "$combination" | jq -r '.compiler_version')
+    os=$(echo "$combination" | jq -r '.os')
+
+    name="cuda$cuda_version-$compiler_name$compiler_version"
+    mkdir -p "$name"
+    devcontainer_file="$name/devcontainer.json"
+    image="$IMAGE_ROOT$compiler_name$compiler_version-cuda$cuda_version-$os"
+
+    # Use the base_devcontainer.json as a template, plug in the CUDA, compiler names, versions, and Ubuntu version,
+    # and write the output to the new devcontainer.json file
+    #jq --arg image "$image"  --arg name "$name" '. + {image: $image, name: $name}' $base_devcontainer_file > "$devcontainer_file"
+    jq --arg image "$image" --arg name "$name" '.image = $image | .name = $name | .containerEnv.DEVCONTAINER_NAME = $name' $base_devcontainer_file > "$devcontainer_file"
+
+    echo "Created $devcontainer_file"
+done
\ No newline at end of file
diff --git a/.github/actions/compute-matrix/action.yml b/.github/actions/compute-matrix/action.yml
new file mode 100644
index 000000000..fbbe49b54
--- /dev/null
+++ b/.github/actions/compute-matrix/action.yml
@@ -0,0 +1,39 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: Compute Matrix
+description: "Compute the matrix for a given matrix type from the specified matrix file"
+
+inputs:
+  matrix_query:
+    description: "The jq query used to specify the desired matrix. e.g., .pull_request.nvcc"
+    required: true
+  matrix_file:
+    description: 'The file containing the matrix'
+    required: true
+outputs:
+  matrix:
+    description: 'The requested matrix'
+    value: ${{ steps.compute-matrix.outputs.MATRIX }}
+
+runs:
+  using: "composite"
+  steps:
+    - name: Compute matrix
+      id: compute-matrix
+      run: |
+        MATRIX=$(./.github/actions/compute-matrix/compute-matrix.sh ${{inputs.matrix_file}}  ${{inputs.matrix_query}} )
+        echo "matrix=$MATRIX" | tee -a $GITHUB_OUTPUT
+      shell: bash -euxo pipefail {0}
\ No newline at end of file
diff --git a/.github/actions/compute-matrix/compute-matrix.sh b/.github/actions/compute-matrix/compute-matrix.sh
new file mode 100755
index 000000000..64a6f5642
--- /dev/null
+++ b/.github/actions/compute-matrix/compute-matrix.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+# Check for the correct number of arguments
+if [ $# -ne 2 ]; then
+    echo "Usage: $0 MATRIX_FILE MATRIX_QUERY"
+    echo "MATRIX_FILE: The path to the matrix file."
+    echo "MATRIX_QUERY: The jq query used to specify the desired matrix. e.g., '.pull-request.nvcc'"
+    exit 1
+fi
+
+# Get realpath before changing directory
+MATRIX_FILE=$(realpath "$1")
+MATRIX_QUERY="$2"
+
+# Ensure the script is being executed in its containing directory
+cd "$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )";
+
+echo "Input matrix file:" >&2
+cat "$MATRIX_FILE" >&2
+echo "Query: $MATRIX_QUERY" >&2
+echo $(yq -o=json "$MATRIX_FILE" | jq -c -r "$MATRIX_QUERY | map(. as \$o | {std: .std[]} + del(\$o.std))")
\ No newline at end of file
diff --git a/.github/actions/configure_cccl_sccache/action.yml b/.github/actions/configure_cccl_sccache/action.yml
new file mode 100644
index 000000000..458669688
--- /dev/null
+++ b/.github/actions/configure_cccl_sccache/action.yml
@@ -0,0 +1,34 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: Set up AWS credentials and environment variables for sccache
+description: "Set up AWS credentials and environment variables for sccache"
+runs:
+  using: "composite"
+  steps:
+    - name: Get AWS credentials for sccache bucket
+      uses: aws-actions/configure-aws-credentials@v2
+      with:
+        role-to-assume: arn:aws:iam::279114543810:role/gha-oidc-NVIDIA
+        aws-region: us-east-2
+        role-duration-seconds: 43200 # 12 hours
+    - name: Set environment variables
+      run: |
+        echo "SCCACHE_BUCKET=rapids-sccache-east" >> $GITHUB_ENV
+        echo "SCCACHE_REGION=us-east-2" >> $GITHUB_ENV
+        echo "SCCACHE_IDLE_TIMEOUT=32768" >> $GITHUB_ENV
+        echo "SCCACHE_S3_USE_SSL=true" >> $GITHUB_ENV
+        echo "SCCACHE_S3_NO_CREDENTIALS=false" >> $GITHUB_ENV
+      shell: bash
\ No newline at end of file
diff --git a/.github/copy-pr-bot.yaml b/.github/copy-pr-bot.yaml
new file mode 100644
index 000000000..895ba83ee
--- /dev/null
+++ b/.github/copy-pr-bot.yaml
@@ -0,0 +1,4 @@
+# Configuration file for `copy-pr-bot` GitHub App
+# https://docs.gha-runners.nvidia.com/apps/copy-pr-bot/
+
+enabled: true
diff --git a/.github/workflows/add_to_project.yml b/.github/workflows/add_to_project.yml
deleted file mode 100644
index 72dd4acd2..000000000
--- a/.github/workflows/add_to_project.yml
+++ /dev/null
@@ -1,29 +0,0 @@
-name: Add new issue/PR to project
-
-on:
-  issues:
-    types:
-      - opened
-
-  pull_request_target:
-    types:
-      - opened
-
-jobs:
-  add-to-project:
-    name: Add issue or PR to project
-    runs-on: ubuntu-latest
-    steps:
-      - name: Generate token
-        id: generate_token
-        uses: tibdex/github-app-token@36464acb844fc53b9b8b2401da68844f6b05ebb0
-        with:
-          app_id: ${{ secrets.CCCL_AUTH_APP_ID }}
-          private_key: ${{ secrets.CCCL_AUTH_APP_PEM }}
-      - name: Add to Project
-        env:
-          TOKEN: ${{ steps.generate_token.outputs.token }}
-        uses: actions/add-to-project@v0.3.0
-        with:
-          project-url: https://github.com/orgs/NVIDIA/projects/6
-          github-token: ${{ env.TOKEN }}
diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml
new file mode 100644
index 000000000..6599e9dcb
--- /dev/null
+++ b/.github/workflows/build-and-test.yml
@@ -0,0 +1,86 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: build and test
+
+defaults:
+  run:
+    shell: bash -eo pipefail {0}
+
+on:
+  workflow_call:
+    inputs:
+      devcontainer_version: {type: string, required: true}
+      cuda_version: {type: string, required: true}
+      compiler: {type: string, required: true}
+      compiler_exe: {type: string, required: true}
+      compiler_version: {type: string, required: true}
+      std: {type: string, required: true}
+      gpu_build_archs: {type: string, required: true}
+      cpu: {type: string, required: true}
+      os: {type: string, required: true}
+      build_script: {type: string, required: false}
+      test_script: {type: string, required: false}
+      run_tests: {type: boolean, required: false, default: true}
+
+jobs:
+  devcontainer_image:
+    name: Devcontainer ${{ inputs.os }}/${{ inputs.compiler }}${{ inputs.compiler_version }}
+    runs-on: ubuntu-latest
+    outputs:
+      image_name: ${{ steps.compute-devcontainer-image-name.outputs.name }}
+    steps:
+      - name: Compute devcontainer image name
+        id: compute-devcontainer-image-name
+        run: |
+          COMPILER_SEGMENT=""
+          if [ "${{ inputs.compiler }}" != "cc" ] && [ "${{ inputs.compiler_exe }}" != "c++" ]; then
+            COMPILER_SEGMENT="${{ inputs.compiler }}${{ inputs.compiler_version }}-"
+          fi
+          DEVCONTAINER_IMAGE="rapidsai/devcontainers:${{inputs.devcontainer_version}}-cpp-${COMPILER_SEGMENT}cuda${{inputs.cuda_version}}-${{inputs.os}}"
+          echo "DEVCONTAINER_IMAGE=$DEVCONTAINER_IMAGE" >> $GITHUB_ENV
+          echo "name=$DEVCONTAINER_IMAGE" >> $GITHUB_OUTPUT
+      - name: Check if devcontainer image exists
+        run: |
+          docker buildx imagetools inspect $DEVCONTAINER_IMAGE > /dev/null
+          if [ $? -ne 0 ]; then
+            echo "Error: Docker image $DEVCONTAINER_IMAGE does not exist."
+            exit 1
+          fi
+
+  build:
+    needs: devcontainer_image
+    if: inputs.build_script != '' && needs.devcontainer_image.outputs.image_name != ''
+    name: Build ${{inputs.compiler}}${{inputs.compiler_version}}/C++${{inputs.std}}/SM${{inputs.gpu_build_archs}}
+    uses: ./.github/workflows/run-as-coder.yml
+    with:
+      name: Build ${{inputs.compiler}}${{inputs.compiler_version}}/C++${{inputs.std}}/SM${{inputs.gpu_build_archs}}
+      runner: linux-${{inputs.cpu}}-cpu16
+      image: ${{ needs.devcontainer_image.outputs.image_name }}
+      command: |
+        ${{ inputs.build_script }} "${{inputs.compiler_exe}}" "${{inputs.std}}" "${{inputs.gpu_build_archs}}"
+
+  test:
+    needs: [devcontainer_image, build]
+    if:  ${{ !cancelled() && ( needs.build.result == 'success' || needs.build.result == 'skipped' ) && inputs.test_script != '' && needs.devcontainer_image.outputs.image_name != '' && inputs.run_tests}}
+    name: Test ${{inputs.compiler}}${{inputs.compiler_version}}/C++${{inputs.std}}/SM${{inputs.gpu_build_archs}}
+    uses: ./.github/workflows/run-as-coder.yml
+    with:
+      name: Test ${{inputs.compiler}}${{inputs.compiler_version}}/C++${{inputs.std}}/SM${{inputs.gpu_build_archs}}
+      runner: linux-${{inputs.cpu}}-gpu-v100-latest-1
+      image: ${{ needs.devcontainer_image.outputs.image_name }}
+      command: |
+        nvidia-smi
+        ${{ inputs.test_script }} "${{inputs.compiler_exe}}" "${{inputs.std}}" "${{inputs.gpu_build_archs}}"
\ No newline at end of file
diff --git a/.github/workflows/dispatch-build-and-test.yml b/.github/workflows/dispatch-build-and-test.yml
new file mode 100644
index 000000000..dea71e00e
--- /dev/null
+++ b/.github/workflows/dispatch-build-and-test.yml
@@ -0,0 +1,49 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: Dispatch build and test
+
+on:
+  workflow_call:
+    inputs:
+      per_cuda_compiler_matrix: {type: string, required: true}
+      build_script: {type: string, required: false}
+      test_script: {type: string, required: false}
+      devcontainer_version: {type: string, required: true}
+
+jobs:
+  # Using a matrix to dispatch to the build-and-test reusable workflow for each build configuration
+  # ensures that the build/test steps can overlap across different configurations. For example,
+  # the build step for CUDA 12.1 + gcc 9.3 can run at the same time as the test step for CUDA 11.0 + clang 11.
+  build_and_test:
+    name: ${{matrix.cpu}}
+    uses: ./.github/workflows/build-and-test.yml
+    strategy:
+      fail-fast: false
+      matrix:
+        include: ${{ fromJSON(inputs.per_cuda_compiler_matrix) }}
+    with:
+      devcontainer_version: ${{ inputs.devcontainer_version }}
+      cuda_version: ${{ matrix.cuda }}
+      compiler: ${{ matrix.compiler.name }}
+      compiler_exe: ${{ matrix.compiler.exe }}
+      compiler_version: ${{ matrix.compiler.version }}
+      std: ${{ matrix.std }}
+      gpu_build_archs: ${{ matrix.gpu_build_archs }}
+      cpu: ${{ matrix.cpu }}
+      os: ${{ matrix.os }}
+      build_script: ${{ inputs.build_script }}
+      test_script: ${{ inputs.test_script }}
+      run_tests: ${{ contains(matrix.jobs, 'test') && !contains(github.event.head_commit.message, 'skip-tests') }}
diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
new file mode 100644
index 000000000..061b30a99
--- /dev/null
+++ b/.github/workflows/pr.yml
@@ -0,0 +1,121 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This is the main workflow that runs on every PR and push to main
+name: pr
+
+defaults:
+  run:
+    shell: bash -euo pipefail {0}
+
+on:
+  push:
+    branches:
+      - main
+      - dev
+      - "pull-request/[0-9]+"
+
+# Only runs one instance of this workflow at a time for a given PR and cancels any in-progress runs when a new one starts.
+concurrency:
+  group: ${{ github.workflow }}-on-${{ github.event_name }}-from-${{ github.ref_name }}
+  cancel-in-progress: true
+
+jobs:
+  doxygen-check:
+    name: Doxygen check
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repo
+        uses: actions/checkout@v3
+      - name: Install Doxygen
+        run: |
+          sudo apt-get update -q
+          sudo apt-get install -y doxygen
+      - name: Check Doxygen docs
+        run: |
+          ./ci/pre-commit/doxygen.sh
+          if [ $? -ne 0 ]; then
+            echo "Doxygen check failed"
+            exit 1
+          fi
+        shell: bash -euxo pipefail {0}
+
+  get-devcontainer-version:
+    name: Get devcontainer version
+    runs-on: ubuntu-latest
+    outputs:
+      DEVCONTAINER_VERSION: ${{ steps.set-outputs.outputs.DEVCONTAINER_VERSION }}
+    steps:
+      - name: Checkout repo
+        uses: actions/checkout@v3
+      - name: Get devcontainer version
+        id: set-outputs
+        run: |
+          DEVCONTAINER_VERSION=$(yq -o json ci/matrix.yml | jq -r '.devcontainer_version')
+          echo "DEVCONTAINER_VERSION=$DEVCONTAINER_VERSION" | tee -a "$GITHUB_OUTPUT"
+
+  compute-nvcc-matrix:
+    name: Compute NVCC matrix
+    runs-on: ubuntu-latest
+    outputs:
+      FULL_MATRIX: ${{ steps.set-outputs.outputs.FULL_MATRIX }}
+      CUDA_VERSIONS: ${{ steps.set-outputs.outputs.CUDA_VERSIONS }}
+      HOST_COMPILERS: ${{ steps.set-outputs.outputs.HOST_COMPILERS }}
+      PER_CUDA_COMPILER_MATRIX: ${{ steps.set-outputs.outputs.PER_CUDA_COMPILER_MATRIX }}
+    steps:
+      - name: Checkout repo
+        uses: actions/checkout@v3
+      - name: Get full nvcc matrix
+        id: compute-nvcc-matrix
+        uses: ./.github/actions/compute-matrix
+        with:
+          matrix_file: './ci/matrix.yml'
+          matrix_query: '.pull_request.nvcc'
+      - name: Set outputs
+        id: set-outputs
+        run: |
+          FULL_MATRIX='${{steps.compute-nvcc-matrix.outputs.matrix}}'
+          echo "FULL_MATRIX=$FULL_MATRIX" | tee -a "$GITHUB_OUTPUT"
+          CUDA_VERSIONS=$(echo $FULL_MATRIX | jq -c '[.[] | .cuda] | unique')
+          echo "CUDA_VERSIONS=$CUDA_VERSIONS" | tee -a "$GITHUB_OUTPUT"
+          HOST_COMPILERS=$(echo $FULL_MATRIX | jq -c '[.[] | .compiler.name] | unique')
+          echo "HOST_COMPILERS=$HOST_COMPILERS" | tee -a "$GITHUB_OUTPUT"
+          PER_CUDA_COMPILER_MATRIX=$(echo $FULL_MATRIX | jq -c ' group_by(.cuda + .compiler.name) | map({(.[0].cuda + "-" + .[0].compiler.name): .}) | add')
+          echo "PER_CUDA_COMPILER_MATRIX=$PER_CUDA_COMPILER_MATRIX" | tee -a "$GITHUB_OUTPUT"
+
+  ci:
+    name: CUDA${{ matrix.cuda_version }} ${{ matrix.compiler }}
+    needs: [compute-nvcc-matrix, get-devcontainer-version]
+    uses: ./.github/workflows/dispatch-build-and-test.yml
+    strategy:
+      fail-fast: false
+      matrix:
+        cuda_version: ${{ fromJSON(needs.compute-nvcc-matrix.outputs.CUDA_VERSIONS) }}
+        compiler: ${{ fromJSON(needs.compute-nvcc-matrix.outputs.HOST_COMPILERS) }}
+    with:
+      per_cuda_compiler_matrix: ${{ toJSON(fromJSON(needs.compute-nvcc-matrix.outputs.PER_CUDA_COMPILER_MATRIX)[ format('{0}-{1}', matrix.cuda_version, matrix.compiler) ]) }}
+      build_script: "./ci/build.sh"
+      test_script: "./ci/test.sh"
+      devcontainer_version: ${{ needs.get-devcontainer-version.outputs.DEVCONTAINER_VERSION }}
+
+  # This job is the final job that runs after all other jobs and is used for branch protection status checks.
+  # See: https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/collaborating-on-repositories-with-code-quality-features/about-status-checks
+  ci-success:
+    runs-on: ubuntu-latest
+    name: CI success
+    needs:
+      - ci
+    steps:
+      - run: echo "CI success"
\ No newline at end of file
diff --git a/.github/workflows/run-as-coder.yml b/.github/workflows/run-as-coder.yml
new file mode 100644
index 000000000..573ef134a
--- /dev/null
+++ b/.github/workflows/run-as-coder.yml
@@ -0,0 +1,66 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: Run as coder user
+
+defaults:
+  run:
+    shell: bash -exo pipefail {0}
+
+
+on:
+  workflow_call:
+    inputs:
+      name: {type: string, required: true}
+      image: {type: string, required: true}
+      runner: {type: string, required: true}
+      command: {type: string, required: true}
+      env: { type: string, required: false, default: "" }
+
+jobs:
+  run-as-coder:
+    name: ${{inputs.name}}
+    runs-on: ${{inputs.runner}}
+    container:
+      options: -u root
+      image: ${{inputs.image}}
+      env:
+        NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }}
+    permissions:
+      id-token: write
+    steps:
+      - name: Checkout repo
+        uses: actions/checkout@v3
+        with:
+          path: cuCollections
+          persist-credentials: false
+      - name: Move files to coder user home directory
+        run: |
+          cp -R cuCollections /home/coder/cuCollections
+          chown -R coder:coder /home/coder/
+      - name: Configure credentials and environment variables for sccache
+        uses: ./cuCollections/.github/actions/configure_cccl_sccache
+      - name: Run command
+        shell: su coder {0}
+        run: |
+            set -exo pipefail
+            cd ~/cuCollections
+            eval "${{inputs.command}}" || exit_code=$?
+            if [ ! -z "$exit_code" ]; then
+                echo "::error::Error! To checkout the corresponding code and reproduce locally, run the following commands:"
+                echo "git clone --branch $GITHUB_REF_NAME --single-branch --recurse-submodules https://github.com/$GITHUB_REPOSITORY.git && cd $(echo $GITHUB_REPOSITORY | cut -d'/' -f2) && git checkout $GITHUB_SHA"
+                echo "docker run --rm -it --gpus all --pull=always --volume \$PWD:/repo --workdir /repo ${{ inputs.image }} ${{inputs.command}}"
+                exit $exit_code
+            fi
diff --git a/.gitignore b/.gitignore
index 4146530ed..6ccf378c2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,7 +8,6 @@ __pycache__
 *.dylib
 .cache
 .vscode
-.devcontainer
 *.code-workspace
 *.swp
 *.pytest_cache
@@ -140,3 +139,6 @@ ENV/
 
 # clang
 compile_commands.json
+
+# figures
+*.eps
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index e2fe04169..5679bf67f 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -20,7 +20,7 @@ repos:
         hooks:
               - id: doxygen-check
                 name: doxygen-check
-                entry: ./ci/checks/doxygen.sh
+                entry: ./ci/pre-commit/doxygen.sh
                 files: ^include/
                 types_or: [file]
                 language: system
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e1b5055d9..f3ca85a8a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,5 @@
 #=============================================================================
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+# Copyright (c) 2018-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,10 +13,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #=============================================================================
-cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
+cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR)
 
 if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/CUCO_RAPIDS.cmake)
-    file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-22.10/RAPIDS.cmake
+    file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-23.10/RAPIDS.cmake
          ${CMAKE_CURRENT_BINARY_DIR}/CUCO_RAPIDS.cmake)
 endif()
 include(${CMAKE_CURRENT_BINARY_DIR}/CUCO_RAPIDS.cmake)
diff --git a/README.md b/README.md
index dc8d4db80..93ac04027 100644
--- a/README.md
+++ b/README.md
@@ -5,13 +5,13 @@
 <th><b><a href="">Doxygen Documentation (TODO)</a></b></th>
 </tr></table>
 
-`cuCollections` (`cuco`) is an open-source, header-only library of GPU-accelerated, concurrent data structures. 
+`cuCollections` (`cuco`) is an open-source, header-only library of GPU-accelerated, concurrent data structures.
 
-Similar to how [Thrust](https://github.com/thrust/thrust) and [CUB](https://github.com/thrust/cub) provide STL-like, GPU accelerated algorithms and primitives, `cuCollections` provides STL-like concurrent data structures. `cuCollections` is not a one-to-one, drop-in replacement for STL data structures like `std::unordered_map`. Instead, it provides functionally similar data structures tailored for efficient use with GPUs. 
+Similar to how [Thrust](https://github.com/thrust/thrust) and [CUB](https://github.com/thrust/cub) provide STL-like, GPU accelerated algorithms and primitives, `cuCollections` provides STL-like concurrent data structures. `cuCollections` is not a one-to-one, drop-in replacement for STL data structures like `std::unordered_map`. Instead, it provides functionally similar data structures tailored for efficient use with GPUs.
 
 ## Development Status
 
-`cuCollections` is still under heavy development. Users should expect breaking changes and refactoring to be common. 
+`cuCollections` is still under heavy development. Users should expect breaking changes and refactoring to be common.
 
 ## Getting cuCollections
 
@@ -21,14 +21,14 @@ Similar to how [Thrust](https://github.com/thrust/thrust) and [CUB](https://gith
 
 `cuCollections` is designed to make it easy to include within another CMake project.
  The `CMakeLists.txt` exports a `cuco` target that can be linked<sup>[1](#link-footnote)</sup>
- into a target to setup include directories, dependencies, and compile flags necessary to use `cuCollections` in your project. 
+ into a target to setup include directories, dependencies, and compile flags necessary to use `cuCollections` in your project.
 
 
 We recommend using [CMake Package Manager (CPM)](https://github.com/TheLartians/CPM.cmake) to fetch `cuCollections` into your project.
 With CPM, getting `cuCollections` is easy:
 
-```
-cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
+```cmake
+cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR)
 
 include(path/to/CPM.cmake)
 
@@ -47,12 +47,12 @@ target_link_libraries(my_library cuco)
 
 This will take care of downloading `cuCollections` from GitHub and making the headers available in a location that can be found by CMake. Linking against the `cuco` target will provide everything needed for `cuco` to be used by the `my_library` target.
 
-<a name="link-footnote">1</a>: `cuCollections` is header-only and therefore there is no binary component to "link" against. The linking terminology comes from CMake's `target_link_libraries` which is still used even for header-only library targets. 
+<a name="link-footnote">1</a>: `cuCollections` is header-only and therefore there is no binary component to "link" against. The linking terminology comes from CMake's `target_link_libraries` which is still used even for header-only library targets.
 
 ## Requirements
-- `nvcc 11+`
+- `nvcc 11.5+`
 - C++17
-- Volta+ 
+- Volta+
     - Pascal is partially supported. Any data structures that require blocking algorithms are not supported. See [libcu++](https://nvidia.github.io/libcudacxx/setup/requirements.html#device-architectures) documentation for more details.
 
 ## Dependencies
@@ -67,15 +67,15 @@ No action is required from the user to satisfy these dependencies. `cuCollection
 
 ## Building cuCollections
 
-Since `cuCollections` is header-only, there is nothing to build to use it. 
+Since `cuCollections` is header-only, there is nothing to build to use it.
 
 To build the tests, benchmarks, and examples:
 
-```
+```bash
 cd $CUCO_ROOT
 mkdir -p build
 cd build
-cmake .. 
+cmake ..
 make
 ```
 Binaries will be built into:
@@ -179,23 +179,32 @@ class example_class {
 
 ## Data Structures
 
-We plan to add many GPU-accelerated, concurrent data structures to `cuCollections`. As of now, the two flagships are variants of hash tables. 
+We plan to add many GPU-accelerated, concurrent data structures to `cuCollections`. As of now, the two flagships are variants of hash tables.
+
+### `static_set`
+
+`cuco::static_set` is a fixed-size container that stores unique elements in no particular order. See the Doxygen documentation in `static_set.cuh` for more detailed information.
+
+#### Examples:
+- [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_set/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/z/Pzf6vabz1))
+- [Device-ref APIs for individual operations](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_set/device_ref_example.cu) (see [live example in godbolt](https://godbolt.org/z/sfG3qKqGv))
 
 ### `static_map`
 
 `cuco::static_map` is a fixed-size hash table using open addressing with linear probing. See the Doxygen documentation in `static_map.cuh` for more detailed information.
 
 #### Examples:
-- [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_map/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/z/ervPzqh64))
-- [Device-view APIs for individual operations](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_map/device_view_example.cu) (see [live example in godbolt](https://godbolt.org/z/qMWrfE6ET))
-- [Custom data types, key equality operators and hash functions](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_map/custom_type_example.cu) (see [live example in godbolt](https://godbolt.org/z/oGfYjzMGT))
+- [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_map/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/z/T49P85Mnd))
+- [Device-view APIs for individual operations](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_map/device_view_example.cu) (see [live example in godbolt](https://godbolt.org/z/dh8bMn3G1))
+- [Custom data types, key equality operators and hash functions](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_map/custom_type_example.cu) (see [live example in godbolt](https://godbolt.org/z/7djKevK6e))
+- [Key histogram](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_map/count_by_key_example.cu) (see [live example in godbolt](https://godbolt.org/z/vecGeYM48))
 
 ### `static_multimap`
 
 `cuco::static_multimap` is a fixed-size hash table that supports storing equivalent keys. It uses double hashing by default and supports switching to linear probing. See the Doxygen documentation in `static_multimap.cuh` for more detailed information.
 
 #### Examples:
-- [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_multimap/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/z/Po4eTEn1a))
+- [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_multimap/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/z/PrbqG6ae4))
 
 ### `dynamic_map`
 
diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
index a037dc603..3635336e8 100644
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@@ -1,5 +1,5 @@
 ﻿#=============================================================================
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+# Copyright (c) 2018-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,20 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #=============================================================================
-cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
-
-CPMAddPackage(
-  NAME benchmark
-  GITHUB_REPOSITORY google/benchmark
-  VERSION 1.5.2
-  OPTIONS
-    "BENCHMARK_ENABLE_TESTING Off"
-    # The REGEX feature test fails when gbench's cmake is run under CPM w/ gcc5.4 because it doesn't assume C++11
-    # Additionally, attempting to set the CMAKE_CXX_VERSION here doesn't propogate to the feature test build
-    # Therefore, we just disable the feature test and assume platforms we care about have a regex impl available
-    "RUN_HAVE_STD_REGEX 0" #
-    "BENCHMARK_ENABLE_INSTALL OFF"
-)
+cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR)
 
 CPMAddPackage(
   NAME nvbench
@@ -41,65 +28,58 @@ CPMAddPackage(
 ###################################################################################################
 
 ###################################################################################################
-function(ConfigureBench BENCH_NAME BENCH_SRC)
-    add_executable(${BENCH_NAME} "${BENCH_SRC}")
-    set_target_properties(${BENCH_NAME} PROPERTIES
-                                        POSITION_INDEPENDENT_CODE ON
-                                        RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/gbenchmarks")
-    target_include_directories(${BENCH_NAME} PRIVATE
-                                             "${CMAKE_CURRENT_SOURCE_DIR}")
-    target_compile_options(${BENCH_NAME} PRIVATE --compiler-options=-Wall --compiler-options=-Wextra
-      --expt-extended-lambda --expt-relaxed-constexpr -Xcompiler -Wno-subobject-linkage)
-    target_link_libraries(${BENCH_NAME} PRIVATE
-                                        benchmark benchmark_main
-                                        pthread
-                                        cuco
-                                        CUDA::cudart)
-endfunction(ConfigureBench)
-
-###################################################################################################
-function(ConfigureNVBench BENCH_NAME)
+function(ConfigureBench BENCH_NAME)
     add_executable(${BENCH_NAME} ${ARGN})
     set_target_properties(${BENCH_NAME} PROPERTIES
                                         POSITION_INDEPENDENT_CODE ON
-                                        RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/nvbenchmarks")
+                                        RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/benchmarks")
     target_include_directories(${BENCH_NAME} PRIVATE
                                              "${CMAKE_CURRENT_SOURCE_DIR}")
-                                             #"${NVBench_SOURCE_DIR}")
-    target_compile_options(${BENCH_NAME} PRIVATE --expt-extended-lambda --expt-relaxed-constexpr)
+    target_compile_options(${BENCH_NAME} PRIVATE --expt-extended-lambda --expt-relaxed-constexpr -lineinfo)
     target_link_libraries(${BENCH_NAME} PRIVATE
                                         nvbench::main
                                         pthread
                                         cuco)
-endfunction(ConfigureNVBench)
+endfunction(ConfigureBench)
 
 ###################################################################################################
 ### benchmark sources #############################################################################
 ###################################################################################################
 
 ###################################################################################################
-# - dynamic_map benchmarks ------------------------------------------------------------------------
-set(DYNAMIC_MAP_BENCH_SRC "${CMAKE_CURRENT_SOURCE_DIR}/hash_table/dynamic_map_bench.cu")
-ConfigureBench(DYNAMIC_MAP_BENCH "${DYNAMIC_MAP_BENCH_SRC}")
+# - static_set benchmarks -------------------------------------------------------------------------
+ConfigureBench(STATIC_SET_BENCH
+  hash_table/static_set/contains_bench.cu
+  hash_table/static_set/find_bench.cu
+  hash_table/static_set/insert_bench.cu
+  hash_table/static_set/retrieve_all_bench.cu
+  hash_table/static_set/size_bench.cu)
 
 ###################################################################################################
 # - static_map benchmarks -------------------------------------------------------------------------
-set(STATIC_MAP_BENCH_SRC "${CMAKE_CURRENT_SOURCE_DIR}/hash_table/static_map_bench.cu")
-ConfigureBench(STATIC_MAP_BENCH "${STATIC_MAP_BENCH_SRC}")
+ConfigureBench(STATIC_MAP_BENCH
+  hash_table/static_map/insert_bench.cu
+  hash_table/static_map/find_bench.cu
+  hash_table/static_map/contains_bench.cu
+  hash_table/static_map/erase_bench.cu)
 
 ###################################################################################################
 # - static_multimap benchmarks --------------------------------------------------------------------
-ConfigureNVBench(STATIC_MULTIMAP_BENCH
-  hash_table/static_multimap/count_bench.cu
+ConfigureBench(STATIC_MULTIMAP_BENCH
   hash_table/static_multimap/insert_bench.cu
-  hash_table/static_multimap/pair_retrieve_bench.cu
+  hash_table/static_multimap/retrieve_bench.cu
   hash_table/static_multimap/query_bench.cu
-  hash_table/static_multimap/retrieve_bench.cu)
+  hash_table/static_multimap/count_bench.cu)
 
-ConfigureNVBench(RETRIEVE_BENCH
-  hash_table/static_multimap/optimal_retrieve_bench.cu)
+###################################################################################################
+# - dynamic_map benchmarks ------------------------------------------------------------------------
+ConfigureBench(DYNAMIC_MAP_BENCH
+  hash_table/dynamic_map/insert_bench.cu
+  hash_table/dynamic_map/find_bench.cu
+  hash_table/dynamic_map/contains_bench.cu
+  hash_table/dynamic_map/erase_bench.cu)
 
 ###################################################################################################
-# - reduce_by_key benchmarks ----------------------------------------------------------------------
-set(RBK_BENCH_SRC "${CMAKE_CURRENT_SOURCE_DIR}/reduce_by_key/reduce_by_key.cu")
-ConfigureBench(RBK_BENCH "${RBK_BENCH_SRC}")
+# - hash function benchmarks ----------------------------------------------------------------------
+ConfigureBench(HASH_BENCH
+  hash_bench.cu)
diff --git a/benchmarks/defaults.hpp b/benchmarks/defaults.hpp
new file mode 100644
index 000000000..22e4f5338
--- /dev/null
+++ b/benchmarks/defaults.hpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <nvbench/nvbench.cuh>
+
+#include <cstdint>
+#include <vector>
+
+namespace cuco::benchmark::defaults {
+
+using KEY_TYPE_RANGE   = nvbench::type_list<nvbench::int32_t, nvbench::int64_t>;
+using VALUE_TYPE_RANGE = nvbench::type_list<nvbench::int32_t, nvbench::int64_t>;
+
+auto constexpr N             = 100'000'000;
+auto constexpr OCCUPANCY     = 0.5;
+auto constexpr MULTIPLICITY  = 8;
+auto constexpr MATCHING_RATE = 0.5;
+auto constexpr MAX_NOISE     = 3;
+auto constexpr SKEW          = 0.5;
+auto constexpr BATCH_SIZE    = 1'000'000;
+auto constexpr INITIAL_SIZE  = 50'000'000;
+
+auto const N_RANGE = nvbench::range(10'000'000, 100'000'000, 20'000'000);
+auto const N_RANGE_CACHE =
+  std::vector<nvbench::int64_t>{8'000, 80'000, 800'000, 8'000'000, 80'000'000};
+auto const OCCUPANCY_RANGE     = nvbench::range(0.1, 0.9, 0.1);
+auto const MULTIPLICITY_RANGE  = std::vector<nvbench::int64_t>{1, 2, 4, 8, 16};
+auto const MATCHING_RATE_RANGE = nvbench::range(0.1, 1., 0.1);
+auto const SKEW_RANGE          = nvbench::range(0.1, 1., 0.1);
+
+}  // namespace cuco::benchmark::defaults
diff --git a/benchmarks/hash_bench.cu b/benchmarks/hash_bench.cu
new file mode 100644
index 000000000..ec35c186e
--- /dev/null
+++ b/benchmarks/hash_bench.cu
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <defaults.hpp>
+
+#include <cuco/hash_functions.cuh>
+
+#include <nvbench/nvbench.cuh>
+
+#include <thrust/device_vector.h>
+
+#include <cstdint>
+
+template <int32_t Words>
+struct large_key {
+  constexpr __host__ __device__ large_key(int32_t seed) noexcept
+  {
+#pragma unroll Words
+    for (int32_t i = 0; i < Words; ++i) {
+      data_[i] = seed;
+    }
+  }
+
+ private:
+  int32_t data_[Words];
+};
+
+template <int32_t BlockSize, typename Hasher, typename OutputIt>
+__global__ void hash_bench_kernel(Hasher hash,
+                                  cuco::detail::index_type n,
+                                  OutputIt out,
+                                  bool materialize_result)
+{
+  cuco::detail::index_type const gid         = BlockSize * blockIdx.x + threadIdx.x;
+  cuco::detail::index_type const loop_stride = gridDim.x * BlockSize;
+  cuco::detail::index_type idx               = gid;
+  typename Hasher::result_type agg           = 0;
+
+  while (idx < n) {
+    typename Hasher::argument_type key(idx);
+    for (int32_t i = 0; i < 100; ++i) {  // execute hash func 100 times
+      agg += hash(key);
+    }
+    idx += loop_stride;
+  }
+
+  if (materialize_result) { out[gid] = agg; }
+}
+
+/**
+ * @brief A benchmark evaluating performance of various hash functions
+ */
+template <typename Hash>
+void hash_eval(nvbench::state& state, nvbench::type_list<Hash>)
+{
+  bool const materialize_result = false;
+  constexpr auto block_size     = 128;
+  auto const num_keys  = state.get_int64_or_default("NumInputs", cuco::benchmark::defaults::N * 10);
+  auto const grid_size = (num_keys + block_size * 16 - 1) / block_size * 16;
+
+  thrust::device_vector<typename Hash::result_type> hash_values((materialize_result) ? num_keys
+                                                                                     : 1);
+
+  state.add_element_count(num_keys);
+
+  state.exec([&](nvbench::launch& launch) {
+    hash_bench_kernel<block_size><<<grid_size, block_size, 0, launch.get_stream()>>>(
+      Hash{}, num_keys, hash_values.begin(), materialize_result);
+  });
+}
+
+NVBENCH_BENCH_TYPES(
+  hash_eval,
+  NVBENCH_TYPE_AXES(nvbench::type_list<cuco::murmurhash3_32<nvbench::int32_t>,
+                                       cuco::murmurhash3_32<nvbench::int64_t>,
+                                       cuco::murmurhash3_32<large_key<32>>,  // 32*4bytes
+                                       cuco::xxhash_32<nvbench::int32_t>,
+                                       cuco::xxhash_32<nvbench::int64_t>,
+                                       cuco::xxhash_32<large_key<32>>,
+                                       cuco::xxhash_64<nvbench::int32_t>,
+                                       cuco::xxhash_64<nvbench::int64_t>,
+                                       cuco::xxhash_64<large_key<32>>,
+                                       cuco::murmurhash3_fmix_32<nvbench::int32_t>,
+                                       cuco::murmurhash3_fmix_64<nvbench::int64_t>>))
+  .set_name("hash_function_eval")
+  .set_type_axes_names({"Hash"})
+  .set_max_noise(cuco::benchmark::defaults::MAX_NOISE);
diff --git a/benchmarks/hash_table/dynamic_map/contains_bench.cu b/benchmarks/hash_table/dynamic_map/contains_bench.cu
new file mode 100644
index 000000000..ff349bc53
--- /dev/null
+++ b/benchmarks/hash_table/dynamic_map/contains_bench.cu
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <defaults.hpp>
+#include <utils.hpp>
+
+#include <cuco/dynamic_map.cuh>
+#include <cuco/utility/key_generator.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/transform.h>
+
+using namespace cuco::benchmark;
+using namespace cuco::utility;
+
+/**
+ * @brief A benchmark evaluating `cuco::dynamic_map::contains` performance
+ */
+template <typename Key, typename Value, typename Dist>
+std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> dynamic_map_contains(
+  nvbench::state& state, nvbench::type_list<Key, Value, Dist>)
+{
+  using pair_type = cuco::pair<Key, Value>;
+
+  auto const num_keys      = state.get_int64_or_default("NumInputs", defaults::N);
+  auto const initial_size  = state.get_int64_or_default("InitSize", defaults::INITIAL_SIZE);
+  auto const matching_rate = state.get_float64_or_default("MatchingRate", defaults::MATCHING_RATE);
+
+  thrust::device_vector<Key> keys(num_keys);
+
+  key_generator gen;
+  gen.generate(dist_from_state<Dist>(state), keys.begin(), keys.end());
+
+  thrust::device_vector<pair_type> pairs(num_keys);
+  thrust::transform(keys.begin(), keys.end(), pairs.begin(), [] __device__(Key const& key) {
+    return pair_type(key, {});
+  });
+
+  cuco::dynamic_map<Key, Value> map{
+    static_cast<size_t>(initial_size), cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{-1}};
+  map.insert(pairs.begin(), pairs.end());
+
+  gen.dropout(keys.begin(), keys.end(), matching_rate);
+
+  thrust::device_vector<bool> result(num_keys);
+
+  state.add_element_count(num_keys);
+
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    map.contains(keys.begin(), keys.end(), result.begin(), {}, {}, launch.get_stream());
+  });
+}
+
+template <typename Key, typename Value, typename Dist>
+std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> dynamic_map_contains(
+  nvbench::state& state, nvbench::type_list<Key, Value, Dist>)
+{
+  state.skip("Key should be the same type as Value.");
+}
+
+NVBENCH_BENCH_TYPES(dynamic_map_contains,
+                    NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
+                                      defaults::VALUE_TYPE_RANGE,
+                                      nvbench::type_list<distribution::unique>))
+  .set_name("dynamic_map_contains_unique_num_inputs")
+  .set_type_axes_names({"Key", "Value", "Distribution"})
+  .set_max_noise(defaults::MAX_NOISE)
+  .add_int64_axis("NumInputs", defaults::N_RANGE);
+
+NVBENCH_BENCH_TYPES(dynamic_map_contains,
+                    NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
+                                      defaults::VALUE_TYPE_RANGE,
+                                      nvbench::type_list<distribution::unique>))
+  .set_name("dynamic_map_contains_unique_matching_rate")
+  .set_type_axes_names({"Key", "Value", "Distribution"})
+  .set_max_noise(defaults::MAX_NOISE)
+  .add_float64_axis("MatchingRate", defaults::MATCHING_RATE_RANGE);
diff --git a/benchmarks/hash_table/dynamic_map/erase_bench.cu b/benchmarks/hash_table/dynamic_map/erase_bench.cu
new file mode 100644
index 000000000..96f5ec7ec
--- /dev/null
+++ b/benchmarks/hash_table/dynamic_map/erase_bench.cu
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <defaults.hpp>
+#include <utils.hpp>
+
+#include <cuco/dynamic_map.cuh>
+#include <cuco/utility/key_generator.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/transform.h>
+
+using namespace cuco::benchmark;
+using namespace cuco::utility;
+
+/**
+ * @brief A benchmark evaluating `cuco::dynamic_map::erase` performance
+ */
+template <typename Key, typename Value, typename Dist>
+std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> dynamic_map_erase(
+  nvbench::state& state, nvbench::type_list<Key, Value, Dist>)
+{
+  using pair_type = cuco::pair<Key, Value>;
+
+  auto const num_keys      = state.get_int64_or_default("NumInputs", defaults::N);
+  auto const initial_size  = state.get_int64_or_default("InitSize", defaults::INITIAL_SIZE);
+  auto const matching_rate = state.get_float64_or_default("MatchingRate", defaults::MATCHING_RATE);
+
+  thrust::device_vector<Key> keys(num_keys);
+
+  key_generator gen;
+  gen.generate(dist_from_state<Dist>(state), keys.begin(), keys.end());
+
+  thrust::device_vector<pair_type> pairs(num_keys);
+  thrust::transform(
+    keys.begin(), keys.end(), pairs.begin(), [] __device__(auto i) { return pair_type(i, {}); });
+
+  gen.dropout(keys.begin(), keys.end(), matching_rate);
+
+  state.add_element_count(num_keys);
+
+  state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer,
+             [&](nvbench::launch& launch, auto& timer) {
+               // dynamic map with erase support
+               cuco::dynamic_map<Key, Value> map{static_cast<size_t>(initial_size),
+                                                 cuco::empty_key<Key>{-1},
+                                                 cuco::empty_value<Value>{-1},
+                                                 cuco::erased_key<Key>{-2}};
+               map.insert(pairs.begin(), pairs.end(), {}, {}, launch.get_stream());
+
+               timer.start();
+               map.erase(keys.begin(), keys.end(), {}, {}, launch.get_stream());
+               timer.stop();
+             });
+}
+
+template <typename Key, typename Value, typename Dist>
+std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> dynamic_map_erase(
+  nvbench::state& state, nvbench::type_list<Key, Value, Dist>)
+{
+  state.skip("Key should be the same type as Value.");
+}
+
+NVBENCH_BENCH_TYPES(dynamic_map_erase,
+                    NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
+                                      defaults::VALUE_TYPE_RANGE,
+                                      nvbench::type_list<distribution::unique>))
+  .set_name("dynamic_map_erase_unique_num_inputs")
+  .set_type_axes_names({"Key", "Value", "Distribution"})
+  .set_max_noise(defaults::MAX_NOISE)
+  .add_int64_axis("NumInputs", defaults::N_RANGE);
+
+NVBENCH_BENCH_TYPES(dynamic_map_erase,
+                    NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
+                                      defaults::VALUE_TYPE_RANGE,
+                                      nvbench::type_list<distribution::unique>))
+  .set_name("dynamic_map_erase_unique_matching_rate")
+  .set_type_axes_names({"Key", "Value", "Distribution"})
+  .set_max_noise(defaults::MAX_NOISE)
+  .add_float64_axis("MatchingRate", defaults::MATCHING_RATE_RANGE);
diff --git a/benchmarks/hash_table/dynamic_map/find_bench.cu b/benchmarks/hash_table/dynamic_map/find_bench.cu
new file mode 100644
index 000000000..b06cfab4e
--- /dev/null
+++ b/benchmarks/hash_table/dynamic_map/find_bench.cu
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <defaults.hpp>
+#include <utils.hpp>
+
+#include <cuco/dynamic_map.cuh>
+#include <cuco/utility/key_generator.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/transform.h>
+
+using namespace cuco::benchmark;
+using namespace cuco::utility;
+
+/**
+ * @brief A benchmark evaluating `cuco::dynamic_map::find` performance
+ */
+template <typename Key, typename Value, typename Dist>
+std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> dynamic_map_find(
+  nvbench::state& state, nvbench::type_list<Key, Value, Dist>)
+{
+  using pair_type = cuco::pair<Key, Value>;
+
+  auto const num_keys      = state.get_int64_or_default("NumInputs", defaults::N);
+  auto const initial_size  = state.get_int64_or_default("InitSize", defaults::INITIAL_SIZE);
+  auto const matching_rate = state.get_float64_or_default("MatchingRate", defaults::MATCHING_RATE);
+
+  thrust::device_vector<Key> keys(num_keys);
+
+  key_generator gen;
+  gen.generate(dist_from_state<Dist>(state), keys.begin(), keys.end());
+
+  thrust::device_vector<pair_type> pairs(num_keys);
+  thrust::transform(keys.begin(), keys.end(), pairs.begin(), [] __device__(Key const& key) {
+    return pair_type(key, {});
+  });
+
+  cuco::dynamic_map<Key, Value> map{
+    static_cast<size_t>(initial_size), cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{-1}};
+  map.insert(pairs.begin(), pairs.end());
+
+  gen.dropout(keys.begin(), keys.end(), matching_rate);
+
+  thrust::device_vector<Value> result(num_keys);
+
+  state.add_element_count(num_keys);
+
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    map.find(keys.begin(), keys.end(), result.begin(), {}, {}, launch.get_stream());
+  });
+}
+
+template <typename Key, typename Value, typename Dist>
+std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> dynamic_map_find(
+  nvbench::state& state, nvbench::type_list<Key, Value, Dist>)
+{
+  state.skip("Key should be the same type as Value.");
+}
+
+NVBENCH_BENCH_TYPES(dynamic_map_find,
+                    NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
+                                      defaults::VALUE_TYPE_RANGE,
+                                      nvbench::type_list<distribution::unique>))
+  .set_name("dynamic_map_find_unique_num_inputs")
+  .set_type_axes_names({"Key", "Value", "Distribution"})
+  .set_max_noise(defaults::MAX_NOISE)
+  .add_int64_axis("NumInputs", defaults::N_RANGE);
+
+NVBENCH_BENCH_TYPES(dynamic_map_find,
+                    NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
+                                      defaults::VALUE_TYPE_RANGE,
+                                      nvbench::type_list<distribution::unique>))
+  .set_name("dynamic_map_find_unique_matching_rate")
+  .set_type_axes_names({"Key", "Value", "Distribution"})
+  .set_max_noise(defaults::MAX_NOISE)
+  .add_float64_axis("MatchingRate", defaults::MATCHING_RATE_RANGE);
diff --git a/benchmarks/hash_table/dynamic_map/insert_bench.cu b/benchmarks/hash_table/dynamic_map/insert_bench.cu
new file mode 100644
index 000000000..8e8cc8a84
--- /dev/null
+++ b/benchmarks/hash_table/dynamic_map/insert_bench.cu
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <defaults.hpp>
+#include <utils.hpp>
+
+#include <cuco/dynamic_map.cuh>
+#include <cuco/utility/key_generator.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/transform.h>
+
+using namespace cuco::benchmark;
+using namespace cuco::utility;
+
+/**
+ * @brief A benchmark evaluating `cuco::dynamic_map::insert` performance
+ */
+template <typename Key, typename Value, typename Dist>
+std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> dynamic_map_insert(
+  nvbench::state& state, nvbench::type_list<Key, Value, Dist>)
+{
+  using pair_type = cuco::pair<Key, Value>;
+
+  auto const num_keys     = state.get_int64_or_default("NumInputs", defaults::N);
+  auto const initial_size = state.get_int64_or_default("InitSize", defaults::INITIAL_SIZE);
+  auto const batch_size   = state.get_int64_or_default("BatchSize", defaults::BATCH_SIZE);
+
+  if (num_keys % batch_size) { state.skip("NumInputs must be divisible by BatchSize."); }
+
+  thrust::device_vector<Key> keys(num_keys);
+
+  key_generator gen;
+  gen.generate(dist_from_state<Dist>(state), keys.begin(), keys.end());
+
+  thrust::device_vector<pair_type> pairs(num_keys);
+  thrust::transform(keys.begin(), keys.end(), pairs.begin(), [] __device__(Key const& key) {
+    return pair_type(key, {});
+  });
+
+  state.add_element_count(num_keys);
+
+  state.exec(
+    nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) {
+      cuco::dynamic_map<Key, Value> map{static_cast<size_t>(initial_size),
+                                        cuco::empty_key<Key>{-1},
+                                        cuco::empty_value<Value>{-1},
+                                        {},
+                                        launch.get_stream()};
+
+      timer.start();
+      for (std::size_t i = 0; i < num_keys; i += batch_size) {
+        map.insert(pairs.begin() + i, pairs.begin() + i + batch_size, {}, {}, launch.get_stream());
+      }
+      timer.stop();
+    });
+}
+
+template <typename Key, typename Value, typename Dist>
+std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> dynamic_map_insert(
+  nvbench::state& state, nvbench::type_list<Key, Value, Dist>)
+{
+  state.skip("Key should be the same type as Value.");
+}
+
+NVBENCH_BENCH_TYPES(dynamic_map_insert,
+                    NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
+                                      defaults::VALUE_TYPE_RANGE,
+                                      nvbench::type_list<distribution::unique>))
+  .set_name("dynamic_map_insert_unique_num_inputs")
+  .set_type_axes_names({"Key", "Value", "Distribution"})
+  .set_max_noise(defaults::MAX_NOISE)
+  .add_int64_axis("NumInputs", defaults::N_RANGE);
+
+NVBENCH_BENCH_TYPES(dynamic_map_insert,
+                    NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
+                                      defaults::VALUE_TYPE_RANGE,
+                                      nvbench::type_list<distribution::uniform>))
+  .set_name("dynamic_map_insert_uniform_multiplicity")
+  .set_type_axes_names({"Key", "Value", "Distribution"})
+  .set_max_noise(defaults::MAX_NOISE)
+  .add_int64_axis("Multiplicity", defaults::MULTIPLICITY_RANGE);
+
+NVBENCH_BENCH_TYPES(dynamic_map_insert,
+                    NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
+                                      defaults::VALUE_TYPE_RANGE,
+                                      nvbench::type_list<distribution::gaussian>))
+  .set_name("dynamic_map_insert_gaussian_skew")
+  .set_type_axes_names({"Key", "Value", "Distribution"})
+  .set_max_noise(defaults::MAX_NOISE)
+  .add_float64_axis("Skew", defaults::SKEW_RANGE);
diff --git a/benchmarks/hash_table/dynamic_map_bench.cu b/benchmarks/hash_table/dynamic_map_bench.cu
deleted file mode 100644
index 90446ea57..000000000
--- a/benchmarks/hash_table/dynamic_map_bench.cu
+++ /dev/null
@@ -1,198 +0,0 @@
-/*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <synchronization.hpp>
-
-#include <cuco/dynamic_map.cuh>
-
-#include <thrust/device_vector.h>
-
-#include <benchmark/benchmark.h>
-
-#include <iostream>
-#include <random>
-
-enum class dist_type { UNIQUE, UNIFORM, GAUSSIAN };
-
-template <dist_type Dist, typename Key, typename OutputIt>
-static void generate_keys(OutputIt output_begin, OutputIt output_end)
-{
-  auto num_keys = std::distance(output_begin, output_end);
-
-  std::random_device rd;
-  std::mt19937 gen{rd()};
-
-  switch (Dist) {
-    case dist_type::UNIQUE:
-      for (auto i = 0; i < num_keys; ++i) {
-        output_begin[i] = i;
-      }
-      break;
-    case dist_type::UNIFORM:
-      for (auto i = 0; i < num_keys; ++i) {
-        output_begin[i] = std::abs(static_cast<Key>(gen()));
-      }
-      break;
-    case dist_type::GAUSSIAN:
-      std::normal_distribution<> dg{1e9, 1e7};
-      for (auto i = 0; i < num_keys; ++i) {
-        output_begin[i] = std::abs(static_cast<Key>(dg(gen)));
-      }
-      break;
-  }
-}
-
-static void gen_final_size(benchmark::internal::Benchmark* b)
-{
-  for (auto size = 10'000'000; size <= 150'000'000; size += 20'000'000) {
-    b->Args({size});
-  }
-}
-
-template <typename Key, typename Value, dist_type Dist>
-static void BM_dynamic_insert(::benchmark::State& state)
-{
-  using map_type = cuco::dynamic_map<Key, Value>;
-
-  std::size_t num_keys     = state.range(0);
-  std::size_t initial_size = 1 << 27;
-
-  std::vector<Key> h_keys(num_keys);
-  std::vector<cuco::pair_type<Key, Value>> h_pairs(num_keys);
-
-  generate_keys<Dist, Key>(h_keys.begin(), h_keys.end());
-
-  for (std::size_t i = 0; i < num_keys; ++i) {
-    Key key           = h_keys[i];
-    Value val         = h_keys[i];
-    h_pairs[i].first  = key;
-    h_pairs[i].second = val;
-  }
-
-  thrust::device_vector<cuco::pair_type<Key, Value>> d_pairs(h_pairs);
-
-  std::size_t batch_size = 1E6;
-  for (auto _ : state) {
-    map_type map{
-      initial_size, cuco::sentinel::empty_key<Key>{-1}, cuco::sentinel::empty_value<Value>{-1}};
-    {
-      cuda_event_timer raii{state};
-      for (std::size_t i = 0; i < num_keys; i += batch_size) {
-        map.insert(d_pairs.begin() + i, d_pairs.begin() + i + batch_size);
-      }
-    }
-  }
-
-  state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) *
-                          int64_t(state.range(0)));
-}
-
-template <typename Key, typename Value, dist_type Dist>
-static void BM_dynamic_search_all(::benchmark::State& state)
-{
-  using map_type = cuco::dynamic_map<Key, Value>;
-
-  std::size_t num_keys     = state.range(0);
-  std::size_t initial_size = 1 << 27;
-
-  std::vector<Key> h_keys(num_keys);
-  std::vector<cuco::pair_type<Key, Value>> h_pairs(num_keys);
-
-  generate_keys<Dist, Key>(h_keys.begin(), h_keys.end());
-
-  for (std::size_t i = 0; i < num_keys; ++i) {
-    Key key           = h_keys[i];
-    Value val         = h_keys[i];
-    h_pairs[i].first  = key;
-    h_pairs[i].second = val;
-  }
-
-  thrust::device_vector<Key> d_keys(h_keys);
-  thrust::device_vector<cuco::pair_type<Key, Value>> d_pairs(h_pairs);
-  thrust::device_vector<Value> d_results(num_keys);
-
-  map_type map{
-    initial_size, cuco::sentinel::empty_key<Key>{-1}, cuco::sentinel::empty_value<Value>{-1}};
-  map.insert(d_pairs.begin(), d_pairs.end());
-
-  for (auto _ : state) {
-    cuda_event_timer raii{state};
-    map.find(d_keys.begin(), d_keys.end(), d_results.begin());
-  }
-
-  state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) *
-                          int64_t(state.range(0)));
-}
-
-BENCHMARK_TEMPLATE(BM_dynamic_insert, int32_t, int32_t, dist_type::UNIQUE)
-  ->Unit(benchmark::kMillisecond)
-  ->Apply(gen_final_size)
-  ->UseManualTime();
-
-BENCHMARK_TEMPLATE(BM_dynamic_search_all, int32_t, int32_t, dist_type::UNIQUE)
-  ->Unit(benchmark::kMillisecond)
-  ->Apply(gen_final_size)
-  ->UseManualTime();
-
-BENCHMARK_TEMPLATE(BM_dynamic_insert, int32_t, int32_t, dist_type::UNIFORM)
-  ->Unit(benchmark::kMillisecond)
-  ->Apply(gen_final_size)
-  ->UseManualTime();
-
-BENCHMARK_TEMPLATE(BM_dynamic_search_all, int32_t, int32_t, dist_type::UNIFORM)
-  ->Unit(benchmark::kMillisecond)
-  ->Apply(gen_final_size)
-  ->UseManualTime();
-
-BENCHMARK_TEMPLATE(BM_dynamic_insert, int32_t, int32_t, dist_type::GAUSSIAN)
-  ->Unit(benchmark::kMillisecond)
-  ->Apply(gen_final_size)
-  ->UseManualTime();
-
-BENCHMARK_TEMPLATE(BM_dynamic_search_all, int32_t, int32_t, dist_type::GAUSSIAN)
-  ->Unit(benchmark::kMillisecond)
-  ->Apply(gen_final_size)
-  ->UseManualTime();
-
-BENCHMARK_TEMPLATE(BM_dynamic_insert, int64_t, int64_t, dist_type::UNIQUE)
-  ->Unit(benchmark::kMillisecond)
-  ->Apply(gen_final_size)
-  ->UseManualTime();
-
-BENCHMARK_TEMPLATE(BM_dynamic_search_all, int64_t, int64_t, dist_type::UNIQUE)
-  ->Unit(benchmark::kMillisecond)
-  ->Apply(gen_final_size)
-  ->UseManualTime();
-
-BENCHMARK_TEMPLATE(BM_dynamic_insert, int64_t, int64_t, dist_type::UNIFORM)
-  ->Unit(benchmark::kMillisecond)
-  ->Apply(gen_final_size)
-  ->UseManualTime();
-
-BENCHMARK_TEMPLATE(BM_dynamic_search_all, int64_t, int64_t, dist_type::UNIFORM)
-  ->Unit(benchmark::kMillisecond)
-  ->Apply(gen_final_size)
-  ->UseManualTime();
-
-BENCHMARK_TEMPLATE(BM_dynamic_insert, int64_t, int64_t, dist_type::GAUSSIAN)
-  ->Unit(benchmark::kMillisecond)
-  ->Apply(gen_final_size)
-  ->UseManualTime();
-
-BENCHMARK_TEMPLATE(BM_dynamic_search_all, int64_t, int64_t, dist_type::GAUSSIAN)
-  ->Unit(benchmark::kMillisecond)
-  ->Apply(gen_final_size)
-  ->UseManualTime();
diff --git a/benchmarks/hash_table/static_map/contains_bench.cu b/benchmarks/hash_table/static_map/contains_bench.cu
new file mode 100644
index 000000000..0b5d482a1
--- /dev/null
+++ b/benchmarks/hash_table/static_map/contains_bench.cu
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <defaults.hpp>
+#include <utils.hpp>
+
+#include <cuco/static_map.cuh>
+#include <cuco/utility/key_generator.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/transform.h>
+
+using namespace cuco::benchmark;
+using namespace cuco::utility;
+
+/**
+ * @brief A benchmark evaluating `cuco::static_map::contains` performance
+ */
+template <typename Key, typename Value, typename Dist>
+std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_map_contains(
+  nvbench::state& state, nvbench::type_list<Key, Value, Dist>)
+{
+  using pair_type = cuco::pair<Key, Value>;
+
+  auto const num_keys      = state.get_int64_or_default("NumInputs", defaults::N);
+  auto const occupancy     = state.get_float64_or_default("Occupancy", defaults::OCCUPANCY);
+  auto const matching_rate = state.get_float64_or_default("MatchingRate", defaults::MATCHING_RATE);
+
+  std::size_t const size = num_keys / occupancy;
+
+  thrust::device_vector<Key> keys(num_keys);
+
+  key_generator gen;
+  gen.generate(dist_from_state<Dist>(state), keys.begin(), keys.end());
+
+  thrust::device_vector<pair_type> pairs(num_keys);
+  thrust::transform(keys.begin(), keys.end(), pairs.begin(), [] __device__(Key const& key) {
+    return pair_type(key, {});
+  });
+
+  cuco::static_map<Key, Value> map{size, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{-1}};
+  map.insert(pairs.begin(), pairs.end());
+
+  gen.dropout(keys.begin(), keys.end(), matching_rate);
+
+  thrust::device_vector<bool> result(num_keys);
+
+  state.add_element_count(num_keys);
+
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    map.contains(keys.begin(), keys.end(), result.begin(), {}, {}, launch.get_stream());
+  });
+}
+
+template <typename Key, typename Value, typename Dist>
+std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> static_map_contains(
+  nvbench::state& state, nvbench::type_list<Key, Value, Dist>)
+{
+  state.skip("Key should be the same type as Value.");
+}
+
+NVBENCH_BENCH_TYPES(static_map_contains,
+                    NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
+                                      defaults::VALUE_TYPE_RANGE,
+                                      nvbench::type_list<distribution::unique>))
+  .set_name("static_map_contains_unique_occupancy")
+  .set_type_axes_names({"Key", "Value", "Distribution"})
+  .set_max_noise(defaults::MAX_NOISE)
+  .add_float64_axis("Occupancy", defaults::OCCUPANCY_RANGE);
+
+NVBENCH_BENCH_TYPES(static_map_contains,
+                    NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
+                                      defaults::VALUE_TYPE_RANGE,
+                                      nvbench::type_list<distribution::unique>))
+  .set_name("static_map_contains_unique_matching_rate")
+  .set_type_axes_names({"Key", "Value", "Distribution"})
+  .set_max_noise(defaults::MAX_NOISE)
+  .add_float64_axis("MatchingRate", defaults::MATCHING_RATE_RANGE);
diff --git a/benchmarks/hash_table/static_map/erase_bench.cu b/benchmarks/hash_table/static_map/erase_bench.cu
new file mode 100644
index 000000000..c6e56eb07
--- /dev/null
+++ b/benchmarks/hash_table/static_map/erase_bench.cu
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <defaults.hpp>
+#include <utils.hpp>
+
+#include <cuco/static_map.cuh>
+#include <cuco/utility/key_generator.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/transform.h>
+
+using namespace cuco::benchmark;
+using namespace cuco::utility;
+
+/**
+ * @brief A benchmark evaluating `cuco::static_map::erase` performance
+ */
+template <typename Key, typename Value, typename Dist>
+std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_map_erase(
+  nvbench::state& state, nvbench::type_list<Key, Value, Dist>)
+{
+  using pair_type = cuco::pair<Key, Value>;
+
+  auto const num_keys      = state.get_int64_or_default("NumInputs", defaults::N);
+  auto const occupancy     = state.get_float64_or_default("Occupancy", defaults::OCCUPANCY);
+  auto const matching_rate = state.get_float64_or_default("MatchingRate", defaults::MATCHING_RATE);
+
+  std::size_t const size = num_keys / occupancy;
+
+  thrust::device_vector<Key> keys(num_keys);
+
+  key_generator gen;
+  gen.generate(dist_from_state<Dist>(state), keys.begin(), keys.end());
+
+  thrust::device_vector<pair_type> pairs(num_keys);
+  thrust::transform(
+    keys.begin(), keys.end(), pairs.begin(), [] __device__(auto i) { return pair_type(i, {}); });
+
+  gen.dropout(keys.begin(), keys.end(), matching_rate);
+
+  state.add_element_count(num_keys);
+
+  state.exec(
+    nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) {
+      // static map with erase support
+      cuco::static_map<Key, Value> map{
+        size, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{-1}, cuco::erased_key<Key>{-2}};
+      map.insert(pairs.begin(), pairs.end(), {}, {}, launch.get_stream());
+
+      timer.start();
+      map.erase(keys.begin(), keys.end(), {}, {}, launch.get_stream());
+      timer.stop();
+    });
+}
+
+template <typename Key, typename Value, typename Dist>
+std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> static_map_erase(
+  nvbench::state& state, nvbench::type_list<Key, Value, Dist>)
+{
+  state.skip("Key should be the same type as Value.");
+}
+
+NVBENCH_BENCH_TYPES(static_map_erase,
+                    NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
+                                      defaults::VALUE_TYPE_RANGE,
+                                      nvbench::type_list<distribution::unique>))
+  .set_name("static_map_erase_unique_occupancy")
+  .set_type_axes_names({"Key", "Value", "Distribution"})
+  .set_max_noise(defaults::MAX_NOISE)
+  .add_float64_axis("Occupancy", defaults::OCCUPANCY_RANGE);
+
+NVBENCH_BENCH_TYPES(static_map_erase,
+                    NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
+                                      defaults::VALUE_TYPE_RANGE,
+                                      nvbench::type_list<distribution::unique>))
+  .set_name("static_map_erase_unique_matching_rate")
+  .set_type_axes_names({"Key", "Value", "Distribution"})
+  .set_max_noise(defaults::MAX_NOISE)
+  .add_float64_axis("MatchingRate", defaults::MATCHING_RATE_RANGE);
diff --git a/benchmarks/hash_table/static_map/find_bench.cu b/benchmarks/hash_table/static_map/find_bench.cu
new file mode 100644
index 000000000..276a35e0b
--- /dev/null
+++ b/benchmarks/hash_table/static_map/find_bench.cu
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <defaults.hpp>
+#include <utils.hpp>
+
+#include <cuco/static_map.cuh>
+#include <cuco/utility/key_generator.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/transform.h>
+
+using namespace cuco::benchmark;
+using namespace cuco::utility;
+
+/**
+ * @brief A benchmark evaluating `cuco::static_map::find` performance
+ */
+template <typename Key, typename Value, typename Dist>
+std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_map_find(
+  nvbench::state& state, nvbench::type_list<Key, Value, Dist>)
+{
+  using pair_type = cuco::pair<Key, Value>;
+
+  auto const num_keys      = state.get_int64_or_default("NumInputs", defaults::N);
+  auto const occupancy     = state.get_float64_or_default("Occupancy", defaults::OCCUPANCY);
+  auto const matching_rate = state.get_float64_or_default("MatchingRate", defaults::MATCHING_RATE);
+
+  std::size_t const size = num_keys / occupancy;
+
+  thrust::device_vector<Key> keys(num_keys);
+
+  key_generator gen;
+  gen.generate(dist_from_state<Dist>(state), keys.begin(), keys.end());
+
+  thrust::device_vector<pair_type> pairs(num_keys);
+  thrust::transform(keys.begin(), keys.end(), pairs.begin(), [] __device__(Key const& key) {
+    return pair_type(key, {});
+  });
+
+  cuco::static_map<Key, Value> map{size, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{-1}};
+  map.insert(pairs.begin(), pairs.end());
+
+  gen.dropout(keys.begin(), keys.end(), matching_rate);
+
+  thrust::device_vector<Value> result(num_keys);
+
+  state.add_element_count(num_keys);
+
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    map.find(keys.begin(), keys.end(), result.begin(), {}, {}, launch.get_stream());
+  });
+}
+
+template <typename Key, typename Value, typename Dist>
+std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> static_map_find(
+  nvbench::state& state, nvbench::type_list<Key, Value, Dist>)
+{
+  state.skip("Key should be the same type as Value.");
+}
+
+NVBENCH_BENCH_TYPES(static_map_find,
+                    NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
+                                      defaults::VALUE_TYPE_RANGE,
+                                      nvbench::type_list<distribution::unique>))
+  .set_name("static_map_find_unique_occupancy")
+  .set_type_axes_names({"Key", "Value", "Distribution"})
+  .set_max_noise(defaults::MAX_NOISE)
+  .add_float64_axis("Occupancy", defaults::OCCUPANCY_RANGE);
+
+NVBENCH_BENCH_TYPES(static_map_find,
+                    NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
+                                      defaults::VALUE_TYPE_RANGE,
+                                      nvbench::type_list<distribution::unique>))
+  .set_name("static_map_find_unique_matching_rate")
+  .set_type_axes_names({"Key", "Value", "Distribution"})
+  .set_max_noise(defaults::MAX_NOISE)
+  .add_float64_axis("MatchingRate", defaults::MATCHING_RATE_RANGE);
diff --git a/benchmarks/hash_table/static_map/insert_bench.cu b/benchmarks/hash_table/static_map/insert_bench.cu
new file mode 100644
index 000000000..ef997bef8
--- /dev/null
+++ b/benchmarks/hash_table/static_map/insert_bench.cu
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <defaults.hpp>
+#include <utils.hpp>
+
+#include <cuco/static_map.cuh>
+#include <cuco/utility/key_generator.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/transform.h>
+
+using namespace cuco::benchmark;
+using namespace cuco::utility;
+
+/**
+ * @brief A benchmark evaluating `cuco::static_map::insert` performance
+ */
+template <typename Key, typename Value, typename Dist>
+std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_map_insert(
+  nvbench::state& state, nvbench::type_list<Key, Value, Dist>)
+{
+  using pair_type = cuco::pair<Key, Value>;
+
+  auto const num_keys  = state.get_int64_or_default("NumInputs", defaults::N);
+  auto const occupancy = state.get_float64_or_default("Occupancy", defaults::OCCUPANCY);
+
+  std::size_t const size = num_keys / occupancy;
+
+  thrust::device_vector<Key> keys(num_keys);
+
+  key_generator gen;
+  gen.generate(dist_from_state<Dist>(state), keys.begin(), keys.end());
+
+  thrust::device_vector<pair_type> pairs(num_keys);
+  thrust::transform(keys.begin(), keys.end(), pairs.begin(), [] __device__(Key const& key) {
+    return pair_type(key, {});
+  });
+
+  state.add_element_count(num_keys);
+
+  state.exec(
+    nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) {
+      cuco::static_map<Key, Value> map{
+        size, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{-1}, {}, launch.get_stream()};
+
+      timer.start();
+      map.insert(pairs.begin(), pairs.end(), {}, {}, launch.get_stream());
+      timer.stop();
+    });
+}
+
+template <typename Key, typename Value, typename Dist>
+std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> static_map_insert(
+  nvbench::state& state, nvbench::type_list<Key, Value, Dist>)
+{
+  state.skip("Key should be the same type as Value.");
+}
+
+NVBENCH_BENCH_TYPES(static_map_insert,
+                    NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
+                                      defaults::VALUE_TYPE_RANGE,
+                                      nvbench::type_list<distribution::uniform>))
+  .set_name("static_map_insert_uniform_multiplicity")
+  .set_type_axes_names({"Key", "Value", "Distribution"})
+  .set_max_noise(defaults::MAX_NOISE)
+  .add_int64_axis("Multiplicity", defaults::MULTIPLICITY_RANGE);
+
+NVBENCH_BENCH_TYPES(static_map_insert,
+                    NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
+                                      defaults::VALUE_TYPE_RANGE,
+                                      nvbench::type_list<distribution::unique>))
+  .set_name("static_map_insert_unique_occupancy")
+  .set_type_axes_names({"Key", "Value", "Distribution"})
+  .set_max_noise(defaults::MAX_NOISE)
+  .add_float64_axis("Occupancy", defaults::OCCUPANCY_RANGE);
+
+NVBENCH_BENCH_TYPES(static_map_insert,
+                    NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
+                                      defaults::VALUE_TYPE_RANGE,
+                                      nvbench::type_list<distribution::gaussian>))
+  .set_name("static_map_insert_gaussian_skew")
+  .set_type_axes_names({"Key", "Value", "Distribution"})
+  .set_max_noise(defaults::MAX_NOISE)
+  .add_float64_axis("Skew", defaults::SKEW_RANGE);
diff --git a/benchmarks/hash_table/static_map_bench.cu b/benchmarks/hash_table/static_map_bench.cu
deleted file mode 100644
index e2b15b05e..000000000
--- a/benchmarks/hash_table/static_map_bench.cu
+++ /dev/null
@@ -1,259 +0,0 @@
-/*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cuco/static_map.cuh>
-
-#include <thrust/device_vector.h>
-#include <thrust/for_each.h>
-
-#include <benchmark/benchmark.h>
-
-#include <fstream>
-#include <random>
-
-enum class dist_type { UNIQUE, UNIFORM, GAUSSIAN };
-
-template <dist_type Dist, typename Key, typename OutputIt>
-static void generate_keys(OutputIt output_begin, OutputIt output_end)
-{
-  auto num_keys = std::distance(output_begin, output_end);
-
-  std::random_device rd;
-  std::mt19937 gen{rd()};
-
-  switch (Dist) {
-    case dist_type::UNIQUE:
-      for (auto i = 0; i < num_keys; ++i) {
-        output_begin[i] = i;
-      }
-      break;
-    case dist_type::UNIFORM:
-      for (auto i = 0; i < num_keys; ++i) {
-        output_begin[i] = std::abs(static_cast<Key>(gen()));
-      }
-      break;
-    case dist_type::GAUSSIAN:
-      std::normal_distribution<> dg{1e9, 1e7};
-      for (auto i = 0; i < num_keys; ++i) {
-        output_begin[i] = std::abs(static_cast<Key>(dg(gen)));
-      }
-      break;
-  }
-}
-
-/**
- * @brief Generates input sizes and hash table occupancies
- *
- */
-static void generate_size_and_occupancy(benchmark::internal::Benchmark* b)
-{
-  for (auto size = 100'000'000; size <= 100'000'000; size *= 10) {
-    for (auto occupancy = 10; occupancy <= 90; occupancy += 10) {
-      b->Args({size, occupancy});
-    }
-  }
-}
-
-template <typename Key, typename Value, dist_type Dist>
-static void BM_static_map_insert(::benchmark::State& state)
-{
-  using map_type = cuco::static_map<Key, Value>;
-
-  std::size_t num_keys = state.range(0);
-  float occupancy      = state.range(1) / float{100};
-  std::size_t size     = num_keys / occupancy;
-
-  std::vector<Key> h_keys(num_keys);
-  std::vector<cuco::pair_type<Key, Value>> h_pairs(num_keys);
-
-  generate_keys<Dist, Key>(h_keys.begin(), h_keys.end());
-
-  for (std::size_t i = 0; i < num_keys; ++i) {
-    Key key           = h_keys[i];
-    Value val         = h_keys[i];
-    h_pairs[i].first  = key;
-    h_pairs[i].second = val;
-  }
-
-  thrust::device_vector<cuco::pair_type<Key, Value>> d_pairs(h_pairs);
-  thrust::device_vector<Key> d_keys(h_keys);
-
-  for (auto _ : state) {
-    map_type map{size, cuco::sentinel::empty_key<Key>{-1}, cuco::sentinel::empty_value<Value>{-1}};
-
-    cudaEvent_t start, stop;
-    cudaEventCreate(&start);
-    cudaEventCreate(&stop);
-
-    cudaEventRecord(start);
-    map.insert(d_pairs.begin(), d_pairs.end());
-    cudaEventRecord(stop);
-    cudaEventSynchronize(stop);
-
-    float ms;
-    cudaEventElapsedTime(&ms, start, stop);
-
-    state.SetIterationTime(ms / 1000);
-  }
-
-  state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) *
-                          int64_t(state.range(0)));
-}
-
-template <typename Key, typename Value, dist_type Dist>
-static void BM_static_map_search_all(::benchmark::State& state)
-{
-  using map_type = cuco::static_map<Key, Value>;
-
-  std::size_t num_keys = state.range(0);
-  float occupancy      = state.range(1) / float{100};
-  std::size_t size     = num_keys / occupancy;
-
-  map_type map{size, cuco::sentinel::empty_key<Key>{-1}, cuco::sentinel::empty_value<Value>{-1}};
-
-  std::vector<Key> h_keys(num_keys);
-  std::vector<Value> h_values(num_keys);
-  std::vector<cuco::pair_type<Key, Value>> h_pairs(num_keys);
-  std::vector<Value> h_results(num_keys);
-
-  generate_keys<Dist, Key>(h_keys.begin(), h_keys.end());
-
-  for (std::size_t i = 0; i < num_keys; ++i) {
-    Key key           = h_keys[i];
-    Value val         = h_keys[i];
-    h_pairs[i].first  = key;
-    h_pairs[i].second = val;
-  }
-
-  thrust::device_vector<Key> d_keys(h_keys);
-  thrust::device_vector<Value> d_results(num_keys);
-  thrust::device_vector<cuco::pair_type<Key, Value>> d_pairs(h_pairs);
-
-  map.insert(d_pairs.begin(), d_pairs.end());
-
-  for (auto _ : state) {
-    map.find(d_keys.begin(), d_keys.end(), d_results.begin());
-    // TODO: get rid of sync and rewrite the benchmark with `nvbench`
-    // once https://github.com/NVIDIA/nvbench/pull/80 is merged
-    cudaDeviceSynchronize();
-  }
-
-  state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) *
-                          int64_t(state.range(0)));
-}
-
-template <typename Key, typename Value, dist_type Dist>
-static void BM_static_map_erase_all(::benchmark::State& state)
-{
-  using map_type = cuco::static_map<Key, Value>;
-
-  std::size_t num_keys = state.range(0);
-  float occupancy      = state.range(1) / float{100};
-  std::size_t size     = num_keys / occupancy;
-
-  // static map with erase support
-  map_type map{size,
-               cuco::sentinel::empty_key<Key>{-1},
-               cuco::sentinel::empty_value<Value>{-1},
-               cuco::sentinel::erased_key<Key>{-2}};
-
-  std::vector<Key> h_keys(num_keys);
-  std::vector<Value> h_values(num_keys);
-  std::vector<cuco::pair_type<Key, Value>> h_pairs(num_keys);
-  std::vector<Value> h_results(num_keys);
-
-  generate_keys<Dist, Key>(h_keys.begin(), h_keys.end());
-
-  for (std::size_t i = 0; i < num_keys; ++i) {
-    Key key           = h_keys[i];
-    Value val         = h_keys[i];
-    h_pairs[i].first  = key;
-    h_pairs[i].second = val;
-  }
-
-  thrust::device_vector<Key> d_keys(h_keys);
-  thrust::device_vector<bool> d_results(num_keys);
-  thrust::device_vector<cuco::pair_type<Key, Value>> d_pairs(h_pairs);
-
-  for (auto _ : state) {
-    state.PauseTiming();
-    map.insert(d_pairs.begin(), d_pairs.end());
-    state.ResumeTiming();
-
-    map.erase(d_keys.begin(), d_keys.end());
-  }
-
-  state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) *
-                          int64_t(state.range(0)));
-}
-
-BENCHMARK_TEMPLATE(BM_static_map_insert, int32_t, int32_t, dist_type::UNIQUE)
-  ->Unit(benchmark::kMillisecond)
-  ->Apply(generate_size_and_occupancy)
-  ->UseManualTime();
-
-BENCHMARK_TEMPLATE(BM_static_map_search_all, int32_t, int32_t, dist_type::UNIQUE)
-  ->Unit(benchmark::kMillisecond)
-  ->Apply(generate_size_and_occupancy);
-
-BENCHMARK_TEMPLATE(BM_static_map_insert, int32_t, int32_t, dist_type::UNIFORM)
-  ->Unit(benchmark::kMillisecond)
-  ->Apply(generate_size_and_occupancy)
-  ->UseManualTime();
-
-BENCHMARK_TEMPLATE(BM_static_map_search_all, int32_t, int32_t, dist_type::UNIFORM)
-  ->Unit(benchmark::kMillisecond)
-  ->Apply(generate_size_and_occupancy);
-
-BENCHMARK_TEMPLATE(BM_static_map_insert, int32_t, int32_t, dist_type::GAUSSIAN)
-  ->Unit(benchmark::kMillisecond)
-  ->Apply(generate_size_and_occupancy)
-  ->UseManualTime();
-
-BENCHMARK_TEMPLATE(BM_static_map_search_all, int32_t, int32_t, dist_type::GAUSSIAN)
-  ->Unit(benchmark::kMillisecond)
-  ->Apply(generate_size_and_occupancy);
-
-BENCHMARK_TEMPLATE(BM_static_map_insert, int64_t, int64_t, dist_type::UNIQUE)
-  ->Unit(benchmark::kMillisecond)
-  ->Apply(generate_size_and_occupancy)
-  ->UseManualTime();
-
-BENCHMARK_TEMPLATE(BM_static_map_search_all, int64_t, int64_t, dist_type::UNIQUE)
-  ->Unit(benchmark::kMillisecond)
-  ->Apply(generate_size_and_occupancy);
-
-BENCHMARK_TEMPLATE(BM_static_map_insert, int64_t, int64_t, dist_type::UNIFORM)
-  ->Unit(benchmark::kMillisecond)
-  ->Apply(generate_size_and_occupancy)
-  ->UseManualTime();
-
-BENCHMARK_TEMPLATE(BM_static_map_search_all, int64_t, int64_t, dist_type::UNIFORM)
-  ->Unit(benchmark::kMillisecond)
-  ->Apply(generate_size_and_occupancy);
-
-BENCHMARK_TEMPLATE(BM_static_map_insert, int64_t, int64_t, dist_type::GAUSSIAN)
-  ->Unit(benchmark::kMillisecond)
-  ->Apply(generate_size_and_occupancy)
-  ->UseManualTime();
-
-BENCHMARK_TEMPLATE(BM_static_map_search_all, int64_t, int64_t, dist_type::GAUSSIAN)
-  ->Unit(benchmark::kMillisecond)
-  ->Apply(generate_size_and_occupancy);
-
-BENCHMARK_TEMPLATE(BM_static_map_erase_all, int32_t, int32_t, dist_type::UNIQUE)
-  ->Unit(benchmark::kMillisecond)
-  ->Apply(generate_size_and_occupancy);
diff --git a/benchmarks/hash_table/static_multimap/count_bench.cu b/benchmarks/hash_table/static_multimap/count_bench.cu
index 0659fe742..fa71c8d0c 100644
--- a/benchmarks/hash_table/static_multimap/count_bench.cu
+++ b/benchmarks/hash_table/static_multimap/count_bench.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,102 +14,88 @@
  * limitations under the License.
  */
 
-#include <key_generator.hpp>
+#include <defaults.hpp>
+#include <utils.hpp>
 
 #include <cuco/static_multimap.cuh>
+#include <cuco/utility/key_generator.hpp>
 
 #include <nvbench/nvbench.cuh>
 
 #include <thrust/device_vector.h>
+#include <thrust/transform.h>
+
+using namespace cuco::benchmark;
+using namespace cuco::utility;
 
 /**
- * @brief A benchmark evaluating multi-value `count` performance:
- * - Total number of insertions: 100'000'000
- * - CG size: 8
+ * @brief A benchmark evaluating `cuco::static_multimap::count` performance
  */
-template <typename Key, typename Value, dist_type Dist, nvbench::int32_t Multiplicity>
-std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> nvbench_static_multimap_count(
-  nvbench::state& state,
-  nvbench::type_list<Key, Value, nvbench::enum_type<Dist>, nvbench::enum_type<Multiplicity>>)
+template <typename Key, typename Value, typename Dist>
+std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_multimap_count(
+  nvbench::state& state, nvbench::type_list<Key, Value, Dist>)
 {
-  auto const num_keys      = state.get_int64("NumInputs");
-  auto const occupancy     = state.get_float64("Occupancy");
-  auto const matching_rate = state.get_float64("MatchingRate");
+  using pair_type = cuco::pair<Key, Value>;
 
-  std::size_t const size = num_keys / occupancy;
+  auto const num_keys      = state.get_int64_or_default("NumInputs", defaults::N);
+  auto const occupancy     = state.get_float64_or_default("Occupancy", defaults::OCCUPANCY);
+  auto const matching_rate = state.get_float64_or_default("MatchingRate", defaults::MATCHING_RATE);
 
-  std::vector<Key> h_keys(num_keys);
-  std::vector<cuco::pair_type<Key, Value>> h_pairs(num_keys);
+  std::size_t const size = num_keys / occupancy;
 
-  generate_keys<Dist, Multiplicity, Key>(h_keys.begin(), h_keys.end());
+  thrust::device_vector<Key> keys(num_keys);
 
-  for (auto i = 0; i < num_keys; ++i) {
-    Key key           = h_keys[i];
-    Value val         = h_keys[i];
-    h_pairs[i].first  = key;
-    h_pairs[i].second = val;
-  }
+  key_generator gen;
+  gen.generate(dist_from_state<Dist>(state), keys.begin(), keys.end());
 
-  generate_probe_keys<Key>(matching_rate, h_keys.begin(), h_keys.end());
+  thrust::device_vector<pair_type> pairs(num_keys);
+  thrust::transform(keys.begin(), keys.end(), pairs.begin(), [] __device__(Key const& key) {
+    return pair_type(key, {});
+  });
 
-  thrust::device_vector<Key> d_keys(h_keys);
-  thrust::device_vector<cuco::pair_type<Key, Value>> d_pairs(h_pairs);
+  gen.dropout(keys.begin(), keys.end(), matching_rate);
 
-  state.add_element_count(num_keys, "NumKeys");
+  state.add_element_count(num_keys);
 
   cuco::static_multimap<Key, Value> map{
-    size, cuco::sentinel::empty_key<Key>{-1}, cuco::sentinel::empty_value<Value>{-1}};
-  map.insert(d_pairs.begin(), d_pairs.end());
+    size, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{-1}};
+  map.insert(pairs.begin(), pairs.end());
 
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-    auto count = map.count(d_keys.begin(), d_keys.end(), launch.get_stream());
+    auto count = map.count(keys.begin(), keys.end(), launch.get_stream());
   });
 }
 
-template <typename Key, typename Value, dist_type Dist, nvbench::int32_t Multiplicity>
-std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> nvbench_static_multimap_count(
-  nvbench::state& state,
-  nvbench::type_list<Key, Value, nvbench::enum_type<Dist>, nvbench::enum_type<Multiplicity>>)
+template <typename Key, typename Value, typename Dist>
+std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> static_multimap_count(
+  nvbench::state& state, nvbench::type_list<Key, Value, Dist>)
 {
   state.skip("Key should be the same type as Value.");
 }
 
-using key_type   = nvbench::type_list<nvbench::int32_t, nvbench::int64_t>;
-using value_type = nvbench::type_list<nvbench::int32_t, nvbench::int64_t>;
-using d_type =
-  nvbench::enum_type_list<dist_type::GAUSSIAN, dist_type::GEOMETRIC, dist_type::UNIFORM>;
-
-using multiplicity = nvbench::enum_type_list<1, 2, 4, 8, 16, 32, 64, 128, 256>;
-
-NVBENCH_BENCH_TYPES(nvbench_static_multimap_count,
-                    NVBENCH_TYPE_AXES(key_type,
-                                      value_type,
-                                      nvbench::enum_type_list<dist_type::UNIFORM>,
-                                      multiplicity))
-  .set_name("staic_multimap_count_uniform_multiplicity")
-  .set_type_axes_names({"Key", "Value", "Distribution", "Multiplicity"})
-  .set_timeout(100)                            // Custom timeout: 100 s. Default is 15 s.
-  .set_max_noise(3)                            // Custom noise: 3%. By default: 0.5%.
-  .add_int64_axis("NumInputs", {100'000'000})  // Total number of key/value pairs: 100'000'000
-  .add_float64_axis("Occupancy", {0.8})
-  .add_float64_axis("MatchingRate", {0.5});
-
-NVBENCH_BENCH_TYPES(nvbench_static_multimap_count,
-                    NVBENCH_TYPE_AXES(key_type, value_type, d_type, nvbench::enum_type_list<8>))
-  .set_name("staic_multimap_count_occupancy")
-  .set_type_axes_names({"Key", "Value", "Distribution", "Multiplicity"})
-  .set_timeout(100)                            // Custom timeout: 100 s. Default is 15 s.
-  .set_max_noise(3)                            // Custom noise: 3%. By default: 0.5%.
-  .add_int64_axis("NumInputs", {100'000'000})  // Total number of key/value pairs: 100'000'000
-  .add_float64_axis("Occupancy", nvbench::range(0.1, 0.9, 0.1))
-  .add_float64_axis("MatchingRate", {0.5});
-
-NVBENCH_BENCH_TYPES(nvbench_static_multimap_count,
-                    NVBENCH_TYPE_AXES(key_type, value_type, d_type, nvbench::enum_type_list<8>))
-  .set_name("staic_multimap_count_matching_rate")
-  .set_type_axes_names({"Key", "Value", "Distribution", "Multiplicity"})
-  .set_timeout(100)                            // Custom timeout: 100 s. Default is 15 s.
-  .set_max_noise(3)                            // Custom noise: 3%. By default: 0.5%.
-  .add_int64_axis("NumInputs", {100'000'000})  // Total number of key/value pairs: 100'000'000
-  .add_float64_axis("Occupancy", {0.8})
-  .add_float64_axis("MatchingRate", {0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1});
+NVBENCH_BENCH_TYPES(static_multimap_count,
+                    NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
+                                      defaults::VALUE_TYPE_RANGE,
+                                      nvbench::type_list<distribution::uniform>))
+  .set_name("static_multimap_count_uniform_occupancy")
+  .set_type_axes_names({"Key", "Value", "Distribution"})
+  .set_max_noise(defaults::MAX_NOISE)
+  .add_float64_axis("Occupancy", defaults::OCCUPANCY_RANGE);
+
+NVBENCH_BENCH_TYPES(static_multimap_count,
+                    NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
+                                      defaults::VALUE_TYPE_RANGE,
+                                      nvbench::type_list<distribution::uniform>))
+  .set_name("static_multimap_count_uniform_matching_rate")
+  .set_type_axes_names({"Key", "Value", "Distribution"})
+  .set_max_noise(defaults::MAX_NOISE)
+  .add_float64_axis("MatchingRate", defaults::MATCHING_RATE_RANGE);
+
+NVBENCH_BENCH_TYPES(static_multimap_count,
+                    NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
+                                      defaults::VALUE_TYPE_RANGE,
+                                      nvbench::type_list<distribution::uniform>))
+  .set_name("static_multimap_count_uniform_multiplicity")
+  .set_type_axes_names({"Key", "Value", "Distribution"})
+  .set_max_noise(defaults::MAX_NOISE)
+  .add_int64_axis("Multiplicity", defaults::MULTIPLICITY_RANGE);
diff --git a/benchmarks/hash_table/static_multimap/insert_bench.cu b/benchmarks/hash_table/static_multimap/insert_bench.cu
index 17f8723df..aa41044bb 100644
--- a/benchmarks/hash_table/static_multimap/insert_bench.cu
+++ b/benchmarks/hash_table/static_multimap/insert_bench.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,87 +14,87 @@
  * limitations under the License.
  */
 
-#include <key_generator.hpp>
+#include <defaults.hpp>
+#include <utils.hpp>
 
 #include <cuco/static_multimap.cuh>
+#include <cuco/utility/key_generator.hpp>
 
 #include <nvbench/nvbench.cuh>
 
 #include <thrust/device_vector.h>
+#include <thrust/transform.h>
+
+using namespace cuco::benchmark;
+using namespace cuco::utility;
 
 /**
- * @brief A benchmark evaluating multi-value `insert` performance:
- * - Total number of insertions: 100'000'000
- * - CG size: 8
+ * @brief A benchmark evaluating `cuco::static_multimap::insert` performance
  */
-template <typename Key, typename Value, dist_type Dist, nvbench::int32_t Multiplicity>
-std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> nvbench_static_multimap_insert(
-  nvbench::state& state,
-  nvbench::type_list<Key, Value, nvbench::enum_type<Dist>, nvbench::enum_type<Multiplicity>>)
+template <typename Key, typename Value, typename Dist>
+std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_multimap_insert(
+  nvbench::state& state, nvbench::type_list<Key, Value, Dist>)
 {
-  auto const num_keys  = state.get_int64("NumInputs");
-  auto const occupancy = state.get_float64("Occupancy");
+  using pair_type = cuco::pair<Key, Value>;
 
-  std::size_t const size = num_keys / occupancy;
+  auto const num_keys  = state.get_int64_or_default("NumInputs", defaults::N);
+  auto const occupancy = state.get_float64_or_default("Occupancy", defaults::OCCUPANCY);
 
-  std::vector<Key> h_keys(num_keys);
-  std::vector<cuco::pair_type<Key, Value>> h_pairs(num_keys);
+  std::size_t const size = num_keys / occupancy;
 
-  generate_keys<Dist, Multiplicity, Key>(h_keys.begin(), h_keys.end());
+  thrust::device_vector<Key> keys(num_keys);
 
-  for (auto i = 0; i < num_keys; ++i) {
-    Key key           = h_keys[i];
-    Value val         = h_keys[i];
-    h_pairs[i].first  = key;
-    h_pairs[i].second = val;
-  }
+  key_generator gen;
+  gen.generate(dist_from_state<Dist>(state), keys.begin(), keys.end());
 
-  thrust::device_vector<cuco::pair_type<Key, Value>> d_pairs(h_pairs);
+  thrust::device_vector<pair_type> pairs(num_keys);
+  thrust::transform(keys.begin(), keys.end(), pairs.begin(), [] __device__(Key const& key) {
+    return pair_type(key, {});
+  });
 
-  state.add_element_count(num_keys, "NumKeys");
+  state.add_element_count(num_keys);
 
   state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer,
              [&](nvbench::launch& launch, auto& timer) {
                cuco::static_multimap<Key, Value> map{
-                 size, cuco::sentinel::empty_key<Key>{-1}, cuco::sentinel::empty_value<Value>{-1}};
+                 size, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{-1}, launch.get_stream()};
 
-               // Use timers to explicitly mark the target region
                timer.start();
-               map.insert(d_pairs.begin(), d_pairs.end(), launch.get_stream());
+               map.insert(pairs.begin(), pairs.end(), launch.get_stream());
                timer.stop();
              });
 }
 
-template <typename Key, typename Value, dist_type Dist, nvbench::int32_t Multiplicity>
-std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> nvbench_static_multimap_insert(
-  nvbench::state& state,
-  nvbench::type_list<Key, Value, nvbench::enum_type<Dist>, nvbench::enum_type<Multiplicity>>)
+template <typename Key, typename Value, typename Dist>
+std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> static_multimap_insert(
+  nvbench::state& state, nvbench::type_list<Key, Value, Dist>)
 {
   state.skip("Key should be the same type as Value.");
 }
 
-using key_type   = nvbench::type_list<nvbench::int32_t, nvbench::int64_t>;
-using value_type = nvbench::type_list<nvbench::int32_t, nvbench::int64_t>;
-using d_type =
-  nvbench::enum_type_list<dist_type::GAUSSIAN, dist_type::GEOMETRIC, dist_type::UNIFORM>;
-
-using multiplicity = nvbench::enum_type_list<1, 2, 4, 8, 16, 32, 64, 128, 256>;
-
-NVBENCH_BENCH_TYPES(nvbench_static_multimap_insert,
-                    NVBENCH_TYPE_AXES(key_type,
-                                      value_type,
-                                      nvbench::enum_type_list<dist_type::UNIFORM>,
-                                      multiplicity))
-  .set_name("staic_multimap_insert_uniform_multiplicity")
-  .set_type_axes_names({"Key", "Value", "Distribution", "Multiplicity"})
-  .set_max_noise(3)                            // Custom noise: 3%. By default: 0.5%.
-  .add_int64_axis("NumInputs", {100'000'000})  // Total number of key/value pairs: 100'000'000
-  .add_float64_axis("Occupancy", {0.8});
-
-NVBENCH_BENCH_TYPES(nvbench_static_multimap_insert,
-                    NVBENCH_TYPE_AXES(key_type, value_type, d_type, nvbench::enum_type_list<8>))
-  .set_name("staic_multimap_insert_occupancy")
-  .set_type_axes_names({"Key", "Value", "Distribution", "Multiplicity"})
-  .set_max_noise(3)                            // Custom noise: 3%. By default: 0.5%.
-  .add_int64_axis("NumInputs", {100'000'000})  // Total number of key/value pairs: 100'000'000
-  .add_float64_axis("Occupancy", nvbench::range(0.1, 0.9, 0.1));
+NVBENCH_BENCH_TYPES(static_multimap_insert,
+                    NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
+                                      defaults::VALUE_TYPE_RANGE,
+                                      nvbench::type_list<distribution::uniform>))
+  .set_name("static_multimap_insert_uniform_multiplicity")
+  .set_type_axes_names({"Key", "Value", "Distribution"})
+  .set_max_noise(defaults::MAX_NOISE)
+  .add_int64_axis("Multiplicity", defaults::MULTIPLICITY_RANGE);
+
+NVBENCH_BENCH_TYPES(static_multimap_insert,
+                    NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
+                                      defaults::VALUE_TYPE_RANGE,
+                                      nvbench::type_list<distribution::unique>))
+  .set_name("static_multimap_insert_unique_occupancy")
+  .set_type_axes_names({"Key", "Value", "Distribution"})
+  .set_max_noise(defaults::MAX_NOISE)
+  .add_float64_axis("Occupancy", defaults::OCCUPANCY_RANGE);
+
+NVBENCH_BENCH_TYPES(static_multimap_insert,
+                    NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
+                                      defaults::VALUE_TYPE_RANGE,
+                                      nvbench::type_list<distribution::gaussian>))
+  .set_name("static_multimap_insert_gaussian_skew")
+  .set_type_axes_names({"Key", "Value", "Distribution"})
+  .set_max_noise(defaults::MAX_NOISE)
+  .add_float64_axis("Skew", defaults::SKEW_RANGE);
diff --git a/benchmarks/hash_table/static_multimap/optimal_retrieve_bench.cu b/benchmarks/hash_table/static_multimap/optimal_retrieve_bench.cu
deleted file mode 100644
index a4a202161..000000000
--- a/benchmarks/hash_table/static_multimap/optimal_retrieve_bench.cu
+++ /dev/null
@@ -1,122 +0,0 @@
-/*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cuco/static_multimap.cuh>
-
-#include <nvbench/nvbench.cuh>
-
-#include <thrust/device_vector.h>
-
-/**
- * @brief Generates input keys by a given number of repetitions per key.
- *
- */
-template <typename Key, typename OutputIt>
-static void generate_multikeys(OutputIt output_begin,
-                               OutputIt output_end,
-                               size_t const multiplicity)
-{
-  auto num_keys = std::distance(output_begin, output_end);
-
-  for (auto i = 0; i < num_keys; ++i) {
-    output_begin[i] = (i % (num_keys / multiplicity)) + 1;
-  }
-}
-
-/**
- * @brief A benchmark evaluating multi-value retrieval performance by varing number of repetitions
- * per key:
- * - 100'000'000 keys are inserted
- * - Map occupancy is fixed at 0.4
- * - Number of repetitions per key: 1, ... , 128, 256
- *
- */
-template <typename Key, typename Value, nvbench::int32_t CGSize, nvbench::int32_t BufferSize>
-std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> nvbench_retrieve(
-  nvbench::state& state,
-  nvbench::type_list<Key, Value, nvbench::enum_type<CGSize>, nvbench::enum_type<BufferSize>>)
-{
-  std::size_t const num_keys     = state.get_int64("NumInputs");
-  auto const occupancy           = state.get_float64("Occupancy");
-  std::size_t const size         = num_keys / occupancy;
-  std::size_t const multiplicity = state.get_int64("Multiplicity");
-
-  state.add_element_count(num_keys, "NumKeys");
-  state.add_global_memory_writes<Key>(num_keys * 2);
-
-  std::vector<Key> h_keys(num_keys);
-  std::vector<cuco::pair_type<Key, Value>> h_pairs(num_keys);
-
-  generate_multikeys<Key>(h_keys.begin(), h_keys.end(), multiplicity);
-  for (auto i = 0; i < num_keys; ++i) {
-    Key key           = h_keys[i];
-    Value val         = h_keys[i];
-    h_pairs[i].first  = key;
-    h_pairs[i].second = val;
-  }
-
-  thrust::device_vector<Key> d_keys(h_keys);
-  thrust::device_vector<cuco::pair_type<Key, Value>> d_pairs(h_pairs);
-
-  cuco::static_multimap<Key,
-                        Value,
-                        cuda::thread_scope_device,
-                        cuco::cuda_allocator<char>,
-                        cuco::double_hashing<CGSize,
-                                             cuco::detail::MurmurHash3_32<Key>,
-                                             cuco::detail::MurmurHash3_32<Key>>>
-    map{size, cuco::sentinel::empty_key<Key>{-1}, cuco::sentinel::empty_value<Value>{-1}};
-  map.insert(d_pairs.begin(), d_pairs.end());
-
-  auto const output_size = map.count_outer(d_keys.begin(), d_keys.end());
-  thrust::device_vector<cuco::pair_type<Key, Value>> d_results(output_size);
-
-  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-    map.retrieve_outer(d_keys.begin(), d_keys.end(), d_results.data().get(), launch.get_stream());
-  });
-}
-
-template <typename Key, typename Value, nvbench::int32_t CGSize, nvbench::int32_t BufferSize>
-std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> nvbench_retrieve(
-  nvbench::state& state,
-  nvbench::type_list<Key, Value, nvbench::enum_type<CGSize>, nvbench::enum_type<BufferSize>>)
-{
-  state.skip("Key should be the same type as Value.");
-}
-
-using key_type    = nvbench::type_list<nvbench::int32_t, nvbench::int64_t>;
-using value_type  = nvbench::type_list<nvbench::int32_t, nvbench::int64_t>;
-using cg_size     = nvbench::enum_type_list<1, 2, 4, 8, 16, 32>;
-using buffer_size = nvbench::enum_type_list<1, 2, 4, 8, 16>;
-
-NVBENCH_BENCH_TYPES(nvbench_retrieve,
-                    NVBENCH_TYPE_AXES(key_type, value_type, cg_size, nvbench::enum_type_list<2>))
-  .set_type_axes_names({"Key", "Value", "CGSize", "BufferSize"})
-  .set_timeout(100)                            // Custom timeout: 100 s. Default is 15 s.
-  .set_max_noise(3)                            // Custom noise: 3%. By default: 0.5%.
-  .add_int64_axis("NumInputs", {100'000'000})  // Total number of key/value pairs: 100'000'000
-  .add_float64_axis("Occupancy", {0.4})
-  .add_int64_power_of_two_axis("Multiplicity", nvbench::range(0, 8, 1));
-
-NVBENCH_BENCH_TYPES(
-  nvbench_retrieve,
-  NVBENCH_TYPE_AXES(key_type, value_type, nvbench::enum_type_list<8>, buffer_size))
-  .set_type_axes_names({"Key", "Value", "CGSize", "BufferSize"})
-  .set_timeout(100)                            // Custom timeout: 100 s. Default is 15 s.
-  .set_max_noise(3)                            // Custom noise: 3%. By default: 0.5%.
-  .add_int64_axis("NumInputs", {100'000'000})  // Total number of key/value pairs: 100'000'000
-  .add_float64_axis("Occupancy", {0.4})
-  .add_int64_power_of_two_axis("Multiplicity", nvbench::range(0, 8, 1));
diff --git a/benchmarks/hash_table/static_multimap/pair_retrieve_bench.cu b/benchmarks/hash_table/static_multimap/pair_retrieve_bench.cu
deleted file mode 100644
index b341fce76..000000000
--- a/benchmarks/hash_table/static_multimap/pair_retrieve_bench.cu
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <key_generator.hpp>
-
-#include <cuco/static_multimap.cuh>
-
-#include <nvbench/nvbench.cuh>
-
-#include <thrust/device_vector.h>
-#include <thrust/execution_policy.h>
-#include <thrust/iterator/discard_iterator.h>
-#include <thrust/iterator/zip_iterator.h>
-#include <thrust/transform.h>
-#include <thrust/tuple.h>
-
-namespace {
-// Custom pair equal
-template <typename Key, typename Value>
-struct pair_equal {
-  __device__ bool operator()(const cuco::pair_type<Key, Value>& lhs,
-                             const cuco::pair_type<Key, Value>& rhs) const
-  {
-    return lhs.first == rhs.first;
-  }
-};
-}  // anonymous namespace
-
-/**
- * @brief A benchmark evaluating `pair_retrieve` performance:
- * - CG size: 8
- */
-template <typename Key, typename Value, nvbench::int32_t Multiplicity>
-std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> nvbench_static_multimap_pair_retrieve(
-  nvbench::state& state, nvbench::type_list<Key, Value, nvbench::enum_type<Multiplicity>>)
-{
-  auto constexpr matching_rate = 0.5;
-  auto constexpr occupancy     = 0.5;
-  auto constexpr dist          = dist_type::UNIFORM;
-
-  auto const num_input = state.get_int64("NumInputs");
-
-  std::size_t const size = num_input / occupancy;
-
-  std::vector<Key> h_keys(num_input);
-  std::vector<cuco::pair_type<Key, Value>> h_pairs(num_input);
-
-  generate_keys<dist, Multiplicity, Key>(h_keys.begin(), h_keys.end());
-
-  for (auto i = 0; i < num_input; ++i) {
-    Key key           = h_keys[i];
-    Value val         = h_keys[i];
-    h_pairs[i].first  = key;
-    h_pairs[i].second = val;
-  }
-
-  thrust::device_vector<cuco::pair_type<Key, Value>> d_pairs(h_pairs);
-  auto const pair_begin = d_pairs.begin();
-
-  cuco::static_multimap<Key, Value> map{
-    size, cuco::sentinel::empty_key<Key>{-1}, cuco::sentinel::empty_value<Value>{-1}};
-  map.insert(pair_begin, pair_begin + num_input);
-
-  generate_probe_keys<Key>(matching_rate, h_keys.begin(), h_keys.end());
-  thrust::device_vector<Key> d_keys(h_keys);
-
-  thrust::transform(
-    thrust::device, d_keys.begin(), d_keys.begin() + num_input, pair_begin, [] __device__(Key i) {
-      return cuco::pair_type<Key, Value>{i, i};
-    });
-
-  state.add_element_count(num_input, "NumInputs");
-
-  auto const output_size =
-    map.pair_count(pair_begin, pair_begin + num_input, pair_equal<Key, Value>{});
-  thrust::device_vector<cuco::pair_type<Key, Value>> d_results(output_size);
-
-  auto out1_begin = thrust::make_zip_iterator(
-    thrust::make_tuple(thrust::make_discard_iterator(), thrust::make_discard_iterator()));
-  auto out2_begin = thrust::make_zip_iterator(
-    thrust::make_tuple(thrust::make_discard_iterator(), thrust::make_discard_iterator()));
-
-  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-    auto [out1_end, out2_end] = map.pair_retrieve(
-      pair_begin, pair_begin + num_input, out1_begin, out2_begin, pair_equal<Key, Value>{});
-  });
-}
-
-template <typename Key, typename Value, nvbench::int32_t Multiplicity>
-std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> nvbench_static_multimap_pair_retrieve(
-  nvbench::state& state, nvbench::type_list<Key, Value, nvbench::enum_type<Multiplicity>>)
-{
-  state.skip("Key should be the same type as Value.");
-}
-
-using key_type   = nvbench::type_list<nvbench::int32_t, nvbench::int64_t>;
-using value_type = nvbench::type_list<nvbench::int32_t, nvbench::int64_t>;
-using d_type =
-  nvbench::enum_type_list<dist_type::GAUSSIAN, dist_type::GEOMETRIC, dist_type::UNIFORM>;
-
-using multiplicity = nvbench::enum_type_list<1, 2, 4, 8, 16, 32, 64, 128, 256>;
-
-NVBENCH_BENCH_TYPES(nvbench_static_multimap_pair_retrieve,
-                    NVBENCH_TYPE_AXES(key_type, value_type, multiplicity))
-  .set_name("staic_multimap_pair_retrieve_uniform_multiplicity")
-  .set_type_axes_names({"Key", "Value", "Multiplicity"})
-  .set_timeout(100)  // Custom timeout: 100 s. Default is 15 s.
-  .set_max_noise(3)  // Custom noise: 3%. By default: 0.5%.
-  .add_int64_axis("NumInputs",
-                  {1'000,
-                   100'000,
-                   1'000'000,
-                   10'000'000,
-                   100'000'000});  // Total number of key/value pairs: 100'000'000
diff --git a/benchmarks/hash_table/static_multimap/query_bench.cu b/benchmarks/hash_table/static_multimap/query_bench.cu
index 91c3ca645..7d6202297 100644
--- a/benchmarks/hash_table/static_multimap/query_bench.cu
+++ b/benchmarks/hash_table/static_multimap/query_bench.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,106 +14,89 @@
  * limitations under the License.
  */
 
-#include <key_generator.hpp>
+#include <defaults.hpp>
+#include <utils.hpp>
 
 #include <cuco/static_multimap.cuh>
+#include <cuco/utility/key_generator.hpp>
 
 #include <nvbench/nvbench.cuh>
 
 #include <thrust/device_vector.h>
+#include <thrust/transform.h>
+
+using namespace cuco::benchmark;
+using namespace cuco::utility;
 
 /**
- * @brief A benchmark evaluating multi-value query (`count` + `retrieve`) performance:
- * - Total number of insertions: 100'000'000
- * - CG size: 8
+ * @brief A benchmark evaluating 'cuco::static_multimap::query' (`count` + `retrieve`) performance
  */
-template <typename Key, typename Value, dist_type Dist, nvbench::int32_t Multiplicity>
-std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> nvbench_static_multimap_query(
-  nvbench::state& state,
-  nvbench::type_list<Key, Value, nvbench::enum_type<Dist>, nvbench::enum_type<Multiplicity>>)
+template <typename Key, typename Value, typename Dist>
+std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_multimap_query(
+  nvbench::state& state, nvbench::type_list<Key, Value, Dist>)
 {
-  auto const num_keys      = state.get_int64("NumInputs");
-  auto const occupancy     = state.get_float64("Occupancy");
-  auto const matching_rate = state.get_float64("MatchingRate");
+  using pair_type = cuco::pair<Key, Value>;
 
-  std::size_t const size = num_keys / occupancy;
+  auto const num_keys      = state.get_int64_or_default("NumInputs", defaults::N);
+  auto const occupancy     = state.get_float64_or_default("Occupancy", defaults::OCCUPANCY);
+  auto const matching_rate = state.get_float64_or_default("MatchingRate", defaults::MATCHING_RATE);
 
-  std::vector<Key> h_keys(num_keys);
-  std::vector<cuco::pair_type<Key, Value>> h_pairs(num_keys);
+  std::size_t const size = num_keys / occupancy;
 
-  generate_keys<Dist, Multiplicity, Key>(h_keys.begin(), h_keys.end());
+  thrust::device_vector<Key> keys(num_keys);
 
-  for (auto i = 0; i < num_keys; ++i) {
-    Key key           = h_keys[i];
-    Value val         = h_keys[i];
-    h_pairs[i].first  = key;
-    h_pairs[i].second = val;
-  }
+  key_generator gen;
+  gen.generate(dist_from_state<Dist>(state), keys.begin(), keys.end());
 
-  generate_probe_keys<Key>(matching_rate, h_keys.begin(), h_keys.end());
+  thrust::device_vector<pair_type> pairs(num_keys);
+  thrust::transform(keys.begin(), keys.end(), pairs.begin(), [] __device__(Key const& key) {
+    return pair_type(key, {});
+  });
 
-  thrust::device_vector<Key> d_keys(h_keys);
-  thrust::device_vector<cuco::pair_type<Key, Value>> d_pairs(h_pairs);
+  gen.dropout(keys.begin(), keys.end(), matching_rate);
 
-  state.add_element_count(num_keys, "NumKeys");
+  state.add_element_count(num_keys);
 
   cuco::static_multimap<Key, Value> map{
-    size, cuco::sentinel::empty_key<Key>{-1}, cuco::sentinel::empty_value<Value>{-1}};
-  map.insert(d_pairs.begin(), d_pairs.end());
-
-  auto const output_size = map.count_outer(d_keys.begin(), d_keys.end());
-  thrust::device_vector<cuco::pair_type<Key, Value>> d_results(output_size);
+    size, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{-1}};
+  map.insert(pairs.begin(), pairs.end());
 
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-    auto count = map.count_outer(d_keys.begin(), d_keys.end(), launch.get_stream());
-    map.retrieve_outer(d_keys.begin(), d_keys.end(), d_results.data().get(), launch.get_stream());
+    auto count = map.count_outer(keys.begin(), keys.end(), launch.get_stream());
+    map.retrieve_outer(keys.begin(), keys.end(), pairs.begin(), launch.get_stream());
   });
 }
 
-template <typename Key, typename Value, dist_type Dist, nvbench::int32_t Multiplicity>
-std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> nvbench_static_multimap_query(
-  nvbench::state& state,
-  nvbench::type_list<Key, Value, nvbench::enum_type<Dist>, nvbench::enum_type<Multiplicity>>)
+template <typename Key, typename Value, typename Dist>
+std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> static_multimap_query(
+  nvbench::state& state, nvbench::type_list<Key, Value, Dist>)
 {
   state.skip("Key should be the same type as Value.");
 }
 
-using key_type   = nvbench::type_list<nvbench::int32_t, nvbench::int64_t>;
-using value_type = nvbench::type_list<nvbench::int32_t, nvbench::int64_t>;
-using d_type =
-  nvbench::enum_type_list<dist_type::GAUSSIAN, dist_type::GEOMETRIC, dist_type::UNIFORM>;
-
-using multiplicity = nvbench::enum_type_list<1, 2, 4, 8, 16, 32, 64, 128, 256>;
-
-NVBENCH_BENCH_TYPES(nvbench_static_multimap_query,
-                    NVBENCH_TYPE_AXES(key_type,
-                                      value_type,
-                                      nvbench::enum_type_list<dist_type::UNIFORM>,
-                                      multiplicity))
-  .set_name("staic_multimap_query_uniform_multiplicity")
-  .set_type_axes_names({"Key", "Value", "Distribution", "Multiplicity"})
-  .set_timeout(100)                            // Custom timeout: 100 s. Default is 15 s.
-  .set_max_noise(3)                            // Custom noise: 3%. By default: 0.5%.
-  .add_int64_axis("NumInputs", {100'000'000})  // Total number of key/value pairs: 100'000'000
-  .add_float64_axis("Occupancy", {0.8})
-  .add_float64_axis("MatchingRate", {0.5});
-
-NVBENCH_BENCH_TYPES(nvbench_static_multimap_query,
-                    NVBENCH_TYPE_AXES(key_type, value_type, d_type, nvbench::enum_type_list<8>))
-  .set_name("staic_multimap_query_occupancy")
-  .set_type_axes_names({"Key", "Value", "Distribution", "Multiplicity"})
-  .set_timeout(100)                            // Custom timeout: 100 s. Default is 15 s.
-  .set_max_noise(3)                            // Custom noise: 3%. By default: 0.5%.
-  .add_int64_axis("NumInputs", {100'000'000})  // Total number of key/value pairs: 100'000'000
-  .add_float64_axis("Occupancy", nvbench::range(0.1, 0.9, 0.1))
-  .add_float64_axis("MatchingRate", {0.5});
-
-NVBENCH_BENCH_TYPES(nvbench_static_multimap_query,
-                    NVBENCH_TYPE_AXES(key_type, value_type, d_type, nvbench::enum_type_list<8>))
-  .set_name("staic_multimap_query_matching_rate")
-  .set_type_axes_names({"Key", "Value", "Distribution", "Multiplicity"})
-  .set_timeout(100)                            // Custom timeout: 100 s. Default is 15 s.
-  .set_max_noise(3)                            // Custom noise: 3%. By default: 0.5%.
-  .add_int64_axis("NumInputs", {100'000'000})  // Total number of key/value pairs: 100'000'000
-  .add_float64_axis("Occupancy", {0.8})
-  .add_float64_axis("MatchingRate", {0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1});
+NVBENCH_BENCH_TYPES(static_multimap_query,
+                    NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
+                                      defaults::VALUE_TYPE_RANGE,
+                                      nvbench::type_list<distribution::uniform>))
+  .set_name("static_multimap_query_uniform_occupancy")
+  .set_type_axes_names({"Key", "Value", "Distribution"})
+  .set_max_noise(defaults::MAX_NOISE)
+  .add_float64_axis("Occupancy", defaults::OCCUPANCY_RANGE);
+
+NVBENCH_BENCH_TYPES(static_multimap_query,
+                    NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
+                                      defaults::VALUE_TYPE_RANGE,
+                                      nvbench::type_list<distribution::uniform>))
+  .set_name("static_multimap_query_uniform_matching_rate")
+  .set_type_axes_names({"Key", "Value", "Distribution"})
+  .set_max_noise(defaults::MAX_NOISE)
+  .add_float64_axis("MatchingRate", defaults::MATCHING_RATE_RANGE);
+
+NVBENCH_BENCH_TYPES(static_multimap_query,
+                    NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
+                                      defaults::VALUE_TYPE_RANGE,
+                                      nvbench::type_list<distribution::uniform>))
+  .set_name("static_multimap_query_uniform_multiplicity")
+  .set_type_axes_names({"Key", "Value", "Distribution"})
+  .set_max_noise(defaults::MAX_NOISE)
+  .add_int64_axis("Multiplicity", defaults::MULTIPLICITY_RANGE);
diff --git a/benchmarks/hash_table/static_multimap/retrieve_bench.cu b/benchmarks/hash_table/static_multimap/retrieve_bench.cu
index d92f3528e..e30fbe547 100644
--- a/benchmarks/hash_table/static_multimap/retrieve_bench.cu
+++ b/benchmarks/hash_table/static_multimap/retrieve_bench.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,105 +14,88 @@
  * limitations under the License.
  */
 
-#include <key_generator.hpp>
+#include <defaults.hpp>
+#include <utils.hpp>
 
 #include <cuco/static_multimap.cuh>
+#include <cuco/utility/key_generator.hpp>
 
 #include <nvbench/nvbench.cuh>
 
 #include <thrust/device_vector.h>
+#include <thrust/transform.h>
+
+using namespace cuco::benchmark;
+using namespace cuco::utility;
 
 /**
- * @brief A benchmark evaluating multi-value `retrieve` performance:
- * - Total number of insertions: 100'000'000
- * - CG size: 8
+ * @brief A benchmark evaluating `cuco::static_multimap::retrieve` performance
  */
-template <typename Key, typename Value, dist_type Dist, nvbench::int32_t Multiplicity>
-std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> nvbench_static_multimap_retrieve(
-  nvbench::state& state,
-  nvbench::type_list<Key, Value, nvbench::enum_type<Dist>, nvbench::enum_type<Multiplicity>>)
+template <typename Key, typename Value, typename Dist>
+std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_multimap_retrieve(
+  nvbench::state& state, nvbench::type_list<Key, Value, Dist>)
 {
-  auto const num_keys      = state.get_int64("NumInputs");
-  auto const occupancy     = state.get_float64("Occupancy");
-  auto const matching_rate = state.get_float64("MatchingRate");
+  using pair_type = cuco::pair<Key, Value>;
 
-  std::size_t const size = num_keys / occupancy;
+  auto const num_keys      = state.get_int64_or_default("NumInputs", defaults::N);
+  auto const occupancy     = state.get_float64_or_default("Occupancy", defaults::OCCUPANCY);
+  auto const matching_rate = state.get_float64_or_default("MatchingRate", defaults::MATCHING_RATE);
 
-  std::vector<Key> h_keys(num_keys);
-  std::vector<cuco::pair_type<Key, Value>> h_pairs(num_keys);
+  std::size_t const size = num_keys / occupancy;
 
-  generate_keys<Dist, Multiplicity, Key>(h_keys.begin(), h_keys.end());
+  thrust::device_vector<Key> keys(num_keys);
 
-  for (auto i = 0; i < num_keys; ++i) {
-    Key key           = h_keys[i];
-    Value val         = h_keys[i];
-    h_pairs[i].first  = key;
-    h_pairs[i].second = val;
-  }
+  key_generator gen;
+  gen.generate(dist_from_state<Dist>(state), keys.begin(), keys.end());
 
-  generate_probe_keys<Key>(matching_rate, h_keys.begin(), h_keys.end());
+  thrust::device_vector<pair_type> pairs(num_keys);
+  thrust::transform(keys.begin(), keys.end(), pairs.begin(), [] __device__(Key const& key) {
+    return pair_type(key, {});
+  });
 
-  thrust::device_vector<Key> d_keys(h_keys);
-  thrust::device_vector<cuco::pair_type<Key, Value>> d_pairs(h_pairs);
+  gen.dropout(keys.begin(), keys.end(), matching_rate);
 
-  state.add_element_count(num_keys, "NumKeys");
+  state.add_element_count(num_keys);
 
   cuco::static_multimap<Key, Value> map{
-    size, cuco::sentinel::empty_key<Key>{-1}, cuco::sentinel::empty_value<Value>{-1}};
-  map.insert(d_pairs.begin(), d_pairs.end());
-
-  auto const output_size = map.count_outer(d_keys.begin(), d_keys.end());
-  thrust::device_vector<cuco::pair_type<Key, Value>> d_results(output_size);
+    size, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{-1}};
+  map.insert(pairs.begin(), pairs.end());
 
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-    map.retrieve_outer(d_keys.begin(), d_keys.end(), d_results.data().get(), launch.get_stream());
+    map.retrieve_outer(keys.begin(), keys.end(), pairs.begin(), launch.get_stream());
   });
 }
 
-template <typename Key, typename Value, dist_type Dist, nvbench::int32_t Multiplicity>
-std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> nvbench_static_multimap_retrieve(
-  nvbench::state& state,
-  nvbench::type_list<Key, Value, nvbench::enum_type<Dist>, nvbench::enum_type<Multiplicity>>)
+template <typename Key, typename Value, typename Dist>
+std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> static_multimap_retrieve(
+  nvbench::state& state, nvbench::type_list<Key, Value, Dist>)
 {
   state.skip("Key should be the same type as Value.");
 }
 
-using key_type   = nvbench::type_list<nvbench::int32_t, nvbench::int64_t>;
-using value_type = nvbench::type_list<nvbench::int32_t, nvbench::int64_t>;
-using d_type =
-  nvbench::enum_type_list<dist_type::GAUSSIAN, dist_type::GEOMETRIC, dist_type::UNIFORM>;
-
-using multiplicity = nvbench::enum_type_list<1, 2, 4, 8, 16, 32, 64, 128, 256>;
-
-NVBENCH_BENCH_TYPES(nvbench_static_multimap_retrieve,
-                    NVBENCH_TYPE_AXES(key_type,
-                                      value_type,
-                                      nvbench::enum_type_list<dist_type::UNIFORM>,
-                                      multiplicity))
-  .set_name("staic_multimap_retrieve_uniform_multiplicity")
-  .set_type_axes_names({"Key", "Value", "Distribution", "Multiplicity"})
-  .set_timeout(100)                            // Custom timeout: 100 s. Default is 15 s.
-  .set_max_noise(3)                            // Custom noise: 3%. By default: 0.5%.
-  .add_int64_axis("NumInputs", {100'000'000})  // Total number of key/value pairs: 100'000'000
-  .add_float64_axis("Occupancy", {0.8})
-  .add_float64_axis("MatchingRate", {0.5});
-
-NVBENCH_BENCH_TYPES(nvbench_static_multimap_retrieve,
-                    NVBENCH_TYPE_AXES(key_type, value_type, d_type, nvbench::enum_type_list<8>))
-  .set_name("staic_multimap_retrieve_occupancy")
-  .set_type_axes_names({"Key", "Value", "Distribution", "Multiplicity"})
-  .set_timeout(100)                            // Custom timeout: 100 s. Default is 15 s.
-  .set_max_noise(3)                            // Custom noise: 3%. By default: 0.5%.
-  .add_int64_axis("NumInputs", {100'000'000})  // Total number of key/value pairs: 100'000'000
-  .add_float64_axis("Occupancy", nvbench::range(0.1, 0.9, 0.1))
-  .add_float64_axis("MatchingRate", {0.5});
-
-NVBENCH_BENCH_TYPES(nvbench_static_multimap_retrieve,
-                    NVBENCH_TYPE_AXES(key_type, value_type, d_type, nvbench::enum_type_list<8>))
-  .set_name("staic_multimap_retrieve_matching_rate")
-  .set_type_axes_names({"Key", "Value", "Distribution", "Multiplicity"})
-  .set_timeout(100)                            // Custom timeout: 100 s. Default is 15 s.
-  .set_max_noise(3)                            // Custom noise: 3%. By default: 0.5%.
-  .add_int64_axis("NumInputs", {100'000'000})  // Total number of key/value pairs: 100'000'000
-  .add_float64_axis("Occupancy", {0.8})
-  .add_float64_axis("MatchingRate", {0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1});
+NVBENCH_BENCH_TYPES(static_multimap_retrieve,
+                    NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
+                                      defaults::VALUE_TYPE_RANGE,
+                                      nvbench::type_list<distribution::uniform>))
+  .set_name("static_multimap_retrieve_uniform_occupancy")
+  .set_type_axes_names({"Key", "Value", "Distribution"})
+  .set_max_noise(defaults::MAX_NOISE)
+  .add_float64_axis("Occupancy", defaults::OCCUPANCY_RANGE);
+
+NVBENCH_BENCH_TYPES(static_multimap_retrieve,
+                    NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
+                                      defaults::VALUE_TYPE_RANGE,
+                                      nvbench::type_list<distribution::uniform>))
+  .set_name("static_multimap_retrieve_uniform_matching_rate")
+  .set_type_axes_names({"Key", "Value", "Distribution"})
+  .set_max_noise(defaults::MAX_NOISE)
+  .add_float64_axis("MatchingRate", defaults::MATCHING_RATE_RANGE);
+
+NVBENCH_BENCH_TYPES(static_multimap_retrieve,
+                    NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
+                                      defaults::VALUE_TYPE_RANGE,
+                                      nvbench::type_list<distribution::uniform>))
+  .set_name("static_multimap_retrieve_uniform_multiplicity")
+  .set_type_axes_names({"Key", "Value", "Distribution"})
+  .set_max_noise(defaults::MAX_NOISE)
+  .add_int64_axis("Multiplicity", defaults::MULTIPLICITY_RANGE);
diff --git a/benchmarks/hash_table/static_set/contains_bench.cu b/benchmarks/hash_table/static_set/contains_bench.cu
new file mode 100644
index 000000000..35362ed9e
--- /dev/null
+++ b/benchmarks/hash_table/static_set/contains_bench.cu
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <defaults.hpp>
+#include <utils.hpp>
+
+#include <cuco/static_set.cuh>
+#include <cuco/utility/key_generator.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+#include <thrust/device_vector.h>
+
+using namespace cuco::benchmark;
+using namespace cuco::utility;
+
+/**
+ * @brief A benchmark evaluating `cuco::static_set::contains` performance
+ */
+template <typename Key, typename Dist>
+void static_set_contains(nvbench::state& state, nvbench::type_list<Key, Dist>)
+{
+  auto const num_keys      = state.get_int64_or_default("NumInputs", defaults::N);
+  auto const occupancy     = state.get_float64_or_default("Occupancy", defaults::OCCUPANCY);
+  auto const matching_rate = state.get_float64_or_default("MatchingRate", defaults::MATCHING_RATE);
+
+  std::size_t const size = num_keys / occupancy;
+
+  thrust::device_vector<Key> keys(num_keys);
+
+  key_generator gen;
+  gen.generate(dist_from_state<Dist>(state), keys.begin(), keys.end());
+
+  cuco::experimental::static_set<Key> set{size, cuco::empty_key<Key>{-1}};
+  set.insert(keys.begin(), keys.end());
+
+  gen.dropout(keys.begin(), keys.end(), matching_rate);
+
+  thrust::device_vector<bool> result(num_keys);
+
+  state.add_element_count(num_keys);
+
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    set.contains(keys.begin(), keys.end(), result.begin(), {launch.get_stream()});
+  });
+}
+
+NVBENCH_BENCH_TYPES(static_set_contains,
+                    NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
+                                      nvbench::type_list<distribution::unique>))
+  .set_name("static_set_contains_unique_occupancy")
+  .set_type_axes_names({"Key", "Distribution"})
+  .set_max_noise(defaults::MAX_NOISE)
+  .add_float64_axis("Occupancy", defaults::OCCUPANCY_RANGE);
+
+NVBENCH_BENCH_TYPES(static_set_contains,
+                    NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
+                                      nvbench::type_list<distribution::unique>))
+  .set_name("static_set_contains_unique_matching_rate")
+  .set_type_axes_names({"Key", "Distribution"})
+  .set_max_noise(defaults::MAX_NOISE)
+  .add_float64_axis("MatchingRate", defaults::MATCHING_RATE_RANGE);
+
+NVBENCH_BENCH_TYPES(static_set_contains,
+                    NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
+                                      nvbench::type_list<distribution::unique>))
+  .set_name("static_set_constains_unique_capacity")
+  .set_type_axes_names({"Key", "Distribution"})
+  .add_int64_axis("NumInputs", defaults::N_RANGE_CACHE);
diff --git a/benchmarks/hash_table/static_set/find_bench.cu b/benchmarks/hash_table/static_set/find_bench.cu
new file mode 100644
index 000000000..e0ab9111c
--- /dev/null
+++ b/benchmarks/hash_table/static_set/find_bench.cu
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <defaults.hpp>
+#include <utils.hpp>
+
+#include <cuco/static_set.cuh>
+#include <cuco/utility/key_generator.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/transform.h>
+
+using namespace cuco::benchmark;
+using namespace cuco::utility;
+
+/**
+ * @brief A benchmark evaluating `cuco::static_set::find` performance
+ */
+template <typename Key, typename Dist>
+void static_set_find(nvbench::state& state, nvbench::type_list<Key, Dist>)
+{
+  auto const num_keys      = state.get_int64_or_default("NumInputs", defaults::N);
+  auto const occupancy     = state.get_float64_or_default("Occupancy", defaults::OCCUPANCY);
+  auto const matching_rate = state.get_float64_or_default("MatchingRate", defaults::MATCHING_RATE);
+
+  std::size_t const size = num_keys / occupancy;
+
+  thrust::device_vector<Key> keys(num_keys);
+
+  key_generator gen;
+  gen.generate(dist_from_state<Dist>(state), keys.begin(), keys.end());
+
+  cuco::experimental::static_set<Key> set{size, cuco::empty_key<Key>{-1}};
+  set.insert(keys.begin(), keys.end());
+
+  // TODO: would crash if not passing nullptr, why?
+  gen.dropout(keys.begin(), keys.end(), matching_rate, nullptr);
+
+  thrust::device_vector<Key> result(num_keys);
+
+  state.add_element_count(num_keys);
+
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    set.find(keys.begin(), keys.end(), result.begin(), {launch.get_stream()});
+  });
+}
+
+NVBENCH_BENCH_TYPES(static_set_find,
+                    NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
+                                      nvbench::type_list<distribution::unique>))
+  .set_name("static_set_find_unique_occupancy")
+  .set_type_axes_names({"Key", "Distribution"})
+  .set_max_noise(defaults::MAX_NOISE)
+  .add_float64_axis("Occupancy", defaults::OCCUPANCY_RANGE);
+
+NVBENCH_BENCH_TYPES(static_set_find,
+                    NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
+                                      nvbench::type_list<distribution::unique>))
+  .set_name("static_set_find_unique_matching_rate")
+  .set_type_axes_names({"Key", "Distribution"})
+  .set_max_noise(defaults::MAX_NOISE)
+  .add_float64_axis("MatchingRate", defaults::MATCHING_RATE_RANGE);
+
+NVBENCH_BENCH_TYPES(static_set_find,
+                    NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
+                                      nvbench::type_list<distribution::unique>))
+  .set_name("static_set_find_unique_capacity")
+  .set_type_axes_names({"Key", "Distribution"})
+  .add_int64_axis("NumInputs", defaults::N_RANGE_CACHE);
diff --git a/benchmarks/hash_table/static_set/insert_bench.cu b/benchmarks/hash_table/static_set/insert_bench.cu
new file mode 100644
index 000000000..48bc37fa4
--- /dev/null
+++ b/benchmarks/hash_table/static_set/insert_bench.cu
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <defaults.hpp>
+#include <utils.hpp>
+
+#include <cuco/static_set.cuh>
+#include <cuco/utility/key_generator.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+#include <thrust/device_vector.h>
+
+using namespace cuco::benchmark;
+using namespace cuco::utility;
+
+/**
+ * @brief A benchmark evaluating `cuco::static_set::insert` performance
+ */
+template <typename Key, typename Dist>
+void static_set_insert(nvbench::state& state, nvbench::type_list<Key, Dist>)
+{
+  auto const num_keys  = state.get_int64_or_default("NumInputs", defaults::N);
+  auto const occupancy = state.get_float64_or_default("Occupancy", defaults::OCCUPANCY);
+
+  std::size_t const size = num_keys / occupancy;
+
+  thrust::device_vector<Key> keys(num_keys);
+
+  key_generator gen;
+  gen.generate(dist_from_state<Dist>(state), keys.begin(), keys.end());
+
+  state.add_element_count(num_keys);
+
+  state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer,
+             [&](nvbench::launch& launch, auto& timer) {
+               cuco::experimental::static_set<Key> set{
+                 size, cuco::empty_key<Key>{-1}, {}, {}, {}, {launch.get_stream()}};
+
+               timer.start();
+               set.insert(keys.begin(), keys.end(), {launch.get_stream()});
+               timer.stop();
+             });
+}
+
+NVBENCH_BENCH_TYPES(static_set_insert,
+                    NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
+                                      nvbench::type_list<distribution::uniform>))
+  .set_name("static_set_insert_uniform_multiplicity")
+  .set_type_axes_names({"Key", "Distribution"})
+  .set_max_noise(defaults::MAX_NOISE)
+  .add_int64_axis("Multiplicity", defaults::MULTIPLICITY_RANGE);
+
+NVBENCH_BENCH_TYPES(static_set_insert,
+                    NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
+                                      nvbench::type_list<distribution::unique>))
+  .set_name("static_set_insert_unique_occupancy")
+  .set_type_axes_names({"Key", "Distribution"})
+  .set_max_noise(defaults::MAX_NOISE)
+  .add_float64_axis("Occupancy", defaults::OCCUPANCY_RANGE);
+
+NVBENCH_BENCH_TYPES(static_set_insert,
+                    NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
+                                      nvbench::type_list<distribution::gaussian>))
+  .set_name("static_set_insert_gaussian_skew")
+  .set_type_axes_names({"Key", "Distribution"})
+  .set_max_noise(defaults::MAX_NOISE)
+  .add_float64_axis("Skew", defaults::SKEW_RANGE);
diff --git a/benchmarks/hash_table/static_set/retrieve_all_bench.cu b/benchmarks/hash_table/static_set/retrieve_all_bench.cu
new file mode 100644
index 000000000..17ea66384
--- /dev/null
+++ b/benchmarks/hash_table/static_set/retrieve_all_bench.cu
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <defaults.hpp>
+#include <utils.hpp>
+
+#include <cuco/static_set.cuh>
+#include <cuco/utility/key_generator.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+#include <thrust/device_vector.h>
+
+using namespace cuco::benchmark;
+using namespace cuco::utility;
+
+/**
+ * @brief A benchmark evaluating `cuco::static_set::retrieve_all` performance
+ */
+template <typename Key, typename Dist>
+void static_set_retrieve_all(nvbench::state& state, nvbench::type_list<Key, Dist>)
+{
+  auto const num_keys  = state.get_int64_or_default("NumInputs", defaults::N);
+  auto const occupancy = state.get_float64_or_default("Occupancy", defaults::OCCUPANCY);
+
+  std::size_t const size = num_keys / occupancy;
+
+  thrust::device_vector<Key> keys(num_keys);
+
+  key_generator gen;
+  gen.generate(dist_from_state<Dist>(state), keys.begin(), keys.end());
+
+  cuco::experimental::static_set<Key> set{size, cuco::empty_key<Key>{-1}};
+  set.insert(keys.begin(), keys.end());
+
+  thrust::device_vector<Key> result(num_keys);
+
+  state.add_element_count(num_keys);
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    auto end = set.retrieve_all(result.begin(), {launch.get_stream()});
+  });
+}
+
+NVBENCH_BENCH_TYPES(static_set_retrieve_all,
+                    NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
+                                      nvbench::type_list<distribution::unique>))
+  .set_name("static_set_retrieve_all_unique_occupancy")
+  .set_type_axes_names({"Key", "Distribution"})
+  .set_max_noise(defaults::MAX_NOISE)
+  .add_float64_axis("Occupancy", defaults::OCCUPANCY_RANGE);
diff --git a/benchmarks/hash_table/static_set/size_bench.cu b/benchmarks/hash_table/static_set/size_bench.cu
new file mode 100644
index 000000000..fbddc3951
--- /dev/null
+++ b/benchmarks/hash_table/static_set/size_bench.cu
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <defaults.hpp>
+#include <utils.hpp>
+
+#include <cuco/static_set.cuh>
+#include <cuco/utility/key_generator.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+#include <thrust/device_vector.h>
+
+using namespace cuco::benchmark;
+using namespace cuco::utility;
+
+/**
+ * @brief A benchmark evaluating `cuco::static_set::size` performance
+ */
+template <typename Key, typename Dist>
+void static_set_size(nvbench::state& state, nvbench::type_list<Key, Dist>)
+{
+  auto const num_keys  = state.get_int64_or_default("NumInputs", defaults::N);
+  auto const occupancy = state.get_float64_or_default("Occupancy", defaults::OCCUPANCY);
+
+  std::size_t const size = num_keys / occupancy;
+
+  thrust::device_vector<Key> keys(num_keys);
+
+  key_generator gen;
+  gen.generate(dist_from_state<Dist>(state), keys.begin(), keys.end());
+
+  state.add_element_count(num_keys);
+
+  cuco::experimental::static_set<Key> set{size, cuco::empty_key<Key>{-1}};
+
+  set.insert(keys.begin(), keys.end());
+
+  state.exec(nvbench::exec_tag::sync,
+             [&](nvbench::launch& launch) { auto const size = set.size({launch.get_stream()}); });
+}
+
+NVBENCH_BENCH_TYPES(static_set_size,
+                    NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
+                                      nvbench::type_list<distribution::unique>))
+  .set_name("static_set_size_unique_occupancy")
+  .set_type_axes_names({"Key", "Distribution"})
+  .set_max_noise(defaults::MAX_NOISE)
+  .add_float64_axis("Occupancy", defaults::OCCUPANCY_RANGE);
diff --git a/benchmarks/key_generator.hpp b/benchmarks/key_generator.hpp
deleted file mode 100644
index bd90e6caa..000000000
--- a/benchmarks/key_generator.hpp
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Copyright (c) 2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <nvbench/nvbench.cuh>
-
-#include <limits>
-#include <random>
-
-enum class dist_type { GAUSSIAN, GEOMETRIC, UNIFORM };
-
-NVBENCH_DECLARE_ENUM_TYPE_STRINGS(
-  // Enum type:
-  dist_type,
-  // Callable to generate input strings:
-  // Short identifier used for tables, command-line args, etc.
-  // Used when context is available to figure out the enum type.
-  [](dist_type d) {
-    switch (d) {
-      case dist_type::GAUSSIAN: return "GAUSSIAN";
-      case dist_type::GEOMETRIC: return "GEOMETRIC";
-      case dist_type::UNIFORM: return "UNIFORM";
-      default: return "ERROR";
-    }
-  },
-  // Callable to generate descriptions:
-  // If non-empty, these are used in `--list` to describe values.
-  // Used when context may not be available to figure out the type from the
-  // input string.
-  // Just use `[](auto) { return std::string{}; }` if you don't want these.
-  [](auto) { return std::string{}; })
-
-template <dist_type Dist, std::size_t Multiplicity, typename Key, typename OutputIt>
-static void generate_keys(OutputIt output_begin, OutputIt output_end)
-{
-  auto const num_keys = std::distance(output_begin, output_end);
-
-  std::random_device rd;
-  std::mt19937 gen{rd()};
-
-  switch (Dist) {
-    case dist_type::GAUSSIAN: {
-      auto const mean = static_cast<double>(num_keys / 2);
-      auto const dev  = static_cast<double>(num_keys / 5);
-
-      std::normal_distribution<> distribution{mean, dev};
-
-      for (auto i = 0; i < num_keys; ++i) {
-        auto k = distribution(gen);
-        while (k >= num_keys) {
-          k = distribution(gen);
-        }
-        output_begin[i] = k;
-      }
-      break;
-    }
-    case dist_type::GEOMETRIC: {
-      auto const max   = std::numeric_limits<int32_t>::max();
-      auto const coeff = static_cast<double>(num_keys) / static_cast<double>(max);
-      // Random sampling in range [0, INT32_MAX]
-      std::geometric_distribution<Key> distribution{1e-9};
-
-      for (auto i = 0; i < num_keys; ++i) {
-        output_begin[i] = distribution(gen) * coeff;
-      }
-      break;
-    }
-    case dist_type::UNIFORM: {
-      std::uniform_int_distribution<Key> distribution{1, static_cast<Key>(num_keys / Multiplicity)};
-
-      for (auto i = 0; i < num_keys; ++i) {
-        output_begin[i] = distribution(gen);
-      }
-      break;
-    }
-  }  // switch
-}
-
-template <typename Key, typename OutputIt>
-static void generate_probe_keys(double const matching_rate,
-                                OutputIt output_begin,
-                                OutputIt output_end)
-{
-  auto const num_keys = std::distance(output_begin, output_end);
-  auto const max      = std::numeric_limits<Key>::max();
-
-  std::random_device rd;
-  std::mt19937 gen{rd()};
-
-  std::uniform_real_distribution<double> rate_dist(0.0, 1.0);
-  std::uniform_int_distribution<Key> non_match_dist{static_cast<Key>(num_keys), max};
-
-  for (auto i = 0; i < num_keys; ++i) {
-    auto const tmp_rate = rate_dist(gen);
-
-    if (tmp_rate > matching_rate) { output_begin[i] = non_match_dist(gen); }
-  }
-
-  std::random_shuffle(output_begin, output_end);
-}
diff --git a/benchmarks/reduce_by_key/reduce_by_key.cu b/benchmarks/reduce_by_key/reduce_by_key.cu
deleted file mode 100644
index 1de05a42f..000000000
--- a/benchmarks/reduce_by_key/reduce_by_key.cu
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <benchmark/benchmark.h>
-
-#include <thrust/device_vector.h>
-#include <thrust/distance.h>
-#include <thrust/execution_policy.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/iterator/zip_iterator.h>
-#include <thrust/pair.h>
-#include <thrust/reduce.h>
-#include <thrust/sort.h>
-#include <thrust/transform.h>
-
-/**
- * @brief Generates input sizes and number of unique keys
- *
- */
-static void generate_size_and_num_unique(benchmark::internal::Benchmark* b)
-{
-  for (auto num_unique = 64; num_unique <= 1 << 20; num_unique <<= 1) {
-    for (auto size = 10'000'000; size <= 10'000'000; size *= 10) {
-      b->Args({size, num_unique});
-    }
-  }
-}
-
-template <typename KeyRandomIterator, typename ValueRandomIterator>
-void thrust_reduce_by_key(KeyRandomIterator keys_begin,
-                          KeyRandomIterator keys_end,
-                          ValueRandomIterator values_begin)
-{
-  using Key   = typename thrust::iterator_traits<KeyRandomIterator>::value_type;
-  using Value = typename thrust::iterator_traits<ValueRandomIterator>::value_type;
-
-  // Exact size of output is unknown (number of unique keys), but upper bounded
-  // by the number of keys
-  auto maximum_output_size = thrust::distance(keys_begin, keys_end);
-  thrust::device_vector<Key> output_keys(maximum_output_size);
-  thrust::device_vector<Value> output_values(maximum_output_size);
-
-  thrust::sort_by_key(thrust::device, keys_begin, keys_end, values_begin);
-  thrust::reduce_by_key(
-    thrust::device, keys_begin, keys_end, values_begin, output_keys.begin(), output_values.end());
-}
-
-template <typename Key, typename Value>
-static void BM_thrust(::benchmark::State& state)
-{
-  auto const num_unique_keys = state.range(1);
-  for (auto _ : state) {
-    state.PauseTiming();
-    thrust::device_vector<Key> keys(state.range(0));
-    auto begin = thrust::make_counting_iterator(0);
-    thrust::transform(
-      begin, begin + state.range(0), keys.begin(), [num_unique_keys] __device__(auto i) {
-        return i % num_unique_keys;
-      });
-
-    thrust::device_vector<Value> values(state.range(0));
-    state.ResumeTiming();
-    thrust_reduce_by_key(keys.begin(), keys.end(), values.begin());
-    cudaDeviceSynchronize();
-  }
-}
-BENCHMARK_TEMPLATE(BM_thrust, int32_t, int32_t)
-  ->Unit(benchmark::kMillisecond)
-  ->Apply(generate_size_and_num_unique);
-
-BENCHMARK_TEMPLATE(BM_thrust, int64_t, int64_t)
-  ->Unit(benchmark::kMillisecond)
-  ->Apply(generate_size_and_num_unique);
-
-// TODO: Hash based reduce by key benchmark
diff --git a/benchmarks/synchronization.hpp b/benchmarks/synchronization.hpp
deleted file mode 100644
index f0d7807be..000000000
--- a/benchmarks/synchronization.hpp
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-// Google Benchmark library
-#include <benchmark/benchmark.h>
-
-#include <cuda_runtime_api.h>
-
-#include <stdexcept>
-
-#define BENCH_CUDA_TRY(call)                                                         \
-  do {                                                                               \
-    auto const status = (call);                                                      \
-    if (cudaSuccess != status) { throw std::runtime_error("CUDA error detected."); } \
-  } while (0)
-
-#define BENCH_ASSERT_CUDA_SUCCESS(expr) \
-  do {                                  \
-    cudaError_t const status = (expr);  \
-    assert(cudaSuccess == status);      \
-  } while (0)
-/**
- * @brief  This class serves as a wrapper for using `cudaEvent_t` as the user
- * defined timer within the framework of google benchmark
- * (https://github.com/google/benchmark).
- *
- * It is built on top of the idea of Resource acquisition is initialization
- * (RAII). In the following we show a minimal example of how to use this class.
- *
- * \code{cpp}
- * #include <benchmark/benchmark.h>
- *
- * static void sample_cuda_benchmark(benchmark::State& state) {
- *
- *   for (auto _ : state){
- *     cudaStream_t stream = 0;
- *
- *     // Create (Construct) an object of this class. You HAVE to pass in the
- *     // benchmark::State object you are using. It measures the time from its
- *     // creation to its destruction that is spent on the specified CUDA stream.
- *     // It also clears the L2 cache by cudaMemset'ing a device buffer that is of
- *     // the size of the L2 cache (if flush_l2_cache is set to true and there is
- *     // an L2 cache on the current device).
- *     cuda_event_timer raii(state, true, stream); // flush_l2_cache = true
- *
- *     // Now perform the operations that is to be benchmarked
- *     sample_kernel<<<1, 256, 0, stream>>>(); // Possibly launching a CUDA kernel
- *
- *   }
- * }
- *
- * // Register the function as a benchmark. You will need to set the `UseManualTime()`
- * // flag in order to use the timer embeded in this class.
- * BENCHMARK(sample_cuda_benchmark)->UseManualTime();
- * \endcode
- *
- *
- */
-class cuda_event_timer {
- public:
-  /**
-   * @brief Constructs a `cuda_event_timer` beginning a manual timing range.
-   *
-   * Optionally flushes L2 cache.
-   *
-   * @param[in,out] state  This is the benchmark::State whose timer we are going
-   * to update.
-   * @param[in] flush_l2_cache_ whether or not to flush the L2 cache before
-   *                            every iteration.
-   * @param[in] stream_ The CUDA stream we are measuring time on.
-   */
-  cuda_event_timer(benchmark::State& state, bool flush_l2_cache = false, cudaStream_t stream = 0)
-    : p_state(&state), stream_(stream)
-  {
-    // flush all of L2$
-    if (flush_l2_cache) {
-      int current_device = 0;
-      BENCH_CUDA_TRY(cudaGetDevice(&current_device));
-
-      int l2_cache_bytes = 0;
-      BENCH_CUDA_TRY(
-        cudaDeviceGetAttribute(&l2_cache_bytes, cudaDevAttrL2CacheSize, current_device));
-
-      if (l2_cache_bytes > 0) {
-        const int memset_value = 0;
-        int* l2_cache_buffer   = nullptr;
-        BENCH_CUDA_TRY(cudaMalloc(&l2_cache_buffer, l2_cache_bytes));
-        BENCH_CUDA_TRY(cudaMemsetAsync(l2_cache_buffer, memset_value, l2_cache_bytes, stream_));
-        BENCH_CUDA_TRY(cudaFree(l2_cache_buffer));
-      }
-    }
-
-    BENCH_CUDA_TRY(cudaEventCreate(&start_));
-    BENCH_CUDA_TRY(cudaEventCreate(&stop_));
-    BENCH_CUDA_TRY(cudaEventRecord(start_, stream_));
-  }
-
-  cuda_event_timer() = delete;
-
-  /**
-   * @brief Destroy the `cuda_event_timer` and ending the manual time range.
-   *
-   */
-  ~cuda_event_timer()
-  {
-    BENCH_ASSERT_CUDA_SUCCESS(cudaEventRecord(stop_, stream_));
-    BENCH_ASSERT_CUDA_SUCCESS(cudaEventSynchronize(stop_));
-    float milliseconds = 0.0f;
-    BENCH_ASSERT_CUDA_SUCCESS(cudaEventElapsedTime(&milliseconds, start_, stop_));
-    p_state->SetIterationTime(milliseconds / (1000.0f));
-    BENCH_ASSERT_CUDA_SUCCESS(cudaEventDestroy(start_));
-    BENCH_ASSERT_CUDA_SUCCESS(cudaEventDestroy(stop_));
-  }
-
- private:
-  cudaEvent_t start_;
-  cudaEvent_t stop_;
-  cudaStream_t stream_;
-  benchmark::State* p_state;
-};
diff --git a/benchmarks/utils.hpp b/benchmarks/utils.hpp
new file mode 100644
index 000000000..a8a84a3b6
--- /dev/null
+++ b/benchmarks/utils.hpp
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuco/detail/error.hpp>
+#include <cuco/utility/key_generator.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+namespace cuco::benchmark {
+
+template <typename Dist>
+auto dist_from_state(nvbench::state const& state)
+{
+  if constexpr (std::is_same_v<Dist, cuco::utility::distribution::unique>) {
+    return Dist{};
+  } else if constexpr (std::is_same_v<Dist, cuco::utility::distribution::uniform>) {
+    auto const multiplicity = state.get_int64_or_default("Multiplicity", defaults::MULTIPLICITY);
+    return Dist{multiplicity};
+  } else if constexpr (std::is_same_v<Dist, cuco::utility::distribution::gaussian>) {
+    auto const skew = state.get_float64_or_default("Skew", defaults::SKEW);
+    return Dist{skew};
+  } else {
+    CUCO_FAIL("Unexpected distribution type");
+  }
+}
+
+}  // namespace cuco::benchmark
+
+NVBENCH_DECLARE_TYPE_STRINGS(cuco::utility::distribution::unique, "UNIQUE", "distribution::unique");
+NVBENCH_DECLARE_TYPE_STRINGS(cuco::utility::distribution::uniform,
+                             "UNIFORM",
+                             "distribution::uniform");
+NVBENCH_DECLARE_TYPE_STRINGS(cuco::utility::distribution::gaussian,
+                             "GAUSSIAN",
+                             "distribution::gaussian");
diff --git a/ci/build.sh b/ci/build.sh
new file mode 100755
index 000000000..0baeaa68c
--- /dev/null
+++ b/ci/build.sh
@@ -0,0 +1,121 @@
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -eo pipefail
+
+# Ensure the script is being executed in its containing directory
+cd "$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )";
+
+# Script defaults
+CUDA_COMPILER=nvcc
+
+# Check if the correct number of arguments has been provided
+function usage {
+    echo "Usage: $0 [OPTIONS] <HOST_COMPILER> <CXX_STANDARD> <GPU_ARCHS>"
+    echo "The PARALLEL_LEVEL environment variable controls the amount of build parallelism. Default is the number of cores."
+    echo "Example: PARALLEL_LEVEL=8 $0 g++-8 14 \"70\" "
+    echo "Example: $0 clang++-8 17 \"70;75;80-virtual\" "
+    echo "Possible options: "
+    echo "  -nvcc: path/to/nvcc"
+    echo "  -v/--verbose: enable shell echo for debugging"
+    exit 1
+}
+
+# Check for extra options
+# While there are more than 3 arguments, parse switches/options
+while [ "$#" -gt 3 ]
+do
+  case "${1}" in
+  -h)     usage ;;
+  -help)  usage ;;
+  --help) usage ;;
+  --verbose)           VERBOSE=1; shift ;;
+  -v)                  VERBOSE=1; shift ;;
+  -nvcc)               CUDA_COMPILER="${2}"; shift 2;;
+  *) usage ;;
+  esac
+done
+
+if [ $VERBOSE ]; then
+    set -x
+fi
+
+if [ "$#" -ne 3 ]; then
+    echo "Invalid number of arguments"
+    usage
+fi
+
+# Begin processing unsets after option parsing
+set -u
+
+# Assign command line arguments to variables
+readonly HOST_COMPILER=$(which $1)
+readonly CXX_STANDARD=$2
+
+# Replace spaces, commas and semicolons with semicolons for CMake list
+readonly GPU_ARCHS=$(echo $3 | tr ' ,' ';')
+
+readonly PARALLEL_LEVEL=${PARALLEL_LEVEL:=$(nproc)}
+readonly NVCC_VERSION=$($CUDA_COMPILER --version | grep release | awk '{print $6}' | cut -c2-)
+
+if [ -z ${DEVCONTAINER_NAME+x} ]; then
+    BUILD_DIR=../build/local
+else
+    BUILD_DIR=../build/${DEVCONTAINER_NAME}
+fi
+
+# The most recent build will always be symlinked to cuCollections/build/latest
+mkdir -p $BUILD_DIR
+rm -f ../build/latest
+ln -sf $BUILD_DIR ../build/latest
+export BUILD_DIR
+echo $BUILD_DIR
+
+CMAKE_OPTIONS="
+    -DCMAKE_BUILD_TYPE=Release \
+    -DCMAKE_CXX_STANDARD=${CXX_STANDARD} \
+    -DCMAKE_CUDA_STANDARD=${CXX_STANDARD} \
+    -DCMAKE_CXX_COMPILER=${HOST_COMPILER} \
+    -DCMAKE_CUDA_COMPILER=${CUDA_COMPILER} \
+    -DCMAKE_CUDA_HOST_COMPILER=${HOST_COMPILER} \
+    -DCMAKE_CUDA_ARCHITECTURES=${GPU_ARCHS} \
+    -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
+"
+
+echo "========================================"
+echo "Begin build"
+echo "pwd=$(pwd)"
+echo "NVCC_VERSION=$NVCC_VERSION"
+echo "HOST_COMPILER=$HOST_COMPILER"
+echo "CXX_STANDARD=$CXX_STANDARD"
+echo "GPU_ARCHS=$GPU_ARCHS"
+echo "PARALLEL_LEVEL=$PARALLEL_LEVEL"
+echo "BUILD_DIR=$BUILD_DIR"
+echo "========================================"
+
+function configure(){
+    cmake -S .. -B $BUILD_DIR $CMAKE_OPTIONS
+}
+
+function build(){
+    source "./sccache_stats.sh" start
+    cmake --build $BUILD_DIR --parallel $PARALLEL_LEVEL
+    echo "Build complete"
+    source "./sccache_stats.sh" end
+}
+
+configure
+build
\ No newline at end of file
diff --git a/ci/checks/style.sh b/ci/checks/style.sh
deleted file mode 100755
index fbbe1d120..000000000
--- a/ci/checks/style.sh
+++ /dev/null
@@ -1,33 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
-##############################
-# cuCollections Style Tester #
-##############################
-
-# Ignore errors and set path
-set +e
-PATH=/conda/bin:$PATH
-# LC_ALL=C.UTF-8
-# LANG=C.UTF-8
-
-# Activate common conda env
-. /opt/conda/etc/profile.d/conda.sh
-conda activate rapids
-
-# Run clang-format and check for a consistent code format
-CLANG_FORMAT=`pre-commit run clang-format --all-files 2>&1`
-CLANG_FORMAT_RETVAL=$?
-
-# Run doxygen check
-DOXYGEN_CHECK=`ci/checks/doxygen.sh`
-DOXYGEN_CHECK_RETVAL=$?
-
-echo -e "$DOXYGEN_CHECK"
-
-RETVALS=(
-  $CLANG_FORMAT_RETVAL
-)
-IFS=$'\n'
-RETVAL=`echo "${RETVALS[*]}" | sort -nr | head -n1`
-
-exit $RETVAL
diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
deleted file mode 100644
index 8ae26bcf4..000000000
--- a/ci/gpu/build.sh
+++ /dev/null
@@ -1,76 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2021, NVIDIA CORPORATION.
-##############################################i###
-# cuCollections GPU build and test script for CI #
-##################################################
-set -e
-NUMARGS=$#
-ARGS=$*
-
-# Arg parsing function
-function hasArg {
-    (( ${NUMARGS} != 0 )) && (echo " ${ARGS} " | grep -q " $1 ")
-}
-
-# Set path and build parallel level
-export PATH=/opt/conda/bin:/usr/local/cuda/bin:$PATH
-export PARALLEL_LEVEL=${PARALLEL_LEVEL:-4}
-export CUDA_REL=${CUDA_VERSION%.*}
-
-# Set home to the job's workspace
-export HOME=$WORKSPACE
-
-################################################################################
-# SETUP - Check environment
-################################################################################
-
-gpuci_logger "Check environment"
-env
-
-gpuci_logger "Check GPU usage"
-nvidia-smi
-
-gpuci_logger "Install Dependencies"
-. /opt/conda/etc/profile.d/conda.sh
-conda create -y -n cuda -c nvidia -c conda-forge "cudatoolkit=${CUDA_VER}" "cmake>=3.18.*"
-conda activate cuda
-
-gpuci_logger "Check versions"
-python --version
-
-gpuci_logger "Check conda environment"
-conda info
-conda config --show-sources
-conda list --show-channel-urls
-
-################################################################################
-# BUILD - Build from Source
-################################################################################
-
-gpuci_logger "Build Tests/Examples"
-cd ${WORKSPACE}
-mkdir -p build
-cd build
-cmake ..
-make
-
-################################################################################
-# TEST - Run Tests
-################################################################################
-
-if hasArg --skip-tests; then
-    gpuci_logger "Skipping Tests"
-else
-    gpuci_logger "Check GPU usage"
-    nvidia-smi
-    cd ${WORKSPACE}/build/tests
-    ctest .
-
-    # This block may provide more verbose testing output since each test is ran individually
-    #cd ${WORKSPACE}/build/tests
-    #for gt in "$WORKSPACE/build/tests"* ; do
-    #    test_name=$(basename ${gt})
-    #    echo "Running $test_name"
-    #    ${gt}
-    #done
-fi
diff --git a/ci/matrix.yml b/ci/matrix.yml
new file mode 100644
index 000000000..5916dd113
--- /dev/null
+++ b/ci/matrix.yml
@@ -0,0 +1,46 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+cuda_oldest: &cuda_oldest '11.8'
+cuda_newest: &cuda_newest '12.2'
+
+# The GPUs to test on
+# Note: This assumes that the appropriate gpu_build_archs are set to include building for the GPUs listed here
+gpus:
+  - 'v100'
+
+# The version of the devcontainer images to use from https://hub.docker.com/r/rapidsai/devcontainers
+devcontainer_version: '23.08'
+
+# Each environment below will generate a unique build/test job
+# See the "compute-matrix" job in the workflow for how this is parsed and used
+# cuda: The CUDA Toolkit version
+# os: The operating system used
+# cpu: The CPU architecture
+# compiler: The compiler to use
+#   name: The compiler name
+#   version: The compiler version
+#   exe: The unverionsed compiler binary name
+#   To use the system's default compiler set "exe: 'c++'" or "name: 'cc'"
+# gpu_build_archs: The GPU architectures to build for (comma-separated list)
+# std: The C++ standards to build for
+#    This field is unique as it will generate an independent build/test job for each value
+
+# Configurations that will run for every PR
+pull_request:
+  nvcc:
+    # There is currently only one CUDA 11.8 image available which comes with the system's default C++ compiler. For ubuntu22.04, we know that the default CC is gcc11.3
+    - {cuda: *cuda_oldest, os: 'ubuntu22.04', cpu: 'amd64', compiler: {name: 'gcc', version: '11', exe: 'c++'}, gpu_build_archs: '60', std: [17], jobs: ['build', 'test']}
+    - {cuda: *cuda_newest, os: 'ubuntu22.04', cpu: 'amd64', compiler: {name: 'gcc', version: '12', exe: 'g++'}, gpu_build_archs: '70', std: [17], jobs: ['build', 'test']}
\ No newline at end of file
diff --git a/ci/checks/doxygen.sh b/ci/pre-commit/doxygen.sh
similarity index 59%
rename from ci/checks/doxygen.sh
rename to ci/pre-commit/doxygen.sh
index b9a243cd1..8f387c6ea 100755
--- a/ci/checks/doxygen.sh
+++ b/ci/pre-commit/doxygen.sh
@@ -1,8 +1,18 @@
 #!/bin/bash
-# Copyright (c) 2022, NVIDIA CORPORATION.
-########################################
-# cuCollections doxygen warnings check #
-########################################
+# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 # skip if doxygen is not installed
 if ! [ -x "$(command -v doxygen)" ]; then
@@ -16,9 +26,9 @@ function version { echo "$@" | awk -F. '{ printf("%d%03d%03d%03d\n", $1,$2,$3,$4
 # Doxygen supported version 1.8.20 to 1.9.1
 DOXYGEN_VERSION=$(doxygen --version)
 if [ $(version "$DOXYGEN_VERSION") -lt $(version "1.8.20") ] ||  [ $(version $DOXYGEN_VERSION) -gt $(version "1.9.1") ]; then
-￼  echo -e "Warning: Unsupported Doxygen version $DOXYGEN_VERSION"
-￼  echo -e "Expecting Doxygen version from 1.8.20 to 1.9.1"
-￼  exit 0
+  echo -e "Warning: Unsupported Doxygen version $DOXYGEN_VERSION"
+  echo -e "Expecting Doxygen version from 1.8.20 to 1.9.1"
+  exit 0
 fi
 
 # Run doxygen, ignore missing tag files error
diff --git a/ci/sccache_hit_rate.sh b/ci/sccache_hit_rate.sh
new file mode 100755
index 000000000..8b6d2d3f5
--- /dev/null
+++ b/ci/sccache_hit_rate.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+# Ensure two arguments are provided
+if [ $# -ne 2 ]; then
+  echo "Usage: $0 <before-file> <after-file>" >&2
+  exit 1
+fi
+
+# Print the contents of the before file
+echo "=== Contents of $1 ===" >&2
+cat $1 >&2
+echo "=== End of $1 ===" >&2
+
+# Print the contents of the after file
+echo "=== Contents of $2 ==="  >&2
+cat $2 >&2
+echo "=== End of $2 ===" >&2
+
+# Extract compile requests and cache hits from the before and after files
+requests_before=$(awk '/^[ \t]*Compile requests[ \t]+[0-9]+/ {print $3}' "$1")
+hits_before=$(awk '/^[ \t]*Cache hits[ \t]+[0-9]+/ {print $3}' "$1")
+requests_after=$(awk '/^[ \t]*Compile requests[ \t]+[0-9]+/ {print $3}' "$2")
+hits_after=$(awk '/^[ \t]*Cache hits[ \t]+[0-9]+/ {print $3}' "$2")
+
+# Calculate the differences to find out how many new requests and hits
+requests_diff=$((requests_after - requests_before))
+hits_diff=$((hits_after - hits_before))
+
+echo "New Compile Requests: $requests_diff" >&2
+echo "New Hits: $hits_diff" >&2
+
+# Calculate and print the hit rate
+if [ $requests_diff -eq 0 ]; then
+    echo "No new compile requests, hit rate is not applicable"
+else
+    hit_rate=$(awk -v hits=$hits_diff -v requests=$requests_diff 'BEGIN {printf "%.2f", hits/requests * 100}')
+    echo "sccache hit rate: $hit_rate%" >&2
+    echo "$hit_rate"
+fi
\ No newline at end of file
diff --git a/ci/sccache_stats.sh b/ci/sccache_stats.sh
new file mode 100755
index 000000000..a834347cb
--- /dev/null
+++ b/ci/sccache_stats.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This script prints the sccache hit rate between two calls to sccache --show-stats.
+# It should be sourced in your script before and after the operations you want to profile,
+# with the 'start' or 'end' argument respectively.
+
+mode=$1
+
+if [[ "$mode" != "start" && "$mode" != "end" ]]; then
+    echo "Invalid mode: $mode"
+    echo "Usage: $0 {start|end}"
+    exit 1
+fi
+
+case $mode in
+  start)
+    export SCCACHE_START_HITS=$(sccache --show-stats | awk '/^[ \t]*Cache hits[ \t]+[0-9]+/ {print $3}')
+    export SCCACHE_START_MISSES=$(sccache --show-stats | awk '/^[ \t]*Cache misses[ \t]+[0-9]+/ {print $3}')
+    ;;
+  end)
+    if [[ -z ${SCCACHE_START_HITS+x} || -z ${SCCACHE_START_MISSES+x} ]]; then
+        echo "Error: start stats not collected. Did you call this script with 'start' before your operations?"
+        exit 1
+    fi
+
+    final_hits=$(sccache --show-stats | awk '/^[ \t]*Cache hits[ \t]+[0-9]+/ {print $3}')
+    final_misses=$(sccache --show-stats | awk '/^[ \t]*Cache misses[ \t]+[0-9]+/ {print $3}')
+    hits=$((final_hits - SCCACHE_START_HITS))
+    misses=$((final_misses - SCCACHE_START_MISSES))
+    total=$((hits + misses))
+
+    prefix=""
+    if [ ${GITHUB_ACTIONS:-false} = "true" ]; then
+      prefix="::notice::"
+    fi
+
+    if (( total > 0 )); then
+      hit_rate=$(awk -v hits="$hits" -v total="$total" 'BEGIN { printf "%.2f", (hits / total) * 100 }')
+      echo ${prefix}"sccache hits: $hits | misses: $misses | hit rate: $hit_rate%"
+    else
+      echo ${prefix}"sccache stats: N/A No new compilation requests"
+    fi
+    unset SCCACHE_START_HITS
+    unset SCCACHE_START_MISSES
+    ;;
+esac
\ No newline at end of file
diff --git a/ci/test.sh b/ci/test.sh
new file mode 100755
index 000000000..cfcce2acd
--- /dev/null
+++ b/ci/test.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Ensure the script is being executed in its containing directory
+cd "$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )";
+
+source ./build.sh "$@"
+
+ctest --test-dir ${BUILD_DIR}/tests --output-on-failure --timeout 60
+
+echo "Test complete"
\ No newline at end of file
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 0a83a3cb1..91e1417aa 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -1,5 +1,5 @@
 ﻿#=============================================================================
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+# Copyright (c) 2018-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #=============================================================================
-cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
+cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR)
 
 ###################################################################################################
 # - compiler function -----------------------------------------------------------------------------
@@ -33,7 +33,11 @@ endfunction(ConfigureExample)
 ### Example sources ###############################################################################
 ###################################################################################################
 
+ConfigureExample(STATIC_SET_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/host_bulk_example.cu")
+ConfigureExample(STATIC_SET_DEVICE_REF_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/device_ref_example.cu")
+ConfigureExample(STATIC_SET_DEVICE_SUBSETS_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/device_subsets_example.cu")
 ConfigureExample(STATIC_MAP_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/host_bulk_example.cu")
 ConfigureExample(STATIC_MAP_DEVICE_SIDE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/device_view_example.cu")
 ConfigureExample(STATIC_MAP_CUSTOM_TYPE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/custom_type_example.cu")
+ConfigureExample(STATIC_MAP_COUNT_BY_KEY_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/count_by_key_example.cu")
 ConfigureExample(STATIC_MULTIMAP_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_multimap/host_bulk_example.cu")
diff --git a/examples/static_map/count_by_key_example.cu b/examples/static_map/count_by_key_example.cu
new file mode 100644
index 000000000..4c8cfdb11
--- /dev/null
+++ b/examples/static_map/count_by_key_example.cu
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuco/static_map.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/logical.h>
+#include <thrust/transform.h>
+
+#include <cub/block/block_reduce.cuh>
+
+#include <cuda/std/atomic>
+
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <iostream>
+#include <limits>
+
+/**
+ * @file count_by_key_example.cu
+ * @brief Demonstrates usage of the device side APIs for individual operations like insert/find in
+ * the context of a count-by-key operation, i.e. for a histogram over keys.
+ *
+ * Individual operations like a single insert or find can be performed in device code via the
+ * static_map "device_view" types.
+ *
+ * @note This example is for demonstration purposes only. It is not intended to show the most
+ * performant way to do the example algorithm.
+ *
+ */
+
+/**
+ * @brief Inserts keys and counts how often they occur in the input sequence.
+ *
+ * @tparam BlockSize CUDA block size
+ * @tparam Map Type of the map returned from static_map::get_device_mutable_view
+ * @tparam KeyIter Input iterator whose value_type convertible to Map::key_type
+ * @tparam UniqueIter Output iterator whose value_type is convertible to uint64_t
+ *
+ * @param[in] map_view View of the map into which inserts will be performed
+ * @param[in] key_begin The beginning of the range of keys to insert
+ * @param[in] num_keys The total number of keys and values
+ * @param[out] num_unique_keys The total number of distinct keys inserted
+ */
+template <int64_t BlockSize, typename Map, typename KeyIter, typename UniqueIter>
+__global__ void count_by_key(Map map_view,
+                             KeyIter keys,
+                             uint64_t num_keys,
+                             UniqueIter num_unique_keys)
+{
+  using BlockReduce = cub::BlockReduce<uint64_t, BlockSize>;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+
+  int64_t const loop_stride = gridDim.x * BlockSize;
+  int64_t idx               = BlockSize * blockIdx.x + threadIdx.x;
+
+  uint64_t thread_unique_keys = 0;
+  while (idx < num_keys) {
+    // insert key into the map with a count of 1
+    auto [slot, is_new_key] = map_view.insert_and_find({keys[idx], 1});
+    if (is_new_key) {
+      // first occurrence of the key
+      thread_unique_keys++;
+    } else {
+      // key is already in the map -> increment count
+      slot->second.fetch_add(1, cuda::memory_order_relaxed);
+    }
+    idx += loop_stride;
+  }
+
+  // compute number of successfully inserted new keys for each block
+  // and atomically add to the grand total
+  uint64_t block_unique_keys = BlockReduce(temp_storage).Sum(thread_unique_keys);
+  if (threadIdx.x == 0) {
+    cuda::atomic_ref<uint64_t, cuda::thread_scope_device> grid_unique_keys(
+      *thrust::raw_pointer_cast(num_unique_keys));
+    grid_unique_keys.fetch_add(block_unique_keys, cuda::memory_order_relaxed);
+  }
+}
+
+int main(void)
+{
+  // Note that if (sizeof(Key)+sizeof(Count))>8 then the minimum required CUDA architecture is sm_70
+  using Key   = uint32_t;
+  using Count = uint32_t;
+
+  // Empty slots are represented by reserved "sentinel" values. These values should be selected such
+  // that they never occur in your input data.
+  Key constexpr empty_key_sentinel     = static_cast<Key>(-1);
+  Count constexpr empty_value_sentinel = static_cast<Key>(-1);
+
+  // Number of keys to be inserted
+  auto constexpr num_keys = 50'000;
+  // How often each distinct key occurs in the example input
+  auto constexpr key_duplicates = 5;
+  static_assert((num_keys % key_duplicates) == 0,
+                "For this example, num_keys must be divisible by key_duplicates in order to pass "
+                "the unit test.");
+
+  thrust::device_vector<Key> insert_keys(num_keys);
+  // Create a sequence of keys. Eeach distinct key has key_duplicates many matches.
+  thrust::transform(
+    thrust::make_counting_iterator<Key>(0),
+    thrust::make_counting_iterator<Key>(insert_keys.size()),
+    insert_keys.begin(),
+    [] __device__(auto i) { return static_cast<Key>(i % (num_keys / key_duplicates)); });
+
+  // Allocate storage for count of number of unique keys
+  thrust::device_vector<uint64_t> num_unique_keys(1);
+
+  // Compute capacity based on a 50% load factor
+  auto constexpr load_factor = 0.5;
+
+  // If the number of unique keys is known in advance, we can use it to calculate the map capacity
+  std::size_t const capacity = std::ceil((num_keys / key_duplicates) / load_factor);
+  // If we can't give an estimated upper bound on the number of unique keys
+  // we conservatively assume each key in the input is distinct
+  // std::size_t const capacity = std::ceil(num_keys / load_factor);
+
+  // Constructs a map with "capacity" slots.
+  cuco::static_map<Key, Count> map{
+    capacity, cuco::empty_key{empty_key_sentinel}, cuco::empty_value{empty_value_sentinel}};
+
+  // Get a non-owning, mutable view of the map that allows inserts to pass by value into the kernel
+  auto device_insert_view = map.get_device_mutable_view();
+
+  auto constexpr block_size = 256;
+  auto const grid_size      = (num_keys + block_size - 1) / block_size;
+  count_by_key<block_size><<<grid_size, block_size>>>(
+    device_insert_view, insert_keys.begin(), num_keys, num_unique_keys.data());
+
+  // Retrieve contents of all the non-empty slots in the map
+  thrust::device_vector<Key> result_keys(num_unique_keys[0]);
+  thrust::device_vector<Count> result_counts(num_unique_keys[0]);
+  map.retrieve_all(result_keys.begin(), result_counts.begin());
+
+  // Check if the number of result keys is correct
+  auto num_keys_check = num_unique_keys[0] == (num_keys / key_duplicates);
+
+  // Iterate over all result counts and verify that they are correct
+  auto counts_check = thrust::all_of(
+    result_counts.begin(), result_counts.end(), [] __host__ __device__(Count const count) {
+      return count == key_duplicates;
+    });
+
+  if (num_keys_check and counts_check) { std::cout << "Success!\n"; }
+
+  return 0;
+}
diff --git a/examples/static_map/custom_type_example.cu b/examples/static_map/custom_type_example.cu
index efc04e0c8..e150a858e 100644
--- a/examples/static_map/custom_type_example.cu
+++ b/examples/static_map/custom_type_example.cu
@@ -93,9 +93,7 @@ int main(void)
   // Construct a map with 100,000 slots using the given empty key/value sentinels. Note the
   // capacity is chosen knowing we will insert 80,000 keys, for an load factor of 80%.
   cuco::static_map<custom_key_type, custom_value_type> map{
-    100'000,
-    cuco::sentinel::empty_key{empty_key_sentinel},
-    cuco::sentinel::empty_value{empty_value_sentinel}};
+    100'000, cuco::empty_key{empty_key_sentinel}, cuco::empty_value{empty_value_sentinel}};
 
   // Inserts 80,000 pairs into the map by using the custom hasher and custom equality callable
   map.insert(pairs_begin, pairs_begin + num_pairs, custom_hash{}, custom_key_equals{});
diff --git a/examples/static_map/device_view_example.cu b/examples/static_map/device_view_example.cu
index a65e12162..f3414e3ff 100644
--- a/examples/static_map/device_view_example.cu
+++ b/examples/static_map/device_view_example.cu
@@ -135,9 +135,8 @@ int main(void)
   std::size_t const capacity = std::ceil(num_keys / load_factor);
 
   // Constructs a map with "capacity" slots using -1 and -1 as the empty key/value sentinels.
-  cuco::static_map<Key, Value> map{capacity,
-                                   cuco::sentinel::empty_key{empty_key_sentinel},
-                                   cuco::sentinel::empty_value{empty_value_sentinel}};
+  cuco::static_map<Key, Value> map{
+    capacity, cuco::empty_key{empty_key_sentinel}, cuco::empty_value{empty_value_sentinel}};
 
   // Get a non-owning, mutable view of the map that allows inserts to pass by value into the kernel
   auto device_insert_view = map.get_device_mutable_view();
diff --git a/examples/static_map/host_bulk_example.cu b/examples/static_map/host_bulk_example.cu
index d682442fb..746857511 100644
--- a/examples/static_map/host_bulk_example.cu
+++ b/examples/static_map/host_bulk_example.cu
@@ -54,9 +54,8 @@ int main(void)
   std::size_t const capacity = std::ceil(num_keys / load_factor);
 
   // Constructs a map with "capacity" slots using -1 and -1 as the empty key/value sentinels.
-  cuco::static_map<Key, Value> map{capacity,
-                                   cuco::sentinel::empty_key{empty_key_sentinel},
-                                   cuco::sentinel::empty_value{empty_value_sentinel}};
+  cuco::static_map<Key, Value> map{
+    capacity, cuco::empty_key{empty_key_sentinel}, cuco::empty_value{empty_value_sentinel}};
 
   // Create a sequence of keys and values {{0,0}, {1,1}, ... {i,i}}
   thrust::device_vector<Key> insert_keys(num_keys);
diff --git a/examples/static_multimap/host_bulk_example.cu b/examples/static_multimap/host_bulk_example.cu
index 149abd112..d1fe5589a 100644
--- a/examples/static_multimap/host_bulk_example.cu
+++ b/examples/static_multimap/host_bulk_example.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -38,9 +38,7 @@ int main(void)
   // sentinels. Note the capacity is chosen knowing we will insert 50,000 keys,
   // for an load factor of 50%.
   cuco::static_multimap<key_type, value_type> map{
-    N * 2,
-    cuco::sentinel::empty_key{empty_key_sentinel},
-    cuco::sentinel::empty_value{empty_value_sentinel}};
+    N * 2, cuco::empty_key{empty_key_sentinel}, cuco::empty_value{empty_value_sentinel}};
 
   thrust::device_vector<thrust::pair<key_type, value_type>> pairs(N);
 
@@ -62,13 +60,12 @@ int main(void)
   // The `_outer` suffix indicates that the occurrence of a non-match is 1.
   auto const output_size = map.count_outer(keys_to_find.begin(), keys_to_find.end());
 
-  thrust::device_vector<cuco::pair_type<key_type, value_type>> d_results(output_size);
+  thrust::device_vector<cuco::pair<key_type, value_type>> d_results(output_size);
 
   // Finds all keys {0, 1, 2, ...} and stores associated key/value pairs into `d_results`
   // If a key `keys_to_find[i]` doesn't exist, `d_results[i].second == empty_value_sentinel`
-  auto output_end =
-    map.retrieve_outer(keys_to_find.begin(), keys_to_find.end(), d_results.data().get());
-  auto retrieve_size = output_end - d_results.data().get();
+  auto output_end = map.retrieve_outer(keys_to_find.begin(), keys_to_find.end(), d_results.begin());
+  auto retrieve_size = output_end - d_results.begin();
 
   // The total number of outer matches should be `N + N / 2`
   assert(not(output_size == retrieve_size == N + N / 2));
diff --git a/examples/static_set/device_ref_example.cu b/examples/static_set/device_ref_example.cu
new file mode 100644
index 000000000..52e41cf45
--- /dev/null
+++ b/examples/static_set/device_ref_example.cu
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuco/static_set.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/functional.h>
+#include <thrust/logical.h>
+#include <thrust/sequence.h>
+
+#include <cooperative_groups.h>
+
+#include <cstddef>
+#include <iostream>
+
+/**
+ * @file device_reference_example.cu
+ * @brief Demonstrates usage of the static_set device-side APIs.
+ *
+ * static_set provides a non-owning reference which can be used to interact with
+ * the container from within device code.
+ */
+
+// insert a set of keys into a hash set using one cooperative group for each task
+template <typename SetRef, typename InputIterator>
+__global__ void custom_cooperative_insert(SetRef set, InputIterator keys, std::size_t n)
+{
+  namespace cg = cooperative_groups;
+
+  constexpr auto cg_size = SetRef::cg_size;
+
+  auto tile = cg::tiled_partition<cg_size>(cg::this_thread_block());
+
+  int64_t const loop_stride = gridDim.x * blockDim.x / cg_size;
+  int64_t idx               = (blockDim.x * blockIdx.x + threadIdx.x) / cg_size;
+
+  while (idx < n) {
+    set.insert(tile, *(keys + idx));
+    idx += loop_stride;
+  }
+}
+
+template <typename SetRef, typename InputIterator, typename OutputIterator>
+__global__ void custom_contains(SetRef set, InputIterator keys, std::size_t n, OutputIterator found)
+{
+  int64_t const loop_stride = gridDim.x * blockDim.x;
+  int64_t idx               = blockDim.x * blockIdx.x + threadIdx.x;
+
+  auto const tile =
+    cooperative_groups::tiled_partition<SetRef::cg_size>(cooperative_groups::this_thread_block());
+
+  while (idx < n) {
+    found[idx] = set.contains(tile, *(keys + idx));
+    idx += loop_stride;
+  }
+}
+
+int main(void)
+{
+  using Key = int;
+
+  // Empty slots are represented by reserved "sentinel" values. These values should be selected such
+  // that they never occur in your input data.
+  Key constexpr empty_key_sentinel = -1;
+
+  // Number of keys to be inserted
+  std::size_t constexpr num_keys = 50'000;
+
+  // Compute capacity based on a 50% load factor
+  auto constexpr load_factor = 0.5;
+  std::size_t const capacity = std::ceil(num_keys / load_factor);
+
+  using set_type = cuco::experimental::static_set<Key>;
+
+  // Constructs a hash set with at least "capacity" slots using -1 as the empty key sentinel.
+  set_type set{capacity, cuco::empty_key{empty_key_sentinel}};
+
+  // Create a sequence of keys {0, 1, 2, .., i}
+  thrust::device_vector<Key> keys(num_keys);
+  thrust::sequence(keys.begin(), keys.end(), 0);
+
+  // Insert the first half of the keys into the set
+  set.insert(keys.begin(), keys.begin() + num_keys / 2);
+
+  // Insert the second half of keys using a custom CUDA kernel.
+  custom_cooperative_insert<<<128, 128>>>(
+    set.ref(cuco::experimental::insert), keys.begin() + num_keys / 2, num_keys / 2);
+
+  // Storage for result
+  thrust::device_vector<bool> found(num_keys);
+
+  // Check if all keys are now contained in the set. Note that we pass a reference that already has
+  // the `contains` operator.
+  // In general, using two or more reference objects to the same container but with
+  // a different set of operators concurrently is undefined behavior.
+  // This does not apply here since the two kernels do not overlap.
+  custom_contains<<<128, 128>>>(
+    set.ref(cuco::experimental::contains), keys.begin(), num_keys, found.begin());
+
+  // Verify that all keys have been found
+  bool const all_keys_found = thrust::all_of(found.begin(), found.end(), thrust::identity<bool>());
+
+  if (all_keys_found) { std::cout << "Success! Found all keys.\n"; }
+
+  return 0;
+}
diff --git a/examples/static_set/device_subsets_example.cu b/examples/static_set/device_subsets_example.cu
new file mode 100644
index 000000000..827342f95
--- /dev/null
+++ b/examples/static_set/device_subsets_example.cu
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuco/static_set_ref.cuh>
+#include <cuco/storage.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/reduce.h>
+#include <thrust/scan.h>
+
+#include <cooperative_groups.h>
+
+#include <cuda/std/array>
+
+#include <algorithm>
+#include <cstddef>
+#include <iostream>
+#include <numeric>
+
+/**
+ * @file device_subsets_example.cu
+ * @brief Demonstrates how to use one bulk set storage to create multiple subsets and perform
+ * individual operations via device-side ref APIs.
+ *
+ * To optimize memory usage, especially when dealing with expensive data allocation and multiple
+ * hashsets, a practical solution involves employing a single bulk storage for generating subsets.
+ * This eliminates the need for separate memory allocation and deallocation for each container. This
+ * can be achieved by using the lightweight non-owning ref type.
+ *
+ * @note This example is for demonstration purposes only. It is not intended to show the most
+ * performant way to do the example algorithm.
+ */
+
+auto constexpr cg_size     = 8;   ///< A CUDA Cooperative Group of 8 threads to handle each subset
+auto constexpr window_size = 1;   ///< Number of concurrent slots handled by each thread
+auto constexpr N           = 10;  ///< Number of elements to insert and query
+
+using key_type            = int;  ///< Key type
+using probing_scheme_type = cuco::experimental::linear_probing<
+  cg_size,
+  cuco::default_hash_function<key_type>>;  ///< Type controls CG granularity and probing scheme
+                                           ///< (linear probing v.s. double hashing)
+/// Type of bulk allocation storage
+using storage_type = cuco::experimental::aow_storage<key_type, window_size>;
+/// Lightweight non-owning storage ref type
+using storage_ref_type = typename storage_type::ref_type;
+using ref_type         = cuco::experimental::static_set_ref<key_type,
+                                                    cuda::thread_scope_device,
+                                                    thrust::equal_to<key_type>,
+                                                    probing_scheme_type,
+                                                    storage_ref_type>;  ///< Set ref type
+
+/// Sample data to insert and query
+__device__ constexpr std::array<key_type, N> data = {1, 3, 5, 7, 9, 11, 13, 15, 17, 19};
+/// Empty slots are represented by reserved "sentinel" values. These values should be selected such
+/// that they never occur in your input data.
+key_type constexpr empty_key_sentinel = -1;
+
+/**
+ * @brief Inserts sample data into subsets by using cooperative group
+ *
+ * Each Cooperative Group creates its own subset and inserts `N` sample data.
+ *
+ * @param set_refs Pointer to the array of subset objects
+ */
+__global__ void insert(ref_type* set_refs)
+{
+  namespace cg = cooperative_groups;
+
+  auto const tile = cg::tiled_partition<cg_size>(cg::this_thread_block());
+  // Get subset (or CG) index
+  auto const idx = (blockDim.x * blockIdx.x + threadIdx.x) / cg_size;
+
+  auto raw_set_ref    = *(set_refs + idx);
+  auto insert_set_ref = std::move(raw_set_ref).with(cuco::experimental::insert);
+
+  // Insert `N` elemtns into the set with CG insert
+  for (int i = 0; i < N; i++) {
+    insert_set_ref.insert(tile, data[i]);
+  }
+}
+
+/**
+ * @brief All inserted data can be found
+ *
+ * Each Cooperative Group reconstructs its own subset ref based on the storage parameters and
+ * verifies all inserted data can be found.
+ *
+ * @param set_refs Pointer to the array of subset objects
+ */
+__global__ void find(ref_type* set_refs)
+{
+  namespace cg = cooperative_groups;
+
+  auto const tile = cg::tiled_partition<cg_size>(cg::this_thread_block());
+  auto const idx  = (blockDim.x * blockIdx.x + threadIdx.x) / cg_size;
+
+  auto raw_set_ref  = *(set_refs + idx);
+  auto find_set_ref = std::move(raw_set_ref).with(cuco::experimental::find);
+
+  // Result denoting if any of the inserted data is not found
+  __shared__ int result;
+  if (threadIdx.x == 0) { result = 0; }
+  __syncthreads();
+
+  for (int i = 0; i < N; i++) {
+    // Query the set with inserted data
+    auto const found = find_set_ref.find(tile, data[i]);
+    // Record if the inserted data has been found
+    atomicOr(&result, *found != data[i]);
+  }
+  __syncthreads();
+
+  if (threadIdx.x == 0) {
+    // If the result is still 0, all inserted data are found.
+    if (result == 0) { printf("Success! Found all inserted elements.\n"); }
+  }
+}
+
+int main()
+{
+  // Number of subsets to be created
+  auto constexpr num = 16;
+  // Each subset may have a different requested size
+  auto constexpr subset_sizes =
+    std::array<std::size_t, num>{20, 20, 20, 20, 30, 30, 30, 30, 40, 40, 40, 40, 50, 50, 50, 50};
+
+  auto valid_sizes = std::vector<std::size_t>();
+  valid_sizes.reserve(num);
+
+  for (size_t i = 0; i < num; ++i) {
+    valid_sizes.emplace_back(
+      static_cast<std::size_t>(cuco::experimental::make_window_extent<ref_type>(subset_sizes[i])));
+  }
+
+  std::vector<std::size_t> offsets(num + 1, 0);
+
+  // prefix sum to compute offsets and total number of windows
+  std::size_t current_sum = 0;
+  for (std::size_t i = 0; i < valid_sizes.size(); ++i) {
+    current_sum += valid_sizes[i];
+    offsets[i + 1] = current_sum;
+  }
+
+  // total number of windows is located at the back of the offsets array
+  auto const total_num_windows = offsets.back();
+
+  // Create a single bulk storage used by all subsets
+  auto set_storage = storage_type{total_num_windows};
+  // Initializes the storage with the given sentinel
+  set_storage.initialize(empty_key_sentinel);
+
+  std::vector<ref_type> set_refs;
+
+  // create subsets
+  for (std::size_t i = 0; i < num; ++i) {
+    storage_ref_type storage_ref{valid_sizes[i], set_storage.data() + offsets[i]};
+    set_refs.emplace_back(
+      ref_type{cuco::empty_key<key_type>{empty_key_sentinel}, {}, {}, storage_ref});
+  }
+
+  thrust::device_vector<ref_type> d_set_refs(set_refs);
+
+  // Insert sample data
+  insert<<<1, 128>>>(d_set_refs.data().get());
+  // Find all inserted data
+  find<<<1, 128>>>(d_set_refs.data().get());
+
+  return 0;
+}
diff --git a/examples/static_set/host_bulk_example.cu b/examples/static_set/host_bulk_example.cu
new file mode 100644
index 000000000..3b8c4deb4
--- /dev/null
+++ b/examples/static_set/host_bulk_example.cu
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuco/static_set.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/functional.h>
+#include <thrust/logical.h>
+#include <thrust/sequence.h>
+
+#include <iostream>
+#include <limits>
+
+/**
+ * @file host_bulk_example.cu
+ * @brief Demonstrates usage of the static_set "bulk" host APIs.
+ *
+ * The bulk APIs are only invocable from the host and are used for doing operations like `insert` or
+ * `contains` on a set of keys.
+ *
+ */
+int main(void)
+{
+  using Key = int;
+
+  // Empty slots are represented by reserved "sentinel" values. These values should be selected such
+  // that they never occur in your input data.
+  Key constexpr empty_key_sentinel = -1;
+
+  // Number of keys to be inserted
+  std::size_t constexpr num_keys = 50'000;
+
+  // Compute capacity based on a 50% load factor
+  auto constexpr load_factor = 0.5;
+  std::size_t const capacity = std::ceil(num_keys / load_factor);
+
+  // Constructs a set with at least `capacity` slots using -1 as the empty keys sentinel.
+  cuco::experimental::static_set<Key> set{capacity, cuco::empty_key{empty_key_sentinel}};
+
+  // Create a sequence of keys {0, 1, 2, .., i}
+  thrust::device_vector<Key> keys(num_keys);
+  thrust::sequence(keys.begin(), keys.end(), 0);
+
+  // Inserts all keys into the hash set
+  set.insert(keys.begin(), keys.end());
+
+  // Storage for result
+  thrust::device_vector<bool> found(num_keys);
+
+  // Check if all keys are contained in the set
+  set.contains(keys.begin(), keys.end(), found.begin());
+
+  // Verify that all keys have been found
+  bool const all_keys_found = thrust::all_of(found.begin(), found.end(), thrust::identity<bool>());
+
+  if (all_keys_found) { std::cout << "Success! Found all keys.\n"; }
+
+  return 0;
+}
diff --git a/include/cuco/aow_storage.cuh b/include/cuco/aow_storage.cuh
new file mode 100644
index 000000000..479246fac
--- /dev/null
+++ b/include/cuco/aow_storage.cuh
@@ -0,0 +1,229 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuco/cuda_stream_ref.hpp>
+#include <cuco/detail/storage/aow_storage_base.cuh>
+#include <cuco/extent.cuh>
+#include <cuco/utility/allocator.hpp>
+
+#include <cuda/std/array>
+
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+#include <memory>
+
+namespace cuco {
+namespace experimental {
+
+/// Window type alias
+template <typename T, int32_t WindowSize>
+using window = detail::window<T, WindowSize>;
+
+/// forward declaration
+template <typename T, int32_t WindowSize, typename Extent>
+class aow_storage_ref;
+
+/**
+ * @brief Array of Window open addressing storage class.
+ *
+ * @tparam T Slot type
+ * @tparam WindowSize Number of slots in each window
+ * @tparam Extent Type of extent denoting number of windows
+ * @tparam Allocator Type of allocator used for device storage (de)allocation
+ */
+template <typename T,
+          int32_t WindowSize,
+          typename Extent    = cuco::experimental::extent<std::size_t>,
+          typename Allocator = cuco::cuda_allocator<cuco::experimental::window<T, WindowSize>>>
+class aow_storage : public detail::aow_storage_base<T, WindowSize, Extent> {
+ public:
+  using base_type = detail::aow_storage_base<T, WindowSize, Extent>;  ///< AoW base class type
+
+  using base_type::window_size;  ///< Number of elements processed per window
+
+  using extent_type = typename base_type::extent_type;  ///< Storage extent type
+  using size_type   = typename base_type::size_type;    ///< Storage size type
+  using value_type  = typename base_type::value_type;   ///< Slot type
+  using window_type = typename base_type::window_type;  ///< Slot window type
+
+  using base_type::capacity;
+  using base_type::num_windows;
+
+  /// Type of the allocator to (de)allocate windows
+  using allocator_type = typename std::allocator_traits<Allocator>::rebind_alloc<window_type>;
+  using window_deleter_type =
+    detail::custom_deleter<size_type, allocator_type>;  ///< Type of window deleter
+  using ref_type = aow_storage_ref<value_type, window_size, extent_type>;  ///< Storage ref type
+
+  /**
+   * @brief Constructor of AoW storage.
+   *
+   * @note The input `size` should be exclusively determined by the return value of
+   * `make_window_extent` since it depends on the requested low-bound value, the probing scheme, and
+   * the storage.
+   *
+   * @param size Number of windows to (de)allocate
+   * @param allocator Allocator used for (de)allocating device storage
+   */
+  explicit constexpr aow_storage(Extent size, Allocator const& allocator = {}) noexcept;
+
+  aow_storage(aow_storage&&) = default;  ///< Move constructor
+  /**
+   * @brief Replaces the contents of the storage with another storage.
+   *
+   * @return Reference of the current storage object
+   */
+  aow_storage& operator=(aow_storage&&) = default;
+  ~aow_storage()                        = default;  ///< Destructor
+
+  aow_storage(aow_storage const&) = delete;
+  aow_storage& operator=(aow_storage const&) = delete;
+
+  /**
+   * @brief Gets windows array.
+   *
+   * @return Pointer to the first window
+   */
+  [[nodiscard]] constexpr window_type* data() const noexcept;
+
+  /**
+   * @brief Gets the storage allocator.
+   *
+   * @return The storage allocator
+   */
+  [[nodiscard]] constexpr allocator_type allocator() const noexcept;
+
+  /**
+   * @brief Gets window storage reference.
+   *
+   * @return Reference of window storage
+   */
+  [[nodiscard]] constexpr ref_type ref() const noexcept;
+
+  /**
+   * @brief Initializes each slot in the AoW storage to contain `key`.
+   *
+   * @param key Key to which all keys in `slots` are initialized
+   * @param stream Stream used for executing the kernel
+   */
+  void initialize(value_type key, cuda_stream_ref stream = {}) noexcept;
+
+  /**
+   * @brief Asynchronously initializes each slot in the AoW storage to contain `key`.
+   *
+   * @param key Key to which all keys in `slots` are initialized
+   * @param stream Stream used for executing the kernel
+   */
+  void initialize_async(value_type key, cuda_stream_ref stream = {}) noexcept;
+
+ private:
+  allocator_type allocator_;            ///< Allocator used to (de)allocate windows
+  window_deleter_type window_deleter_;  ///< Custom windows deleter
+  std::unique_ptr<window_type, window_deleter_type> windows_;  ///< Pointer to AoW storage
+};
+
+/**
+ * @brief Non-owning AoW storage reference type.
+ *
+ * @tparam T Storage element type
+ * @tparam WindowSize Number of slots in each window
+ * @tparam Extent Type of extent denoting storage capacity
+ */
+template <typename T, int32_t WindowSize, typename Extent = cuco::experimental::extent<std::size_t>>
+class aow_storage_ref : public detail::aow_storage_base<T, WindowSize, Extent> {
+ public:
+  using base_type = detail::aow_storage_base<T, WindowSize, Extent>;  ///< AoW base class type
+
+  using base_type::window_size;  ///< Number of elements processed per window
+
+  using extent_type = typename base_type::extent_type;  ///< Storage extent type
+  using size_type   = typename base_type::size_type;    ///< Storage size type
+  using value_type  = typename base_type::value_type;   ///< Slot type
+  using window_type = typename base_type::window_type;  ///< Slot window type
+
+  using base_type::capacity;
+  using base_type::num_windows;
+
+  /**
+   * @brief Constructor of AoS storage ref.
+   *
+   * @param size Number of windows
+   * @param windows Pointer to the windows array
+   */
+  __host__ __device__ explicit constexpr aow_storage_ref(Extent size,
+                                                         window_type* windows) noexcept;
+
+  /**
+   * @brief Custom un-incrementable input iterator for the convenience of `find` operations.
+   *
+   * @note This iterator is for read only and NOT incrementable.
+   */
+  struct iterator;
+  using const_iterator = iterator const;  ///< Const forward iterator type
+
+  /**
+   * @brief Returns an iterator to one past the last slot.
+   *
+   * This is provided for convenience for those familiar with checking
+   * an iterator returned from `find()` against the `end()` iterator.
+   *
+   * @return An iterator to one past the last slot
+   */
+  [[nodiscard]] __device__ constexpr iterator end() noexcept;
+
+  /**
+   * @brief Returns a const_iterator to one past the last slot.
+   *
+   * This is provided for convenience for those familiar with checking
+   * an iterator returned from `find()` against the `end()` iterator.
+   *
+   * @return A const_iterator to one past the last slot
+   */
+  [[nodiscard]] __device__ constexpr const_iterator end() const noexcept;
+
+  /**
+   * @brief Gets windows array.
+   *
+   * @return Pointer to the first window
+   */
+  [[nodiscard]] __device__ constexpr window_type* data() noexcept;
+
+  /**
+   * @brief Gets windows array.
+   *
+   * @return Pointer to the first window
+   */
+  [[nodiscard]] __device__ constexpr window_type* data() const noexcept;
+
+  /**
+   * @brief Returns an array of slots (or a window) for a given index.
+   *
+   * @param index Index of the window
+   * @return An array of slots
+   */
+  [[nodiscard]] __device__ constexpr window_type operator[](size_type index) const noexcept;
+
+ private:
+  window_type* windows_;  ///< Pointer to the windows array
+};
+
+}  // namespace experimental
+}  // namespace cuco
+
+#include <cuco/detail/storage/aow_storage.inl>
diff --git a/include/cuco/cuda_stream_ref.hpp b/include/cuco/cuda_stream_ref.hpp
new file mode 100644
index 000000000..bf0a5dea9
--- /dev/null
+++ b/include/cuco/cuda_stream_ref.hpp
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cuda_runtime_api.h>
+
+#include <cstddef>
+
+namespace cuco {
+namespace experimental {
+
+/**
+ * @brief Strongly-typed non-owning wrapper for CUDA streams with default constructor.
+ *
+ * This wrapper is simply a "view": it does not own the lifetime of the stream it wraps.
+ */
+class cuda_stream_ref {
+ public:
+  constexpr cuda_stream_ref()                       = default;  ///< Default constructor
+  constexpr cuda_stream_ref(cuda_stream_ref const&) = default;  ///< Copy constructor
+  constexpr cuda_stream_ref(cuda_stream_ref&&)      = default;  ///< Move constructor
+
+  /**
+   * @brief Copy-assignment operator.
+   *
+   * @return Copy of this stream reference.
+   */
+  constexpr cuda_stream_ref& operator=(cuda_stream_ref const&) = default;
+
+  /**
+   * @brief Move-assignment operator.
+   *
+   * @return New location of this stream reference.
+   */
+  constexpr cuda_stream_ref& operator=(cuda_stream_ref&&) = default;  ///< Move-assignment operator
+
+  ~cuda_stream_ref() = default;
+
+  constexpr cuda_stream_ref(int)            = delete;  //< Prevent cast from literal 0
+  constexpr cuda_stream_ref(std::nullptr_t) = delete;  //< Prevent cast from nullptr
+
+  /**
+   * @brief Implicit conversion from `cudaStream_t`.
+   *
+   * @param stream The CUDA stream to reference.
+   */
+  constexpr cuda_stream_ref(cudaStream_t stream) noexcept : stream_{stream} {}
+
+  /**
+   * @brief Get the wrapped stream.
+   *
+   * @return The wrapped stream.
+   */
+  [[nodiscard]] constexpr cudaStream_t value() const noexcept { return stream_; }
+
+  /**
+   * @brief Implicit conversion to `cudaStream_t`.
+   *
+   * @return The underlying `cudaStream_t`.
+   */
+  constexpr operator cudaStream_t() const noexcept { return value(); }
+
+  /**
+   * @brief Return true if the wrapped stream is the CUDA per-thread default stream.
+   *
+   * @return True if the wrapped stream is the per-thread default stream; else false.
+   */
+  [[nodiscard]] inline bool is_per_thread_default() const noexcept;
+
+  /**
+   * @brief Return true if the wrapped stream is explicitly the CUDA legacy default stream.
+   *
+   * @return True if the wrapped stream is the default stream; else false.
+   */
+  [[nodiscard]] inline bool is_default() const noexcept;
+
+  /**
+   * @brief Synchronize the viewed CUDA stream.
+   *
+   * Calls `cudaStreamSynchronize()`.
+   *
+   * @throw cuco::cuda_error if stream synchronization fails
+   */
+  void synchronize() const;
+
+ private:
+  cudaStream_t stream_{};
+};
+
+/**
+ * @brief Static `cuda_stream_ref` of the default stream (stream 0), for convenience
+ */
+static constexpr cuda_stream_ref cuda_stream_default{};
+
+/**
+ * @brief Static `cuda_stream_ref` of cudaStreamLegacy, for convenience
+ */
+static const cuda_stream_ref cuda_stream_legacy{cudaStreamLegacy};
+
+/**
+ * @brief Static `cuda_stream_ref` of cudaStreamPerThread, for convenience
+ */
+static const cuda_stream_ref cuda_stream_per_thread{cudaStreamPerThread};
+
+// /**
+//  * @brief Equality comparison operator for streams
+//  *
+//  * @param lhs The first stream view to compare
+//  * @param rhs The second stream view to compare
+//  * @return true if equal, false if unequal
+//  */
+// inline bool operator==(cuda_stream_ref lhs, cuda_stream_ref rhs)
+// {
+//   return lhs.value() == rhs.value();
+// }
+
+// /**
+//  * @brief Inequality comparison operator for streams
+//  *
+//  * @param lhs The first stream view to compare
+//  * @param rhs The second stream view to compare
+//  * @return true if unequal, false if equal
+//  */
+// inline bool operator!=(cuda_stream_ref lhs, cuda_stream_ref rhs) { return not(lhs == rhs); }
+
+}  // namespace experimental
+}  // namespace cuco
+
+#include <cuco/detail/cuda_stream_ref.inl>
\ No newline at end of file
diff --git a/include/cuco/detail/__config b/include/cuco/detail/__config
index 40eb75aa2..07dec5e50 100644
--- a/include/cuco/detail/__config
+++ b/include/cuco/detail/__config
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,9 +14,23 @@
  * limitations under the License.
  */
 
- #pragma once
+#pragma once
 
- #include <nv/target>
+#include <nv/target>
+
+#if !defined(__CUDACC_VER_MAJOR__) || !defined(__CUDACC_VER_MINOR__)
+#error "NVCC version not found"
+#elif __CUDACC_VER_MAJOR__ < 11 || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ < 5)
+#error "NVCC version 11.5 or later is required"
+#endif
+
+#if !defined(__CUDACC_RELAXED_CONSTEXPR__)
+#error "Support for relaxed constexpr is required"
+#endif
+
+#if !defined(__CUDACC_EXTENDED_LAMBDA__)
+#error "Support for extended device lambdas is required"
+#endif
 
 // WAR for libcudacxx/296
 #define CUCO_CUDA_MINIMUM_ARCH _NV_FIRST_ARG(__CUDA_ARCH_LIST__)
@@ -25,10 +39,14 @@
 #define CUCO_HAS_CUDA_BARRIER
 #endif
 
-#if defined(CUDART_VERSION) && (CUDART_VERSION >= 11100)
+#if defined(CUDART_VERSION) && (CUDART_VERSION >= 11010)
 #define CUCO_HAS_CG_MEMCPY_ASYNC
 #endif
 
 #if (CUCO_CUDA_MINIMUM_ARCH >= 700)
 #define CUCO_HAS_INDEPENDENT_THREADS
 #endif
+
+#if defined(__SIZEOF_INT128__)
+#define CUCO_HAS_INT128
+#endif
\ No newline at end of file
diff --git a/include/cuco/detail/bitwise_compare.cuh b/include/cuco/detail/bitwise_compare.cuh
index 3038943a0..a8a5a69d1 100644
--- a/include/cuco/detail/bitwise_compare.cuh
+++ b/include/cuco/detail/bitwise_compare.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,10 @@
 
 #pragma once
 
+#include <cuco/utility/traits.hpp>
+
+#include <cuda/std/bit>
+
 #include <cstdint>
 #include <type_traits>
 
@@ -58,6 +62,16 @@ struct bitwise_compare_impl<8> {
   }
 };
 
+/**
+ * @brief Gives value to use as alignment for a type that is at least the
+ * size of type, or 16, whichever is smaller.
+ */
+template <typename T>
+constexpr std::size_t alignment()
+{
+  return std::min(std::size_t{16}, cuda::std::bit_ceil(sizeof(T)));
+}
+
 /**
  * @brief Performs a bitwise equality comparison between the two specified objects
  *
@@ -73,8 +87,11 @@ __host__ __device__ constexpr bool bitwise_compare(T const& lhs, T const& rhs)
     cuco::is_bitwise_comparable_v<T>,
     "Bitwise compared objects must have unique object representations or be explicitly declared as "
     "safe for bitwise comparison via specialization of cuco::is_bitwise_comparable_v.");
-  return detail::bitwise_compare_impl<sizeof(T)>::compare(reinterpret_cast<char const*>(&lhs),
-                                                          reinterpret_cast<char const*>(&rhs));
+
+  alignas(detail::alignment<T>()) T __lhs{lhs};
+  alignas(detail::alignment<T>()) T __rhs{rhs};
+  return detail::bitwise_compare_impl<sizeof(T)>::compare(reinterpret_cast<char const*>(&__lhs),
+                                                          reinterpret_cast<char const*>(&__rhs));
 }
 
 }  // namespace detail
diff --git a/include/cuco/detail/common_functors.cuh b/include/cuco/detail/common_functors.cuh
new file mode 100644
index 000000000..12fe14e0a
--- /dev/null
+++ b/include/cuco/detail/common_functors.cuh
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ */
+
+#pragma once
+
+namespace cuco {
+namespace experimental {
+namespace detail {
+
+/**
+ * @brief Device functor returning the content of the slot indexed by `idx`.
+ *
+ * @tparam StorageRef Storage ref type
+ */
+template <typename StorageRef>
+struct get_slot {
+  StorageRef storage_;  ///< Storage ref
+
+  /**
+   * @brief Constructs `get_slot` functor with the given storage ref.
+   *
+   * @param s Input storage ref
+   */
+  explicit constexpr get_slot(StorageRef s) noexcept : storage_{s} {}
+
+  /**
+   * @brief Accesses the slot content with the given index.
+   *
+   * @param idx The slot index
+   * @return The slot content
+   */
+  __device__ constexpr auto operator()(typename StorageRef::size_type idx) const noexcept
+  {
+    auto const window_idx = idx / StorageRef::window_size;
+    auto const intra_idx  = idx % StorageRef::window_size;
+    return storage_[window_idx][intra_idx];
+  }
+};
+
+}  // namespace detail
+}  // namespace experimental
+}  // namespace cuco
diff --git a/include/cuco/detail/common_kernels.cuh b/include/cuco/detail/common_kernels.cuh
new file mode 100644
index 000000000..759041bad
--- /dev/null
+++ b/include/cuco/detail/common_kernels.cuh
@@ -0,0 +1,264 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cuco/detail/utility/cuda.cuh>
+
+#include <cub/block/block_reduce.cuh>
+
+#include <cuda/atomic>
+
+#include <cooperative_groups.h>
+
+namespace cuco {
+namespace experimental {
+namespace detail {
+
+/**
+ * @brief Inserts all elements in the range `[first, first + n)` and returns the number of
+ * successful insertions if `pred` of the corresponding stencil returns true.
+ *
+ * @note If multiple elements in `[first, first + n)` compare equal, it is unspecified which element
+ * is inserted.
+ * @note The key `*(first + i)` is inserted if `pred( *(stencil + i) )` returns true.
+ *
+ * @tparam CGSize Number of threads in each CG
+ * @tparam BlockSize Number of threads in each block
+ * @tparam InputIterator Device accessible input iterator whose `value_type` is
+ * convertible to the `value_type` of the data structure
+ * @tparam StencilIt Device accessible random access iterator whose value_type is
+ * convertible to Predicate's argument type
+ * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool`
+ * and argument type is convertible from `std::iterator_traits<StencilIt>::value_type`
+ * @tparam AtomicT Atomic counter type
+ * @tparam Ref Type of non-owning device container ref allowing access to storage
+ *
+ * @param first Beginning of the sequence of input elements
+ * @param n Number of input elements
+ * @param stencil Beginning of the stencil sequence
+ * @param pred Predicate to test on every element in the range `[stencil, stencil + n)`
+ * @param num_successes Number of successful inserted elements
+ * @param ref Non-owning container device ref used to access the slot storage
+ */
+template <int32_t CGSize,
+          int32_t BlockSize,
+          typename InputIterator,
+          typename StencilIt,
+          typename Predicate,
+          typename AtomicT,
+          typename Ref>
+__global__ void insert_if_n(InputIterator first,
+                            cuco::detail::index_type n,
+                            StencilIt stencil,
+                            Predicate pred,
+                            AtomicT* num_successes,
+                            Ref ref)
+{
+  using BlockReduce = cub::BlockReduce<typename Ref::size_type, BlockSize>;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  typename Ref::size_type thread_num_successes = 0;
+
+  auto const loop_stride = cuco::detail::grid_stride() / CGSize;
+  auto idx               = cuco::detail::global_thread_id() / CGSize;
+
+  while (idx < n) {
+    if (pred(*(stencil + idx))) {
+      typename Ref::value_type const insert_element{*(first + idx)};
+      if constexpr (CGSize == 1) {
+        if (ref.insert(insert_element)) { thread_num_successes++; };
+      } else {
+        auto const tile =
+          cooperative_groups::tiled_partition<CGSize>(cooperative_groups::this_thread_block());
+        if (ref.insert(tile, insert_element) && tile.thread_rank() == 0) { thread_num_successes++; }
+      }
+    }
+    idx += loop_stride;
+  }
+
+  // compute number of successfully inserted elements for each block
+  // and atomically add to the grand total
+  auto const block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes);
+  if (threadIdx.x == 0) {
+    num_successes->fetch_add(block_num_successes, cuda::std::memory_order_relaxed);
+  }
+}
+
+/**
+ * @brief Inserts all elements in the range `[first, first + n)` if `pred` of the corresponding
+ * stencil returns true.
+ *
+ * @note If multiple elements in `[first, first + n)` compare equal, it is unspecified which element
+ * is inserted.
+ * @note The key `*(first + i)` is inserted if `pred( *(stencil + i) )` returns true.
+ *
+ * @tparam CGSize Number of threads in each CG
+ * @tparam BlockSize Number of threads in each block
+ * @tparam InputIterator Device accessible input iterator whose `value_type` is
+ * convertible to the `value_type` of the data structure
+ * @tparam StencilIt Device accessible random access iterator whose value_type is
+ * convertible to Predicate's argument type
+ * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool`
+ * and argument type is convertible from `std::iterator_traits<StencilIt>::value_type`
+ * @tparam Ref Type of non-owning device ref allowing access to storage
+ *
+ * @param first Beginning of the sequence of input elements
+ * @param n Number of input elements
+ * @param stencil Beginning of the stencil sequence
+ * @param pred Predicate to test on every element in the range `[stencil, stencil + n)`
+ * @param ref Non-owning container device ref used to access the slot storage
+ */
+template <int32_t CGSize,
+          int32_t BlockSize,
+          typename InputIterator,
+          typename StencilIt,
+          typename Predicate,
+          typename Ref>
+__global__ void insert_if_n(
+  InputIterator first, cuco::detail::index_type n, StencilIt stencil, Predicate pred, Ref ref)
+{
+  auto const loop_stride = cuco::detail::grid_stride() / CGSize;
+  auto idx               = cuco::detail::global_thread_id() / CGSize;
+
+  while (idx < n) {
+    if (pred(*(stencil + idx))) {
+      typename Ref::value_type const insert_element{*(first + idx)};
+      if constexpr (CGSize == 1) {
+        ref.insert(insert_element);
+      } else {
+        auto const tile =
+          cooperative_groups::tiled_partition<CGSize>(cooperative_groups::this_thread_block());
+        ref.insert(tile, insert_element);
+      }
+    }
+    idx += loop_stride;
+  }
+}
+
+/**
+ * @brief Indicates whether the keys in the range `[first, first + n)` are contained in the data
+ * structure if `pred` of the corresponding stencil returns true.
+ *
+ * @note If `pred( *(stencil + i) )` is true, stores `true` or `false` to `(output_begin + i)`
+ * indicating if the key `*(first + i)` is present in the container. If `pred( *(stencil + i) )` is
+ * false, stores false to `(output_begin + i)`.
+ *
+ * @tparam CGSize Number of threads in each CG
+ * @tparam BlockSize The size of the thread block
+ * @tparam InputIt Device accessible input iterator
+ * @tparam StencilIt Device accessible random access iterator whose value_type is
+ * convertible to Predicate's argument type
+ * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool`
+ * and argument type is convertible from `std::iterator_traits<StencilIt>::value_type`
+ * @tparam OutputIt Device accessible output iterator assignable from `bool`
+ * @tparam Ref Type of non-owning device ref allowing access to storage
+ *
+ * @param first Beginning of the sequence of keys
+ * @param n Number of keys
+ * @param stencil Beginning of the stencil sequence
+ * @param pred Predicate to test on every element in the range `[stencil, stencil + n)`
+ * @param output_begin Beginning of the sequence of booleans for the presence of each key
+ * @param ref Non-owning container device ref used to access the slot storage
+ */
+template <int32_t CGSize,
+          int32_t BlockSize,
+          typename InputIt,
+          typename StencilIt,
+          typename Predicate,
+          typename OutputIt,
+          typename Ref>
+__global__ void contains_if_n(InputIt first,
+                              cuco::detail::index_type n,
+                              StencilIt stencil,
+                              Predicate pred,
+                              OutputIt output_begin,
+                              Ref ref)
+{
+  namespace cg = cooperative_groups;
+
+  auto const block       = cg::this_thread_block();
+  auto const thread_idx  = block.thread_rank();
+  auto const loop_stride = cuco::detail::grid_stride() / CGSize;
+  auto idx               = cuco::detail::global_thread_id() / CGSize;
+
+  __shared__ bool output_buffer[BlockSize / CGSize];
+
+  while (idx - thread_idx < n) {  // the whole thread block falls into the same iteration
+    if constexpr (CGSize == 1) {
+      if (idx < n) {
+        auto const key = *(first + idx);
+        /*
+         * The ld.relaxed.gpu instruction causes L1 to flush more frequently, causing increased
+         * sector stores from L2 to global memory. By writing results to shared memory and then
+         * synchronizing before writing back to global, we no longer rely on L1, preventing the
+         * increase in sector stores from L2 to global and improving performance.
+         */
+        output_buffer[thread_idx] = pred(*(stencil + idx)) ? ref.contains(key) : false;
+      }
+      block.sync();
+      if (idx < n) { *(output_begin + idx) = output_buffer[thread_idx]; }
+    } else {
+      auto const tile = cg::tiled_partition<CGSize>(cg::this_thread_block());
+      if (idx < n) {
+        auto const key   = *(first + idx);
+        auto const found = pred(*(stencil + idx)) ? ref.contains(tile, key) : false;
+        if (tile.thread_rank() == 0) { *(output_begin + idx) = found; }
+      }
+    }
+    idx += loop_stride;
+  }
+}
+
+/**
+ * @brief Calculates the number of filled slots for the given window storage.
+ *
+ * @tparam BlockSize Number of threads in each block
+ * @tparam StorageRef Type of non-owning ref allowing access to storage
+ * @tparam Predicate Type of predicate indicating if the given slot is filled
+ * @tparam AtomicT Atomic counter type
+ *
+ * @param storage Non-owning device ref used to access the slot storage
+ * @param is_filled Predicate indicating if the given slot is filled
+ * @param count Number of filled slots
+ */
+template <int32_t BlockSize, typename StorageRef, typename Predicate, typename AtomicT>
+__global__ void size(StorageRef storage, Predicate is_filled, AtomicT* count)
+{
+  using size_type = typename StorageRef::size_type;
+
+  auto const loop_stride = cuco::detail::grid_stride();
+  auto idx               = cuco::detail::global_thread_id();
+
+  size_type thread_count = 0;
+  auto const n           = storage.num_windows();
+
+  while (idx < n) {
+    auto const window = storage[idx];
+#pragma unroll
+    for (auto const& it : window) {
+      thread_count += static_cast<size_type>(is_filled(it));
+    }
+    idx += loop_stride;
+  }
+
+  using BlockReduce = cub::BlockReduce<size_type, BlockSize>;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  auto const block_count = BlockReduce(temp_storage).Sum(thread_count);
+  if (threadIdx.x == 0) { count->fetch_add(block_count, cuda::std::memory_order_relaxed); }
+}
+
+}  // namespace detail
+}  // namespace experimental
+}  // namespace cuco
diff --git a/include/cuco/detail/cuda_stream_ref.inl b/include/cuco/detail/cuda_stream_ref.inl
new file mode 100644
index 000000000..64aa078aa
--- /dev/null
+++ b/include/cuco/detail/cuda_stream_ref.inl
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cuco/cuda_stream_ref.hpp>
+#include <cuco/detail/error.hpp>
+
+#include <cuda_runtime_api.h>
+
+namespace cuco {
+namespace experimental {
+
+[[nodiscard]] inline bool cuda_stream_ref::is_per_thread_default() const noexcept
+{
+#ifdef CUDA_API_PER_THREAD_DEFAULT_STREAM
+  return value() == cuda_stream_per_thread || value() == nullptr;
+#else
+  return value() == cuda_stream_per_thread;
+#endif
+}
+
+[[nodiscard]] inline bool cuda_stream_ref::is_default() const noexcept
+{
+#ifdef CUDA_API_PER_THREAD_DEFAULT_STREAM
+  return value() == cuda_stream_legacy;
+#else
+  return value() == cuda_stream_legacy || value() == nullptr;
+#endif
+}
+
+inline void cuda_stream_ref::synchronize() const
+{
+  CUCO_CUDA_TRY(cudaStreamSynchronize(this->stream_));
+}
+
+}  // namespace experimental
+}  // namespace cuco
\ No newline at end of file
diff --git a/include/cuco/detail/dynamic_map.inl b/include/cuco/detail/dynamic_map.inl
index 0c1d2e377..7b5145190 100644
--- a/include/cuco/detail/dynamic_map.inl
+++ b/include/cuco/detail/dynamic_map.inl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,13 +17,14 @@
 namespace cuco {
 
 template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
-dynamic_map<Key, Value, Scope, Allocator>::dynamic_map(
-  std::size_t initial_capacity,
-  sentinel::empty_key<Key> empty_key_sentinel,
-  sentinel::empty_value<Value> empty_value_sentinel,
-  Allocator const& alloc)
+dynamic_map<Key, Value, Scope, Allocator>::dynamic_map(std::size_t initial_capacity,
+                                                       empty_key<Key> empty_key_sentinel,
+                                                       empty_value<Value> empty_value_sentinel,
+                                                       Allocator const& alloc,
+                                                       cudaStream_t stream)
   : empty_key_sentinel_(empty_key_sentinel.value),
     empty_value_sentinel_(empty_value_sentinel.value),
+    erased_key_sentinel_(empty_key_sentinel.value),
     size_(0),
     capacity_(initial_capacity),
     min_insert_size_(1E4),
@@ -32,23 +33,49 @@ dynamic_map<Key, Value, Scope, Allocator>::dynamic_map(
 {
   submaps_.push_back(std::make_unique<static_map<Key, Value, Scope, Allocator>>(
     initial_capacity,
-    sentinel::empty_key<Key>{empty_key_sentinel},
-    sentinel::empty_value<Value>{empty_value_sentinel},
-    alloc));
+    empty_key<Key>{empty_key_sentinel},
+    empty_value<Value>{empty_value_sentinel},
+    alloc,
+    stream));
   submap_views_.push_back(submaps_[0]->get_device_view());
   submap_mutable_views_.push_back(submaps_[0]->get_device_mutable_view());
-
-  CUCO_CUDA_TRY(cudaMallocManaged(&num_successes_, sizeof(atomic_ctr_type)));
-}  // namespace cuco
+  submap_num_successes_.push_back(submaps_[0]->num_successes_);
+}
 
 template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
-dynamic_map<Key, Value, Scope, Allocator>::~dynamic_map()
+dynamic_map<Key, Value, Scope, Allocator>::dynamic_map(std::size_t initial_capacity,
+                                                       empty_key<Key> empty_key_sentinel,
+                                                       empty_value<Value> empty_value_sentinel,
+                                                       erased_key<Key> erased_key_sentinel,
+                                                       Allocator const& alloc,
+                                                       cudaStream_t stream)
+  : empty_key_sentinel_(empty_key_sentinel.value),
+    empty_value_sentinel_(empty_value_sentinel.value),
+    erased_key_sentinel_(erased_key_sentinel.value),
+    size_(0),
+    capacity_(initial_capacity),
+    min_insert_size_(1E4),
+    max_load_factor_(0.60),
+    alloc_{alloc}
 {
-  CUCO_ASSERT_CUDA_SUCCESS(cudaFree(num_successes_));
+  CUCO_EXPECTS(empty_key_sentinel_ != erased_key_sentinel_,
+               "The empty key sentinel and erased key sentinel cannot be the same value.",
+               std::runtime_error);
+
+  submaps_.push_back(std::make_unique<static_map<Key, Value, Scope, Allocator>>(
+    initial_capacity,
+    empty_key<Key>{empty_key_sentinel_},
+    empty_value<Value>{empty_value_sentinel_},
+    erased_key<Key>{erased_key_sentinel_},
+    alloc,
+    stream));
+  submap_views_.push_back(submaps_[0]->get_device_view());
+  submap_mutable_views_.push_back(submaps_[0]->get_device_mutable_view());
+  submap_num_successes_.push_back(submaps_[0]->num_successes_);
 }
 
 template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
-void dynamic_map<Key, Value, Scope, Allocator>::reserve(std::size_t n)
+void dynamic_map<Key, Value, Scope, Allocator>::reserve(std::size_t n, cudaStream_t stream)
 {
   int64_t num_elements_remaining = n;
   uint32_t submap_idx            = 0;
@@ -62,14 +89,25 @@ void dynamic_map<Key, Value, Scope, Allocator>::reserve(std::size_t n)
     // if the submap does not exist yet, create it
     else {
       submap_capacity = capacity_;
-      submaps_.push_back(std::make_unique<static_map<Key, Value, Scope, Allocator>>(
-        submap_capacity,
-        sentinel::empty_key<Key>{empty_key_sentinel_},
-        sentinel::empty_value<Value>{empty_value_sentinel_},
-        alloc_));
+      if (erased_key_sentinel_ != empty_key_sentinel_) {
+        submaps_.push_back(std::make_unique<static_map<Key, Value, Scope, Allocator>>(
+          submap_capacity,
+          empty_key<Key>{empty_key_sentinel_},
+          empty_value<Value>{empty_value_sentinel_},
+          erased_key<Key>{erased_key_sentinel_},
+          alloc_,
+          stream));
+      } else {
+        submaps_.push_back(std::make_unique<static_map<Key, Value, Scope, Allocator>>(
+          submap_capacity,
+          empty_key<Key>{empty_key_sentinel_},
+          empty_value<Value>{empty_value_sentinel_},
+          alloc_,
+          stream));
+      }
+      submap_num_successes_.push_back(submaps_[submap_idx]->num_successes_);
       submap_views_.push_back(submaps_[submap_idx]->get_device_view());
       submap_mutable_views_.push_back(submaps_[submap_idx]->get_device_mutable_view());
-
       capacity_ *= 2;
     }
 
@@ -80,13 +118,20 @@ void dynamic_map<Key, Value, Scope, Allocator>::reserve(std::size_t n)
 
 template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
 template <typename InputIt, typename Hash, typename KeyEqual>
-void dynamic_map<Key, Value, Scope, Allocator>::insert(InputIt first,
-                                                       InputIt last,
-                                                       Hash hash,
-                                                       KeyEqual key_equal)
+void dynamic_map<Key, Value, Scope, Allocator>::insert(
+  InputIt first, InputIt last, Hash hash, KeyEqual key_equal, cudaStream_t stream)
 {
+  // TODO: memset an atomic variable is unsafe
+  static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type),
+                "sizeof(atomic_ctr_type) must be equal to sizeof(std:size_t).");
+
+  auto constexpr block_size = 128;
+  auto constexpr stride     = 1;
+  auto constexpr tile_size  = 4;
+
   std::size_t num_to_insert = std::distance(first, last);
-  reserve(size_ + num_to_insert);
+
+  reserve(size_ + num_to_insert, stream);
 
   uint32_t submap_idx = 0;
   while (num_to_insert > 0) {
@@ -95,30 +140,29 @@ void dynamic_map<Key, Value, Scope, Allocator>::insert(InputIt first,
     // If we are tying to insert some of the remaining keys into this submap, we can insert
     // only if we meet the minimum insert size.
     if (capacity_remaining >= min_insert_size_) {
-      *num_successes_ = 0;
-      int device_id;
-      CUCO_CUDA_TRY(cudaGetDevice(&device_id));
-      CUCO_CUDA_TRY(cudaMemPrefetchAsync(num_successes_, sizeof(atomic_ctr_type), device_id));
-
-      auto n                = std::min(capacity_remaining, num_to_insert);
-      auto const block_size = 128;
-      auto const stride     = 1;
-      auto const tile_size  = 4;
-      auto const grid_size  = (tile_size * n + stride * block_size - 1) / (stride * block_size);
-
-      detail::insert<block_size, tile_size, cuco::pair_type<key_type, mapped_type>>
-        <<<grid_size, block_size>>>(first,
-                                    first + n,
-                                    submap_views_.data().get(),
-                                    submap_mutable_views_.data().get(),
-                                    num_successes_,
-                                    submap_idx,
-                                    submaps_.size(),
-                                    hash,
-                                    key_equal);
-      CUCO_CUDA_TRY(cudaDeviceSynchronize());
-
-      std::size_t h_num_successes = num_successes_->load(cuda::std::memory_order_relaxed);
+      CUCO_CUDA_TRY(
+        cudaMemsetAsync(submap_num_successes_[submap_idx], 0, sizeof(atomic_ctr_type), stream));
+
+      auto const n         = std::min(capacity_remaining, num_to_insert);
+      auto const grid_size = (tile_size * n + stride * block_size - 1) / (stride * block_size);
+
+      detail::insert<block_size, tile_size, cuco::pair<key_type, mapped_type>>
+        <<<grid_size, block_size, 0, stream>>>(first,
+                                               first + n,
+                                               submap_views_.data().get(),
+                                               submap_mutable_views_.data().get(),
+                                               submap_num_successes_.data().get(),
+                                               submap_idx,
+                                               submaps_.size(),
+                                               hash,
+                                               key_equal);
+
+      std::size_t h_num_successes;
+      CUCO_CUDA_TRY(cudaMemcpyAsync(&h_num_successes,
+                                    submap_num_successes_[submap_idx],
+                                    sizeof(atomic_ctr_type),
+                                    cudaMemcpyDeviceToHost,
+                                    stream));
       submaps_[submap_idx]->size_ += h_num_successes;
       size_ += h_num_successes;
       first += n;
@@ -128,34 +172,88 @@ void dynamic_map<Key, Value, Scope, Allocator>::insert(InputIt first,
   }
 }
 
+template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
+template <typename InputIt, typename Hash, typename KeyEqual>
+void dynamic_map<Key, Value, Scope, Allocator>::erase(
+  InputIt first, InputIt last, Hash hash, KeyEqual key_equal, cudaStream_t stream)
+{
+  // TODO: memset an atomic variable is unsafe
+  static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type),
+                "sizeof(atomic_ctr_type) must be equal to sizeof(std:size_t).");
+
+  auto constexpr block_size = 128;
+  auto constexpr stride     = 1;
+  auto constexpr tile_size  = 4;
+
+  auto const num_keys  = std::distance(first, last);
+  auto const grid_size = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size);
+
+  // zero out submap success counters
+  for (uint32_t i = 0; i < submaps_.size(); ++i) {
+    CUCO_CUDA_TRY(cudaMemsetAsync(submap_num_successes_[i], 0, sizeof(atomic_ctr_type), stream));
+  }
+
+  auto const temp_storage_size = submaps_.size() * sizeof(unsigned long long);
+
+  detail::erase<block_size, tile_size>
+    <<<grid_size, block_size, temp_storage_size, stream>>>(first,
+                                                           first + num_keys,
+                                                           submap_mutable_views_.data().get(),
+                                                           submap_num_successes_.data().get(),
+                                                           submaps_.size(),
+                                                           hash,
+                                                           key_equal);
+
+  for (uint32_t i = 0; i < submaps_.size(); ++i) {
+    std::size_t h_submap_num_successes;
+    CUCO_CUDA_TRY(cudaMemcpyAsync(&h_submap_num_successes,
+                                  submap_num_successes_[i],
+                                  sizeof(atomic_ctr_type),
+                                  cudaMemcpyDeviceToHost,
+                                  stream));
+    submaps_[i]->size_ -= h_submap_num_successes;
+    size_ -= h_submap_num_successes;
+  }
+}
+
 template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
 template <typename InputIt, typename OutputIt, typename Hash, typename KeyEqual>
-void dynamic_map<Key, Value, Scope, Allocator>::find(
-  InputIt first, InputIt last, OutputIt output_begin, Hash hash, KeyEqual key_equal)
+void dynamic_map<Key, Value, Scope, Allocator>::find(InputIt first,
+                                                     InputIt last,
+                                                     OutputIt output_begin,
+                                                     Hash hash,
+                                                     KeyEqual key_equal,
+                                                     cudaStream_t stream)
 {
-  auto num_keys         = std::distance(first, last);
-  auto const block_size = 128;
-  auto const stride     = 1;
-  auto const tile_size  = 4;
-  auto const grid_size  = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size);
+  auto constexpr block_size = 128;
+  auto constexpr stride     = 1;
+  auto constexpr tile_size  = 4;
 
-  detail::find<block_size, tile_size, Value><<<grid_size, block_size>>>(
+  auto const num_keys  = std::distance(first, last);
+  auto const grid_size = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size);
+
+  detail::find<block_size, tile_size, Value><<<grid_size, block_size, 0, stream>>>(
     first, last, output_begin, submap_views_.data().get(), submaps_.size(), hash, key_equal);
   CUCO_CUDA_TRY(cudaDeviceSynchronize());
 }
 
 template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
 template <typename InputIt, typename OutputIt, typename Hash, typename KeyEqual>
-void dynamic_map<Key, Value, Scope, Allocator>::contains(
-  InputIt first, InputIt last, OutputIt output_begin, Hash hash, KeyEqual key_equal)
+void dynamic_map<Key, Value, Scope, Allocator>::contains(InputIt first,
+                                                         InputIt last,
+                                                         OutputIt output_begin,
+                                                         Hash hash,
+                                                         KeyEqual key_equal,
+                                                         cudaStream_t stream)
 {
-  auto num_keys         = std::distance(first, last);
-  auto const block_size = 128;
-  auto const stride     = 1;
-  auto const tile_size  = 4;
-  auto const grid_size  = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size);
+  auto constexpr block_size = 128;
+  auto constexpr stride     = 1;
+  auto constexpr tile_size  = 4;
+
+  auto const num_keys  = std::distance(first, last);
+  auto const grid_size = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size);
 
-  detail::contains<block_size, tile_size><<<grid_size, block_size>>>(
+  detail::contains<block_size, tile_size><<<grid_size, block_size, 0, stream>>>(
     first, last, output_begin, submap_views_.data().get(), submaps_.size(), hash, key_equal);
   CUCO_CUDA_TRY(cudaDeviceSynchronize());
 }
diff --git a/include/cuco/detail/dynamic_map_kernels.cuh b/include/cuco/detail/dynamic_map_kernels.cuh
index f261b49aa..566576e1e 100644
--- a/include/cuco/detail/dynamic_map_kernels.cuh
+++ b/include/cuco/detail/dynamic_map_kernels.cuh
@@ -41,6 +41,7 @@ namespace cg = cooperative_groups;
  * @tparam viewT Type of device view allowing access of hash map storage
  * @tparam Hash Unary callable type
  * @tparam KeyEqual Binary callable type
+ *
  * @param first Beginning of the sequence of key/value pairs
  * @param last End of the sequence of key/value pairs
  * @param submap_views Array of `static_map::device_view` objects used to
@@ -71,7 +72,7 @@ __global__ void insert(InputIt first,
                        Hash hash,
                        KeyEqual key_equal)
 {
-  typedef cub::BlockReduce<std::size_t, block_size> BlockReduce;
+  using BlockReduce = cub::BlockReduce<std::size_t, block_size>;
   __shared__ typename BlockReduce::TempStorage temp_storage;
   std::size_t thread_num_successes = 0;
 
@@ -97,8 +98,10 @@ __global__ void insert(InputIt first,
     tid += gridDim.x * blockDim.x;
   }
 
-  std::size_t block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes);
-  if (threadIdx.x == 0) { *num_successes += block_num_successes; }
+  std::size_t const block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes);
+  if (threadIdx.x == 0) {
+    num_successes->fetch_add(block_num_successes, cuda::std::memory_order_relaxed);
+  }
 }
 
 /**
@@ -122,13 +125,14 @@ __global__ void insert(InputIt first,
  * @tparam viewT Type of device view allowing access of hash map storage
  * @tparam Hash Unary callable type
  * @tparam KeyEqual Binary callable type
+ *
  * @param first Beginning of the sequence of key/value pairs
  * @param last End of the sequence of key/value pairs
  * @param submap_views Array of `static_map::device_view` objects used to
  * perform `contains` operations on each underlying `static_map`
  * @param submap_mutable_views Array of `static_map::device_mutable_view` objects
  * used to perform an `insert` into the target `static_map` submap
- * @param num_successes The number of successfully inserted key/value pairs
+ * @param submap_num_successes The number of successfully inserted key/value pairs for each submap
  * @param insert_idx The index of the submap we are inserting into
  * @param num_submaps The total number of submaps in the map
  * @param hash The unary function to apply to hash each key
@@ -147,13 +151,13 @@ __global__ void insert(InputIt first,
                        InputIt last,
                        viewT* submap_views,
                        mutableViewT* submap_mutable_views,
-                       atomicT* num_successes,
+                       atomicT** submap_num_successes,
                        uint32_t insert_idx,
                        uint32_t num_submaps,
                        Hash hash,
                        KeyEqual key_equal)
 {
-  typedef cub::BlockReduce<std::size_t, block_size> BlockReduce;
+  using BlockReduce = cub::BlockReduce<std::size_t, block_size>;
   __shared__ typename BlockReduce::TempStorage temp_storage;
   std::size_t thread_num_successes = 0;
 
@@ -182,8 +186,154 @@ __global__ void insert(InputIt first,
     it += (gridDim.x * blockDim.x) / tile_size;
   }
 
-  std::size_t block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes);
-  if (threadIdx.x == 0) { *num_successes += block_num_successes; }
+  std::size_t const block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes);
+  if (threadIdx.x == 0) {
+    submap_num_successes[insert_idx]->fetch_add(block_num_successes,
+                                                cuda::std::memory_order_relaxed);
+  }
+}
+
+/**
+ * @brief Erases the key/value pairs corresponding to all keys in the range `[first, last)`.
+ *
+ * If the key `*(first + i)` exists in the map, its slot is erased and made available for future
+   insertions.
+ * Else, no effect.
+ *
+ * @tparam block_size The size of the thread block
+ * @tparam InputIt Device accessible input iterator whose `value_type` is
+ * convertible to the map's `key_type`
+ * @tparam mutableViewT Type of device view allowing modification of hash map storage
+ * @tparam atomicT Type of atomic storage
+ * @tparam Hash Unary callable type
+ * @tparam KeyEqual Binary callable type
+ *
+ * @param first Beginning of the sequence of keys
+ * @param last End of the sequence of keys
+ * @param submap_mutable_views Array of `static_map::mutable_device_view` objects used to
+ * perform `erase` operations on each underlying `static_map`
+ * @param num_successes The number of successfully erased key/value pairs
+ * @param submap_num_successes The number of successfully erased key/value pairs
+ * in each submap
+ * @param num_submaps The number of submaps in the map
+ * @param hash The unary function to apply to hash each key
+ * @param key_equal The binary function to compare two keys for equality
+ */
+template <uint32_t block_size,
+          typename InputIt,
+          typename mutableViewT,
+          typename atomicT,
+          typename Hash,
+          typename KeyEqual>
+__global__ void erase(InputIt first,
+                      InputIt last,
+                      mutableViewT* submap_mutable_views,
+                      atomicT** submap_num_successes,
+                      uint32_t num_submaps,
+                      Hash hash,
+                      KeyEqual key_equal)
+{
+  extern __shared__ unsigned long long submap_block_num_successes[];
+
+  auto tid = block_size * blockIdx.x + threadIdx.x;
+  auto it  = first + tid;
+
+  for (auto i = threadIdx.x; i < num_submaps; i += block_size) {
+    submap_block_num_successes[i] = 0;
+  }
+  __syncthreads();
+
+  while (it < last) {
+    for (auto i = 0; i < num_submaps; ++i) {
+      if (submap_mutable_views[i].erase(*it, hash, key_equal)) {
+        atomicAdd(&submap_block_num_successes[i], 1);
+        break;
+      }
+    }
+    it += gridDim.x * blockDim.x;
+  }
+  __syncthreads();
+
+  for (auto i = 0; i < num_submaps; ++i) {
+    if (threadIdx.x == 0) {
+      submap_num_successes[i]->fetch_add(static_cast<std::size_t>(submap_block_num_successes[i]),
+                                         cuda::std::memory_order_relaxed);
+    }
+  }
+}
+
+/**
+ * @brief Erases the key/value pairs corresponding to all keys in the range `[first, last)`.
+ *
+ * If the key `*(first + i)` exists in the map, its slot is erased and made available for future
+ * insertions.
+ * Else, no effect.
+ *
+ * @tparam block_size The size of the thread block
+ * @tparam tile_size The number of threads in the Cooperative Groups used to perform erase
+ * @tparam InputIt Device accessible input iterator whose `value_type` is
+ * convertible to the map's `key_type`
+ * @tparam mutableViewT Type of device view allowing modification of hash map storage
+ * @tparam atomicT Type of atomic storage
+ * @tparam Hash Unary callable type
+ * @tparam KeyEqual Binary callable type
+ *
+ * @param first Beginning of the sequence of keys
+ * @param last End of the sequence of keys
+ * @param submap_mutable_views Array of `static_map::mutable_device_view` objects used to
+ * perform `erase` operations on each underlying `static_map`
+ * @param num_successes The number of successfully erased key/value pairs
+ * @param submap_num_successes The number of successfully erased key/value pairs
+ * in each submap
+ * @param num_submaps The number of submaps in the map
+ * @param hash The unary function to apply to hash each key
+ * @param key_equal The binary function to compare two keys for equality
+ */
+template <uint32_t block_size,
+          uint32_t tile_size,
+          typename InputIt,
+          typename mutableViewT,
+          typename atomicT,
+          typename Hash,
+          typename KeyEqual>
+__global__ void erase(InputIt first,
+                      InputIt last,
+                      mutableViewT* submap_mutable_views,
+                      atomicT** submap_num_successes,
+                      uint32_t num_submaps,
+                      Hash hash,
+                      KeyEqual key_equal)
+{
+  extern __shared__ unsigned long long submap_block_num_successes[];
+
+  auto block = cg::this_thread_block();
+  auto tile  = cg::tiled_partition<tile_size>(cg::this_thread_block());
+  auto tid   = block_size * block.group_index().x + block.thread_rank();
+  auto it    = first + tid / tile_size;
+
+  for (auto i = threadIdx.x; i < num_submaps; i += block_size) {
+    submap_block_num_successes[i] = 0;
+  }
+  block.sync();
+
+  while (it < last) {
+    auto erased = false;
+    int i       = 0;
+    for (i = 0; i < num_submaps; ++i) {
+      erased = submap_mutable_views[i].erase(tile, *it, hash, key_equal);
+      if (erased) { break; }
+    }
+    if (erased && tile.thread_rank() == 0) { atomicAdd(&submap_block_num_successes[i], 1); }
+    it += (gridDim.x * blockDim.x) / tile_size;
+  }
+  block.sync();
+
+  for (auto i = 0; i < num_submaps; ++i) {
+    if (threadIdx.x == 0) {
+      submap_num_successes[i]->fetch_add(static_cast<std::size_t>(submap_block_num_successes[i]),
+                                         cuda::std::memory_order_relaxed);
+    }
+  }
 }
 
 /**
@@ -191,6 +341,7 @@ __global__ void insert(InputIt first,
  *
  * If the key `*(first + i)` exists in the map, copies its associated value to `(output_begin + i)`.
  * Else, copies the empty value sentinel.
+ *
  * @tparam block_size The number of threads in the thread block
  * @tparam Value The mapped value type for the map
  * @tparam InputIt Device accessible input iterator whose `value_type` is
@@ -200,6 +351,7 @@ __global__ void insert(InputIt first,
  * @tparam viewT Type of `static_map` device view
  * @tparam Hash Unary callable type
  * @tparam KeyEqual Binary callable type
+ *
  * @param first Beginning of the sequence of keys
  * @param last End of the sequence of keys
  * @param output_begin Beginning of the sequence of values retrieved for each key
@@ -273,6 +425,7 @@ __global__ void find(InputIt first,
  * @tparam viewT Type of `static_map` device view
  * @tparam Hash Unary callable type
  * @tparam KeyEqual Binary callable type
+ *
  * @param first Beginning of the sequence of keys
  * @param last End of the sequence of keys
  * @param output_begin Beginning of the sequence of values retrieved for each key
@@ -345,6 +498,7 @@ __global__ void find(InputIt first,
  * @tparam viewT Type of `static_map` device view
  * @tparam Hash Unary callable type
  * @tparam KeyEqual Binary callable type
+ *
  * @param first Beginning of the sequence of keys
  * @param last End of the sequence of keys
  * @param output_begin Beginning of the sequence of booleans for the presence of each key
@@ -411,6 +565,7 @@ __global__ void contains(InputIt first,
  * @tparam viewT Type of `static_map` device view
  * @tparam Hash Unary callable type
  * @tparam KeyEqual Binary callable type
+ *
  * @param first Beginning of the sequence of keys
  * @param last End of the sequence of keys
  * @param output_begin Beginning of the sequence of booleans for the presence of each key
diff --git a/include/cuco/detail/equal_wrapper.cuh b/include/cuco/detail/equal_wrapper.cuh
new file mode 100644
index 000000000..d2ded4a33
--- /dev/null
+++ b/include/cuco/detail/equal_wrapper.cuh
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cuco/detail/bitwise_compare.cuh>
+
+#include <cstddef>
+
+namespace cuco {
+namespace experimental {
+namespace detail {
+
+/**
+ * @brief Enum of equality comparison results.
+ */
+enum class equal_result : int32_t { UNEQUAL = 0, EMPTY = 1, EQUAL = 2 };
+
+/**
+ * @brief Key equality wrapper.
+ *
+ * User-provided equality binary callable cannot be used to compare against sentinel value.
+ *
+ * @tparam T Right-hand side Element type
+ * @tparam Equal Type of user-provided equality binary callable
+ */
+template <typename T, typename Equal>
+struct equal_wrapper {
+  T empty_sentinel_;  ///< Sentinel value
+  Equal equal_;       ///< Custom equality callable
+
+  /**
+   * @brief Equality wrapper ctor.
+   *
+   * @param sentinel Sentinel value
+   * @param equal Equality binary callable
+   */
+  __host__ __device__ constexpr equal_wrapper(T sentinel, Equal const& equal) noexcept
+    : empty_sentinel_{sentinel}, equal_{equal}
+  {
+  }
+
+  /**
+   * @brief Equality check with the given equality callable.
+   *
+   * @tparam U Right-hand side Element type
+   *
+   * @param lhs Left-hand side element to check equality
+   * @param rhs Right-hand side element to check equality
+   *
+   * @return `EQUAL` if `lhs` and `rhs` are equivalent. `UNEQUAL` otherwise.
+   */
+  template <typename U>
+  __device__ constexpr equal_result equal_to(T const& lhs, U const& rhs) const noexcept
+  {
+    return equal_(lhs, rhs) ? equal_result::EQUAL : equal_result::UNEQUAL;
+  }
+
+  /**
+   * @brief Order-sensitive equality operator.
+   *
+   * @note This function always compares the left-hand side element against `empty_sentinel_` value
+   * first then perform a equality check with the given `equal_` callable, i.e., `equal_(lhs, rhs)`.
+   * @note Container (like set or map) keys MUST be always on the left-hand side.
+   *
+   * @tparam U Right-hand side Element type
+   *
+   * @param lhs Left-hand side element to check equality
+   * @param rhs Right-hand side element to check equality
+   *
+   * @return Three way equality comparison result
+   */
+  template <typename U>
+  __device__ constexpr equal_result operator()(T const& lhs, U const& rhs) const noexcept
+  {
+    return cuco::detail::bitwise_compare(lhs, empty_sentinel_) ? equal_result::EMPTY
+                                                               : this->equal_to(lhs, rhs);
+  }
+};
+
+}  // namespace detail
+}  // namespace experimental
+}  // namespace cuco
diff --git a/include/cuco/detail/error.hpp b/include/cuco/detail/error.hpp
index 45f78a2e0..1d1ff6135 100644
--- a/include/cuco/detail/error.hpp
+++ b/include/cuco/detail/error.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,31 +16,9 @@
 
 #pragma once
 
-#include <cuda_runtime_api.h>
-
-#include <stdexcept>
-#include <string>
+#include <cuco/utility/error.hpp>
 
-namespace cuco {
-/**
- * @brief Exception thrown when a CUDA error is encountered.
- *
- */
-struct cuda_error : public std::runtime_error {
-  /**
-   * @brief Constructs a `cuda_error` object with the given `message`.
-   *
-   * @param message The error char array used to construct `cuda_error`
-   */
-  cuda_error(const char* message) : std::runtime_error(message) {}
-  /**
-   * @brief Constructs a `cuda_error` object with the given `message` string.
-   *
-   * @param message The `std::string` used to construct `cuda_error`
-   */
-  cuda_error(std::string const& message) : cuda_error{message.c_str()} {}
-};
-}  // namespace cuco
+#include <cuda_runtime_api.h>
 
 #define STRINGIFY_DETAIL(x) #x
 #define CUCO_STRINGIFY(x)   STRINGIFY_DETAIL(x)
@@ -58,7 +36,7 @@ struct cuda_error : public std::runtime_error {
  * Example:
  * ```c++
  *
- * // Throws `rmm::cuda_error` if `cudaMalloc` fails
+ * // Throws `cuco::cuda_error` if `cudaMalloc` fails
  * CUCO_CUDA_TRY(cudaMalloc(&p, 100));
  *
  * // Throws `std::runtime_error` if `cudaMalloc` fails
@@ -93,21 +71,72 @@ struct cuda_error : public std::runtime_error {
   } while (0)
 
 /**
- * @brief Macro for checking runtime conditions that throws an exception when
+ * @brief Macro for checking (pre-)conditions that throws an exception when
  * a condition is violated.
  *
+ * Defaults to throwing `cuco::logic_error`, but a custom exception may also be
+ * specified.
+ *
  * Example usage:
+ * ```
+ * // throws cuco::logic_error
+ * CUCO_EXPECTS(p != nullptr, "Unexpected null pointer");
  *
- * @code
- * CUCO_RUNTIME_EXPECTS(key == value, "Key value mismatch");
- * @endcode
+ * // throws std::runtime_error
+ * CUCO_EXPECTS(p != nullptr, "Unexpected nullptr", std::runtime_error);
+ * ```
+ * @param ... This macro accepts either two or three arguments:
+ *   - The first argument must be an expression that evaluates to true or
+ *     false, and is the condition being checked.
+ *   - The second argument is a string literal used to construct the `what` of
+ *     the exception.
+ *   - When given, the third argument is the exception to be thrown. When not
+ *     specified, defaults to `cuco::logic_error`.
+ * @throw `_exception_type` if the condition evaluates to 0 (false).
+ */
+#define CUCO_EXPECTS(...)                                             \
+  GET_CUCO_EXPECTS_MACRO(__VA_ARGS__, CUCO_EXPECTS_3, CUCO_EXPECTS_2) \
+  (__VA_ARGS__)
+
+#define GET_CUCO_EXPECTS_MACRO(_1, _2, _3, NAME, ...) NAME
+
+#define CUCO_EXPECTS_3(_condition, _reason, _exception_type)                    \
+  do {                                                                          \
+    static_assert(std::is_base_of_v<std::exception, _exception_type>);          \
+    (_condition) ? static_cast<void>(0)                                         \
+                 : throw _exception_type /*NOLINT(bugprone-macro-parentheses)*/ \
+      {"CUCO failure at: " __FILE__ ":" CUCO_STRINGIFY(__LINE__) ": " _reason}; \
+  } while (0)
+
+#define CUCO_EXPECTS_2(_condition, _reason) CUCO_EXPECTS_3(_condition, _reason, cuco::logic_error)
+
+/**
+ * @brief Indicates that an erroneous code path has been taken.
+ *
+ * Example usage:
+ * ```c++
+ * // Throws `cuco::logic_error`
+ * CUCO_FAIL("Unsupported code path");
  *
- * @param[in] cond Expression that evaluates to true or false
- * @param[in] reason String literal description of the reason that cond is
- * expected to be true
- * @throw std::runtime_error if the condition evaluates to false.
+ * // Throws `std::runtime_error`
+ * CUCO_FAIL("Unsupported code path", std::runtime_error);
+ * ```
+ *
+ * @param ... This macro accepts either one or two arguments:
+ *   - The first argument is a string literal used to construct the `what` of
+ *     the exception.
+ *   - When given, the second argument is the exception to be thrown. When not
+ *     specified, defaults to `cuco::logic_error`.
+ * @throw `_exception_type` if the condition evaluates to 0 (false).
  */
-#define CUCO_RUNTIME_EXPECTS(cond, reason)                           \
-  (!!(cond)) ? static_cast<void>(0)                                  \
-             : throw std::runtime_error("cuco failure at: " __FILE__ \
-                                        ":" CUCO_STRINGIFY(__LINE__) ": " reason)
+#define CUCO_FAIL(...)                                       \
+  GET_CUCO_FAIL_MACRO(__VA_ARGS__, CUCO_FAIL_2, CUCO_FAIL_1) \
+  (__VA_ARGS__)
+
+#define GET_CUCO_FAIL_MACRO(_1, _2, NAME, ...) NAME
+
+#define CUCO_FAIL_2(_what, _exception_type)      \
+  /*NOLINTNEXTLINE(bugprone-macro-parentheses)*/ \
+  throw _exception_type { "CUCO failure at:" __FILE__ ":" CUCO_STRINGIFY(__LINE__) ": " _what }
+
+#define CUCO_FAIL_1(_what) CUCO_FAIL_2(_what, cuco::logic_error)
diff --git a/include/cuco/detail/extent/extent.inl b/include/cuco/detail/extent/extent.inl
new file mode 100644
index 000000000..a7cd83dcd
--- /dev/null
+++ b/include/cuco/detail/extent/extent.inl
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuco/detail/error.hpp>
+#include <cuco/detail/prime.hpp>  // TODO move to detail/extent/
+#include <cuco/detail/utility/math.hpp>
+#include <cuco/detail/utils.hpp>
+#include <cuco/utility/fast_int.cuh>
+
+#include <type_traits>
+
+namespace cuco {
+namespace experimental {
+
+template <typename SizeType, std::size_t N>
+struct window_extent {
+  using value_type = SizeType;  ///< Extent value type
+
+  __host__ __device__ constexpr value_type value() const noexcept { return N; }
+  __host__ __device__ explicit constexpr operator value_type() const noexcept { return value(); }
+
+ private:
+  __host__ __device__ explicit constexpr window_extent() noexcept {}
+  __host__ __device__ explicit constexpr window_extent(SizeType) noexcept {}
+
+  template <int32_t CGSize_, int32_t WindowSize_, typename SizeType_, std::size_t N_>
+  friend auto constexpr make_window_extent(extent<SizeType_, N_> ext);
+};
+
+template <typename SizeType>
+struct window_extent<SizeType, dynamic_extent> : cuco::utility::fast_int<SizeType> {
+  using value_type =
+    typename cuco::utility::fast_int<SizeType>::fast_int::value_type;  ///< Extent value type
+
+ private:
+  using cuco::utility::fast_int<SizeType>::fast_int;
+
+  template <int32_t CGSize_, int32_t WindowSize_, typename SizeType_, std::size_t N_>
+  friend auto constexpr make_window_extent(extent<SizeType_, N_> ext);
+};
+
+template <typename Container, typename SizeType, std::size_t N>
+[[nodiscard]] auto constexpr make_window_extent(extent<SizeType, N> ext)
+{
+  return make_window_extent<Container::cg_size, Container::window_size>(ext);
+}
+
+template <typename Container, typename SizeType>
+[[nodiscard]] auto constexpr make_window_extent(SizeType size)
+{
+  return make_window_extent<Container::cg_size, Container::window_size>(extent<SizeType>{size});
+}
+
+template <int32_t CGSize, int32_t WindowSize, typename SizeType, std::size_t N>
+[[nodiscard]] auto constexpr make_window_extent(extent<SizeType, N> ext)
+{
+  auto constexpr max_prime = cuco::detail::primes.back();
+  auto constexpr max_value =
+    (static_cast<uint64_t>(std::numeric_limits<SizeType>::max()) < max_prime)
+      ? std::numeric_limits<SizeType>::max()
+      : static_cast<SizeType>(max_prime);
+  auto const size = cuco::detail::int_div_ceil(
+    std::max(static_cast<SizeType>(ext), static_cast<SizeType>(1)), CGSize * WindowSize);
+  if (size > max_value) { CUCO_FAIL("Invalid input extent"); }
+
+  if constexpr (N == dynamic_extent) {
+    return window_extent<SizeType>{static_cast<SizeType>(
+      *cuco::detail::lower_bound(
+        cuco::detail::primes.begin(), cuco::detail::primes.end(), static_cast<uint64_t>(size)) *
+      CGSize)};
+  }
+  if constexpr (N != dynamic_extent) {
+    return window_extent<SizeType,
+                         static_cast<std::size_t>(
+                           *cuco::detail::lower_bound(cuco::detail::primes.begin(),
+                                                      cuco::detail::primes.end(),
+                                                      static_cast<uint64_t>(size)) *
+                           CGSize)>{};
+  }
+}
+
+template <int32_t CGSize, int32_t WindowSize, typename SizeType>
+[[nodiscard]] auto constexpr make_window_extent(SizeType size)
+{
+  return make_window_extent<CGSize, WindowSize>(extent<SizeType>{size});
+}
+
+namespace detail {
+
+template <typename...>
+struct is_window_extent : std::false_type {
+};
+
+template <typename SizeType, std::size_t N>
+struct is_window_extent<window_extent<SizeType, N>> : std::true_type {
+};
+
+template <typename T>
+inline constexpr bool is_window_extent_v = is_window_extent<T>::value;
+
+}  // namespace detail
+
+}  // namespace experimental
+}  // namespace cuco
diff --git a/include/cuco/detail/hash_functions.cuh b/include/cuco/detail/hash_functions.cuh
deleted file mode 100644
index 7be6cab20..000000000
--- a/include/cuco/detail/hash_functions.cuh
+++ /dev/null
@@ -1,122 +0,0 @@
-/*
- * Copyright (c) 2017-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-namespace cuco {
-
-using hash_value_type = uint32_t;
-
-namespace detail {
-
-/**
- * @brief A `MurmurHash3_32` hash function to hash the given argument on host and device.
- *
- * MurmurHash3_32 implementation from
- * https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp
- * -----------------------------------------------------------------------------
- * MurmurHash3 was written by Austin Appleby, and is placed in the public domain. The author
- * hereby disclaims copyright to this source code.
- *
- * Note - The x86 and x64 versions do _not_ produce the same results, as the algorithms are
- * optimized for their respective platforms. You can still compile and run any of them on any
- * platform, but your performance with the non-native version will be less than optimal.
- *
- * @tparam Key The type of the values to hash
- */
-template <typename Key>
-struct MurmurHash3_32 {
-  using argument_type = Key;       ///< The type of the values taken as argument
-  using result_type   = uint32_t;  ///< The type of the hash values produced
-
-  /// Default constructor
-  __host__ __device__ constexpr MurmurHash3_32() : MurmurHash3_32{0} {}
-
-  /**
-   * @brief Constructs a MurmurHash3_32 hash function with the given `seed`.
-   *
-   * @param seed A custom number to randomize the resulting hash value
-   */
-  __host__ __device__ constexpr MurmurHash3_32(uint32_t seed) : m_seed(seed) {}
-
-  /**
-   * @brief Returns a hash value for its argument, as a value of type `result_type`.
-   *
-   * @param key The input argument to hash
-   * @return A resulting hash value for `key`
-   */
-  constexpr result_type __host__ __device__ operator()(Key const& key) const noexcept
-  {
-    constexpr int len         = sizeof(argument_type);
-    const uint8_t* const data = (const uint8_t*)&key;
-    constexpr int nblocks     = len / 4;
-
-    uint32_t h1           = m_seed;
-    constexpr uint32_t c1 = 0xcc9e2d51;
-    constexpr uint32_t c2 = 0x1b873593;
-    //----------
-    // body
-    const uint32_t* const blocks = (const uint32_t*)(data + nblocks * 4);
-    for (int i = -nblocks; i; i++) {
-      uint32_t k1 = blocks[i];  // getblock32(blocks,i);
-      k1 *= c1;
-      k1 = rotl32(k1, 15);
-      k1 *= c2;
-      h1 ^= k1;
-      h1 = rotl32(h1, 13);
-      h1 = h1 * 5 + 0xe6546b64;
-    }
-    //----------
-    // tail
-    const uint8_t* tail = (const uint8_t*)(data + nblocks * 4);
-    uint32_t k1         = 0;
-    switch (len & 3) {
-      case 3: k1 ^= tail[2] << 16;
-      case 2: k1 ^= tail[1] << 8;
-      case 1:
-        k1 ^= tail[0];
-        k1 *= c1;
-        k1 = rotl32(k1, 15);
-        k1 *= c2;
-        h1 ^= k1;
-    };
-    //----------
-    // finalization
-    h1 ^= len;
-    h1 = fmix32(h1);
-    return h1;
-  }
-
- private:
-  constexpr __host__ __device__ uint32_t rotl32(uint32_t x, int8_t r) const noexcept
-  {
-    return (x << r) | (x >> (32 - r));
-  }
-
-  constexpr __host__ __device__ uint32_t fmix32(uint32_t h) const noexcept
-  {
-    h ^= h >> 16;
-    h *= 0x85ebca6b;
-    h ^= h >> 13;
-    h *= 0xc2b2ae35;
-    h ^= h >> 16;
-    return h;
-  }
-  uint32_t m_seed;
-};
-
-}  // namespace detail
-}  // namespace cuco
diff --git a/include/cuco/detail/hash_functions/murmurhash3.cuh b/include/cuco/detail/hash_functions/murmurhash3.cuh
new file mode 100644
index 000000000..a12143523
--- /dev/null
+++ b/include/cuco/detail/hash_functions/murmurhash3.cuh
@@ -0,0 +1,209 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuco/detail/hash_functions/utils.cuh>
+#include <cuco/extent.cuh>
+
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+
+namespace cuco::detail {
+
+/**
+ * @brief The 32bit integer finalizer hash function of `MurmurHash3`.
+ *
+ * @throw Key type must be 4 bytes in size
+ *
+ * @tparam Key The type of the values to hash
+ */
+template <typename Key>
+struct MurmurHash3_fmix32 {
+  static_assert(sizeof(Key) == 4, "Key type must be 4 bytes in size.");
+
+  using argument_type = Key;            ///< The type of the values taken as argument
+  using result_type   = std::uint32_t;  ///< The type of the hash values produced
+
+  /**
+   * @brief Constructs a MurmurHash3_fmix32 hash function with the given `seed`.
+   *
+   * @param seed A custom number to randomize the resulting hash value
+   */
+  __host__ __device__ constexpr MurmurHash3_fmix32(std::uint32_t seed = 0) : seed_{seed} {}
+
+  /**
+   * @brief Returns a hash value for its argument, as a value of type `result_type`.
+   *
+   * @param key The input argument to hash
+   * @return A resulting hash value for `key`
+   */
+  constexpr result_type __host__ __device__ operator()(Key const& key) const noexcept
+  {
+    std::uint32_t h = static_cast<std::uint32_t>(key) ^ seed_;
+    h ^= h >> 16;
+    h *= 0x85ebca6b;
+    h ^= h >> 13;
+    h *= 0xc2b2ae35;
+    h ^= h >> 16;
+    return h;
+  }
+
+ private:
+  std::uint32_t seed_;
+};
+
+/**
+ * @brief The 64bit integer finalizer hash function of `MurmurHash3`.
+ *
+ * @throw Key type must be 8 bytes in size
+ *
+ * @tparam Key The type of the values to hash
+ */
+template <typename Key>
+struct MurmurHash3_fmix64 {
+  static_assert(sizeof(Key) == 8, "Key type must be 8 bytes in size.");
+
+  using argument_type = Key;            ///< The type of the values taken as argument
+  using result_type   = std::uint64_t;  ///< The type of the hash values produced
+
+  /**
+   * @brief Constructs a MurmurHash3_fmix64 hash function with the given `seed`.
+   *
+   * @param seed A custom number to randomize the resulting hash value
+   */
+  __host__ __device__ constexpr MurmurHash3_fmix64(std::uint64_t seed = 0) : seed_{seed} {}
+
+  /**
+   * @brief Returns a hash value for its argument, as a value of type `result_type`.
+   *
+   * @param key The input argument to hash
+   * @return A resulting hash value for `key`
+   */
+  constexpr result_type __host__ __device__ operator()(Key const& key) const noexcept
+  {
+    std::uint64_t h = static_cast<std::uint64_t>(key) ^ seed_;
+    h ^= h >> 33;
+    h *= 0xff51afd7ed558ccd;
+    h ^= h >> 33;
+    h *= 0xc4ceb9fe1a85ec53;
+    h ^= h >> 33;
+    return h;
+  }
+
+ private:
+  std::uint64_t seed_;
+};
+
+/**
+ * @brief A `MurmurHash3_32` hash function to hash the given argument on host and device.
+ *
+ * MurmurHash3_32 implementation from
+ * https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp
+ * -----------------------------------------------------------------------------
+ * MurmurHash3 was written by Austin Appleby, and is placed in the public domain. The author
+ * hereby disclaims copyright to this source code.
+ *
+ * Note - The x86 and x64 versions do _not_ produce the same results, as the algorithms are
+ * optimized for their respective platforms. You can still compile and run any of them on any
+ * platform, but your performance with the non-native version will be less than optimal.
+ *
+ * @tparam Key The type of the values to hash
+ */
+template <typename Key>
+struct MurmurHash3_32 {
+  using argument_type = Key;            ///< The type of the values taken as argument
+  using result_type   = std::uint32_t;  ///< The type of the hash values produced
+
+  /**
+   * @brief Constructs a MurmurHash3_32 hash function with the given `seed`.
+   *
+   * @param seed A custom number to randomize the resulting hash value
+   */
+  __host__ __device__ constexpr MurmurHash3_32(std::uint32_t seed = 0) : fmix32_{0}, seed_{seed} {}
+
+  /**
+   * @brief Returns a hash value for its argument, as a value of type `result_type`.
+   *
+   * @param key The input argument to hash
+   * @return The resulting hash value for `key`
+   */
+  constexpr result_type __host__ __device__ operator()(Key const& key) const noexcept
+  {
+    return compute_hash(reinterpret_cast<std::byte const*>(&key),
+                        cuco::experimental::extent<std::size_t, sizeof(Key)>{});
+  }
+
+  /**
+   * @brief Returns a hash value for its argument, as a value of type `result_type`.
+   *
+   * @tparam Extent The extent type
+   *
+   * @param bytes The input argument to hash
+   * @param size The extent of the data in bytes
+   * @return The resulting hash value
+   */
+  template <typename Extent>
+  constexpr result_type __host__ __device__ compute_hash(std::byte const* bytes,
+                                                         Extent size) const noexcept
+  {
+    auto const nblocks = size / 4;
+
+    std::uint32_t h1           = seed_;
+    constexpr std::uint32_t c1 = 0xcc9e2d51;
+    constexpr std::uint32_t c2 = 0x1b873593;
+    //----------
+    // body
+    for (std::remove_const_t<decltype(nblocks)> i = 0; size >= 4 && i < nblocks; i++) {
+      std::uint32_t k1 = load_chunk<std::uint32_t>(bytes, i);
+      k1 *= c1;
+      k1 = rotl32(k1, 15);
+      k1 *= c2;
+      h1 ^= k1;
+      h1 = rotl32(h1, 13);
+      h1 = h1 * 5 + 0xe6546b64;
+    }
+    //----------
+    // tail
+    std::uint32_t k1 = 0;
+    switch (size & 3) {
+      case 3: k1 ^= std::to_integer<std::uint32_t>(bytes[nblocks * 4 + 2]) << 16; [[fallthrough]];
+      case 2: k1 ^= std::to_integer<std::uint32_t>(bytes[nblocks * 4 + 1]) << 8; [[fallthrough]];
+      case 1:
+        k1 ^= std::to_integer<std::uint32_t>(bytes[nblocks * 4 + 0]);
+        k1 *= c1;
+        k1 = rotl32(k1, 15);
+        k1 *= c2;
+        h1 ^= k1;
+    };
+    //----------
+    // finalization
+    h1 ^= size;
+    h1 = fmix32_(h1);
+    return h1;
+  }
+
+ private:
+  constexpr __host__ __device__ std::uint32_t rotl32(std::uint32_t x, std::int8_t r) const noexcept
+  {
+    return (x << r) | (x >> (32 - r));
+  }
+
+  MurmurHash3_fmix32<std::uint32_t> fmix32_;
+  std::uint32_t seed_;
+};
+}  //  namespace cuco::detail
\ No newline at end of file
diff --git a/include/cuco/detail/hash_functions/utils.cuh b/include/cuco/detail/hash_functions/utils.cuh
new file mode 100644
index 000000000..37e279ba7
--- /dev/null
+++ b/include/cuco/detail/hash_functions/utils.cuh
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+namespace cuco::detail {
+
+template <typename T, typename U, typename Extent>
+constexpr __host__ __device__ T load_chunk(U const* const data, Extent index) noexcept
+{
+  auto const bytes = reinterpret_cast<std::byte const*>(data);
+  T chunk;
+  memcpy(&chunk, bytes + index * sizeof(T), sizeof(T));
+  return chunk;
+}
+
+};  // namespace cuco::detail
\ No newline at end of file
diff --git a/include/cuco/detail/hash_functions/xxhash.cuh b/include/cuco/detail/hash_functions/xxhash.cuh
new file mode 100644
index 000000000..a36f74bca
--- /dev/null
+++ b/include/cuco/detail/hash_functions/xxhash.cuh
@@ -0,0 +1,381 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuco/detail/hash_functions/utils.cuh>
+#include <cuco/extent.cuh>
+
+#include <cstddef>
+#include <cstdint>
+
+namespace cuco::detail {
+
+/**
+ * @brief A `XXHash_32` hash function to hash the given argument on host and device.
+ *
+ * XXHash_32 implementation from
+ * https://github.com/Cyan4973/xxHash
+ * -----------------------------------------------------------------------------
+ * xxHash - Extremely Fast Hash algorithm
+ * Header File
+ * Copyright (C) 2012-2021 Yann Collet
+ *
+ * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following disclaimer
+ *      in the documentation and/or other materials provided with the
+ *      distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @tparam Key The type of the values to hash
+ */
+template <typename Key>
+struct XXHash_32 {
+ private:
+  static constexpr std::uint32_t prime1 = 0x9e3779b1u;
+  static constexpr std::uint32_t prime2 = 0x85ebca77u;
+  static constexpr std::uint32_t prime3 = 0xc2b2ae3du;
+  static constexpr std::uint32_t prime4 = 0x27d4eb2fu;
+  static constexpr std::uint32_t prime5 = 0x165667b1u;
+
+ public:
+  using argument_type = Key;            ///< The type of the values taken as argument
+  using result_type   = std::uint32_t;  ///< The type of the hash values produced
+
+  /**
+   * @brief Constructs a XXH32 hash function with the given `seed`.
+   *
+   * @param seed A custom number to randomize the resulting hash value
+   */
+  __host__ __device__ constexpr XXHash_32(std::uint32_t seed = 0) : seed_{seed} {}
+
+  /**
+   * @brief Returns a hash value for its argument, as a value of type `result_type`.
+   *
+   * @param key The input argument to hash
+   * @return The resulting hash value for `key`
+   */
+  constexpr result_type __host__ __device__ operator()(Key const& key) const noexcept
+  {
+    return compute_hash(reinterpret_cast<std::byte const*>(&key),
+                        cuco::experimental::extent<std::size_t, sizeof(Key)>{});
+  }
+
+  /**
+   * @brief Returns a hash value for its argument, as a value of type `result_type`.
+   *
+   * @tparam Extent The extent type
+   *
+   * @param bytes The input argument to hash
+   * @param size The extent of the data in bytes
+   * @return The resulting hash value
+   */
+  template <typename Extent>
+  constexpr result_type __host__ __device__ compute_hash(std::byte const* bytes,
+                                                         Extent size) const noexcept
+  {
+    std::size_t offset = 0;
+    std::uint32_t h32;
+
+    // data can be processed in 16-byte chunks
+    if (size >= 16) {
+      auto const limit = size - 16;
+      std::uint32_t v1 = seed_ + prime1 + prime2;
+      std::uint32_t v2 = seed_ + prime2;
+      std::uint32_t v3 = seed_;
+      std::uint32_t v4 = seed_ - prime1;
+
+      do {
+        // pipeline 4*4byte computations
+        auto const pipeline_offset = offset / 4;
+        v1 += load_chunk<std::uint32_t>(bytes, pipeline_offset + 0) * prime2;
+        v1 = rotl(v1, 13);
+        v1 *= prime1;
+        v2 += load_chunk<std::uint32_t>(bytes, pipeline_offset + 1) * prime2;
+        v2 = rotl(v2, 13);
+        v2 *= prime1;
+        v3 += load_chunk<std::uint32_t>(bytes, pipeline_offset + 2) * prime2;
+        v3 = rotl(v3, 13);
+        v3 *= prime1;
+        v4 += load_chunk<std::uint32_t>(bytes, pipeline_offset + 3) * prime2;
+        v4 = rotl(v4, 13);
+        v4 *= prime1;
+        offset += 16;
+      } while (offset <= limit);
+
+      h32 = rotl(v1, 1) + rotl(v2, 7) + rotl(v3, 12) + rotl(v4, 18);
+    } else {
+      h32 = seed_ + prime5;
+    }
+
+    h32 += size;
+
+    // remaining data can be processed in 4-byte chunks
+    if ((size % 16) >= 4) {
+      for (; offset <= size - 4; offset += 4) {
+        h32 += load_chunk<std::uint32_t>(bytes, offset / 4) * prime3;
+        h32 = rotl(h32, 17) * prime4;
+      }
+    }
+
+    // the following loop is only needed if the size of the key is not a multiple of the block size
+    if (size % 4) {
+      while (offset < size) {
+        h32 += (std::to_integer<std::uint32_t>(bytes[offset]) & 255) * prime5;
+        h32 = rotl(h32, 11) * prime1;
+        ++offset;
+      }
+    }
+
+    return finalize(h32);
+  }
+
+ private:
+  constexpr __host__ __device__ std::uint32_t rotl(std::uint32_t h, std::int8_t r) const noexcept
+  {
+    return ((h << r) | (h >> (32 - r)));
+  }
+
+  // avalanche helper
+  constexpr __host__ __device__ std::uint32_t finalize(std::uint32_t h) const noexcept
+  {
+    h ^= h >> 15;
+    h *= prime2;
+    h ^= h >> 13;
+    h *= prime3;
+    h ^= h >> 16;
+    return h;
+  }
+
+  std::uint32_t seed_;
+};
+
+/**
+ * @brief A `XXHash_64` hash function to hash the given argument on host and device.
+ *
+ * XXHash_64 implementation from
+ * https://github.com/Cyan4973/xxHash
+ * -----------------------------------------------------------------------------
+ * xxHash - Extremely Fast Hash algorithm
+ * Header File
+ * Copyright (C) 2012-2021 Yann Collet
+ *
+ * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following disclaimer
+ *      in the documentation and/or other materials provided with the
+ *      distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * You can contact the author at:
+ *   - xxHash homepage: https://www.xxhash.com
+ *   - xxHash source repository: https://github.com/Cyan4973/xxHash
+ *
+ * @tparam Key The type of the values to hash
+ */
+template <typename Key>
+struct XXHash_64 {
+ private:
+  static constexpr std::uint64_t prime1 = 11400714785074694791ull;
+  static constexpr std::uint64_t prime2 = 14029467366897019727ull;
+  static constexpr std::uint64_t prime3 = 1609587929392839161ull;
+  static constexpr std::uint64_t prime4 = 9650029242287828579ull;
+  static constexpr std::uint64_t prime5 = 2870177450012600261ull;
+
+ public:
+  using argument_type = Key;            ///< The type of the values taken as argument
+  using result_type   = std::uint64_t;  ///< The type of the hash values produced
+
+  /**
+   * @brief Constructs a XXH64 hash function with the given `seed`.
+   *
+   * @param seed A custom number to randomize the resulting hash value
+   */
+  __host__ __device__ constexpr XXHash_64(std::uint64_t seed = 0) : seed_{seed} {}
+
+  /**
+   * @brief Returns a hash value for its argument, as a value of type `result_type`.
+   *
+   * @param key The input argument to hash
+   * @return The resulting hash value for `key`
+   */
+  constexpr result_type __host__ __device__ operator()(Key const& key) const noexcept
+  {
+    return compute_hash(reinterpret_cast<std::byte const*>(&key),
+                        cuco::experimental::extent<std::size_t, sizeof(Key)>{});
+  }
+
+  /**
+   * @brief Returns a hash value for its argument, as a value of type `result_type`.
+   *
+   * @tparam Extent The extent type
+   *
+   * @param bytes The input argument to hash
+   * @param size The extent of the data in bytes
+   * @return The resulting hash value
+   */
+  template <typename Extent>
+  constexpr result_type __host__ __device__ compute_hash(std::byte const* bytes,
+                                                         Extent size) const noexcept
+  {
+    std::size_t offset = 0;
+    std::uint64_t h64;
+
+    // data can be processed in 32-byte chunks
+    if (size >= 32) {
+      auto const limit = size - 32;
+      std::uint64_t v1 = seed_ + prime1 + prime2;
+      std::uint64_t v2 = seed_ + prime2;
+      std::uint64_t v3 = seed_;
+      std::uint64_t v4 = seed_ - prime1;
+
+      do {
+        // pipeline 4*8byte computations
+        auto const pipeline_offset = offset / 8;
+        v1 += load_chunk<std::uint64_t>(bytes, pipeline_offset + 0) * prime2;
+        v1 = rotl(v1, 31);
+        v1 *= prime1;
+        v2 += load_chunk<std::uint64_t>(bytes, pipeline_offset + 1) * prime2;
+        v2 = rotl(v2, 31);
+        v2 *= prime1;
+        v3 += load_chunk<std::uint64_t>(bytes, pipeline_offset + 2) * prime2;
+        v3 = rotl(v3, 31);
+        v3 *= prime1;
+        v4 += load_chunk<std::uint64_t>(bytes, pipeline_offset + 3) * prime2;
+        v4 = rotl(v4, 31);
+        v4 *= prime1;
+        offset += 32;
+      } while (offset <= limit);
+
+      h64 = rotl(v1, 1) + rotl(v2, 7) + rotl(v3, 12) + rotl(v4, 18);
+
+      v1 *= prime2;
+      v1 = rotl(v1, 31);
+      v1 *= prime1;
+      h64 ^= v1;
+      h64 = h64 * prime1 + prime4;
+
+      v2 *= prime2;
+      v2 = rotl(v2, 31);
+      v2 *= prime1;
+      h64 ^= v2;
+      h64 = h64 * prime1 + prime4;
+
+      v3 *= prime2;
+      v3 = rotl(v3, 31);
+      v3 *= prime1;
+      h64 ^= v3;
+      h64 = h64 * prime1 + prime4;
+
+      v4 *= prime2;
+      v4 = rotl(v4, 31);
+      v4 *= prime1;
+      h64 ^= v4;
+      h64 = h64 * prime1 + prime4;
+    } else {
+      h64 = seed_ + prime5;
+    }
+
+    h64 += size;
+
+    // remaining data can be processed in 8-byte chunks
+    if ((size % 32) >= 8) {
+      for (; offset <= size - 8; offset += 8) {
+        std::uint64_t k1 = load_chunk<std::uint64_t>(bytes, offset / 8) * prime2;
+        k1               = rotl(k1, 31) * prime1;
+        h64 ^= k1;
+        h64 = rotl(h64, 27) * prime1 + prime4;
+      }
+    }
+
+    // remaining data can be processed in 4-byte chunks
+    if ((size % 8) >= 4) {
+      for (; offset <= size - 4; offset += 4) {
+        h64 ^= (load_chunk<std::uint32_t>(bytes, offset / 4) & 0xffffffffull) * prime1;
+        h64 = rotl(h64, 23) * prime2 + prime3;
+      }
+    }
+
+    // the following loop is only needed if the size of the key is not a multiple of a previous
+    // block size
+    if (size % 4) {
+      while (offset < size) {
+        h64 ^= (std::to_integer<std::uint32_t>(bytes[offset]) & 0xff) * prime5;
+        h64 = rotl(h64, 11) * prime1;
+        ++offset;
+      }
+    }
+    return finalize(h64);
+  }
+
+ private:
+  constexpr __host__ __device__ std::uint64_t rotl(std::uint64_t h, std::int8_t r) const noexcept
+  {
+    return ((h << r) | (h >> (64 - r)));
+  }
+
+  // avalanche helper
+  constexpr __host__ __device__ std::uint64_t finalize(std::uint64_t h) const noexcept
+  {
+    h ^= h >> 33;
+    h *= prime2;
+    h ^= h >> 29;
+    h *= prime3;
+    h ^= h >> 32;
+    return h;
+  }
+
+  std::uint64_t seed_;
+};
+
+}  // namespace cuco::detail
diff --git a/include/cuco/detail/open_addressing_impl.cuh b/include/cuco/detail/open_addressing_impl.cuh
new file mode 100644
index 000000000..2bc3a7225
--- /dev/null
+++ b/include/cuco/detail/open_addressing_impl.cuh
@@ -0,0 +1,545 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuco/detail/__config>
+#include <cuco/detail/common_functors.cuh>
+#include <cuco/detail/common_kernels.cuh>
+#include <cuco/detail/storage/counter_storage.cuh>
+#include <cuco/detail/utility/cuda.hpp>
+#include <cuco/extent.cuh>
+#include <cuco/probing_scheme.cuh>
+#include <cuco/storage.cuh>
+#include <cuco/utility/traits.hpp>
+
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+
+#include <cub/device/device_select.cuh>
+
+#include <cuda/atomic>
+
+namespace cuco {
+namespace experimental {
+namespace detail {
+/**
+ * @brief An open addressing impl class.
+ *
+ * @note This class should NOT be used directly.
+ *
+ * @throw If the size of the given key type is larger than 8 bytes
+ * @throw If the size of the given slot type is larger than 16 bytes
+ * @throw If the given key type doesn't have unique object representations, i.e.,
+ * `cuco::bitwise_comparable_v<Key> == false`
+ * @throw If the probing scheme type is not inherited from `cuco::detail::probing_scheme_base`
+ *
+ * @tparam Key Type used for keys. Requires `cuco::is_bitwise_comparable_v<Key>`
+ * @tparam Value Type used for storage values.
+ * @tparam Extent Data structure size type
+ * @tparam Scope The scope in which operations will be performed by individual threads.
+ * @tparam KeyEqual Binary callable type used to compare two keys for equality
+ * @tparam ProbingScheme Probing scheme (see `include/cuco/probing_scheme.cuh` for choices)
+ * @tparam Allocator Type of allocator used for device storage
+ * @tparam Storage Slot window storage type
+ */
+template <class Key,
+          class Value,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+class open_addressing_impl {
+  static_assert(sizeof(Key) <= 8, "Container does not support key types larger than 8 bytes.");
+
+  static_assert(sizeof(Value) <= 16, "Container does not support slot types larger than 16 bytes.");
+
+  static_assert(
+    cuco::is_bitwise_comparable_v<Key>,
+    "Key type must have unique object representations or have been explicitly declared as safe for "
+    "bitwise comparison via specialization of cuco::is_bitwise_comparable_v<Key>.");
+
+  static_assert(
+    std::is_base_of_v<cuco::experimental::detail::probing_scheme_base<ProbingScheme::cg_size>,
+                      ProbingScheme>,
+    "ProbingScheme must inherit from cuco::detail::probing_scheme_base");
+
+ public:
+  static constexpr auto cg_size      = ProbingScheme::cg_size;  ///< CG size used for probing
+  static constexpr auto window_size  = Storage::window_size;    ///< Window size used for probing
+  static constexpr auto thread_scope = Scope;                   ///< CUDA thread scope
+
+  using key_type   = Key;    ///< Key type
+  using value_type = Value;  ///< The storage value type, NOT payload type
+  /// Extent type
+  using extent_type = decltype(make_window_extent<open_addressing_impl>(std::declval<Extent>()));
+  using size_type   = typename extent_type::value_type;  ///< Size type
+  using key_equal   = KeyEqual;                          ///< Key equality comparator type
+  using storage_type =
+    detail::storage<Storage, value_type, extent_type, Allocator>;  ///< Storage type
+  using allocator_type = typename storage_type::allocator_type;    ///< Allocator type
+
+  using storage_ref_type = typename storage_type::ref_type;  ///< Non-owning window storage ref type
+  using probing_scheme_type = ProbingScheme;                 ///< Probe scheme type
+
+  /**
+   * @brief Constructs a statically-sized open addressing data structure with the specified initial
+   * capacity, sentinel values and CUDA stream.
+   *
+   * @note The actual capacity depends on the given `capacity`, the probing scheme, CG size, and the
+   * window size and it is computed via the `make_window_extent` factory. Insert operations will not
+   * automatically grow the container. Attempting to insert more unique keys than the capacity of
+   * the container results in undefined behavior.
+   * @note Any `*_sentinel`s are reserved and behavior is undefined when attempting to insert
+   * this sentinel value.
+   * @note If a non-default CUDA stream is provided, the caller is responsible for synchronizing the
+   * stream before the object is first used.
+   *
+   * @param capacity The requested lower-bound size
+   * @param empty_key_sentinel The reserved key value for empty slots
+   * @param empty_slot_sentinel The reserved slot value for empty slots
+   * @param pred Key equality binary predicate
+   * @param probing_scheme Probing scheme
+   * @param alloc Allocator used for allocating device storage
+   * @param stream CUDA stream used to initialize the data structure
+   */
+  constexpr open_addressing_impl(Extent capacity,
+                                 key_type empty_key_sentinel,
+                                 value_type empty_slot_sentinel,
+                                 KeyEqual const& pred,
+                                 ProbingScheme const& probing_scheme,
+                                 Allocator const& alloc,
+                                 cuda_stream_ref stream) noexcept
+    : empty_key_sentinel_{empty_key_sentinel},
+      empty_slot_sentinel_{empty_slot_sentinel},
+      predicate_{pred},
+      probing_scheme_{probing_scheme},
+      storage_{make_window_extent<open_addressing_impl>(capacity), alloc}
+  {
+    this->clear_async(stream);
+  }
+
+  /**
+   * @brief Erases all elements from the container. After this call, `size()` returns zero.
+   * Invalidates any references, pointers, or iterators referring to contained elements.
+   *
+   * @param stream CUDA stream this operation is executed in
+   */
+  void clear(cuda_stream_ref stream) noexcept { storage_.initialize(empty_slot_sentinel_, stream); }
+
+  /**
+   * @brief Asynchronously erases all elements from the container. After this call, `size()` returns
+   * zero. Invalidates any references, pointers, or iterators referring to contained elements.
+   *
+   * @param stream CUDA stream this operation is executed in
+   */
+  void clear_async(cuda_stream_ref stream) noexcept
+  {
+    storage_.initialize_async(empty_slot_sentinel_, stream);
+  }
+
+  /**
+   * @brief Inserts all keys in the range `[first, last)` and returns the number of successful
+   * insertions.
+   *
+   * @note This function synchronizes the given stream. For asynchronous execution use
+   * `insert_async`.
+   *
+   * @tparam InputIt Device accessible random access input iterator where
+   * <tt>std::is_convertible<std::iterator_traits<InputIt>::value_type,
+   * open_addressing_impl::value_type></tt> is `true`
+   * @tparam Ref Type of non-owning device container ref allowing access to storage
+   *
+   * @param first Beginning of the sequence of keys
+   * @param last End of the sequence of keys
+   * @param container_ref Non-owning device container ref used to access the slot storage
+   * @param stream CUDA stream used for insert
+   *
+   * @return Number of successfully inserted keys
+   */
+  template <typename InputIt, typename Ref>
+  size_type insert(InputIt first, InputIt last, Ref container_ref, cuda_stream_ref stream)
+  {
+    auto const num_keys = cuco::detail::distance(first, last);
+    if (num_keys == 0) { return 0; }
+
+    auto counter =
+      detail::counter_storage<size_type, thread_scope, allocator_type>{this->allocator()};
+    counter.reset(stream);
+
+    auto const grid_size = cuco::detail::grid_size(num_keys, cg_size);
+
+    auto const always_true = thrust::constant_iterator<bool>{true};
+    detail::insert_if_n<cg_size, cuco::detail::default_block_size()>
+      <<<grid_size, cuco::detail::default_block_size(), 0, stream>>>(
+        first, num_keys, always_true, thrust::identity{}, counter.data(), container_ref);
+
+    return counter.load_to_host(stream);
+  }
+
+  /**
+   * @brief Asynchronously inserts all keys in the range `[first, last)`.
+   *
+   * @tparam InputIt Device accessible random access input iterator where
+   * <tt>std::is_convertible<std::iterator_traits<InputIt>::value_type,
+   * open_addressing_impl::value_type></tt> is `true`
+   * @tparam Ref Type of non-owning device container ref allowing access to storage
+   *
+   * @param first Beginning of the sequence of keys
+   * @param last End of the sequence of keys
+   * @param container_ref Non-owning device container ref used to access the slot storage
+   * @param stream CUDA stream used for insert
+   */
+  template <typename InputIt, typename Ref>
+  void insert_async(InputIt first, InputIt last, Ref container_ref, cuda_stream_ref stream) noexcept
+  {
+    auto const num_keys = cuco::detail::distance(first, last);
+    if (num_keys == 0) { return; }
+
+    auto const grid_size = cuco::detail::grid_size(num_keys, cg_size);
+
+    auto const always_true = thrust::constant_iterator<bool>{true};
+    detail::insert_if_n<cg_size, cuco::detail::default_block_size()>
+      <<<grid_size, cuco::detail::default_block_size(), 0, stream>>>(
+        first, num_keys, always_true, thrust::identity{}, container_ref);
+  }
+
+  /**
+   * @brief Inserts keys in the range `[first, last)` if `pred` of the corresponding stencil returns
+   * true.
+   *
+   * @note The key `*(first + i)` is inserted if `pred( *(stencil + i) )` returns true.
+   * @note This function synchronizes the given stream and returns the number of successful
+   * insertions. For asynchronous execution use `insert_if_async`.
+   *
+   * @tparam InputIt Device accessible random access iterator whose `value_type` is
+   * convertible to the container's `value_type`
+   * @tparam StencilIt Device accessible random access iterator whose value_type is
+   * convertible to Predicate's argument type
+   * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` and
+   * argument type is convertible from <tt>std::iterator_traits<StencilIt>::value_type</tt>
+   * @tparam Ref Type of non-owning device container ref allowing access to storage
+   *
+   * @param first Beginning of the sequence of key/value pairs
+   * @param last End of the sequence of key/value pairs
+   * @param stencil Beginning of the stencil sequence
+   * @param pred Predicate to test on every element in the range `[stencil, stencil +
+   * std::distance(first, last))`
+   * @param container_ref Non-owning device container ref used to access the slot storage
+   * @param stream CUDA stream used for the operation
+   *
+   * @return Number of successfully inserted keys
+   */
+  template <typename InputIt, typename StencilIt, typename Predicate, typename Ref>
+  size_type insert_if(InputIt first,
+                      InputIt last,
+                      StencilIt stencil,
+                      Predicate pred,
+                      Ref container_ref,
+                      cuda_stream_ref stream)
+  {
+    auto const num_keys = cuco::detail::distance(first, last);
+    if (num_keys == 0) { return 0; }
+
+    auto counter =
+      detail::counter_storage<size_type, thread_scope, allocator_type>{this->allocator()};
+    counter.reset(stream);
+
+    auto const grid_size = cuco::detail::grid_size(num_keys, cg_size);
+
+    detail::insert_if_n<cg_size, cuco::detail::default_block_size()>
+      <<<grid_size, cuco::detail::default_block_size(), 0, stream>>>(
+        first, num_keys, stencil, pred, counter.data(), container_ref);
+
+    return counter.load_to_host(stream);
+  }
+
+  /**
+   * @brief Asynchronously inserts keys in the range `[first, last)` if `pred` of the corresponding
+   * stencil returns true.
+   *
+   * @note The key `*(first + i)` is inserted if `pred( *(stencil + i) )` returns true.
+   *
+   * @tparam InputIt Device accessible random access iterator whose `value_type` is
+   * convertible to the container's `value_type`
+   * @tparam StencilIt Device accessible random access iterator whose value_type is
+   * convertible to Predicate's argument type
+   * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` and
+   * argument type is convertible from <tt>std::iterator_traits<StencilIt>::value_type</tt>
+   * @tparam Ref Type of non-owning device container ref allowing access to storage
+   *
+   * @param first Beginning of the sequence of key/value pairs
+   * @param last End of the sequence of key/value pairs
+   * @param stencil Beginning of the stencil sequence
+   * @param pred Predicate to test on every element in the range `[stencil, stencil +
+   * std::distance(first, last))`
+   * @param container_ref Non-owning device container ref used to access the slot storage
+   * @param stream CUDA stream used for the operation
+   */
+  template <typename InputIt, typename StencilIt, typename Predicate, typename Ref>
+  void insert_if_async(InputIt first,
+                       InputIt last,
+                       StencilIt stencil,
+                       Predicate pred,
+                       Ref container_ref,
+                       cuda_stream_ref stream) noexcept
+  {
+    auto const num_keys = cuco::detail::distance(first, last);
+    if (num_keys == 0) { return; }
+
+    auto const grid_size = cuco::detail::grid_size(num_keys, cg_size);
+
+    detail::insert_if_n<cg_size, cuco::detail::default_block_size()>
+      <<<grid_size, cuco::detail::default_block_size(), 0, stream>>>(
+        first, num_keys, stencil, pred, container_ref);
+  }
+
+  /**
+   * @brief Asynchronously indicates whether the keys in the range `[first, last)` are contained in
+   * the container.
+   *
+   * @tparam InputIt Device accessible input iterator
+   * @tparam OutputIt Device accessible output iterator assignable from `bool`
+   * @tparam Ref Type of non-owning device container ref allowing access to storage
+   *
+   * @param first Beginning of the sequence of keys
+   * @param last End of the sequence of keys
+   * @param output_begin Beginning of the sequence of booleans for the presence of each key
+   * @param container_ref Non-owning device container ref used to access the slot storage
+   * @param stream Stream used for executing the kernels
+   */
+  template <typename InputIt, typename OutputIt, typename Ref>
+  void contains_async(InputIt first,
+                      InputIt last,
+                      OutputIt output_begin,
+                      Ref container_ref,
+                      cuda_stream_ref stream) const noexcept
+  {
+    auto const num_keys = cuco::detail::distance(first, last);
+    if (num_keys == 0) { return; }
+
+    auto const grid_size = cuco::detail::grid_size(num_keys, cg_size);
+
+    auto const always_true = thrust::constant_iterator<bool>{true};
+    detail::contains_if_n<cg_size, cuco::detail::default_block_size()>
+      <<<grid_size, cuco::detail::default_block_size(), 0, stream>>>(
+        first, num_keys, always_true, thrust::identity{}, output_begin, container_ref);
+  }
+
+  /**
+   * @brief Asynchronously indicates whether the keys in the range `[first, last)` are contained in
+   * the container if `pred` of the corresponding stencil returns true.
+   *
+   * @note If `pred( *(stencil + i) )` is true, stores `true` or `false` to `(output_begin + i)`
+   * indicating if the key `*(first + i)` is present int the container. If `pred( *(stencil + i) )`
+   * is false, stores false to `(output_begin + i)`.
+   *
+   * @tparam InputIt Device accessible input iterator
+   * @tparam StencilIt Device accessible random access iterator whose value_type is
+   * convertible to Predicate's argument type
+   * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` and
+   * argument type is convertible from <tt>std::iterator_traits<StencilIt>::value_type</tt>
+   * @tparam OutputIt Device accessible output iterator assignable from `bool`
+   * @tparam Ref Type of non-owning device container ref allowing access to storage
+   *
+   * @param first Beginning of the sequence of keys
+   * @param last End of the sequence of keys
+   * @param stencil Beginning of the stencil sequence
+   * @param pred Predicate to test on every element in the range `[stencil, stencil +
+   * std::distance(first, last))`
+   * @param output_begin Beginning of the sequence of booleans for the presence of each key
+   * @param container_ref Non-owning device container ref used to access the slot storage
+   * @param stream Stream used for executing the kernels
+   */
+  template <typename InputIt,
+            typename StencilIt,
+            typename Predicate,
+            typename OutputIt,
+            typename Ref>
+  void contains_if_async(InputIt first,
+                         InputIt last,
+                         StencilIt stencil,
+                         Predicate pred,
+                         OutputIt output_begin,
+                         Ref container_ref,
+                         cuda_stream_ref stream) const noexcept
+  {
+    auto const num_keys = cuco::detail::distance(first, last);
+    if (num_keys == 0) { return; }
+
+    auto const grid_size = cuco::detail::grid_size(num_keys, cg_size);
+
+    detail::contains_if_n<cg_size, cuco::detail::default_block_size()>
+      <<<grid_size, cuco::detail::default_block_size(), 0, stream>>>(
+        first, num_keys, stencil, pred, output_begin, container_ref);
+  }
+
+  /**
+   * @brief Retrieves all keys contained in the container.
+   *
+   * @note This API synchronizes the given stream.
+   * @note The order in which keys are returned is implementation defined and not guaranteed to be
+   * consistent between subsequent calls to `retrieve_all`.
+   * @note Behavior is undefined if the range beginning at `output_begin` is smaller than the return
+   * value of `size()`.
+   *
+   * @tparam InputIt Device accessible container slot iterator
+   * @tparam OutputIt Device accessible random access output iterator whose `value_type` is
+   * convertible from the container's `value_type`
+   * @tparam Predicate Type of predicate indicating if the given slot is filled
+   *
+   * @param begin Beginning of the container slot iterator
+   * @param output_begin Beginning output iterator for keys
+   * @param is_filled Predicate indicating if the given slot is filled
+   * @param stream CUDA stream used for this operation
+   *
+   * @return Iterator indicating the end of the output
+   */
+  template <typename InputIt, typename OutputIt, typename Predicate>
+  [[nodiscard]] OutputIt retrieve_all(InputIt begin,
+                                      OutputIt output_begin,
+                                      Predicate const& is_filled,
+                                      cuda_stream_ref stream) const
+  {
+    std::size_t temp_storage_bytes = 0;
+    using temp_allocator_type = typename std::allocator_traits<allocator_type>::rebind_alloc<char>;
+    auto temp_allocator       = temp_allocator_type{this->allocator()};
+    auto d_num_out            = reinterpret_cast<size_type*>(
+      std::allocator_traits<temp_allocator_type>::allocate(temp_allocator, sizeof(size_type)));
+    CUCO_CUDA_TRY(cub::DeviceSelect::If(nullptr,
+                                        temp_storage_bytes,
+                                        begin,
+                                        output_begin,
+                                        d_num_out,
+                                        this->capacity(),
+                                        is_filled,
+                                        stream));
+
+    // Allocate temporary storage
+    auto d_temp_storage = temp_allocator.allocate(temp_storage_bytes);
+
+    CUCO_CUDA_TRY(cub::DeviceSelect::If(d_temp_storage,
+                                        temp_storage_bytes,
+                                        begin,
+                                        output_begin,
+                                        d_num_out,
+                                        this->capacity(),
+                                        is_filled,
+                                        stream));
+
+    size_type h_num_out;
+    CUCO_CUDA_TRY(
+      cudaMemcpyAsync(&h_num_out, d_num_out, sizeof(size_type), cudaMemcpyDeviceToHost, stream));
+    stream.synchronize();
+    std::allocator_traits<temp_allocator_type>::deallocate(
+      temp_allocator, reinterpret_cast<char*>(d_num_out), sizeof(size_type));
+    temp_allocator.deallocate(d_temp_storage, temp_storage_bytes);
+
+    return output_begin + h_num_out;
+  }
+
+  /**
+   * @brief Gets the number of elements in the container.
+   *
+   * @note This function synchronizes the given stream.
+   *
+   * @tparam Predicate Type of predicate indicating if the given slot is filled
+   *
+   * @param is_filled Predicate indicating if the given slot is filled
+   * @param stream CUDA stream used to get the number of inserted elements
+   *
+   * @return The number of elements in the container
+   */
+  template <typename Predicate>
+  [[nodiscard]] size_type size(Predicate const& is_filled, cuda_stream_ref stream) const noexcept
+  {
+    auto counter =
+      detail::counter_storage<size_type, thread_scope, allocator_type>{this->allocator()};
+    counter.reset(stream);
+
+    auto const grid_size = cuco::detail::grid_size(storage_.num_windows());
+
+    // TODO: custom kernel to be replaced by cub::DeviceReduce::Sum when cub version is bumped to
+    // v2.1.0
+    detail::size<cuco::detail::default_block_size()>
+      <<<grid_size, cuco::detail::default_block_size(), 0, stream>>>(
+        storage_.ref(), is_filled, counter.data());
+
+    return counter.load_to_host(stream);
+  }
+
+  /**
+   * @brief Gets the maximum number of elements the container can hold.
+   *
+   * @return The maximum number of elements the container can hold
+   */
+  [[nodiscard]] constexpr auto capacity() const noexcept { return storage_.capacity(); }
+
+  /**
+   * @brief Gets the sentinel value used to represent an empty key slot.
+   *
+   * @return The sentinel value used to represent an empty key slot
+   */
+  [[nodiscard]] constexpr key_type empty_key_sentinel() const noexcept
+  {
+    return empty_key_sentinel_;
+  }
+
+  /**
+   * @brief Gets the key comparator.
+   *
+   * @return The comparator used to compare keys
+   */
+  [[nodiscard]] constexpr key_equal key_eq() const noexcept { return predicate_; }
+
+  /**
+   * @brief Gets the probing scheme.
+   *
+   * @return The probing scheme used for the container
+   */
+  [[nodiscard]] constexpr probing_scheme_type const& probing_scheme() const noexcept
+  {
+    return probing_scheme_;
+  }
+
+  /**
+   * @brief Gets the container allocator.
+   *
+   * @return The container allocator
+   */
+  [[nodiscard]] constexpr allocator_type allocator() const noexcept { return storage_.allocator(); }
+
+  /**
+   * @brief Gets the non-owning storage ref.
+   *
+   * @return The non-owning storage ref of the container
+   */
+  [[nodiscard]] constexpr storage_ref_type storage_ref() const noexcept { return storage_.ref(); }
+
+ protected:
+  key_type empty_key_sentinel_;         ///< Key value that represents an empty slot
+  value_type empty_slot_sentinel_;      ///< Slot value that represents an empty slot
+  key_equal predicate_;                 ///< Key equality binary predicate
+  probing_scheme_type probing_scheme_;  ///< Probing scheme
+  storage_type storage_;                ///< Slot window storage
+};
+
+}  // namespace detail
+}  // namespace experimental
+}  // namespace cuco
diff --git a/include/cuco/detail/open_addressing_ref_impl.cuh b/include/cuco/detail/open_addressing_ref_impl.cuh
new file mode 100644
index 000000000..cce691c21
--- /dev/null
+++ b/include/cuco/detail/open_addressing_ref_impl.cuh
@@ -0,0 +1,876 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuco/detail/equal_wrapper.cuh>
+#include <cuco/detail/probing_scheme_base.cuh>
+#include <cuco/extent.cuh>
+#include <cuco/pair.cuh>
+#include <cuco/probing_scheme.cuh>
+
+#include <thrust/distance.h>
+#include <thrust/pair.h>
+
+#include <cuda/atomic>
+
+#include <cooperative_groups.h>
+
+#include <cstdint>
+#include <type_traits>
+
+namespace cuco {
+namespace experimental {
+namespace detail {
+
+/// Three-way insert result enum
+enum class insert_result : int32_t { CONTINUE = 0, SUCCESS = 1, DUPLICATE = 2 };
+
+/**
+ * @brief Helper struct to store intermediate window probing results.
+ */
+struct window_probing_results {
+  detail::equal_result state_;  ///< Equal result
+  int32_t intra_window_index_;  ///< Intra-window index
+
+  /**
+   * @brief Constructs window_probing_results.
+   *
+   * @param state The three way equality result
+   * @param index Intra-window index
+   */
+  __device__ explicit constexpr window_probing_results(detail::equal_result state,
+                                                       int32_t index) noexcept
+    : state_{state}, intra_window_index_{index}
+  {
+  }
+};
+
+/**
+ * @brief Common device non-owning "ref" implementation class.
+ *
+ * @note This class should NOT be used directly.
+ *
+ * @throw If the size of the given key type is larger than 8 bytes
+ * @throw If the given key type doesn't have unique object representations, i.e.,
+ * `cuco::bitwise_comparable_v<Key> == false`
+ * @throw If the probing scheme type is not inherited from `cuco::detail::probing_scheme_base`
+ *
+ * @tparam Key Type used for keys. Requires `cuco::is_bitwise_comparable_v<Key>` returning true
+ * @tparam Scope The scope in which operations will be performed by individual threads.
+ * @tparam ProbingScheme Probing scheme (see `include/cuco/probing_scheme.cuh` for options)
+ * @tparam StorageRef Storage ref type
+ */
+template <typename Key, cuda::thread_scope Scope, typename ProbingScheme, typename StorageRef>
+class open_addressing_ref_impl {
+  static_assert(sizeof(Key) <= 8, "Container does not support key types larger than 8 bytes.");
+
+  static_assert(
+    cuco::is_bitwise_comparable_v<Key>,
+    "Key type must have unique object representations or have been explicitly declared as safe for "
+    "bitwise comparison via specialization of cuco::is_bitwise_comparable_v<Key>.");
+
+  static_assert(
+    std::is_base_of_v<cuco::experimental::detail::probing_scheme_base<ProbingScheme::cg_size>,
+                      ProbingScheme>,
+    "ProbingScheme must inherit from cuco::detail::probing_scheme_base");
+
+  // TODO: how to re-enable this check?
+  // static_assert(is_window_extent_v<typename StorageRef::extent_type>,
+  // "Extent is not a valid cuco::window_extent");
+
+ public:
+  using key_type            = Key;                                     ///< Key type
+  using probing_scheme_type = ProbingScheme;                           ///< Type of probing scheme
+  using storage_ref_type    = StorageRef;                              ///< Type of storage ref
+  using window_type         = typename storage_ref_type::window_type;  ///< Window type
+  using value_type          = typename storage_ref_type::value_type;   ///< Storage element type
+  using extent_type         = typename storage_ref_type::extent_type;  ///< Extent type
+  using size_type           = typename storage_ref_type::size_type;    ///< Probing scheme size type
+  using iterator            = typename storage_ref_type::iterator;     ///< Slot iterator type
+  using const_iterator = typename storage_ref_type::const_iterator;    ///< Const slot iterator type
+
+  static constexpr auto cg_size = probing_scheme_type::cg_size;  ///< Cooperative group size
+  static constexpr auto window_size =
+    storage_ref_type::window_size;  ///< Number of elements handled per window
+
+  /**
+   * @brief Constructs open_addressing_ref_impl.
+   *
+   * @param empty_slot_sentinel Sentinel indicating an empty slot
+   * @param probing_scheme Probing scheme
+   * @param storage_ref Non-owning ref of slot storage
+   */
+  __host__ __device__ explicit constexpr open_addressing_ref_impl(
+    value_type empty_slot_sentinel,
+    probing_scheme_type const& probing_scheme,
+    storage_ref_type storage_ref) noexcept
+    : empty_slot_sentinel_{empty_slot_sentinel},
+      probing_scheme_{probing_scheme},
+      storage_ref_{storage_ref}
+  {
+  }
+
+  /**
+   * @brief Gets the maximum number of elements the container can hold.
+   *
+   * @return The maximum number of elements the container can hold
+   */
+  [[nodiscard]] __host__ __device__ constexpr auto capacity() const noexcept
+  {
+    return storage_ref_.capacity();
+  }
+
+  /**
+   * @brief Returns a const_iterator to one past the last slot.
+   *
+   * @return A const_iterator to one past the last slot
+   */
+  [[nodiscard]] __host__ __device__ constexpr const_iterator end() const noexcept
+  {
+    return storage_ref_.end();
+  }
+
+  /**
+   * @brief Returns an iterator to one past the last slot.
+   *
+   * @return An iterator to one past the last slot
+   */
+  [[nodiscard]] __host__ __device__ constexpr iterator end() noexcept { return storage_ref_.end(); }
+
+  /**
+   * @brief Inserts an element.
+   *
+   * @tparam HasPayload Boolean indicating it's a set or map implementation
+   * @tparam Predicate Predicate type
+   *
+   * @param value The element to insert
+   * @param predicate Predicate used to compare slot content against `key`
+   *
+   * @return True if the given element is successfully inserted
+   */
+  template <bool HasPayload, typename Predicate>
+  __device__ bool insert(value_type const& value, Predicate const& predicate) noexcept
+  {
+    static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme");
+
+    auto const key = [&]() {
+      if constexpr (HasPayload) {
+        return value.first;
+      } else {
+        return value;
+      }
+    }();
+    auto probing_iter = probing_scheme_(key, storage_ref_.window_extent());
+
+    while (true) {
+      auto const window_slots = storage_ref_[*probing_iter];
+
+      for (auto& slot_content : window_slots) {
+        auto const eq_res = predicate(slot_content, key);
+
+        // If the key is already in the container, return false
+        if (eq_res == detail::equal_result::EQUAL) { return false; }
+        if (eq_res == detail::equal_result::EMPTY) {
+          auto const intra_window_index = thrust::distance(window_slots.begin(), &slot_content);
+          switch (attempt_insert<HasPayload>(
+            (storage_ref_.data() + *probing_iter)->data() + intra_window_index, value, predicate)) {
+            case insert_result::CONTINUE: continue;
+            case insert_result::SUCCESS: return true;
+            case insert_result::DUPLICATE: return false;
+          }
+        }
+      }
+      ++probing_iter;
+    }
+  }
+
+  /**
+   * @brief Inserts an element.
+   *
+   * @tparam HasPayload Boolean indicating it's a set or map implementation
+   * @tparam Predicate Predicate type
+   *
+   * @param group The Cooperative Group used to perform group insert
+   * @param value The element to insert
+   * @param predicate Predicate used to compare slot content against `key`
+   *
+   * @return True if the given element is successfully inserted
+   */
+  template <bool HasPayload, typename Predicate>
+  __device__ bool insert(cooperative_groups::thread_block_tile<cg_size> const& group,
+                         value_type const& value,
+                         Predicate const& predicate) noexcept
+  {
+    auto const key = [&]() {
+      if constexpr (HasPayload) {
+        return value.first;
+      } else {
+        return value;
+      }
+    }();
+    auto probing_iter = probing_scheme_(group, key, storage_ref_.window_extent());
+
+    while (true) {
+      auto const window_slots = storage_ref_[*probing_iter];
+
+      auto const [state, intra_window_index] = [&]() {
+        for (auto i = 0; i < window_size; ++i) {
+          switch (predicate(window_slots[i], key)) {
+            case detail::equal_result::EMPTY:
+              return window_probing_results{detail::equal_result::EMPTY, i};
+            case detail::equal_result::EQUAL:
+              return window_probing_results{detail::equal_result::EQUAL, i};
+            default: continue;
+          }
+        }
+        // returns dummy index `-1` for UNEQUAL
+        return window_probing_results{detail::equal_result::UNEQUAL, -1};
+      }();
+
+      // If the key is already in the container, return false
+      if (group.any(state == detail::equal_result::EQUAL)) { return false; }
+
+      auto const group_contains_empty = group.ballot(state == detail::equal_result::EMPTY);
+
+      if (group_contains_empty) {
+        auto const src_lane = __ffs(group_contains_empty) - 1;
+        auto const status =
+          (group.thread_rank() == src_lane)
+            ? attempt_insert<HasPayload>(
+                (storage_ref_.data() + *probing_iter)->data() + intra_window_index,
+                value,
+                predicate)
+            : insert_result::CONTINUE;
+
+        switch (group.shfl(status, src_lane)) {
+          case insert_result::SUCCESS: return true;
+          case insert_result::DUPLICATE: return false;
+          default: continue;
+        }
+      } else {
+        ++probing_iter;
+      }
+    }
+  }
+
+  /**
+   * @brief Inserts the given element into the container.
+   *
+   * @note This API returns a pair consisting of an iterator to the inserted element (or to the
+   * element that prevented the insertion) and a `bool` denoting whether the insertion took place or
+   * not.
+   *
+   * @tparam HasPayload Boolean indicating it's a set or map implementation
+   * @tparam Predicate Predicate type
+   *
+   * @param value The element to insert
+   * @param predicate Predicate used to compare slot content against `key`
+   *
+   * @return a pair consisting of an iterator to the element and a bool indicating whether the
+   * insertion is successful or not.
+   */
+  template <bool HasPayload, typename Predicate>
+  __device__ thrust::pair<iterator, bool> insert_and_find(value_type const& value,
+                                                          Predicate const& predicate) noexcept
+  {
+    static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme");
+
+    auto const key = [&]() {
+      if constexpr (HasPayload) {
+        return value.first;
+      } else {
+        return value;
+      }
+    }();
+    auto probing_iter = probing_scheme_(key, storage_ref_.window_extent());
+
+    while (true) {
+      auto const window_slots = storage_ref_[*probing_iter];
+
+      for (auto i = 0; i < window_size; ++i) {
+        auto const eq_res = predicate(window_slots[i], key);
+        auto* window_ptr  = (storage_ref_.data() + *probing_iter)->data();
+
+        // If the key is already in the container, return false
+        if (eq_res == detail::equal_result::EQUAL) { return {iterator{&window_ptr[i]}, false}; }
+        if (eq_res == detail::equal_result::EMPTY) {
+          switch ([&]() {
+            if constexpr (sizeof(value_type) <= 8) {
+              return packed_cas<HasPayload>(window_ptr + i, value, predicate);
+            } else {
+              return cas_dependent_write(window_ptr + i, value, predicate);
+            }
+          }()) {
+            case insert_result::SUCCESS: {
+              return {iterator{&window_ptr[i]}, true};
+            }
+            case insert_result::DUPLICATE: {
+              return {iterator{&window_ptr[i]}, false};
+            }
+            default: continue;
+          }
+        }
+      }
+      ++probing_iter;
+    };
+  }
+
+  /**
+   * @brief Inserts the given element into the container.
+   *
+   * @note This API returns a pair consisting of an iterator to the inserted element (or to the
+   * element that prevented the insertion) and a `bool` denoting whether the insertion took place or
+   * not.
+   *
+   * @tparam HasPayload Boolean indicating it's a set or map implementation
+   * @tparam Predicate Predicate type
+   *
+   * @param group The Cooperative Group used to perform group insert_and_find
+   * @param value The element to insert
+   * @param predicate Predicate used to compare slot content against `key`
+   *
+   * @return a pair consisting of an iterator to the element and a bool indicating whether the
+   * insertion is successful or not.
+   */
+  template <bool HasPayload, typename Predicate>
+  __device__ thrust::pair<iterator, bool> insert_and_find(
+    cooperative_groups::thread_block_tile<cg_size> const& group,
+    value_type const& value,
+    Predicate const& predicate) noexcept
+  {
+    auto const key = [&]() {
+      if constexpr (HasPayload) {
+        return value.first;
+      } else {
+        return value;
+      }
+    }();
+    auto probing_iter = probing_scheme_(group, key, storage_ref_.window_extent());
+
+    while (true) {
+      auto const window_slots = storage_ref_[*probing_iter];
+
+      auto const [state, intra_window_index] = [&]() {
+        for (auto i = 0; i < window_size; ++i) {
+          switch (predicate(window_slots[i], key)) {
+            case detail::equal_result::EMPTY:
+              return window_probing_results{detail::equal_result::EMPTY, i};
+            case detail::equal_result::EQUAL:
+              return window_probing_results{detail::equal_result::EQUAL, i};
+            default: continue;
+          }
+        }
+        // returns dummy index `-1` for UNEQUAL
+        return window_probing_results{detail::equal_result::UNEQUAL, -1};
+      }();
+
+      auto* slot_ptr = (storage_ref_.data() + *probing_iter)->data() + intra_window_index;
+
+      // If the key is already in the container, return false
+      auto const group_finds_equal = group.ballot(state == detail::equal_result::EQUAL);
+      if (group_finds_equal) {
+        auto const src_lane = __ffs(group_finds_equal) - 1;
+        auto const res      = group.shfl(reinterpret_cast<intptr_t>(slot_ptr), src_lane);
+        return {iterator{reinterpret_cast<value_type*>(res)}, false};
+      }
+
+      auto const group_contains_empty = group.ballot(state == detail::equal_result::EMPTY);
+      if (group_contains_empty) {
+        auto const src_lane = __ffs(group_contains_empty) - 1;
+        auto const res      = group.shfl(reinterpret_cast<intptr_t>(slot_ptr), src_lane);
+        auto const status   = [&]() {
+          if (group.thread_rank() != src_lane) { return insert_result::CONTINUE; }
+          if constexpr (sizeof(value_type) <= 8) {
+            return packed_cas<HasPayload>(slot_ptr, value, predicate);
+          } else {
+            return cas_dependent_write(slot_ptr, value, predicate);
+          }
+        }();
+
+        switch (group.shfl(status, src_lane)) {
+          case insert_result::SUCCESS: {
+            return {iterator{reinterpret_cast<value_type*>(res)}, true};
+          }
+          case insert_result::DUPLICATE: {
+            return {iterator{reinterpret_cast<value_type*>(res)}, false};
+          }
+          default: continue;
+        }
+      } else {
+        ++probing_iter;
+      }
+    }
+  }
+
+  /**
+   * @brief Indicates whether the probe key `key` was inserted into the container.
+   *
+   * @note If the probe key `key` was inserted into the container, returns true. Otherwise, returns
+   * false.
+   *
+   * @tparam ProbeKey Probe key type
+   * @tparam Predicate Predicate type
+   *
+   * @param key The key to search for
+   * @param predicate Predicate used to compare slot content against `key`
+   *
+   * @return A boolean indicating whether the probe key is present
+   */
+  template <typename ProbeKey, typename Predicate>
+  [[nodiscard]] __device__ bool contains(ProbeKey const& key,
+                                         Predicate const& predicate) const noexcept
+  {
+    static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme");
+    auto probing_iter = probing_scheme_(key, storage_ref_.window_extent());
+
+    while (true) {
+      // TODO atomic_ref::load if insert operator is present
+      auto const window_slots = storage_ref_[*probing_iter];
+
+      for (auto& slot_content : window_slots) {
+        switch (predicate(slot_content, key)) {
+          case detail::equal_result::UNEQUAL: continue;
+          case detail::equal_result::EMPTY: return false;
+          case detail::equal_result::EQUAL: return true;
+        }
+      }
+      ++probing_iter;
+    }
+  }
+
+  /**
+   * @brief Indicates whether the probe key `key` was inserted into the container.
+   *
+   * @note If the probe key `key` was inserted into the container, returns true. Otherwise, returns
+   * false.
+   *
+   * @tparam ProbeKey Probe key type
+   * @tparam Predicate Predicate type
+   *
+   * @param group The Cooperative Group used to perform group contains
+   * @param key The key to search for
+   * @param predicate Predicate used to compare slot content against `key`
+   *
+   * @return A boolean indicating whether the probe key is present
+   */
+  template <typename ProbeKey, typename Predicate>
+  [[nodiscard]] __device__ bool contains(
+    cooperative_groups::thread_block_tile<cg_size> const& group,
+    ProbeKey const& key,
+    Predicate const& predicate) const noexcept
+  {
+    auto probing_iter = probing_scheme_(group, key, storage_ref_.window_extent());
+
+    while (true) {
+      auto const window_slots = storage_ref_[*probing_iter];
+
+      auto const state = [&]() {
+        for (auto& slot : window_slots) {
+          switch (predicate(slot, key)) {
+            case detail::equal_result::EMPTY: return detail::equal_result::EMPTY;
+            case detail::equal_result::EQUAL: return detail::equal_result::EQUAL;
+            default: continue;
+          }
+        }
+        return detail::equal_result::UNEQUAL;
+      }();
+
+      if (group.any(state == detail::equal_result::EQUAL)) { return true; }
+      if (group.any(state == detail::equal_result::EMPTY)) { return false; }
+
+      ++probing_iter;
+    }
+  }
+
+  /**
+   * @brief Finds an element in the container with key equivalent to the probe key.
+   *
+   * @note Returns a un-incrementable input iterator to the element whose key is equivalent to
+   * `key`. If no such element exists, returns `end()`.
+   *
+   * @tparam ProbeKey Probe key type
+   * @tparam Predicate Predicate type
+   *
+   * @param key The key to search for
+   * @param predicate Predicate used to compare slot content against `key`
+   *
+   * @return An iterator to the position at which the equivalent key is stored
+   */
+  template <typename ProbeKey, typename Predicate>
+  [[nodiscard]] __device__ const_iterator find(ProbeKey const& key,
+                                               Predicate const& predicate) const noexcept
+  {
+    static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme");
+    auto probing_iter = probing_scheme_(key, storage_ref_.window_extent());
+
+    while (true) {
+      // TODO atomic_ref::load if insert operator is present
+      auto const window_slots = storage_ref_[*probing_iter];
+
+      for (auto i = 0; i < window_size; ++i) {
+        switch (predicate(window_slots[i], key)) {
+          case detail::equal_result::EMPTY: {
+            return this->end();
+          }
+          case detail::equal_result::EQUAL: {
+            return const_iterator{&(*(storage_ref_.data() + *probing_iter))[i]};
+          }
+          default: continue;
+        }
+      }
+      ++probing_iter;
+    }
+  }
+
+  /**
+   * @brief Finds an element in the container with key equivalent to the probe key.
+   *
+   * @note Returns a un-incrementable input iterator to the element whose key is equivalent to
+   * `key`. If no such element exists, returns `end()`.
+   *
+   * @tparam ProbeKey Probe key type
+   * @tparam Predicate Predicate type
+   *
+   * @param group The Cooperative Group used to perform this operation
+   * @param key The key to search for
+   * @param predicate Predicate used to compare slot content against `key`
+   *
+   * @return An iterator to the position at which the equivalent key is stored
+   */
+  template <typename ProbeKey, typename Predicate>
+  [[nodiscard]] __device__ const_iterator
+  find(cooperative_groups::thread_block_tile<cg_size> const& group,
+       ProbeKey const& key,
+       Predicate const& predicate) const noexcept
+  {
+    auto probing_iter = probing_scheme_(group, key, storage_ref_.window_extent());
+
+    while (true) {
+      auto const window_slots = storage_ref_[*probing_iter];
+
+      auto const [state, intra_window_index] = [&]() {
+        for (auto i = 0; i < window_size; ++i) {
+          switch (predicate(window_slots[i], key)) {
+            case detail::equal_result::EMPTY:
+              return window_probing_results{detail::equal_result::EMPTY, i};
+            case detail::equal_result::EQUAL:
+              return window_probing_results{detail::equal_result::EQUAL, i};
+            default: continue;
+          }
+        }
+        // returns dummy index `-1` for UNEQUAL
+        return window_probing_results{detail::equal_result::UNEQUAL, -1};
+      }();
+
+      // Find a match for the probe key, thus return an iterator to the entry
+      auto const group_finds_match = group.ballot(state == detail::equal_result::EQUAL);
+      if (group_finds_match) {
+        auto const src_lane = __ffs(group_finds_match) - 1;
+        auto const res      = group.shfl(
+          reinterpret_cast<intptr_t>(&(*(storage_ref_.data() + *probing_iter))[intra_window_index]),
+          src_lane);
+        return const_iterator{reinterpret_cast<value_type*>(res)};
+      }
+
+      // Find an empty slot, meaning that the probe key isn't present in the container
+      if (group.any(state == detail::equal_result::EMPTY)) { return this->end(); }
+
+      ++probing_iter;
+    }
+  }
+
+  /**
+   * @brief Compares the content of the address `address` (old value) with the `expected` value and,
+   * only if they are the same, sets the content of `address` to `desired`.
+   *
+   * @tparam T Address content type
+   *
+   * @param address The target address
+   * @param expected The value expected to be found at the target address
+   * @param desired The value to store at the target address if it is as expected
+   *
+   * @return The old value located at address `address`
+   */
+  template <typename T>
+  __device__ constexpr auto compare_and_swap(T* address, T expected, T desired)
+  {
+    // temporary workaround due to performance regression
+    // https://github.com/NVIDIA/libcudacxx/issues/366
+    if constexpr (sizeof(T) == sizeof(unsigned int)) {
+      auto* const slot_ptr           = reinterpret_cast<unsigned int*>(address);
+      auto const* const expected_ptr = reinterpret_cast<unsigned int*>(&expected);
+      auto const* const desired_ptr  = reinterpret_cast<unsigned int*>(&desired);
+      if constexpr (Scope == cuda::thread_scope_system) {
+        return atomicCAS_system(slot_ptr, *expected_ptr, *desired_ptr);
+      } else if constexpr (Scope == cuda::thread_scope_device) {
+        return atomicCAS(slot_ptr, *expected_ptr, *desired_ptr);
+      } else if constexpr (Scope == cuda::thread_scope_block) {
+        return atomicCAS_block(slot_ptr, *expected_ptr, *desired_ptr);
+      } else {
+        static_assert(cuco::dependent_false<decltype(Scope)>, "Unsupported thread scope");
+      }
+    } else if constexpr (sizeof(T) == sizeof(unsigned long long int)) {
+      auto* const slot_ptr           = reinterpret_cast<unsigned long long int*>(address);
+      auto const* const expected_ptr = reinterpret_cast<unsigned long long int*>(&expected);
+      auto const* const desired_ptr  = reinterpret_cast<unsigned long long int*>(&desired);
+      if constexpr (Scope == cuda::thread_scope_system) {
+        return atomicCAS_system(slot_ptr, *expected_ptr, *desired_ptr);
+      } else if constexpr (Scope == cuda::thread_scope_device) {
+        return atomicCAS(slot_ptr, *expected_ptr, *desired_ptr);
+      } else if constexpr (Scope == cuda::thread_scope_block) {
+        return atomicCAS_block(slot_ptr, *expected_ptr, *desired_ptr);
+      } else {
+        static_assert(cuco::dependent_false<decltype(Scope)>, "Unsupported thread scope");
+      }
+    }
+  }
+
+  /**
+   * @brief Atomically stores `value` at the given `address`.
+   *
+   * @tparam T Address content type
+   *
+   * @param address The target address
+   * @param value The value to store
+   */
+  template <typename T>
+  __device__ constexpr void atomic_store(T* address, T value)
+  {
+    if constexpr (sizeof(T) == sizeof(unsigned int)) {
+      auto* const slot_ptr        = reinterpret_cast<unsigned int*>(address);
+      auto const* const value_ptr = reinterpret_cast<unsigned int*>(&value);
+      if constexpr (Scope == cuda::thread_scope_system) {
+        atomicExch_system(slot_ptr, *value_ptr);
+      } else if constexpr (Scope == cuda::thread_scope_device) {
+        atomicExch(slot_ptr, *value_ptr);
+      } else if constexpr (Scope == cuda::thread_scope_block) {
+        atomicExch_block(slot_ptr, *value_ptr);
+      } else {
+        static_assert(cuco::dependent_false<decltype(Scope)>, "Unsupported thread scope");
+      }
+    } else if constexpr (sizeof(T) == sizeof(unsigned long long int)) {
+      auto* const slot_ptr        = reinterpret_cast<unsigned long long int*>(address);
+      auto const* const value_ptr = reinterpret_cast<unsigned long long int*>(&value);
+      if constexpr (Scope == cuda::thread_scope_system) {
+        atomicExch_system(slot_ptr, *value_ptr);
+      } else if constexpr (Scope == cuda::thread_scope_device) {
+        atomicExch(slot_ptr, *value_ptr);
+      } else if constexpr (Scope == cuda::thread_scope_block) {
+        atomicExch_block(slot_ptr, *value_ptr);
+      } else {
+        static_assert(cuco::dependent_false<decltype(Scope)>, "Unsupported thread scope");
+      }
+    }
+  }
+
+  /**
+   * @brief Gets the sentinel used to represent an empty slot.
+   *
+   * @return The sentinel value used to represent an empty slot
+   */
+  [[nodiscard]] __device__ constexpr value_type empty_slot_sentinel() const noexcept
+  {
+    return empty_slot_sentinel_;
+  }
+
+  /**
+   * @brief Gets the probing scheme.
+   *
+   * @return The probing scheme used for the container
+   */
+  [[nodiscard]] __device__ constexpr probing_scheme_type const& probing_scheme() const noexcept
+  {
+    return probing_scheme_;
+  }
+
+  /**
+   * @brief Gets the non-owning storage ref.
+   *
+   * @return The non-owning storage ref of the container
+   */
+  [[nodiscard]] __device__ constexpr storage_ref_type storage_ref() const noexcept
+  {
+    return storage_ref_;
+  }
+
+ private:
+  /**
+   * @brief Inserts the specified element with one single CAS operation.
+   *
+   * @tparam HasPayload Boolean indicating it's a set or map implementation
+   * @tparam Predicate Predicate type
+   *
+   * @param slot Pointer to the slot in memory
+   * @param value Element to insert
+   * @param predicate Predicate used to compare slot content against `key`
+   *
+   * @return Result of this operation, i.e., success/continue/duplicate
+   */
+  template <bool HasPayload, typename Predicate>
+  [[nodiscard]] __device__ constexpr insert_result packed_cas(value_type* slot,
+                                                              value_type const& value,
+                                                              Predicate const& predicate) noexcept
+  {
+    auto old            = compare_and_swap(slot, this->empty_slot_sentinel_, value);
+    auto* old_ptr       = reinterpret_cast<value_type*>(&old);
+    auto const inserted = [&]() {
+      if constexpr (HasPayload) {
+        // If it's a map implementation, compare keys only
+        return cuco::detail::bitwise_compare(old_ptr->first, this->empty_slot_sentinel_.first);
+      } else {
+        // If it's a set implementation, compare the whole slot content
+        return cuco::detail::bitwise_compare(*old_ptr, this->empty_slot_sentinel_);
+      }
+    }();
+    if (inserted) {
+      return insert_result::SUCCESS;
+    } else {
+      // Shouldn't use `predicate` operator directly since it includes a redundant bitwise compare
+      auto const res = [&]() {
+        if constexpr (HasPayload) {
+          // If it's a map implementation, compare keys only
+          return predicate.equal_to(old_ptr->first, value.first);
+        } else {
+          // If it's a set implementation, compare the whole slot content
+          return predicate.equal_to(*old_ptr, value);
+        }
+      }();
+      return res == detail::equal_result::EQUAL ? insert_result::DUPLICATE
+                                                : insert_result::CONTINUE;
+    }
+  }
+
+  /**
+   * @brief Inserts the specified element with two back-to-back CAS operations.
+   *
+   * @tparam Predicate Predicate type
+   *
+   * @param slot Pointer to the slot in memory
+   * @param value Element to insert
+   * @param predicate Predicate used to compare slot content against `key`
+   *
+   * @return Result of this operation, i.e., success/continue/duplicate
+   */
+  template <typename Predicate>
+  [[nodiscard]] __device__ constexpr insert_result back_to_back_cas(
+    value_type* slot, value_type const& value, Predicate const& predicate) noexcept
+  {
+    auto const expected_key     = this->empty_slot_sentinel_.first;
+    auto const expected_payload = this->empty_slot_sentinel_.second;
+
+    auto old_key     = compare_and_swap(&slot->first, expected_key, value.first);
+    auto old_payload = compare_and_swap(&slot->second, expected_payload, value.second);
+
+    using mapped_type = decltype(expected_payload);
+
+    auto* old_key_ptr     = reinterpret_cast<key_type*>(&old_key);
+    auto* old_payload_ptr = reinterpret_cast<mapped_type*>(&old_payload);
+
+    // if key success
+    if (cuco::detail::bitwise_compare(*old_key_ptr, expected_key)) {
+      while (not cuco::detail::bitwise_compare(*old_payload_ptr, expected_payload)) {
+        old_payload = compare_and_swap(&slot->second, expected_payload, value.second);
+      }
+      return insert_result::SUCCESS;
+    } else if (cuco::detail::bitwise_compare(*old_payload_ptr, expected_payload)) {
+      atomic_store(&slot->second, expected_payload);
+    }
+
+    // Our key was already present in the slot, so our key is a duplicate
+    // Shouldn't use `predicate` operator directly since it includes a redundant bitwise compare
+    if (predicate.equal_to(*old_key_ptr, value.first) == detail::equal_result::EQUAL) {
+      return insert_result::DUPLICATE;
+    }
+
+    return insert_result::CONTINUE;
+  }
+
+  /**
+   * @brief Inserts the specified element with CAS-dependent write operations.
+   *
+   * @tparam Predicate Predicate type
+   *
+   * @param slot Pointer to the slot in memory
+   * @param value Element to insert
+   * @param predicate Predicate used to compare slot content against `key`
+   *
+   * @return Result of this operation, i.e., success/continue/duplicate
+   */
+  template <typename Predicate>
+  [[nodiscard]] __device__ constexpr insert_result cas_dependent_write(
+    value_type* slot, value_type const& value, Predicate const& predicate) noexcept
+  {
+    auto const expected_key = this->empty_slot_sentinel_.first;
+
+    auto old_key = compare_and_swap(&slot->first, expected_key, value.first);
+
+    auto* old_key_ptr = reinterpret_cast<key_type*>(&old_key);
+
+    // if key success
+    if (cuco::detail::bitwise_compare(*old_key_ptr, expected_key)) {
+      atomic_store(&slot->second, value.second);
+      return insert_result::SUCCESS;
+    }
+
+    // Our key was already present in the slot, so our key is a duplicate
+    // Shouldn't use `predicate` operator directly since it includes a redundant bitwise compare
+    if (predicate.equal_to(*old_key_ptr, value.first) == detail::equal_result::EQUAL) {
+      return insert_result::DUPLICATE;
+    }
+
+    return insert_result::CONTINUE;
+  }
+
+  /**
+   * @brief Attempts to insert an element into a slot.
+   *
+   * @note Dispatches the correct implementation depending on the container
+   * type and presence of other operator mixins.
+   *
+   * @tparam HasPayload Boolean indicating it's a set or map implementation
+   * @tparam Predicate Predicate type
+   *
+   * @param slot Pointer to the slot in memory
+   * @param value Element to insert
+   * @param predicate Predicate used to compare slot content against `key`
+   *
+   * @return Result of this operation, i.e., success/continue/duplicate
+   */
+  template <bool HasPayload, typename Predicate>
+  [[nodiscard]] __device__ insert_result attempt_insert(value_type* slot,
+                                                        value_type const& value,
+                                                        Predicate const& predicate) noexcept
+  {
+    if constexpr (sizeof(value_type) <= 8) {
+      return packed_cas<HasPayload>(slot, value, predicate);
+    } else {
+#if (_CUDA_ARCH__ < 700)
+      return cas_dependent_write(slot, value, predicate);
+#else
+      return back_to_back_cas(slot, value, predicate);
+#endif
+    }
+  }
+
+  value_type empty_slot_sentinel_;      ///< Sentinel value indicating an empty slot
+  probing_scheme_type probing_scheme_;  ///< Probing scheme
+  storage_ref_type storage_ref_;        ///< Slot storage ref
+};
+
+}  // namespace detail
+}  // namespace experimental
+}  // namespace cuco
diff --git a/include/cuco/detail/operator.inl b/include/cuco/detail/operator.inl
new file mode 100644
index 000000000..fdd5884e8
--- /dev/null
+++ b/include/cuco/detail/operator.inl
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuco/utility/traits.hpp>
+
+#include <type_traits>
+
+namespace cuco {
+namespace experimental {
+namespace detail {
+
+/**
+ * @brief CRTP mixin which augments a given `Reference` with an `Operator`.
+ *
+ * @throw If the operator is not defined in `include/cuco/operator.hpp`
+ *
+ * @tparam Operator Operator type, i.e., `cuco::op::*_tag`
+ * @tparam Reference The reference type.
+ *
+ * @note This primary template should never be instantiated.
+ */
+template <typename Operator, typename Reference>
+class operator_impl {
+  static_assert(cuco::dependent_false<Operator, Reference>,
+                "Operator type is not supported by reference type.");
+};
+
+/**
+ * @brief Checks if the given `Operator` is contained in a list of `Operators`.
+ *
+ * @tparam Operator Operator type, i.e., `cuco::op::*_tag`
+ * @tparam Operators List of operators to search in
+ *
+ * @return `true` if `Operator` is contained in `Operators`, `false` otherwise.
+ */
+template <typename Operator, typename... Operators>
+static constexpr bool has_operator()
+{
+  return ((std::is_same_v<Operators, Operator>) || ...);
+}
+
+}  // namespace detail
+}  // namespace experimental
+}  // namespace cuco
diff --git a/include/cuco/detail/pair.cuh b/include/cuco/detail/pair.cuh
deleted file mode 100644
index 7ea39889c..000000000
--- a/include/cuco/detail/pair.cuh
+++ /dev/null
@@ -1,275 +0,0 @@
-/*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/device_reference.h>
-#include <thrust/memory.h>
-#include <thrust/pair.h>
-#include <thrust/tuple.h>
-
-#include <algorithm>
-#include <tuple>
-#include <type_traits>
-
-namespace cuco {
-namespace detail {
-
-/**
- * @brief Rounds `v` to the nearest power of 2 greater than or equal to `v`.
- *
- * @param v
- * @return The nearest power of 2 greater than or equal to `v`.
- */
-constexpr std::size_t next_pow2(std::size_t v) noexcept
-{
-  --v;
-  v |= v >> 1;
-  v |= v >> 2;
-  v |= v >> 4;
-  v |= v >> 8;
-  v |= v >> 16;
-  return ++v;
-}
-
-/**
- * @brief Gives value to use as alignment for a pair type that is at least the
- * size of the sum of the size of the first type and second type, or 16,
- * whichever is smaller.
- */
-template <typename First, typename Second>
-constexpr std::size_t pair_alignment()
-{
-  return std::min(std::size_t{16}, next_pow2(sizeof(First) + sizeof(Second)));
-}
-
-template <typename T, typename = void>
-struct is_std_pair_like : std::false_type {
-};
-
-template <typename T>
-struct is_std_pair_like<
-  T,
-  std::void_t<decltype(std::get<0>(std::declval<T>())), decltype(std::get<1>(std::declval<T>()))>>
-  : std::conditional_t<std::tuple_size<T>::value == 2, std::true_type, std::false_type> {
-};
-
-template <typename T, typename = void>
-struct is_thrust_pair_like_impl : std::false_type {
-};
-
-template <typename T>
-struct is_thrust_pair_like_impl<T,
-                                std::void_t<decltype(thrust::get<0>(std::declval<T>())),
-                                            decltype(thrust::get<1>(std::declval<T>()))>>
-  : std::conditional_t<thrust::tuple_size<T>::value == 2, std::true_type, std::false_type> {
-};
-
-template <typename T>
-struct is_thrust_pair_like
-  : is_thrust_pair_like_impl<
-      std::remove_reference_t<decltype(thrust::raw_reference_cast(std::declval<T>()))>> {
-};
-
-/**
- * @brief Denotes the equivalent packed type based on the size of the object.
- *
- * @tparam N The size of the object
- */
-template <std::size_t N>
-struct packed {
-  using type = void;  ///< `void` type by default
-};
-/**
- * @brief Denotes the packed type when the size of the object is 8.
- */
-template <>
-struct packed<sizeof(uint64_t)> {
-  using type = uint64_t;  ///< Packed type as `uint64_t` if the size of the object is 8
-};
-/**
- * @brief Denotes the packed type when the size of the object is 4.
- */
-template <>
-struct packed<sizeof(uint32_t)> {
-  using type = uint32_t;  ///< Packed type as `uint32_t` if the size of the object is 4
-};
-template <typename pair_type>
-using packed_t = typename packed<sizeof(pair_type)>::type;
-
-/**
- * @brief Indicates if a pair type can be packed.
- *
- * When the size of the key,value pair being inserted into the hash table is
- * equal in size to a type where atomicCAS is natively supported, it is more
- * efficient to "pack" the pair and insert it with a single atomicCAS.
- *
- * Pair types whose key and value have the same object representation may be
- * packed. Also, the `pair_type` must not contain any padding bits otherwise
- * accessing the packed value would be undefined.
- *
- * @tparam pair_type The pair type that will be packed
- *
- * @return true If the pair type can be packed
- * @return false  If the pair type cannot be packed
- */
-template <typename pair_type,
-          typename key_type   = typename pair_type::first_type,
-          typename value_type = typename pair_type::second_type>
-constexpr bool is_packable()
-{
-  return not std::is_void<packed_t<pair_type>>::value and
-         std::has_unique_object_representations_v<pair_type>;
-}
-
-/**
- * @brief Allows viewing a pair in a packed representation.
- *
- * Used as an optimization for inserting when a pair can be inserted with a
- * single atomicCAS
- */
-template <typename pair_type>
-union pair_converter {
-  using packed_type = packed_t<pair_type>;  ///< The packed pair type
-  packed_type packed;                       ///< The pair in the packed representation
-  pair_type pair;                           ///< The pair in the pair representation
-
-  /**
-   * @brief Constructs a pair converter by copying from `p`
-   *
-   * @tparam T Type that is convertible to `pair_type`
-   *
-   * @param p The pair to copy from
-   */
-  template <typename T>
-  __device__ pair_converter(T&& p) : pair{p}
-  {
-  }
-
-  /**
-   * @brief Constructs a pair converter by copying from `p`
-   *
-   * @param p The packed data to copy from
-   */
-  __device__ pair_converter(packed_type p) : packed{p} {}
-};
-
-}  // namespace detail
-
-/**
- * @brief Custom pair type
- *
- * This is necessary because `thrust::pair` is under aligned.
- *
- * @tparam First Type of the first value in the pair
- * @tparam Second Type of the second value in the pair
- */
-template <typename First, typename Second>
-struct alignas(detail::pair_alignment<First, Second>()) pair {
-  using first_type  = First;   ///< Type of the first value in the pair
-  using second_type = Second;  ///< Type of the second value in the pair
-
-  pair()            = default;
-  ~pair()           = default;
-  pair(pair const&) = default;  ///< Copy constructor
-  pair(pair&&)      = default;  ///< Move constructor
-
-  /**
-   * @brief Replaces the contents of the pair with another pair.
-   *
-   * @return Reference of the current pair object
-   */
-  pair& operator=(pair const&) = default;
-
-  /**
-   * @brief Replaces the contents of the pair with another pair.
-   *
-   * @return Reference of the current pair object
-   */
-  pair& operator=(pair&&) = default;
-
-  /**
-   * @brief Constructs a pair from objects `f` and `s`.
-   *
-   * @param f The object to copy into `first`
-   * @param s The object to copy into `second`
-   */
-  __host__ __device__ constexpr pair(First const& f, Second const& s) : first{f}, second{s} {}
-
-  /**
-   * @brief Constructs a pair by copying from the given pair `p`.
-   *
-   * @tparam F Type of the first value of `p`
-   * @tparam S Type of the second value of `p`
-   *
-   * @param p The pair to copy from
-   */
-  template <typename F, typename S>
-  __host__ __device__ constexpr pair(pair<F, S> const& p) : first{p.first}, second{p.second}
-  {
-  }
-
-  /**
-   * @brief Constructs a pair from the given std::pair-like `p`.
-   *
-   * @tparam T Type of the pair to copy from
-   *
-   * @param p The input pair to copy from
-   */
-  template <typename T, std::enable_if_t<detail::is_std_pair_like<T>::value>* = nullptr>
-  __host__ __device__ constexpr pair(T const& p)
-    : pair{std::get<0>(thrust::raw_reference_cast(p)), std::get<1>(thrust::raw_reference_cast(p))}
-  {
-  }
-
-  /**
-   * @brief Constructs a pair from the given thrust::pair-like `p`.
-   *
-   * @tparam T Type of the pair to copy from
-   *
-   * @param p The input pair to copy from
-   */
-  template <typename T, std::enable_if_t<detail::is_thrust_pair_like<T>::value>* = nullptr>
-  __host__ __device__ constexpr pair(T const& p)
-    : pair{thrust::get<0>(thrust::raw_reference_cast(p)),
-           thrust::get<1>(thrust::raw_reference_cast(p))}
-  {
-  }
-
-  First first;    ///< The first value in the pair
-  Second second;  ///< The second value in the pair
-};
-
-template <typename K, typename V>
-using pair_type = cuco::pair<K, V>;
-
-/**
- * @brief Creates a pair of type `pair_type`
- *
- * @tparam F
- * @tparam S
- *
- * @param f
- * @param s
- * @return pair_type with first element `f` and second element `s`.
- */
-template <typename F, typename S>
-__host__ __device__ pair_type<F, S> make_pair(F&& f, S&& s) noexcept
-{
-  return pair_type<F, S>{std::forward<F>(f), std::forward<S>(s)};
-}
-
-}  // namespace cuco
diff --git a/include/cuco/detail/pair.inl b/include/cuco/detail/pair.inl
new file mode 100644
index 000000000..56d16e4fb
--- /dev/null
+++ b/include/cuco/detail/pair.inl
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <type_traits>
+#include <utility>
+
+namespace cuco {
+
+template <typename First, typename Second>
+__host__ __device__ constexpr pair<First, Second>::pair(First const& f, Second const& s)
+  : first{f}, second{s}
+{
+}
+
+template <typename First, typename Second>
+template <typename F, typename S>
+__host__ __device__ constexpr pair<First, Second>::pair(pair<F, S> const& p)
+  : first{p.first}, second{p.second}
+{
+}
+
+template <typename F, typename S>
+__host__ __device__ constexpr pair<std::decay_t<F>, std::decay_t<S>> make_pair(F&& f,
+                                                                               S&& s) noexcept
+{
+  return pair<std::decay_t<F>, std::decay_t<S>>(std::forward<F>(f), std::forward<S>(s));
+}
+
+template <class T1, class T2, class U1, class U2>
+__host__ __device__ constexpr bool operator==(cuco::pair<T1, T2> const& lhs,
+                                              cuco::pair<U1, U2> const& rhs) noexcept
+{
+  return lhs.first == rhs.first and lhs.second == rhs.second;
+}
+
+}  // namespace cuco
diff --git a/include/cuco/detail/prime.hpp b/include/cuco/detail/prime.hpp
index 93ddde1a0..c788fa245 100644
--- a/include/cuco/detail/prime.hpp
+++ b/include/cuco/detail/prime.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,12 +16,18 @@
 
 #pragma once
 
-#include <cuco/detail/utils.hpp>
+#include <cuco/detail/utility/math.hpp>
+
+#include <algorithm>
+#include <array>
+#include <cstdint>
+#include <limits>
 
 namespace cuco {
 namespace detail {
 
-constexpr std::array<std::uint64_t, 140746> primes = {
+// TODO use CTAD instead of explicitly specifying the array size once we drop support for nvcc <11.5
+inline constexpr std::array<int64_t, 140739> primes = {
   2,           3,           5,           7,           13,          19,          29,
   37,          43,          53,          59,          67,          73,          79,
   89,          97,          103,         109,         127,         137,         149,
@@ -20129,43 +20135,6 @@ constexpr std::array<std::uint64_t, 140746> primes = {
   17176447243, 17176578343, 17176709449, 17176840529, 17176971601, 17177102693, 17177233783,
   17177364857, 17177495953, 17177627053, 17177758133};
 
-/**
- * @brief Indicates whether the input `num` is a prime number.
- *
- * @param num
- * @return A boolean indicating whether the input `num` is a prime number
- */
-constexpr bool is_prime(std::size_t num) noexcept
-{
-  bool flag = true;
-  // 0 and 1 are not prime numbers
-  if (num == 0lu || num == 1lu) {
-    flag = false;
-  } else {
-    for (auto i = 2lu; i <= num / 2lu; ++i) {
-      if (num % i == 0) {
-        flag = false;
-        break;
-      }
-    }
-  }
-  return flag;
-}
-
-/**
- * @brief Computes the smallest prime number greater than or equal to `num`.
- *
- * @param num
- * @return The smallest prime number greater than or equal to `num`
- */
-constexpr std::size_t compute_prime(std::size_t num) noexcept
-{
-  while (not is_prime(num)) {
-    num++;
-  }
-  return num;
-}
-
 /**
  * @brief Calculates the valid capacity based on `cg_size` , `vector_width`
  * and the initial `capacity`.
@@ -20177,15 +20146,15 @@ constexpr std::size_t compute_prime(std::size_t num) noexcept
  * @param capacity The initially requested capacity
  * @return A valid capacity no smaller than the requested `capacity`
  */
-template <uint32_t cg_size, uint32_t vector_width, bool uses_vector_load>
-constexpr std::size_t get_valid_capacity(std::size_t capacity) noexcept
+template <int32_t cg_size, int32_t vector_width, bool uses_vector_load, typename T>
+constexpr T get_valid_capacity(T capacity) noexcept
 {
   auto const stride = [&]() {
     if constexpr (uses_vector_load) { return cg_size * vector_width; }
     if constexpr (not uses_vector_load) { return cg_size; }
   }();
 
-  auto const c         = SDIV(capacity, stride);
+  auto const c         = int_div_ceil(capacity, stride);
   auto const min_prime = std::lower_bound(primes.begin(), primes.end(), c);
   return *min_prime * stride;
 }
diff --git a/include/cuco/detail/probe_sequence_impl.cuh b/include/cuco/detail/probe_sequence_impl.cuh
index 688b2f28f..c108840b2 100644
--- a/include/cuco/detail/probe_sequence_impl.cuh
+++ b/include/cuco/detail/probe_sequence_impl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,8 +16,8 @@
 
 #pragma once
 
-#include <cuco/detail/hash_functions.cuh>
-#include <cuco/detail/pair.cuh>
+#include <cuco/detail/utils.cuh>
+#include <cuco/pair.cuh>
 
 #include <cuda/std/atomic>
 
@@ -72,13 +72,13 @@ template <typename Key,
           uint32_t CGSize>
 class probe_sequence_impl_base {
  protected:
-  using value_type         = cuco::pair_type<Key, Value>;       ///< Type of key/value pairs
+  using value_type         = cuco::pair<Key, Value>;            ///< Type of key/value pairs
   using key_type           = Key;                               ///< Key type
   using mapped_type        = Value;                             ///< Type of mapped values
   using atomic_key_type    = cuda::atomic<key_type, Scope>;     ///< Type of atomic keys
   using atomic_mapped_type = cuda::atomic<mapped_type, Scope>;  ///< Type of atomic mapped values
   /// Pair type of atomic key and atomic mapped value
-  using pair_atomic_type = cuco::pair_type<atomic_key_type, atomic_mapped_type>;
+  using pair_atomic_type = cuco::pair<atomic_key_type, atomic_mapped_type>;
   /// Type of the forward iterator to `pair_atomic_type`
   using iterator = pair_atomic_type*;
   /// Type of the forward iterator to `const pair_atomic_type`
diff --git a/include/cuco/detail/probing_scheme_base.cuh b/include/cuco/detail/probing_scheme_base.cuh
new file mode 100644
index 000000000..03f712155
--- /dev/null
+++ b/include/cuco/detail/probing_scheme_base.cuh
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cstdint>
+
+namespace cuco {
+namespace experimental {
+namespace detail {
+
+/**
+ * @brief Base class of public probing scheme.
+ *
+ * This class should not be used directly.
+ *
+ * @tparam CGSize Size of CUDA Cooperative Groups
+ */
+template <int32_t CGSize>
+class probing_scheme_base {
+ public:
+  /**
+   * @brief The size of the CUDA cooperative thread group.
+   */
+  static constexpr int32_t cg_size = CGSize;
+};
+}  // namespace detail
+}  // namespace experimental
+}  // namespace cuco
diff --git a/include/cuco/detail/probing_scheme_impl.inl b/include/cuco/detail/probing_scheme_impl.inl
new file mode 100644
index 000000000..3090d026e
--- /dev/null
+++ b/include/cuco/detail/probing_scheme_impl.inl
@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuco/detail/utils.cuh>
+
+namespace cuco {
+namespace experimental {
+namespace detail {
+
+/**
+ * @brief Probing iterator class.
+ *
+ * @tparam Extent Type of Extent
+ */
+template <typename Extent>
+class probing_iterator {
+ public:
+  using extent_type = Extent;                            ///< Extent type
+  using size_type   = typename extent_type::value_type;  ///< Size type
+
+  /**
+   * @brief Constructs an probing iterator
+   *
+   * @param start Iteration starting point
+   * @param step_size Double hashing step size
+   * @param upper_bound Upper bound of the iteration
+   */
+  __host__ __device__ constexpr probing_iterator(size_type start,
+                                                 size_type step_size,
+                                                 extent_type upper_bound) noexcept
+    : curr_index_{start}, step_size_{step_size}, upper_bound_{upper_bound}
+  {
+    // TODO: revise this API when introducing quadratic probing into cuco
+  }
+
+  /**
+   * @brief Dereference operator
+   *
+   * @return Current slot index
+   */
+  __host__ __device__ constexpr auto operator*() const noexcept { return curr_index_; }
+
+  /**
+   * @brief Prefix increment operator
+   *
+   * @return Current iterator
+   */
+  __host__ __device__ constexpr auto operator++() noexcept
+  {
+    // TODO: step_size_ can be a build time constant (e.g. linear probing)
+    //  Worth passing another extent type?
+    curr_index_ = (curr_index_ + step_size_) % upper_bound_;
+    return *this;
+  }
+
+  /**
+   * @brief Postfix increment operator
+   *
+   * @return Old iterator before increment
+   */
+  __host__ __device__ constexpr auto operator++(int32_t) noexcept
+  {
+    auto temp = *this;
+    ++(*this);
+    return temp;
+  }
+
+ private:
+  size_type curr_index_;
+  size_type step_size_;
+  extent_type upper_bound_;
+};
+}  // namespace detail
+
+template <int32_t CGSize, typename Hash>
+__host__ __device__ constexpr linear_probing<CGSize, Hash>::linear_probing(Hash const& hash)
+  : hash_{hash}
+{
+}
+
+template <int32_t CGSize, typename Hash>
+template <typename ProbeKey, typename Extent>
+__host__ __device__ constexpr auto linear_probing<CGSize, Hash>::operator()(
+  ProbeKey const& probe_key, Extent upper_bound) const noexcept
+{
+  using size_type = typename Extent::value_type;
+  return detail::probing_iterator<Extent>{
+    cuco::detail::sanitize_hash<size_type>(hash_(probe_key)) % upper_bound,
+    1,  // step size is 1
+    upper_bound};
+}
+
+template <int32_t CGSize, typename Hash>
+template <typename ProbeKey, typename Extent>
+__host__ __device__ constexpr auto linear_probing<CGSize, Hash>::operator()(
+  cooperative_groups::thread_block_tile<cg_size> const& g,
+  ProbeKey const& probe_key,
+  Extent upper_bound) const noexcept
+{
+  using size_type = typename Extent::value_type;
+  return detail::probing_iterator<Extent>{
+    cuco::detail::sanitize_hash<size_type>(hash_(probe_key) + g.thread_rank()) % upper_bound,
+    cg_size,
+    upper_bound};
+}
+
+template <int32_t CGSize, typename Hash1, typename Hash2>
+__host__ __device__ constexpr double_hashing<CGSize, Hash1, Hash2>::double_hashing(
+  Hash1 const& hash1, Hash2 const& hash2)
+  : hash1_{hash1}, hash2_{hash2}
+{
+}
+
+template <int32_t CGSize, typename Hash1, typename Hash2>
+template <typename ProbeKey, typename Extent>
+__host__ __device__ constexpr auto double_hashing<CGSize, Hash1, Hash2>::operator()(
+  ProbeKey const& probe_key, Extent upper_bound) const noexcept
+{
+  using size_type = typename Extent::value_type;
+  return detail::probing_iterator<Extent>{
+    cuco::detail::sanitize_hash<size_type>(hash1_(probe_key)) % upper_bound,
+    max(size_type{1},
+        cuco::detail::sanitize_hash<size_type>(hash2_(probe_key)) %
+          upper_bound),  // step size in range [1, prime - 1]
+    upper_bound};
+}
+
+template <int32_t CGSize, typename Hash1, typename Hash2>
+template <typename ProbeKey, typename Extent>
+__host__ __device__ constexpr auto double_hashing<CGSize, Hash1, Hash2>::operator()(
+  cooperative_groups::thread_block_tile<cg_size> const& g,
+  ProbeKey const& probe_key,
+  Extent upper_bound) const noexcept
+{
+  using size_type = typename Extent::value_type;
+  return detail::probing_iterator<Extent>{
+    cuco::detail::sanitize_hash<size_type>(hash1_(probe_key) + g.thread_rank()) % upper_bound,
+    static_cast<size_type>((cuco::detail::sanitize_hash<size_type>(hash2_(probe_key)) %
+                              (upper_bound.value() / cg_size - 1) +
+                            1) *
+                           cg_size),
+    upper_bound};  // TODO use fast_int operator
+}
+}  // namespace experimental
+}  // namespace cuco
diff --git a/include/cuco/detail/static_map.inl b/include/cuco/detail/static_map.inl
index 93059729f..f6f8a9464 100644
--- a/include/cuco/detail/static_map.inl
+++ b/include/cuco/detail/static_map.inl
@@ -17,6 +17,7 @@
 #include <cuco/detail/bitwise_compare.cuh>
 #include <cuco/detail/error.hpp>
 #include <cuco/detail/utils.cuh>
+#include <cuco/detail/utils.hpp>
 
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/iterator/zip_iterator.h>
@@ -27,12 +28,11 @@
 namespace cuco {
 
 template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
-static_map<Key, Value, Scope, Allocator>::static_map(
-  std::size_t capacity,
-  sentinel::empty_key<Key> empty_key_sentinel,
-  sentinel::empty_value<Value> empty_value_sentinel,
-  Allocator const& alloc,
-  cudaStream_t stream)
+static_map<Key, Value, Scope, Allocator>::static_map(std::size_t capacity,
+                                                     empty_key<Key> empty_key_sentinel,
+                                                     empty_value<Value> empty_value_sentinel,
+                                                     Allocator const& alloc,
+                                                     cudaStream_t stream)
   : capacity_{std::max(capacity, std::size_t{1})},  // to avoid dereferencing a nullptr (Issue #72)
     empty_key_sentinel_{empty_key_sentinel.value},
     empty_value_sentinel_{empty_value_sentinel.value},
@@ -52,13 +52,12 @@ static_map<Key, Value, Scope, Allocator>::static_map(
 }
 
 template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
-static_map<Key, Value, Scope, Allocator>::static_map(
-  std::size_t capacity,
-  sentinel::empty_key<Key> empty_key_sentinel,
-  sentinel::empty_value<Value> empty_value_sentinel,
-  sentinel::erased_key<Key> erased_key_sentinel,
-  Allocator const& alloc,
-  cudaStream_t stream)
+static_map<Key, Value, Scope, Allocator>::static_map(std::size_t capacity,
+                                                     empty_key<Key> empty_key_sentinel,
+                                                     empty_value<Value> empty_value_sentinel,
+                                                     erased_key<Key> erased_key_sentinel,
+                                                     Allocator const& alloc,
+                                                     cudaStream_t stream)
   : capacity_{std::max(capacity, std::size_t{1})},  // to avoid dereferencing a nullptr (Issue #72)
     empty_key_sentinel_{empty_key_sentinel.value},
     empty_value_sentinel_{empty_value_sentinel.value},
@@ -66,8 +65,9 @@ static_map<Key, Value, Scope, Allocator>::static_map(
     slot_allocator_{alloc},
     counter_allocator_{alloc}
 {
-  CUCO_RUNTIME_EXPECTS(empty_key_sentinel_ != erased_key_sentinel_,
-                       "The empty key sentinel and erased key sentinel cannot be the same value.");
+  CUCO_EXPECTS(empty_key_sentinel_ != erased_key_sentinel_,
+               "The empty key sentinel and erased key sentinel cannot be the same value.",
+               std::runtime_error);
 
   slots_         = std::allocator_traits<slot_allocator_type>::allocate(slot_allocator_, capacity_);
   num_successes_ = std::allocator_traits<counter_allocator_type>::allocate(counter_allocator_, 1);
@@ -102,7 +102,7 @@ template <typename InputIt, typename Hash, typename KeyEqual>
 void static_map<Key, Value, Scope, Allocator>::insert(
   InputIt first, InputIt last, Hash hash, KeyEqual key_equal, cudaStream_t stream)
 {
-  auto num_keys = std::distance(first, last);
+  auto const num_keys = cuco::detail::distance(first, last);
   if (num_keys == 0) { return; }
 
   auto const block_size = 128;
@@ -116,8 +116,8 @@ void static_map<Key, Value, Scope, Allocator>::insert(
   CUCO_CUDA_TRY(cudaMemsetAsync(num_successes_, 0, sizeof(atomic_ctr_type), stream));
   std::size_t h_num_successes;
 
-  detail::insert<block_size, tile_size><<<grid_size, block_size, 0, stream>>>(
-    first, first + num_keys, num_successes_, view, hash, key_equal);
+  detail::insert<block_size, tile_size>
+    <<<grid_size, block_size, 0, stream>>>(first, num_keys, num_successes_, view, hash, key_equal);
   CUCO_CUDA_TRY(cudaMemcpyAsync(
     &h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream));
 
@@ -140,7 +140,7 @@ void static_map<Key, Value, Scope, Allocator>::insert_if(InputIt first,
                                                          KeyEqual key_equal,
                                                          cudaStream_t stream)
 {
-  auto num_keys = std::distance(first, last);
+  auto const num_keys = cuco::detail::distance(first, last);
   if (num_keys == 0) { return; }
 
   auto constexpr block_size = 128;
@@ -167,10 +167,11 @@ template <typename InputIt, typename Hash, typename KeyEqual>
 void static_map<Key, Value, Scope, Allocator>::erase(
   InputIt first, InputIt last, Hash hash, KeyEqual key_equal, cudaStream_t stream)
 {
-  CUCO_RUNTIME_EXPECTS(get_empty_key_sentinel() != get_erased_key_sentinel(),
-                       "You must provide a unique erased key sentinel value at map construction.");
+  CUCO_EXPECTS(get_empty_key_sentinel() != get_erased_key_sentinel(),
+               "You must provide a unique erased key sentinel value at map construction.",
+               std::runtime_error);
 
-  auto num_keys = std::distance(first, last);
+  auto const num_keys = cuco::detail::distance(first, last);
   if (num_keys == 0) { return; }
 
   auto constexpr block_size = 128;
@@ -184,8 +185,8 @@ void static_map<Key, Value, Scope, Allocator>::erase(
   CUCO_CUDA_TRY(cudaMemsetAsync(num_successes_, 0, sizeof(atomic_ctr_type), stream));
   std::size_t h_num_successes;
 
-  detail::erase<block_size, tile_size><<<grid_size, block_size, 0, stream>>>(
-    first, first + num_keys, num_successes_, view, hash, key_equal);
+  detail::erase<block_size, tile_size>
+    <<<grid_size, block_size, 0, stream>>>(first, num_keys, num_successes_, view, hash, key_equal);
   CUCO_CUDA_TRY(cudaMemcpyAsync(
     &h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream));
 
@@ -203,7 +204,7 @@ void static_map<Key, Value, Scope, Allocator>::find(InputIt first,
                                                     KeyEqual key_equal,
                                                     cudaStream_t stream)
 {
-  auto num_keys = std::distance(first, last);
+  auto const num_keys = cuco::detail::distance(first, last);
   if (num_keys == 0) { return; }
 
   auto const block_size = 128;
@@ -213,13 +214,13 @@ void static_map<Key, Value, Scope, Allocator>::find(InputIt first,
   auto view             = get_device_view();
 
   detail::find<block_size, tile_size, Value>
-    <<<grid_size, block_size, 0, stream>>>(first, last, output_begin, view, hash, key_equal);
+    <<<grid_size, block_size, 0, stream>>>(first, num_keys, output_begin, view, hash, key_equal);
 }
 
 template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
 template <typename KeyOut, typename ValueOut>
 std::pair<KeyOut, ValueOut> static_map<Key, Value, Scope, Allocator>::retrieve_all(
-  KeyOut keys_out, ValueOut values_out, cudaStream_t stream)
+  KeyOut keys_out, ValueOut values_out, cudaStream_t stream) const
 {
   static_assert(sizeof(pair_atomic_type) == sizeof(value_type));
   auto slots_begin = reinterpret_cast<value_type*>(slots_);
@@ -259,6 +260,10 @@ std::pair<KeyOut, ValueOut> static_map<Key, Value, Scope, Allocator>::retrieve_a
   CUCO_CUDA_TRY(
     cudaMemcpyAsync(&h_num_out, d_num_out, sizeof(std::size_t), cudaMemcpyDeviceToHost, stream));
   CUCO_CUDA_TRY(cudaStreamSynchronize(stream));
+  std::allocator_traits<temp_allocator_type>::deallocate(
+    temp_allocator, reinterpret_cast<char*>(d_num_out), sizeof(std::size_t));
+  std::allocator_traits<temp_allocator_type>::deallocate(
+    temp_allocator, d_temp_storage, temp_storage_bytes);
 
   return std::make_pair(keys_out + h_num_out, values_out + h_num_out);
 }
@@ -272,7 +277,7 @@ void static_map<Key, Value, Scope, Allocator>::contains(InputIt first,
                                                         KeyEqual key_equal,
                                                         cudaStream_t stream) const
 {
-  auto num_keys = std::distance(first, last);
+  auto const num_keys = cuco::detail::distance(first, last);
   if (num_keys == 0) { return; }
 
   auto const block_size = 128;
@@ -282,7 +287,7 @@ void static_map<Key, Value, Scope, Allocator>::contains(InputIt first,
   auto view             = get_device_view();
 
   detail::contains<block_size, tile_size>
-    <<<grid_size, block_size, 0, stream>>>(first, last, output_begin, view, hash, key_equal);
+    <<<grid_size, block_size, 0, stream>>>(first, num_keys, output_begin, view, hash, key_equal);
 }
 
 template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
diff --git a/include/cuco/detail/static_map/functors.cuh b/include/cuco/detail/static_map/functors.cuh
new file mode 100644
index 000000000..f508206f0
--- /dev/null
+++ b/include/cuco/detail/static_map/functors.cuh
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ */
+
+#pragma once
+
+#include <cuco/detail/bitwise_compare.cuh>
+
+#include <thrust/tuple.h>
+
+namespace cuco {
+namespace experimental {
+namespace static_map_ns {
+namespace detail {
+
+/**
+ * @brief Device functor returning the content of the slot indexed by `idx`.
+ *
+ * @tparam StorageRef Storage ref type
+ */
+template <typename StorageRef>
+struct get_slot {
+  StorageRef storage_;  ///< Storage ref
+
+  /**
+   * @brief Constructs `get_slot` functor with the given storage ref.
+   *
+   * @param s Input storage ref
+   */
+  explicit constexpr get_slot(StorageRef s) noexcept : storage_{s} {}
+
+  /**
+   * @brief Accesses the slot content with the given index.
+   *
+   * @param idx The slot index
+   * @return The slot content
+   */
+  __device__ constexpr auto operator()(typename StorageRef::size_type idx) const noexcept
+  {
+    auto const window_idx      = idx / StorageRef::window_size;
+    auto const intra_idx       = idx % StorageRef::window_size;
+    auto const [first, second] = storage_[window_idx][intra_idx];
+    return thrust::make_tuple(first, second);
+  }
+};
+
+/**
+ * @brief Device functor returning whether the input slot indexed by `idx` is filled.
+ *
+ * @tparam T The slot key type
+ * @tparam U The slot value type
+ */
+template <typename T, typename U>
+struct slot_is_filled {
+  T empty_sentinel_;  ///< The value of the empty key sentinel
+
+  /**
+   * @brief Constructs `slot_is_filled` functor with the given empty sentinel.
+   *
+   * @param s Sentinel indicating empty slot
+   */
+  explicit constexpr slot_is_filled(T const& s) noexcept : empty_sentinel_{s} {}
+
+  /**
+   * @brief Indicates if the target slot `slot` is filled.
+   *
+   * @tparam U Slot content type
+   *
+   * @param slot The slot
+   *
+   * @return `true` if slot is filled
+   */
+  template <typename Slot>
+  __device__ constexpr bool operator()(Slot const& slot) const noexcept
+  {
+    return not cuco::detail::bitwise_compare(empty_sentinel_, thrust::get<0>(slot));
+  }
+
+  /**
+   * @brief Indicates if the target slot `slot` is filled.
+   *
+   * @param slot The slot
+   *
+   * @return `true` if slot is filled
+   */
+  __device__ constexpr bool operator()(cuco::pair<T, U> const& slot) const noexcept
+  {
+    return not cuco::detail::bitwise_compare(empty_sentinel_, slot.first);
+  }
+};
+
+}  // namespace detail
+}  // namespace static_map_ns
+}  // namespace experimental
+}  // namespace cuco
diff --git a/include/cuco/detail/static_map/kernels.cuh b/include/cuco/detail/static_map/kernels.cuh
new file mode 100644
index 000000000..a36095462
--- /dev/null
+++ b/include/cuco/detail/static_map/kernels.cuh
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cuco/detail/bitwise_compare.cuh>
+#include <cuco/detail/utility/cuda.cuh>
+
+#include <cub/block/block_reduce.cuh>
+
+#include <cuda/atomic>
+
+#include <cooperative_groups.h>
+
+namespace cuco {
+namespace experimental {
+namespace static_map_ns {
+namespace detail {
+
+/**
+ * @brief For any key-value pair `{k, v}` in the range `[first, first + n)`, if a key equivalent to
+ * `k` already exists in the container, assigns `v` to the mapped_type corresponding to the key `k`.
+ * If the key does not exist, inserts the pair as if by insert.
+ *
+ * @note If multiple elements in `[first, first + n)` compare equal, it is unspecified which element
+ * is inserted.
+ *
+ * @tparam CGSize Number of threads in each CG
+ * @tparam BlockSize Number of threads in each block
+ * @tparam InputIterator Device accessible input iterator whose `value_type` is
+ * convertible to the `value_type` of the data structure
+ * @tparam Ref Type of non-owning device ref allowing access to storage
+ *
+ * @param first Beginning of the sequence of input elements
+ * @param n Number of input elements
+ * @param ref Non-owning container device ref used to access the slot storage
+ */
+template <int32_t CGSize, int32_t BlockSize, typename InputIterator, typename Ref>
+__global__ void insert_or_assign(InputIterator first, cuco::detail::index_type n, Ref ref)
+{
+  auto const loop_stride = cuco::detail::grid_stride() / CGSize;
+  auto idx               = cuco::detail::global_thread_id() / CGSize;
+
+  while (idx < n) {
+    typename Ref::value_type const insert_pair{*(first + idx)};
+    if constexpr (CGSize == 1) {
+      ref.insert_or_assign(insert_pair);
+    } else {
+      auto const tile =
+        cooperative_groups::tiled_partition<CGSize>(cooperative_groups::this_thread_block());
+      ref.insert_or_assign(tile, insert_pair);
+    }
+    idx += loop_stride;
+  }
+}
+
+/**
+ * @brief Finds the equivalent map elements of all keys in the range `[first, first + n)`.
+ *
+ * @note If the key `*(first + i)` has a match in the container, copies the payload of its matched
+ * element to `(output_begin + i)`. Else, copies the empty value sentinel. Uses the CUDA Cooperative
+ * Groups API to leverage groups of multiple threads to find each key. This provides a significant
+ * boost in throughput compared to the non Cooperative Group `find` at moderate to high load
+ * factors.
+ *
+ * @tparam CGSize Number of threads in each CG
+ * @tparam BlockSize The size of the thread block
+ * @tparam InputIt Device accessible input iterator
+ * @tparam OutputIt Device accessible output iterator assignable from the map's `mapped_type`
+ * @tparam Ref Type of non-owning device ref allowing access to storage
+ *
+ * @param first Beginning of the sequence of keys
+ * @param n Number of keys to query
+ * @param output_begin Beginning of the sequence of matched payloads retrieved for each key
+ * @param ref Non-owning map device ref used to access the slot storage
+ */
+template <int32_t CGSize, int32_t BlockSize, typename InputIt, typename OutputIt, typename Ref>
+__global__ void find(InputIt first, cuco::detail::index_type n, OutputIt output_begin, Ref ref)
+{
+  namespace cg = cooperative_groups;
+
+  auto const block       = cg::this_thread_block();
+  auto const thread_idx  = block.thread_rank();
+  auto const loop_stride = cuco::detail::grid_stride() / CGSize;
+  auto idx               = cuco::detail::global_thread_id() / CGSize;
+
+  __shared__ typename Ref::mapped_type output_buffer[BlockSize / CGSize];
+
+  while (idx - thread_idx < n) {  // the whole thread block falls into the same iteration
+    if (idx < n) {
+      auto const key = *(first + idx);
+      if constexpr (CGSize == 1) {
+        auto const found = ref.find(key);
+        /*
+         * The ld.relaxed.gpu instruction causes L1 to flush more frequently, causing increased
+         * sector stores from L2 to global memory. By writing results to shared memory and then
+         * synchronizing before writing back to global, we no longer rely on L1, preventing the
+         * increase in sector stores from L2 to global and improving performance.
+         */
+        output_buffer[thread_idx] =
+          found == ref.end() ? ref.empty_value_sentinel() : (*found).second;
+        block.sync();
+        *(output_begin + idx) = output_buffer[thread_idx];
+      } else {
+        auto const tile  = cg::tiled_partition<CGSize>(block);
+        auto const found = ref.find(tile, key);
+
+        if (tile.thread_rank() == 0) {
+          *(output_begin + idx) = found == ref.end() ? ref.empty_value_sentinel() : (*found).second;
+        }
+      }
+    }
+    idx += loop_stride;
+  }
+}
+
+}  // namespace detail
+}  // namespace static_map_ns
+}  // namespace experimental
+}  // namespace cuco
diff --git a/include/cuco/detail/static_map/static_map.inl b/include/cuco/detail/static_map/static_map.inl
new file mode 100644
index 000000000..d7274245e
--- /dev/null
+++ b/include/cuco/detail/static_map/static_map.inl
@@ -0,0 +1,402 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuco/cuda_stream_ref.hpp>
+#include <cuco/detail/static_map/functors.cuh>
+#include <cuco/detail/static_map/kernels.cuh>
+#include <cuco/detail/utility/cuda.hpp>
+#include <cuco/detail/utils.hpp>
+#include <cuco/operator.hpp>
+#include <cuco/static_map_ref.cuh>
+
+#include <cstddef>
+
+namespace cuco {
+namespace experimental {
+
+template <class Key,
+          class T,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+constexpr static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::
+  static_map(Extent capacity,
+             empty_key<Key> empty_key_sentinel,
+             empty_value<T> empty_value_sentinel,
+             KeyEqual const& pred,
+             ProbingScheme const& probing_scheme,
+             Allocator const& alloc,
+             cuda_stream_ref stream)
+  : impl_{std::make_unique<impl_type>(capacity,
+                                      empty_key_sentinel,
+                                      cuco::pair{empty_key_sentinel, empty_value_sentinel},
+                                      pred,
+                                      probing_scheme,
+                                      alloc,
+                                      stream)},
+    empty_value_sentinel_{empty_value_sentinel}
+{
+}
+
+template <class Key,
+          class T,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::clear(
+  cuda_stream_ref stream) noexcept
+{
+  impl_->clear(stream);
+}
+
+template <class Key,
+          class T,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::clear_async(
+  cuda_stream_ref stream) noexcept
+{
+  impl_->clear_async(stream);
+}
+
+template <class Key,
+          class T,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+template <typename InputIt>
+static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::size_type
+static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::insert(
+  InputIt first, InputIt last, cuda_stream_ref stream)
+{
+  return impl_->insert(first, last, ref(op::insert), stream);
+}
+
+template <class Key,
+          class T,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+template <typename InputIt>
+void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::insert_async(
+  InputIt first, InputIt last, cuda_stream_ref stream) noexcept
+{
+  impl_->insert_async(first, last, ref(op::insert), stream);
+}
+
+template <class Key,
+          class T,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+template <typename InputIt, typename StencilIt, typename Predicate>
+static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::size_type
+static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::insert_if(
+  InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda_stream_ref stream)
+{
+  return impl_->insert_if(first, last, stencil, pred, ref(op::insert), stream);
+}
+
+template <class Key,
+          class T,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+template <typename InputIt, typename StencilIt, typename Predicate>
+void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::
+  insert_if_async(
+    InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda_stream_ref stream) noexcept
+{
+  impl_->insert_if_async(first, last, stencil, pred, ref(op::insert), stream);
+}
+
+template <class Key,
+          class T,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+template <typename InputIt>
+void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::
+  insert_or_assign(InputIt first, InputIt last, cuda_stream_ref stream) noexcept
+{
+  return this->insert_or_assign_async(first, last, stream);
+  stream.synchronize();
+}
+
+template <class Key,
+          class T,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+template <typename InputIt>
+void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::
+  insert_or_assign_async(InputIt first, InputIt last, cuda_stream_ref stream) noexcept
+{
+  auto const num = cuco::detail::distance(first, last);
+  if (num == 0) { return; }
+
+  auto const grid_size = cuco::detail::grid_size(num, cg_size);
+
+  static_map_ns::detail::insert_or_assign<cg_size, cuco::detail::default_block_size()>
+    <<<grid_size, cuco::detail::default_block_size(), 0, stream>>>(
+      first, num, ref(op::insert_or_assign));
+}
+
+template <class Key,
+          class T,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+template <typename InputIt, typename OutputIt>
+void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::contains(
+  InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream) const
+{
+  contains_async(first, last, output_begin, stream);
+  stream.synchronize();
+}
+
+template <class Key,
+          class T,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+template <typename InputIt, typename OutputIt>
+void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::contains_async(
+  InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream) const noexcept
+{
+  impl_->contains_async(first, last, output_begin, ref(op::contains), stream);
+}
+
+template <class Key,
+          class T,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+template <typename InputIt, typename StencilIt, typename Predicate, typename OutputIt>
+void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::contains_if(
+  InputIt first,
+  InputIt last,
+  StencilIt stencil,
+  Predicate pred,
+  OutputIt output_begin,
+  cuda_stream_ref stream) const
+{
+  contains_if_async(first, last, stencil, pred, output_begin, stream);
+  stream.synchronize();
+}
+
+template <class Key,
+          class T,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+template <typename InputIt, typename StencilIt, typename Predicate, typename OutputIt>
+void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::
+  contains_if_async(InputIt first,
+                    InputIt last,
+                    StencilIt stencil,
+                    Predicate pred,
+                    OutputIt output_begin,
+                    cuda_stream_ref stream) const noexcept
+{
+  impl_->contains_if_async(first, last, stencil, pred, output_begin, ref(op::contains), stream);
+}
+
+template <class Key,
+          class T,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+template <typename InputIt, typename OutputIt>
+void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::find(
+  InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream) const
+{
+  find_async(first, last, output_begin, stream);
+  stream.synchronize();
+}
+
+template <class Key,
+          class T,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+template <typename InputIt, typename OutputIt>
+void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::find_async(
+  InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream) const
+{
+  auto const num_keys = cuco::detail::distance(first, last);
+  if (num_keys == 0) { return; }
+
+  auto const grid_size = cuco::detail::grid_size(num_keys, cg_size);
+
+  static_map_ns::detail::find<cg_size, cuco::detail::default_block_size()>
+    <<<grid_size, cuco::detail::default_block_size(), 0, stream>>>(
+      first, num_keys, output_begin, ref(op::find));
+}
+
+template <class Key,
+          class T,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+template <typename KeyOut, typename ValueOut>
+std::pair<KeyOut, ValueOut>
+static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::retrieve_all(
+  KeyOut keys_out, ValueOut values_out, cuda_stream_ref stream) const
+{
+  auto const begin = thrust::make_transform_iterator(
+    thrust::counting_iterator<size_type>{0},
+    static_map_ns::detail::get_slot<storage_ref_type>(impl_->storage_ref()));
+  auto const is_filled  = static_map_ns::detail::slot_is_filled<Key, T>(this->empty_key_sentinel());
+  auto zipped_out_begin = thrust::make_zip_iterator(thrust::make_tuple(keys_out, values_out));
+  auto const zipped_out_end = impl_->retrieve_all(begin, zipped_out_begin, is_filled, stream);
+  auto const num            = std::distance(zipped_out_begin, zipped_out_end);
+
+  return std::make_pair(keys_out + num, values_out + num);
+}
+
+template <class Key,
+          class T,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::size_type
+static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::size(
+  cuda_stream_ref stream) const noexcept
+{
+  auto const is_filled = static_map_ns::detail::slot_is_filled<Key, T>(this->empty_key_sentinel());
+  return impl_->size(is_filled, stream);
+}
+
+template <class Key,
+          class T,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+constexpr auto
+static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::capacity()
+  const noexcept
+{
+  return impl_->capacity();
+}
+
+template <class Key,
+          class T,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+constexpr static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::key_type
+static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::empty_key_sentinel()
+  const noexcept
+{
+  return impl_->empty_key_sentinel();
+}
+
+template <class Key,
+          class T,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+constexpr static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::
+  mapped_type
+  static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::
+    empty_value_sentinel() const noexcept
+{
+  return this->empty_value_sentinel_;
+}
+
+template <class Key,
+          class T,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+template <typename... Operators>
+auto static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::ref(
+  Operators...) const noexcept
+{
+  static_assert(sizeof...(Operators), "No operators specified");
+  return ref_type<Operators...>{cuco::empty_key<key_type>(this->empty_key_sentinel()),
+                                cuco::empty_value<mapped_type>(this->empty_value_sentinel()),
+                                impl_->key_eq(),
+                                impl_->probing_scheme(),
+                                impl_->storage_ref()};
+}
+}  // namespace experimental
+}  // namespace cuco
diff --git a/include/cuco/detail/static_map/static_map_ref.inl b/include/cuco/detail/static_map/static_map_ref.inl
new file mode 100644
index 000000000..28b3ffaf2
--- /dev/null
+++ b/include/cuco/detail/static_map/static_map_ref.inl
@@ -0,0 +1,674 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuco/operator.hpp>
+
+#include <cuda/atomic>
+
+#include <cooperative_groups.h>
+
+namespace cuco {
+namespace experimental {
+
+template <typename Key,
+          typename T,
+          cuda::thread_scope Scope,
+          typename KeyEqual,
+          typename ProbingScheme,
+          typename StorageRef,
+          typename... Operators>
+__host__ __device__ constexpr static_map_ref<
+  Key,
+  T,
+  Scope,
+  KeyEqual,
+  ProbingScheme,
+  StorageRef,
+  Operators...>::static_map_ref(cuco::empty_key<Key> empty_key_sentinel,
+                                cuco::empty_value<T> empty_value_sentinel,
+                                KeyEqual const& predicate,
+                                ProbingScheme const& probing_scheme,
+                                StorageRef storage_ref) noexcept
+  : impl_{cuco::pair{empty_key_sentinel, empty_value_sentinel}, probing_scheme, storage_ref},
+    empty_value_sentinel_{empty_value_sentinel},
+    predicate_{empty_key_sentinel, predicate}
+{
+}
+
+template <typename Key,
+          typename T,
+          cuda::thread_scope Scope,
+          typename KeyEqual,
+          typename ProbingScheme,
+          typename StorageRef,
+          typename... Operators>
+template <typename... OtherOperators>
+__host__ __device__ constexpr static_map_ref<Key,
+                                             T,
+                                             Scope,
+                                             KeyEqual,
+                                             ProbingScheme,
+                                             StorageRef,
+                                             Operators...>::
+  static_map_ref(
+    static_map_ref<Key, T, Scope, KeyEqual, ProbingScheme, StorageRef, OtherOperators...>&&
+      other) noexcept
+  : impl_{std::move(other.impl_)},
+    predicate_{std::move(other.predicate_)},
+    empty_value_sentinel_{std::move(other.empty_value_sentinel_)}
+{
+}
+
+template <typename Key,
+          typename T,
+          cuda::thread_scope Scope,
+          typename KeyEqual,
+          typename ProbingScheme,
+          typename StorageRef,
+          typename... Operators>
+__host__ __device__ constexpr auto
+static_map_ref<Key, T, Scope, KeyEqual, ProbingScheme, StorageRef, Operators...>::capacity()
+  const noexcept
+{
+  return impl_.capacity();
+}
+
+template <typename Key,
+          typename T,
+          cuda::thread_scope Scope,
+          typename KeyEqual,
+          typename ProbingScheme,
+          typename StorageRef,
+          typename... Operators>
+__host__ __device__ constexpr Key
+static_map_ref<Key, T, Scope, KeyEqual, ProbingScheme, StorageRef, Operators...>::
+  empty_key_sentinel() const noexcept
+{
+  return predicate_.empty_sentinel_;
+}
+
+template <typename Key,
+          typename T,
+          cuda::thread_scope Scope,
+          typename KeyEqual,
+          typename ProbingScheme,
+          typename StorageRef,
+          typename... Operators>
+__host__ __device__ constexpr T
+static_map_ref<Key, T, Scope, KeyEqual, ProbingScheme, StorageRef, Operators...>::
+  empty_value_sentinel() const noexcept
+{
+  return empty_value_sentinel_;
+}
+
+template <typename Key,
+          typename T,
+          cuda::thread_scope Scope,
+          typename KeyEqual,
+          typename ProbingScheme,
+          typename StorageRef,
+          typename... Operators>
+template <typename... NewOperators>
+auto static_map_ref<Key, T, Scope, KeyEqual, ProbingScheme, StorageRef, Operators...>::with(
+  NewOperators...) && noexcept
+{
+  return static_map_ref<Key, T, Scope, KeyEqual, ProbingScheme, StorageRef, NewOperators...>(
+    std::move(*this));
+}
+
+template <typename Key,
+          typename T,
+          cuda::thread_scope Scope,
+          typename KeyEqual,
+          typename ProbingScheme,
+          typename StorageRef,
+          typename... Operators>
+struct static_map_ref<Key, T, Scope, KeyEqual, ProbingScheme, StorageRef, Operators...>::
+  predicate_wrapper {
+  detail::equal_wrapper<key_type, key_equal> predicate_;
+
+  /**
+   * @brief Map predicate wrapper ctor.
+   *
+   * @param sentinel Sentinel value
+   * @param equal Equality binary callable
+   */
+  __host__ __device__ constexpr predicate_wrapper(key_type empty_key_sentinel,
+                                                  key_equal const& equal) noexcept
+    : predicate_{empty_key_sentinel, equal}
+  {
+  }
+
+  /**
+   * @brief Equality check with the given equality callable.
+   *
+   * @tparam U Right-hand side Element type
+   *
+   * @param lhs Left-hand side element to check equality
+   * @param rhs Right-hand side element to check equality
+   *
+   * @return `EQUAL` if `lhs` and `rhs` are equivalent. `UNEQUAL` otherwise.
+   */
+  template <typename U>
+  __device__ constexpr detail::equal_result equal_to(value_type const& lhs,
+                                                     U const& rhs) const noexcept
+  {
+    return predicate_.equal_to(lhs.first, rhs);
+  }
+
+  /**
+   * @brief Equality check with the given equality callable.
+   *
+   * @param lhs Left-hand side element to check equality
+   * @param rhs Right-hand side element to check equality
+   *
+   * @return `EQUAL` if `lhs` and `rhs` are equivalent. `UNEQUAL` otherwise.
+   */
+  __device__ constexpr detail::equal_result equal_to(value_type const& lhs,
+                                                     value_type const& rhs) const noexcept
+  {
+    return predicate_.equal_to(lhs.first, rhs.first);
+  }
+
+  /**
+   * @brief Equality check with the given equality callable.
+   *
+   * @param lhs Left-hand side key to check equality
+   * @param rhs Right-hand side key to check equality
+   *
+   * @return `EQUAL` if `lhs` and `rhs` are equivalent. `UNEQUAL` otherwise.
+   */
+  __device__ constexpr detail::equal_result equal_to(key_type const& lhs,
+                                                     key_type const& rhs) const noexcept
+  {
+    return predicate_.equal_to(lhs, rhs);
+  }
+
+  /**
+   * @brief Order-sensitive equality operator.
+   *
+   * @note Container keys MUST be always on the left-hand side.
+   *
+   * @tparam U Right-hand side Element type
+   *
+   * @param lhs Left-hand side element to check equality
+   * @param rhs Right-hand side element to check equality
+   *
+   * @return Three way equality comparison result
+   */
+  template <typename U>
+  __device__ constexpr detail::equal_result operator()(value_type const& lhs,
+                                                       U const& rhs) const noexcept
+  {
+    return predicate_(lhs.first, rhs);
+  }
+};
+
+namespace detail {
+
+template <typename Key,
+          typename T,
+          cuda::thread_scope Scope,
+          typename KeyEqual,
+          typename ProbingScheme,
+          typename StorageRef,
+          typename... Operators>
+class operator_impl<
+  op::insert_tag,
+  static_map_ref<Key, T, Scope, KeyEqual, ProbingScheme, StorageRef, Operators...>> {
+  using base_type = static_map_ref<Key, T, Scope, KeyEqual, ProbingScheme, StorageRef>;
+  using ref_type = static_map_ref<Key, T, Scope, KeyEqual, ProbingScheme, StorageRef, Operators...>;
+  using key_type = typename base_type::key_type;
+  using value_type = typename base_type::value_type;
+
+  static constexpr auto cg_size     = base_type::cg_size;
+  static constexpr auto window_size = base_type::window_size;
+
+ public:
+  /**
+   * @brief Inserts an element.
+   *
+   * @param value The element to insert
+   * @return True if the given element is successfully inserted
+   */
+  __device__ bool insert(value_type const& value) noexcept
+  {
+    ref_type& ref_             = static_cast<ref_type&>(*this);
+    auto constexpr has_payload = true;
+    return ref_.impl_.insert<has_payload>(value, ref_.predicate_);
+  }
+
+  /**
+   * @brief Inserts an element.
+   *
+   * @param group The Cooperative Group used to perform group insert
+   * @param value The element to insert
+   * @return True if the given element is successfully inserted
+   */
+  __device__ bool insert(cooperative_groups::thread_block_tile<cg_size> const& group,
+                         value_type const& value) noexcept
+  {
+    auto& ref_                 = static_cast<ref_type&>(*this);
+    auto constexpr has_payload = true;
+    return ref_.impl_.insert<has_payload>(group, value, ref_.predicate_);
+  }
+};
+
+template <typename Key,
+          typename T,
+          cuda::thread_scope Scope,
+          typename KeyEqual,
+          typename ProbingScheme,
+          typename StorageRef,
+          typename... Operators>
+class operator_impl<
+  op::insert_or_assign_tag,
+  static_map_ref<Key, T, Scope, KeyEqual, ProbingScheme, StorageRef, Operators...>> {
+  using base_type = static_map_ref<Key, T, Scope, KeyEqual, ProbingScheme, StorageRef>;
+  using ref_type = static_map_ref<Key, T, Scope, KeyEqual, ProbingScheme, StorageRef, Operators...>;
+  using key_type = typename base_type::key_type;
+  using value_type = typename base_type::value_type;
+
+  static constexpr auto cg_size     = base_type::cg_size;
+  static constexpr auto window_size = base_type::window_size;
+
+  static_assert(sizeof(T) == 4 or sizeof(T) == 8,
+                "sizeof(mapped_type) must be either 4 bytes or 8 bytes.");
+
+ public:
+  /**
+   * @brief Inserts a key-value pair `{k, v}` if it's not present in the map. Otherwise, assigns `v`
+   * to the mapped_type corresponding to the key `k`.
+   *
+   * @param value The element to insert
+   */
+  __device__ void insert_or_assign(value_type const& value) noexcept
+  {
+    static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme");
+
+    ref_type& ref_       = static_cast<ref_type&>(*this);
+    auto const key       = value.first;
+    auto& probing_scheme = ref_.impl_.probing_scheme();
+    auto storage_ref     = ref_.impl_.storage_ref();
+    auto probing_iter    = probing_scheme(key, storage_ref.window_extent());
+
+    while (true) {
+      auto const window_slots = storage_ref[*probing_iter];
+
+      for (auto& slot_content : window_slots) {
+        auto const eq_res = ref_.predicate_(slot_content, key);
+
+        // If the key is already in the container, update the payload and return
+        if (eq_res == detail::equal_result::EQUAL) {
+          auto const intra_window_index = thrust::distance(window_slots.begin(), &slot_content);
+          ref_.impl_.atomic_store(
+            &((storage_ref.data() + *probing_iter)->data() + intra_window_index)->second,
+            value.second);
+          return;
+        }
+        if (eq_res == detail::equal_result::EMPTY) {
+          auto const intra_window_index = thrust::distance(window_slots.begin(), &slot_content);
+          if (attempt_insert_or_assign(
+                (storage_ref.data() + *probing_iter)->data() + intra_window_index, value)) {
+            return;
+          }
+        }
+      }
+      ++probing_iter;
+    }
+  }
+
+  /**
+   * @brief Inserts an element.
+   *
+   * @brief Inserts a key-value pair `{k, v}` if it's not present in the map. Otherwise, assigns `v`
+   * to the mapped_type corresponding to the key `k`.
+   *
+   * @param group The Cooperative Group used to perform group insert
+   * @param value The element to insert
+   */
+  __device__ void insert_or_assign(cooperative_groups::thread_block_tile<cg_size> const& group,
+                                   value_type const& value) noexcept
+  {
+    ref_type& ref_ = static_cast<ref_type&>(*this);
+
+    auto const key       = value.first;
+    auto& probing_scheme = ref_.impl_.probing_scheme();
+    auto storage_ref     = ref_.impl_.storage_ref();
+    auto probing_iter    = probing_scheme(group, key, storage_ref.window_extent());
+
+    while (true) {
+      auto const window_slots = storage_ref[*probing_iter];
+
+      auto const [state, intra_window_index] = [&]() {
+        for (auto i = 0; i < window_size; ++i) {
+          switch (ref_.predicate_(window_slots[i], key)) {
+            case detail::equal_result::EMPTY:
+              return detail::window_probing_results{detail::equal_result::EMPTY, i};
+            case detail::equal_result::EQUAL:
+              return detail::window_probing_results{detail::equal_result::EQUAL, i};
+            default: continue;
+          }
+        }
+        // returns dummy index `-1` for UNEQUAL
+        return detail::window_probing_results{detail::equal_result::UNEQUAL, -1};
+      }();
+
+      auto const group_contains_equal = group.ballot(state == detail::equal_result::EQUAL);
+      if (group_contains_equal) {
+        auto const src_lane = __ffs(group_contains_equal) - 1;
+        if (group.thread_rank() == src_lane) {
+          ref_.impl_.atomic_store(
+            &((storage_ref.data() + *probing_iter)->data() + intra_window_index)->second,
+            value.second);
+        }
+        group.sync();
+        return;
+      }
+
+      auto const group_contains_empty = group.ballot(state == detail::equal_result::EMPTY);
+      if (group_contains_empty) {
+        auto const src_lane = __ffs(group_contains_empty) - 1;
+        auto const status =
+          (group.thread_rank() == src_lane)
+            ? attempt_insert_or_assign(
+                (storage_ref.data() + *probing_iter)->data() + intra_window_index, value)
+            : false;
+
+        // Exit if inserted or assigned
+        if (group.shfl(status, src_lane)) { return; }
+      } else {
+        ++probing_iter;
+      }
+    }
+  }
+
+ private:
+  /**
+   * @brief Attempts to insert an element into a slot or update the matching payload with the given
+   * element
+   *
+   * @brief Inserts a key-value pair `{k, v}` if it's not present in the map. Otherwise, assigns `v`
+   * to the mapped_type corresponding to the key `k`.
+   *
+   * @param group The Cooperative Group used to perform group insert
+   * @param value The element to insert
+   *
+   * @return Returns `true` if the given `value` is inserted or `value` has a match in the map.
+   */
+  __device__ constexpr bool attempt_insert_or_assign(value_type* slot,
+                                                     value_type const& value) noexcept
+  {
+    ref_type& ref_          = static_cast<ref_type&>(*this);
+    auto const expected_key = ref_.impl_.empty_slot_sentinel().first;
+
+    auto old_key      = ref_.impl_.compare_and_swap(&slot->first, expected_key, value.first);
+    auto* old_key_ptr = reinterpret_cast<key_type*>(&old_key);
+
+    // if key success or key was already present in the map
+    if (cuco::detail::bitwise_compare(*old_key_ptr, expected_key) or
+        (ref_.predicate_.equal_to(*old_key_ptr, value.first) == detail::equal_result::EQUAL)) {
+      // Update payload
+      ref_.impl_.atomic_store(&slot->second, value.second);
+      return true;
+    }
+    return false;
+  }
+};
+
+template <typename Key,
+          typename T,
+          cuda::thread_scope Scope,
+          typename KeyEqual,
+          typename ProbingScheme,
+          typename StorageRef,
+          typename... Operators>
+class operator_impl<
+  op::insert_and_find_tag,
+  static_map_ref<Key, T, Scope, KeyEqual, ProbingScheme, StorageRef, Operators...>> {
+  using base_type = static_map_ref<Key, T, Scope, KeyEqual, ProbingScheme, StorageRef>;
+  using ref_type = static_map_ref<Key, T, Scope, KeyEqual, ProbingScheme, StorageRef, Operators...>;
+  using key_type = typename base_type::key_type;
+  using value_type     = typename base_type::value_type;
+  using iterator       = typename base_type::iterator;
+  using const_iterator = typename base_type::const_iterator;
+
+  static constexpr auto cg_size     = base_type::cg_size;
+  static constexpr auto window_size = base_type::window_size;
+
+ public:
+  /**
+   * @brief Returns a const_iterator to one past the last slot.
+   *
+   * @note This API is available only when `find_tag` or `insert_and_find_tag` is present.
+   *
+   * @return A const_iterator to one past the last slot
+   */
+  [[nodiscard]] __host__ __device__ constexpr const_iterator end() const noexcept
+  {
+    auto const& ref_ = static_cast<ref_type const&>(*this);
+    return ref_.impl_.end();
+  }
+
+  /**
+   * @brief Returns an iterator to one past the last slot.
+   *
+   * @note This API is available only when `find_tag` or `insert_and_find_tag` is present.
+   *
+   * @return An iterator to one past the last slot
+   */
+  [[nodiscard]] __host__ __device__ constexpr iterator end() noexcept
+  {
+    auto const& ref_ = static_cast<ref_type const&>(*this);
+    return ref_.impl_.end();
+  }
+
+  /**
+   * @brief Inserts the given element into the map.
+   *
+   * @note This API returns a pair consisting of an iterator to the inserted element (or to the
+   * element that prevented the insertion) and a `bool` denoting whether the insertion took place or
+   * not.
+   *
+   * @param value The element to insert
+   *
+   * @return a pair consisting of an iterator to the element and a bool indicating whether the
+   * insertion is successful or not.
+   */
+  __device__ thrust::pair<iterator, bool> insert_and_find(value_type const& value) noexcept
+  {
+    ref_type& ref_             = static_cast<ref_type&>(*this);
+    auto constexpr has_payload = true;
+    return ref_.impl_.insert_and_find<has_payload>(value, ref_.predicate_);
+  }
+
+  /**
+   * @brief Inserts the given element into the map.
+   *
+   * @note This API returns a pair consisting of an iterator to the inserted element (or to the
+   * element that prevented the insertion) and a `bool` denoting whether the insertion took place or
+   * not.
+   *
+   * @param group The Cooperative Group used to perform group insert_and_find
+   * @param value The element to insert
+   *
+   * @return a pair consisting of an iterator to the element and a bool indicating whether the
+   * insertion is successful or not.
+   */
+  __device__ thrust::pair<iterator, bool> insert_and_find(
+    cooperative_groups::thread_block_tile<cg_size> const& group, value_type const& value) noexcept
+  {
+    ref_type& ref_             = static_cast<ref_type&>(*this);
+    auto constexpr has_payload = true;
+    return ref_.impl_.insert_and_find<has_payload>(group, value, ref_.predicate_);
+  }
+};
+
+template <typename Key,
+          typename T,
+          cuda::thread_scope Scope,
+          typename KeyEqual,
+          typename ProbingScheme,
+          typename StorageRef,
+          typename... Operators>
+class operator_impl<
+  op::contains_tag,
+  static_map_ref<Key, T, Scope, KeyEqual, ProbingScheme, StorageRef, Operators...>> {
+  using base_type = static_map_ref<Key, T, Scope, KeyEqual, ProbingScheme, StorageRef>;
+  using ref_type = static_map_ref<Key, T, Scope, KeyEqual, ProbingScheme, StorageRef, Operators...>;
+  using key_type = typename base_type::key_type;
+  using value_type = typename base_type::value_type;
+
+  static constexpr auto cg_size     = base_type::cg_size;
+  static constexpr auto window_size = base_type::window_size;
+
+ public:
+  /**
+   * @brief Indicates whether the probe key `key` was inserted into the container.
+   *
+   * @note If the probe key `key` was inserted into the container, returns
+   * true. Otherwise, returns false.
+   *
+   * @tparam ProbeKey Probe key type
+   *
+   * @param key The key to search for
+   *
+   * @return A boolean indicating whether the probe key is present
+   */
+  template <typename ProbeKey>
+  [[nodiscard]] __device__ bool contains(ProbeKey const& key) const noexcept
+  {
+    // CRTP: cast `this` to the actual ref type
+    auto const& ref_ = static_cast<ref_type const&>(*this);
+    return ref_.impl_.contains(key, ref_.predicate_);
+  }
+
+  /**
+   * @brief Indicates whether the probe key `key` was inserted into the container.
+   *
+   * @note If the probe key `key` was inserted into the container, returns
+   * true. Otherwise, returns false.
+   *
+   * @tparam ProbeKey Probe key type
+   *
+   * @param group The Cooperative Group used to perform group contains
+   * @param key The key to search for
+   *
+   * @return A boolean indicating whether the probe key is present
+   */
+  template <typename ProbeKey>
+  [[nodiscard]] __device__ bool contains(
+    cooperative_groups::thread_block_tile<cg_size> const& group, ProbeKey const& key) const noexcept
+  {
+    auto const& ref_ = static_cast<ref_type const&>(*this);
+    return ref_.impl_.contains(group, key, ref_.predicate_);
+  }
+};
+
+template <typename Key,
+          typename T,
+          cuda::thread_scope Scope,
+          typename KeyEqual,
+          typename ProbingScheme,
+          typename StorageRef,
+          typename... Operators>
+class operator_impl<
+  op::find_tag,
+  static_map_ref<Key, T, Scope, KeyEqual, ProbingScheme, StorageRef, Operators...>> {
+  using base_type = static_map_ref<Key, T, Scope, KeyEqual, ProbingScheme, StorageRef>;
+  using ref_type = static_map_ref<Key, T, Scope, KeyEqual, ProbingScheme, StorageRef, Operators...>;
+  using key_type = typename base_type::key_type;
+  using value_type     = typename base_type::value_type;
+  using iterator       = typename base_type::iterator;
+  using const_iterator = typename base_type::const_iterator;
+
+  static constexpr auto cg_size     = base_type::cg_size;
+  static constexpr auto window_size = base_type::window_size;
+
+ public:
+  /**
+   * @brief Returns a const_iterator to one past the last slot.
+   *
+   * @note This API is available only when `find_tag` or `insert_and_find_tag` is present.
+   *
+   * @return A const_iterator to one past the last slot
+   */
+  [[nodiscard]] __host__ __device__ constexpr const_iterator end() const noexcept
+  {
+    auto const& ref_ = static_cast<ref_type const&>(*this);
+    return ref_.impl_.end();
+  }
+
+  /**
+   * @brief Returns an iterator to one past the last slot.
+   *
+   * @note This API is available only when `find_tag` or `insert_and_find_tag` is present.
+   *
+   * @return An iterator to one past the last slot
+   */
+  [[nodiscard]] __host__ __device__ constexpr iterator end() noexcept
+  {
+    auto const& ref_ = static_cast<ref_type const&>(*this);
+    return ref_.impl_.end();
+  }
+
+  /**
+   * @brief Finds an element in the map with key equivalent to the probe key.
+   *
+   * @note Returns a un-incrementable input iterator to the element whose key is equivalent to
+   * `key`. If no such element exists, returns `end()`.
+   *
+   * @tparam ProbeKey Probe key type
+   *
+   * @param key The key to search for
+   *
+   * @return An iterator to the position at which the equivalent key is stored
+   */
+  template <typename ProbeKey>
+  [[nodiscard]] __device__ const_iterator find(ProbeKey const& key) const noexcept
+  {
+    // CRTP: cast `this` to the actual ref type
+    auto const& ref_ = static_cast<ref_type const&>(*this);
+    return ref_.impl_.find(key, ref_.predicate_);
+  }
+
+  /**
+   * @brief Finds an element in the map with key equivalent to the probe key.
+   *
+   * @note Returns a un-incrementable input iterator to the element whose key is equivalent to
+   * `key`. If no such element exists, returns `end()`.
+   *
+   * @tparam ProbeKey Probe key type
+   *
+   * @param group The Cooperative Group used to perform this operation
+   * @param key The key to search for
+   *
+   * @return An iterator to the position at which the equivalent key is stored
+   */
+  template <typename ProbeKey>
+  [[nodiscard]] __device__ const_iterator find(
+    cooperative_groups::thread_block_tile<cg_size> const& group, ProbeKey const& key) const noexcept
+  {
+    auto const& ref_ = static_cast<ref_type const&>(*this);
+    return ref_.impl_.find(group, key, ref_.predicate_);
+  }
+};
+
+}  // namespace detail
+}  // namespace experimental
+}  // namespace cuco
diff --git a/include/cuco/detail/static_map_kernels.cuh b/include/cuco/detail/static_map_kernels.cuh
index 7a3ca0dfa..73c22997a 100644
--- a/include/cuco/detail/static_map_kernels.cuh
+++ b/include/cuco/detail/static_map_kernels.cuh
@@ -36,6 +36,7 @@ namespace cg = cooperative_groups;
  * @tparam Key key type
  * @tparam Value value type
  * @tparam pair_atomic_type key/value pair type
+ *
  * @param slots Pointer to flat storage for the map's key/value pairs
  * @param k Key to which all keys in `slots` are initialized
  * @param v Value to which all values in `slots` are initialized
@@ -47,13 +48,14 @@ template <std::size_t block_size,
           typename Key,
           typename Value,
           typename pair_atomic_type>
-__global__ void initialize(pair_atomic_type* const slots, Key k, Value v, std::size_t size)
+__global__ void initialize(pair_atomic_type* const slots, Key k, Value v, int64_t size)
 {
-  auto tid = block_size * blockIdx.x + threadIdx.x;
-  while (tid < size) {
-    new (&slots[tid].first) atomic_key_type{k};
-    new (&slots[tid].second) atomic_mapped_type{v};
-    tid += gridDim.x * block_size;
+  int64_t const loop_stride = gridDim.x * block_size;
+  int64_t idx               = block_size * blockIdx.x + threadIdx.x;
+  while (idx < size) {
+    new (&slots[idx].first) atomic_key_type{k};
+    new (&slots[idx].second) atomic_mapped_type{v};
+    idx += loop_stride;
   }
 }
 
@@ -70,8 +72,9 @@ __global__ void initialize(pair_atomic_type* const slots, Key k, Value v, std::s
  * @tparam viewT Type of device view allowing access of hash map storage
  * @tparam Hash Unary callable type
  * @tparam KeyEqual Binary callable type
+ *
  * @param first Beginning of the sequence of key/value pairs
- * @param last End of the sequence of key/value pairs
+ * @param n Number of the key/value pairs to insert
  * @param num_successes The number of successfully inserted key/value pairs
  * @param view Mutable device view used to access the hash map's slot storage
  * @param hash The unary function to apply to hash each key
@@ -84,19 +87,19 @@ template <std::size_t block_size,
           typename Hash,
           typename KeyEqual>
 __global__ void insert(
-  InputIt first, InputIt last, atomicT* num_successes, viewT view, Hash hash, KeyEqual key_equal)
+  InputIt first, int64_t n, atomicT* num_successes, viewT view, Hash hash, KeyEqual key_equal)
 {
   typedef cub::BlockReduce<std::size_t, block_size> BlockReduce;
   __shared__ typename BlockReduce::TempStorage temp_storage;
   std::size_t thread_num_successes = 0;
 
-  auto tid = block_size * blockIdx.x + threadIdx.x;
-  auto it  = first + tid;
+  int64_t const loop_stride = gridDim.x * block_size;
+  int64_t idx               = block_size * blockIdx.x + threadIdx.x;
 
-  while (it < last) {
-    typename viewT::value_type const insert_pair{*it};
+  while (idx < n) {
+    typename viewT::value_type const insert_pair{*(first + idx)};
     if (view.insert(insert_pair, hash, key_equal)) { thread_num_successes++; }
-    it += gridDim.x * block_size;
+    idx += loop_stride;
   }
 
   // compute number of successfully inserted elements for each block
@@ -123,8 +126,9 @@ __global__ void insert(
  * @tparam viewT Type of device view allowing access of hash map storage
  * @tparam Hash Unary callable type
  * @tparam KeyEqual Binary callable type
+ *
  * @param first Beginning of the sequence of key/value pairs
- * @param last End of the sequence of key/value pairs
+ * @param n Number of the key/value pairs to insert
  * @param num_successes The number of successfully inserted key/value pairs
  * @param view Mutable device view used to access the hash map's slot storage
  * @param hash The unary function to apply to hash each key
@@ -138,23 +142,23 @@ template <std::size_t block_size,
           typename Hash,
           typename KeyEqual>
 __global__ void insert(
-  InputIt first, InputIt last, atomicT* num_successes, viewT view, Hash hash, KeyEqual key_equal)
+  InputIt first, int64_t n, atomicT* num_successes, viewT view, Hash hash, KeyEqual key_equal)
 {
   typedef cub::BlockReduce<std::size_t, block_size> BlockReduce;
   __shared__ typename BlockReduce::TempStorage temp_storage;
   std::size_t thread_num_successes = 0;
 
-  auto tile = cg::tiled_partition<tile_size>(cg::this_thread_block());
-  auto tid  = block_size * blockIdx.x + threadIdx.x;
-  auto it   = first + tid / tile_size;
+  auto tile                 = cg::tiled_partition<tile_size>(cg::this_thread_block());
+  int64_t const loop_stride = gridDim.x * block_size / tile_size;
+  int64_t idx               = (block_size * blockIdx.x + threadIdx.x) / tile_size;
 
-  while (it < last) {
+  while (idx < n) {
     // force conversion to value_type
-    typename viewT::value_type const insert_pair{*it};
+    typename viewT::value_type const insert_pair{*(first + idx)};
     if (view.insert(tile, insert_pair, hash, key_equal) && tile.thread_rank() == 0) {
       thread_num_successes++;
     }
-    it += (gridDim.x * block_size) / tile_size;
+    idx += loop_stride;
   }
 
   // compute number of successfully inserted elements for each block
@@ -163,6 +167,28 @@ __global__ void insert(
   if (threadIdx.x == 0) { *num_successes += block_num_successes; }
 }
 
+/**
+ * @brief Erases the key/value pairs corresponding to all keys in the range `[first, last)`.
+ *
+ * If the key `*(first + i)` exists in the map, its slot is erased and made available for future
+ * insertions.
+ * Else, no effect.
+ *
+ * @tparam block_size The size of the thread block
+ * @tparam InputIt Device accessible input iterator whose `value_type` is
+ * convertible to the map's `key_type`
+ * @tparam atomicT Type of atomic storage
+ * @tparam viewT Type of device view allowing access of hash map storage
+ * @tparam Hash Unary callable type
+ * @tparam KeyEqual Binary callable type
+ *
+ * @param first Beginning of the sequence of keys
+ * @param last End of the sequence of keys
+ * @param num_successes The number of successfully erased key/value pairs
+ * @param view Device view used to access the hash map's slot storage
+ * @param hash The unary function to apply to hash each key
+ * @param key_equal The binary function to compare two keys for equality
+ */
 template <std::size_t block_size,
           typename InputIt,
           typename atomicT,
@@ -170,18 +196,18 @@ template <std::size_t block_size,
           typename Hash,
           typename KeyEqual>
 __global__ void erase(
-  InputIt first, InputIt last, atomicT* num_successes, viewT view, Hash hash, KeyEqual key_equal)
+  InputIt first, int64_t n, atomicT* num_successes, viewT view, Hash hash, KeyEqual key_equal)
 {
   using BlockReduce = cub::BlockReduce<std::size_t, block_size>;
   __shared__ typename BlockReduce::TempStorage temp_storage;
   std::size_t thread_num_successes = 0;
 
-  auto tid = block_size * blockIdx.x + threadIdx.x;
-  auto it  = first + tid;
+  const int64_t loop_stride = gridDim.x * block_size;
+  int64_t idx               = block_size * blockIdx.x + threadIdx.x;
 
-  while (it < last) {
-    if (view.erase(*it, hash, key_equal)) { thread_num_successes++; }
-    it += gridDim.x * block_size;
+  while (idx < n) {
+    if (view.erase(*(first + idx), hash, key_equal)) { thread_num_successes++; }
+    idx += loop_stride;
   }
 
   // compute number of successfully inserted elements for each block
@@ -192,6 +218,29 @@ __global__ void erase(
   }
 }
 
+/**
+ * @brief Erases the key/value pairs corresponding to all keys in the range `[first, last)`.
+ *
+ * If the key `*(first + i)` exists in the map, its slot is erased and made available for future
+ * insertions.
+ * Else, no effect.
+ *
+ * @tparam block_size The size of the thread block
+ * @tparam tile_size The number of threads in the Cooperative Groups used to perform erase
+ * @tparam InputIt Device accessible input iterator whose `value_type` is
+ * convertible to the map's `key_type`
+ * @tparam atomicT Type of atomic storage
+ * @tparam viewT Type of device view allowing access of hash map storage
+ * @tparam Hash Unary callable type
+ * @tparam KeyEqual Binary callable type
+ *
+ * @param first Beginning of the sequence of keys
+ * @param last End of the sequence of keys
+ * @param num_successes The number of successfully erased key/value pairs
+ * @param view Device view used to access the hash map's slot storage
+ * @param hash The unary function to apply to hash each key
+ * @param key_equal The binary function to compare two keys for equality
+ */
 template <std::size_t block_size,
           uint32_t tile_size,
           typename InputIt,
@@ -200,21 +249,21 @@ template <std::size_t block_size,
           typename Hash,
           typename KeyEqual>
 __global__ void erase(
-  InputIt first, InputIt last, atomicT* num_successes, viewT view, Hash hash, KeyEqual key_equal)
+  InputIt first, int64_t n, atomicT* num_successes, viewT view, Hash hash, KeyEqual key_equal)
 {
   typedef cub::BlockReduce<std::size_t, block_size> BlockReduce;
   __shared__ typename BlockReduce::TempStorage temp_storage;
   std::size_t thread_num_successes = 0;
 
-  auto tile = cg::tiled_partition<tile_size>(cg::this_thread_block());
-  auto tid  = block_size * blockIdx.x + threadIdx.x;
-  auto it   = first + tid / tile_size;
+  auto tile                 = cg::tiled_partition<tile_size>(cg::this_thread_block());
+  int64_t const loop_stride = gridDim.x * block_size / tile_size;
+  int64_t idx               = (block_size * blockIdx.x + threadIdx.x) / tile_size;
 
-  while (it < last) {
-    if (view.erase(tile, *it, hash, key_equal) and tile.thread_rank() == 0) {
+  while (idx < n) {
+    if (view.erase(tile, *(first + idx), hash, key_equal) and tile.thread_rank() == 0) {
       thread_num_successes++;
     }
-    it += (gridDim.x * block_size) / tile_size;
+    idx += loop_stride;
   }
 
   // compute number of successfully inserted elements for each block
@@ -244,6 +293,7 @@ __global__ void erase(
  * and argument type is convertible from `std::iterator_traits<StencilIt>::value_type`
  * @tparam Hash Unary callable type
  * @tparam KeyEqual Binary callable type
+ *
  * @param first Beginning of the sequence of key/value pairs
  * @param n Number of elements to insert
  * @param num_successes The number of successfully inserted key/value pairs
@@ -263,7 +313,7 @@ template <std::size_t block_size,
           typename Hash,
           typename KeyEqual>
 __global__ void insert_if_n(InputIt first,
-                            std::size_t n,
+                            int64_t n,
                             atomicT* num_successes,
                             viewT view,
                             StencilIt stencil,
@@ -275,18 +325,18 @@ __global__ void insert_if_n(InputIt first,
   __shared__ typename BlockReduce::TempStorage temp_storage;
   std::size_t thread_num_successes = 0;
 
-  auto tile = cg::tiled_partition<tile_size>(cg::this_thread_block());
-  auto tid  = block_size * blockIdx.x + threadIdx.x;
-  auto i    = tid / tile_size;
+  auto tile                 = cg::tiled_partition<tile_size>(cg::this_thread_block());
+  int64_t const loop_stride = gridDim.x * block_size / tile_size;
+  int64_t idx               = (block_size * blockIdx.x + threadIdx.x) / tile_size;
 
-  while (i < n) {
-    if (pred(*(stencil + i))) {
-      typename viewT::value_type const insert_pair{*(first + i)};
+  while (idx < n) {
+    if (pred(*(stencil + idx))) {
+      typename viewT::value_type const insert_pair{*(first + idx)};
       if (view.insert(tile, insert_pair, hash, key_equal) and tile.thread_rank() == 0) {
         thread_num_successes++;
       }
     }
-    i += (gridDim.x * block_size) / tile_size;
+    idx += loop_stride;
   }
 
   // compute number of successfully inserted elements for each block
@@ -311,8 +361,9 @@ __global__ void insert_if_n(InputIt first,
  * @tparam viewT Type of device view allowing access of hash map storage
  * @tparam Hash Unary callable type
  * @tparam KeyEqual Binary callable type
+ *
  * @param first Beginning of the sequence of keys
- * @param last End of the sequence of keys
+ * @param n Number of keys to query
  * @param output_begin Beginning of the sequence of values retrieved for each key
  * @param view Device view used to access the hash map's slot storage
  * @param hash The unary function to apply to hash each key
@@ -326,14 +377,14 @@ template <std::size_t block_size,
           typename Hash,
           typename KeyEqual>
 __global__ void find(
-  InputIt first, InputIt last, OutputIt output_begin, viewT view, Hash hash, KeyEqual key_equal)
+  InputIt first, int64_t n, OutputIt output_begin, viewT view, Hash hash, KeyEqual key_equal)
 {
-  auto tid     = block_size * blockIdx.x + threadIdx.x;
-  auto key_idx = tid;
+  int64_t const loop_stride = gridDim.x * block_size;
+  int64_t idx               = block_size * blockIdx.x + threadIdx.x;
   __shared__ Value writeBuffer[block_size];
 
-  while (first + key_idx < last) {
-    auto key   = *(first + key_idx);
+  while (idx < n) {
+    auto key   = *(first + idx);
     auto found = view.find(key, hash, key_equal);
 
     /*
@@ -347,8 +398,8 @@ __global__ void find(
                                  ? view.get_empty_value_sentinel()
                                  : found->second.load(cuda::std::memory_order_relaxed);
     __syncthreads();
-    *(output_begin + key_idx) = writeBuffer[threadIdx.x];
-    key_idx += gridDim.x * block_size;
+    *(output_begin + idx) = writeBuffer[threadIdx.x];
+    idx += loop_stride;
   }
 }
 
@@ -371,8 +422,9 @@ __global__ void find(
  * @tparam viewT Type of device view allowing access of hash map storage
  * @tparam Hash Unary callable type
  * @tparam KeyEqual Binary callable type
+ *
  * @param first Beginning of the sequence of keys
- * @param last End of the sequence of keys
+ * @param n Number of keys to query
  * @param output_begin Beginning of the sequence of values retrieved for each key
  * @param view Device view used to access the hash map's slot storage
  * @param hash The unary function to apply to hash each key
@@ -387,15 +439,15 @@ template <std::size_t block_size,
           typename Hash,
           typename KeyEqual>
 __global__ void find(
-  InputIt first, InputIt last, OutputIt output_begin, viewT view, Hash hash, KeyEqual key_equal)
+  InputIt first, int64_t n, OutputIt output_begin, viewT view, Hash hash, KeyEqual key_equal)
 {
-  auto tile    = cg::tiled_partition<tile_size>(cg::this_thread_block());
-  auto tid     = block_size * blockIdx.x + threadIdx.x;
-  auto key_idx = tid / tile_size;
-  __shared__ Value writeBuffer[block_size];
+  auto tile                 = cg::tiled_partition<tile_size>(cg::this_thread_block());
+  int64_t const loop_stride = gridDim.x * block_size / tile_size;
+  int64_t idx               = (block_size * blockIdx.x + threadIdx.x) / tile_size;
+  __shared__ Value writeBuffer[block_size / tile_size];
 
-  while (first + key_idx < last) {
-    auto key   = *(first + key_idx);
+  while (idx < n) {
+    auto key   = *(first + idx);
     auto found = view.find(tile, key, hash, key_equal);
 
     /*
@@ -411,10 +463,8 @@ __global__ void find(
                             : found->second.load(cuda::std::memory_order_relaxed);
     }
     __syncthreads();
-    if (tile.thread_rank() == 0) {
-      *(output_begin + key_idx) = writeBuffer[threadIdx.x / tile_size];
-    }
-    key_idx += (gridDim.x * block_size) / tile_size;
+    if (tile.thread_rank() == 0) { *(output_begin + idx) = writeBuffer[threadIdx.x / tile_size]; }
+    idx += loop_stride;
   }
 }
 
@@ -431,8 +481,9 @@ __global__ void find(
  * @tparam viewT Type of device view allowing access of hash map storage
  * @tparam Hash Unary callable type
  * @tparam KeyEqual Binary callable type
+ *
  * @param first Beginning of the sequence of keys
- * @param last End of the sequence of keys
+ * @param n Number of keys to query
  * @param output_begin Beginning of the sequence of booleans for the presence of each key
  * @param view Device view used to access the hash map's slot storage
  * @param hash The unary function to apply to hash each key
@@ -445,14 +496,14 @@ template <std::size_t block_size,
           typename Hash,
           typename KeyEqual>
 __global__ void contains(
-  InputIt first, InputIt last, OutputIt output_begin, viewT view, Hash hash, KeyEqual key_equal)
+  InputIt first, int64_t n, OutputIt output_begin, viewT view, Hash hash, KeyEqual key_equal)
 {
-  auto tid     = block_size * blockIdx.x + threadIdx.x;
-  auto key_idx = tid;
+  int64_t const loop_stride = gridDim.x * block_size;
+  int64_t idx               = block_size * blockIdx.x + threadIdx.x;
   __shared__ bool writeBuffer[block_size];
 
-  while (first + key_idx < last) {
-    auto key = *(first + key_idx);
+  while (idx < n) {
+    auto key = *(first + idx);
 
     /*
      * The ld.relaxed.gpu instruction used in view.find causes L1 to
@@ -463,8 +514,8 @@ __global__ void contains(
      */
     writeBuffer[threadIdx.x] = view.contains(key, hash, key_equal);
     __syncthreads();
-    *(output_begin + key_idx) = writeBuffer[threadIdx.x];
-    key_idx += gridDim.x * block_size;
+    *(output_begin + idx) = writeBuffer[threadIdx.x];
+    idx += loop_stride;
   }
 }
 
@@ -486,8 +537,9 @@ __global__ void contains(
  * @tparam viewT Type of device view allowing access of hash map storage
  * @tparam Hash Unary callable type
  * @tparam KeyEqual Binary callable type
+ *
  * @param first Beginning of the sequence of keys
- * @param last End of the sequence of keys
+ * @param n Number of keys to query
  * @param output_begin Beginning of the sequence of booleans for the presence of each key
  * @param view Device view used to access the hash map's slot storage
  * @param hash The unary function to apply to hash each key
@@ -501,15 +553,15 @@ template <std::size_t block_size,
           typename Hash,
           typename KeyEqual>
 __global__ void contains(
-  InputIt first, InputIt last, OutputIt output_begin, viewT view, Hash hash, KeyEqual key_equal)
+  InputIt first, int64_t n, OutputIt output_begin, viewT view, Hash hash, KeyEqual key_equal)
 {
-  auto tile    = cg::tiled_partition<tile_size>(cg::this_thread_block());
-  auto tid     = block_size * blockIdx.x + threadIdx.x;
-  auto key_idx = tid / tile_size;
-  __shared__ bool writeBuffer[block_size];
+  auto tile                 = cg::tiled_partition<tile_size>(cg::this_thread_block());
+  int64_t const loop_stride = gridDim.x * block_size / tile_size;
+  int64_t idx               = (block_size * blockIdx.x + threadIdx.x) / tile_size;
+  __shared__ bool writeBuffer[block_size / tile_size];
 
-  while (first + key_idx < last) {
-    auto key   = *(first + key_idx);
+  while (idx < n) {
+    auto key   = *(first + idx);
     auto found = view.contains(tile, key, hash, key_equal);
 
     /*
@@ -521,10 +573,8 @@ __global__ void contains(
      */
     if (tile.thread_rank() == 0) { writeBuffer[threadIdx.x / tile_size] = found; }
     __syncthreads();
-    if (tile.thread_rank() == 0) {
-      *(output_begin + key_idx) = writeBuffer[threadIdx.x / tile_size];
-    }
-    key_idx += (gridDim.x * block_size) / tile_size;
+    if (tile.thread_rank() == 0) { *(output_begin + idx) = writeBuffer[threadIdx.x / tile_size]; }
+    idx += loop_stride;
   }
 }
 
diff --git a/include/cuco/detail/static_multimap/device_view_impl.inl b/include/cuco/detail/static_multimap/device_view_impl.inl
index c6612a7c8..98c08e720 100644
--- a/include/cuco/detail/static_multimap/device_view_impl.inl
+++ b/include/cuco/detail/static_multimap/device_view_impl.inl
@@ -22,6 +22,7 @@
 #include <thrust/type_traits/is_contiguous_iterator.h>
 
 #include <cooperative_groups.h>
+#include <cooperative_groups/memcpy_async.h>
 
 namespace cuco {
 template <typename Key,
@@ -497,23 +498,31 @@ class static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::device_view_
     }
     offset = g.shfl(offset, 0);
 
-    if constexpr (thrust::is_contiguous_iterator_v<OutputIt>) {
 #if defined(CUCO_HAS_CG_MEMCPY_ASYNC)
+    constexpr bool uses_memcpy_async = thrust::is_contiguous_iterator_v<OutputIt>;
+#else
+    constexpr bool uses_memcpy_async = false;
+#endif  // end CUCO_HAS_CG_MEMCPY_ASYNC
+
+    if constexpr (uses_memcpy_async) {
 #if defined(CUCO_HAS_CUDA_BARRIER)
       cooperative_groups::memcpy_async(
         g,
-        output_begin + offset,
+        &thrust::raw_reference_cast(*(output_begin + offset)),
         output_buffer,
         cuda::aligned_size_t<alignof(value_type)>(sizeof(value_type) * num_outputs));
 #else
-      cooperative_groups::memcpy_async(
-        g, output_begin + offset, output_buffer, sizeof(value_type) * num_outputs);
+      cooperative_groups::memcpy_async(g,
+                                       &thrust::raw_reference_cast(*(output_begin + offset)),
+                                       output_buffer,
+                                       sizeof(value_type) * num_outputs);
 #endif  // end CUCO_HAS_CUDA_BARRIER
-      return;
-#endif  // end CUCO_HAS_CG_MEMCPY_ASYNC
     }
-    for (auto index = lane_id; index < num_outputs; index += g.size()) {
-      *(output_begin + offset + index) = output_buffer[index];
+
+    if constexpr (not uses_memcpy_async) {
+      for (auto index = lane_id; index < num_outputs; index += g.size()) {
+        *(output_begin + offset + index) = output_buffer[index];
+      }
     }
   }
 
@@ -991,8 +1000,12 @@ class static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::device_view_
       if (*flushing_cg_counter + flushing_cg.size() * vector_width() > buffer_size) {
         flush_output_buffer(
           flushing_cg, *flushing_cg_counter, output_buffer, num_matches, output_begin);
+        // Everyone in the group reads the counter when flushing, so
+        // sync before writing.
+        flushing_cg.sync();
         // First lane reset warp-level counter
         if (flushing_cg.thread_rank() == 0) { *flushing_cg_counter = 0; }
+        flushing_cg.sync();
       }
 
       current_slot = next_slot(current_slot);
@@ -1083,8 +1096,12 @@ class static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::device_view_
       // Flush if the next iteration won't fit into buffer
       if ((*cg_counter + g.size()) > buffer_size) {
         flush_output_buffer(g, *cg_counter, output_buffer, num_matches, output_begin);
+        // Everyone in the group reads the counter when flushing, so
+        // sync before writing.
+        g.sync();
         // First lane reset CG-level counter
         if (lane_id == 0) { *cg_counter = 0; }
+        g.sync();
       }
       current_slot = next_slot(current_slot);
     }  // while running
@@ -1419,8 +1436,12 @@ class static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::device_view_
                             num_matches,
                             probe_output_begin,
                             contained_output_begin);
+        // Everyone in the group reads the counter when flushing, so
+        // sync before writing.
+        flushing_cg.sync();
         // First lane reset warp-level counter
         if (flushing_cg.thread_rank() == 0) { *flushing_cg_counter = 0; }
+        flushing_cg.sync();
       }
 
       current_slot = next_slot(current_slot);
@@ -1530,8 +1551,12 @@ class static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::device_view_
                             num_matches,
                             probe_output_begin,
                             contained_output_begin);
+        // Everyone in the group reads the counter when flushing, so
+        // sync before writing.
+        g.sync();
         // First lane reset CG-level counter
         if (lane_id == 0) { *cg_counter = 0; }
+        g.sync();
       }
       current_slot = next_slot(current_slot);
     }  // while running
diff --git a/include/cuco/detail/static_multimap/kernels.cuh b/include/cuco/detail/static_multimap/kernels.cuh
index f3820bf64..67fb36045 100644
--- a/include/cuco/detail/static_multimap/kernels.cuh
+++ b/include/cuco/detail/static_multimap/kernels.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,7 +15,7 @@
  */
 #pragma once
 
-#include <cuco/detail/pair.cuh>
+#include <cuco/pair.cuh>
 
 #include <thrust/type_traits/is_contiguous_iterator.h>
 
@@ -23,8 +23,6 @@
 
 #include <cuda/std/atomic>
 
-#include <cooperative_groups/memcpy_async.h>
-
 #include <iterator>
 
 namespace cuco {
@@ -42,6 +40,7 @@ namespace cg = cooperative_groups;
  * @tparam Key key type
  * @tparam Value value type
  * @tparam pair_atomic_type key/value pair type
+ *
  * @param slots Pointer to flat storage for the map's key/value pairs
  * @param k Key to which all keys in `slots` are initialized
  * @param v Value to which all values in `slots` are initialized
@@ -52,13 +51,14 @@ template <typename atomic_key_type,
           typename Key,
           typename Value,
           typename pair_atomic_type>
-__global__ void initialize(pair_atomic_type* const slots, Key k, Value v, std::size_t size)
+__global__ void initialize(pair_atomic_type* const slots, Key k, Value v, int64_t size)
 {
-  auto tid = threadIdx.x + blockIdx.x * blockDim.x;
-  while (tid < size) {
-    new (&slots[tid].first) atomic_key_type{k};
-    new (&slots[tid].second) atomic_mapped_type{v};
-    tid += gridDim.x * blockDim.x;
+  int64_t const loop_stride = gridDim.x * blockDim.x;
+  int64_t idx               = threadIdx.x + blockIdx.x * blockDim.x;
+  while (idx < size) {
+    new (&slots[idx].first) atomic_key_type{k};
+    new (&slots[idx].second) atomic_mapped_type{v};
+    idx += loop_stride;
   }
 }
 
@@ -78,21 +78,21 @@ __global__ void initialize(pair_atomic_type* const slots, Key k, Value v, std::s
  * @tparam viewT Type of device view allowing access of hash map storage
  *
  * @param first Beginning of the sequence of key/value pairs
- * @param last End of the sequence of key/value pairs
+ * @param n Number of key/value pairs to insert
  * @param view Mutable device view used to access the hash map's slot storage
  */
 template <uint32_t block_size, uint32_t tile_size, typename InputIt, typename viewT>
-__global__ void insert(InputIt first, InputIt last, viewT view)
+__global__ void insert(InputIt first, int64_t n, viewT view)
 {
-  auto tile = cg::tiled_partition<tile_size>(cg::this_thread_block());
-  auto tid  = block_size * blockIdx.x + threadIdx.x;
-  auto it   = first + tid / tile_size;
+  auto tile                 = cg::tiled_partition<tile_size>(cg::this_thread_block());
+  int64_t const loop_stride = gridDim.x * block_size / tile_size;
+  int64_t idx               = (block_size * blockIdx.x + threadIdx.x) / tile_size;
 
-  while (it < last) {
+  while (idx < n) {
     // force conversion to value_type
-    typename viewT::value_type const insert_pair{*it};
+    typename viewT::value_type const insert_pair{*(first + idx)};
     view.insert(tile, insert_pair);
-    it += (gridDim.x * block_size) / tile_size;
+    idx += loop_stride;
   }
 }
 
@@ -117,6 +117,7 @@ __global__ void insert(InputIt first, InputIt last, viewT view)
  * @tparam viewT Type of device view allowing access of hash map storage
  * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` and
  * argument type is convertible from `std::iterator_traits<StencilIt>::value_type`.
+ *
  * @param first Beginning of the sequence of key/value pairs
  * @param s Beginning of the stencil sequence
  * @param n Number of elements to insert
@@ -129,19 +130,19 @@ template <uint32_t block_size,
           typename StencilIt,
           typename viewT,
           typename Predicate>
-__global__ void insert_if_n(InputIt first, StencilIt s, std::size_t n, viewT view, Predicate pred)
+__global__ void insert_if_n(InputIt first, StencilIt s, int64_t n, viewT view, Predicate pred)
 {
-  auto tile      = cg::tiled_partition<tile_size>(cg::this_thread_block());
-  auto const tid = block_size * blockIdx.x + threadIdx.x;
-  auto i         = tid / tile_size;
+  auto tile                 = cg::tiled_partition<tile_size>(cg::this_thread_block());
+  int64_t const loop_stride = gridDim.x * block_size / tile_size;
+  int64_t idx               = (block_size * blockIdx.x + threadIdx.x) / tile_size;
 
-  while (i < n) {
-    if (pred(*(s + i))) {
-      typename viewT::value_type const insert_pair{*(first + i)};
+  while (idx < n) {
+    if (pred(*(s + idx))) {
+      typename viewT::value_type const insert_pair{*(first + idx)};
       // force conversion to value_type
       view.insert(tile, insert_pair);
     }
-    i += (gridDim.x * block_size) / tile_size;
+    idx += loop_stride;
   }
 }
 
@@ -164,7 +165,7 @@ __global__ void insert_if_n(InputIt first, StencilIt s, std::size_t n, viewT vie
  * @tparam Equal Binary callable type
  *
  * @param first Beginning of the sequence of elements
- * @param last End of the sequence of elements
+ * @param n Number of elements to query
  * @param output_begin Beginning of the sequence of booleans for the presence of each element
  * @param view Device view used to access the hash map's slot storage
  * @param equal The binary function to compare input element and slot content for equality
@@ -176,15 +177,14 @@ template <bool is_pair_contains,
           typename OutputIt,
           typename viewT,
           typename Equal>
-__global__ void contains(
-  InputIt first, InputIt last, OutputIt output_begin, viewT view, Equal equal)
+__global__ void contains(InputIt first, int64_t n, OutputIt output_begin, viewT view, Equal equal)
 {
-  auto tile = cg::tiled_partition<tile_size>(cg::this_thread_block());
-  auto tid  = block_size * blockIdx.x + threadIdx.x;
-  auto idx  = tid / tile_size;
-  __shared__ bool writeBuffer[block_size];
+  auto tile                 = cg::tiled_partition<tile_size>(cg::this_thread_block());
+  int64_t const loop_stride = gridDim.x * block_size / tile_size;
+  int64_t idx               = (block_size * blockIdx.x + threadIdx.x) / tile_size;
+  __shared__ bool writeBuffer[block_size / tile_size];
 
-  while (first + idx < last) {
+  while (idx < n) {
     typename std::iterator_traits<InputIt>::value_type element = *(first + idx);
     auto found                                                 = [&]() {
       if constexpr (is_pair_contains) { return view.pair_contains(tile, element, equal); }
@@ -201,7 +201,7 @@ __global__ void contains(
     if (tile.thread_rank() == 0) { writeBuffer[threadIdx.x / tile_size] = found; }
     __syncthreads();
     if (tile.thread_rank() == 0) { *(output_begin + idx) = writeBuffer[threadIdx.x / tile_size]; }
-    idx += (gridDim.x * block_size) / tile_size;
+    idx += loop_stride;
   }
 }
 
@@ -221,8 +221,9 @@ __global__ void contains(
  * @tparam atomicT Type of atomic storage
  * @tparam viewT Type of device view allowing access of hash map storage
  * @tparam KeyEqual Binary callable
+ *
  * @param first Beginning of the sequence of keys to count
- * @param last End of the sequence of keys to count
+ * @param n Number of the keys to query
  * @param num_matches The number of all the matches for a sequence of keys
  * @param view Device view used to access the hash map's slot storage
  * @param key_equal Binary function to compare two keys for equality
@@ -235,24 +236,24 @@ template <uint32_t block_size,
           typename viewT,
           typename KeyEqual>
 __global__ void count(
-  InputIt first, InputIt last, atomicT* num_matches, viewT view, KeyEqual key_equal)
+  InputIt first, int64_t n, atomicT* num_matches, viewT view, KeyEqual key_equal)
 {
-  auto tile    = cg::tiled_partition<tile_size>(cg::this_thread_block());
-  auto tid     = block_size * blockIdx.x + threadIdx.x;
-  auto key_idx = tid / tile_size;
+  auto tile                 = cg::tiled_partition<tile_size>(cg::this_thread_block());
+  int64_t const loop_stride = gridDim.x * block_size / tile_size;
+  int64_t idx               = (block_size * blockIdx.x + threadIdx.x) / tile_size;
 
   typedef cub::BlockReduce<std::size_t, block_size> BlockReduce;
   __shared__ typename BlockReduce::TempStorage temp_storage;
   std::size_t thread_num_matches = 0;
 
-  while (first + key_idx < last) {
-    auto key = *(first + key_idx);
+  while (idx < n) {
+    auto key = *(first + idx);
     if constexpr (is_outer) {
       thread_num_matches += view.count_outer(tile, key, key_equal);
     } else {
       thread_num_matches += view.count(tile, key, key_equal);
     }
-    key_idx += (gridDim.x * block_size) / tile_size;
+    idx += loop_stride;
   }
 
   // compute number of successfully inserted elements for each block
@@ -279,8 +280,9 @@ __global__ void count(
  * @tparam atomicT Type of atomic storage
  * @tparam viewT Type of device view allowing access of hash map storage
  * @tparam PairEqual Binary callable
+ *
  * @param first Beginning of the sequence of pairs to count
- * @param last End of the sequence of pairs to count
+ * @param n Number of the pairs to query
  * @param num_matches The number of all the matches for a sequence of pairs
  * @param view Device view used to access the hash map's slot storage
  * @param pair_equal Binary function to compare two pairs for equality
@@ -293,24 +295,24 @@ template <uint32_t block_size,
           typename viewT,
           typename PairEqual>
 __global__ void pair_count(
-  InputIt first, InputIt last, atomicT* num_matches, viewT view, PairEqual pair_equal)
+  InputIt first, int64_t n, atomicT* num_matches, viewT view, PairEqual pair_equal)
 {
-  auto tile     = cg::tiled_partition<tile_size>(cg::this_thread_block());
-  auto tid      = block_size * blockIdx.x + threadIdx.x;
-  auto pair_idx = tid / tile_size;
+  auto tile                 = cg::tiled_partition<tile_size>(cg::this_thread_block());
+  int64_t const loop_stride = gridDim.x * block_size / tile_size;
+  int64_t idx               = (block_size * blockIdx.x + threadIdx.x) / tile_size;
 
   typedef cub::BlockReduce<std::size_t, block_size> BlockReduce;
   __shared__ typename BlockReduce::TempStorage temp_storage;
   std::size_t thread_num_matches = 0;
 
-  while (first + pair_idx < last) {
-    typename viewT::value_type const pair = *(first + pair_idx);
+  while (idx < n) {
+    typename viewT::value_type const pair = *(first + idx);
     if constexpr (is_outer) {
       thread_num_matches += view.pair_count_outer(tile, pair, pair_equal);
     } else {
       thread_num_matches += view.pair_count(tile, pair, pair_equal);
     }
-    pair_idx += (gridDim.x * block_size) / tile_size;
+    idx += loop_stride;
   }
 
   // compute number of successfully inserted elements for each block
@@ -343,8 +345,9 @@ __global__ void pair_count(
  * @tparam atomicT Type of atomic storage
  * @tparam viewT Type of device view allowing access of hash map storage
  * @tparam KeyEqual Binary callable type
+ *
  * @param first Beginning of the sequence of keys
- * @param last End of the sequence of keys
+ * @param n Number of the keys to query
  * @param output_begin Beginning of the sequence of values retrieved for each key
  * @param num_matches Size of the output sequence
  * @param view Device view used to access the hash map's slot storage
@@ -361,7 +364,7 @@ template <uint32_t block_size,
           typename viewT,
           typename KeyEqual>
 __global__ void retrieve(InputIt first,
-                         InputIt last,
+                         int64_t n,
                          OutputIt output_begin,
                          atomicT* num_matches,
                          viewT view,
@@ -372,10 +375,10 @@ __global__ void retrieve(InputIt first,
   constexpr uint32_t num_flushing_cgs = block_size / flushing_cg_size;
   const uint32_t flushing_cg_id       = threadIdx.x / flushing_cg_size;
 
-  auto flushing_cg = cg::tiled_partition<flushing_cg_size>(cg::this_thread_block());
-  auto probing_cg  = cg::tiled_partition<probing_cg_size>(cg::this_thread_block());
-  auto tid         = block_size * blockIdx.x + threadIdx.x;
-  auto key_idx     = tid / probing_cg_size;
+  auto flushing_cg          = cg::tiled_partition<flushing_cg_size>(cg::this_thread_block());
+  auto probing_cg           = cg::tiled_partition<probing_cg_size>(cg::this_thread_block());
+  int64_t const loop_stride = gridDim.x * block_size / probing_cg_size;
+  int64_t idx               = (block_size * blockIdx.x + threadIdx.x) / probing_cg_size;
 
   __shared__ pair_type output_buffer[num_flushing_cgs][buffer_size];
   // TODO: replace this with shared memory cuda::atomic variables once the dynamiic initialization
@@ -384,12 +387,14 @@ __global__ void retrieve(InputIt first,
 
   if (flushing_cg.thread_rank() == 0) { flushing_cg_counter[flushing_cg_id] = 0; }
 
-  while (flushing_cg.any(first + key_idx < last)) {
-    bool active_flag        = first + key_idx < last;
+  flushing_cg.sync();
+
+  while (flushing_cg.any(idx < n)) {
+    bool active_flag        = idx < n;
     auto active_flushing_cg = cg::binary_partition<flushing_cg_size>(flushing_cg, active_flag);
 
     if (active_flag) {
-      auto key = *(first + key_idx);
+      auto key = *(first + idx);
       if constexpr (is_outer) {
         view.retrieve_outer<buffer_size>(active_flushing_cg,
                                          probing_cg,
@@ -410,9 +415,10 @@ __global__ void retrieve(InputIt first,
                                    key_equal);
       }
     }
-    key_idx += (gridDim.x * block_size) / probing_cg_size;
+    idx += loop_stride;
   }
 
+  flushing_cg.sync();
   // Final flush of output buffer
   if (flushing_cg_counter[flushing_cg_id] > 0) {
     view.flush_output_buffer(flushing_cg,
@@ -450,8 +456,9 @@ __global__ void retrieve(InputIt first,
  * @tparam atomicT Type of atomic storage
  * @tparam viewT Type of device view allowing access of hash map storage
  * @tparam PairEqual Binary callable type
+ *
  * @param first Beginning of the sequence of keys
- * @param last End of the sequence of keys
+ * @param n Number of keys to query
  * @param probe_output_begin Beginning of the sequence of the matched probe pairs
  * @param contained_output_begin Beginning of the sequence of the matched contained pairs
  * @param num_matches Size of the output sequence
@@ -470,7 +477,7 @@ template <uint32_t block_size,
           typename viewT,
           typename PairEqual>
 __global__ void pair_retrieve(InputIt first,
-                              InputIt last,
+                              int64_t n,
                               OutputIt1 probe_output_begin,
                               OutputIt2 contained_output_begin,
                               atomicT* num_matches,
@@ -482,10 +489,10 @@ __global__ void pair_retrieve(InputIt first,
   constexpr uint32_t num_flushing_cgs = block_size / flushing_cg_size;
   const uint32_t flushing_cg_id       = threadIdx.x / flushing_cg_size;
 
-  auto flushing_cg = cg::tiled_partition<flushing_cg_size>(cg::this_thread_block());
-  auto probing_cg  = cg::tiled_partition<probing_cg_size>(cg::this_thread_block());
-  auto tid         = block_size * blockIdx.x + threadIdx.x;
-  auto pair_idx    = tid / probing_cg_size;
+  auto flushing_cg          = cg::tiled_partition<flushing_cg_size>(cg::this_thread_block());
+  auto probing_cg           = cg::tiled_partition<probing_cg_size>(cg::this_thread_block());
+  int64_t const loop_stride = gridDim.x * block_size / probing_cg_size;
+  int64_t idx               = (block_size * blockIdx.x + threadIdx.x) / probing_cg_size;
 
   __shared__ pair_type probe_output_buffer[num_flushing_cgs][buffer_size];
   __shared__ pair_type contained_output_buffer[num_flushing_cgs][buffer_size];
@@ -495,12 +502,14 @@ __global__ void pair_retrieve(InputIt first,
 
   if (flushing_cg.thread_rank() == 0) { flushing_cg_counter[flushing_cg_id] = 0; }
 
-  while (flushing_cg.any(first + pair_idx < last)) {
-    bool active_flag        = first + pair_idx < last;
+  flushing_cg.sync();
+
+  while (flushing_cg.any(idx < n)) {
+    bool active_flag        = idx < n;
     auto active_flushing_cg = cg::binary_partition<flushing_cg_size>(flushing_cg, active_flag);
 
     if (active_flag) {
-      pair_type pair = *(first + pair_idx);
+      pair_type pair = *(first + idx);
       if constexpr (is_outer) {
         view.pair_retrieve_outer<buffer_size>(active_flushing_cg,
                                               probing_cg,
@@ -525,9 +534,10 @@ __global__ void pair_retrieve(InputIt first,
                                         pair_equal);
       }
     }
-    pair_idx += (gridDim.x * block_size) / probing_cg_size;
+    idx += loop_stride;
   }
 
+  flushing_cg.sync();
   // Final flush of output buffer
   if (flushing_cg_counter[flushing_cg_id] > 0) {
     view.flush_output_buffer(flushing_cg,
diff --git a/include/cuco/detail/static_multimap/static_multimap.inl b/include/cuco/detail/static_multimap/static_multimap.inl
index ddec2e4a2..4e9570bce 100644
--- a/include/cuco/detail/static_multimap/static_multimap.inl
+++ b/include/cuco/detail/static_multimap/static_multimap.inl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
+#include <cuco/detail/utility/cuda.hpp>
 #include <cuco/detail/utils.cuh>
-#include <cuco/detail/utils.hpp>
 
 #include <thrust/count.h>
 #include <thrust/execution_policy.h>
@@ -33,8 +33,8 @@ template <typename Key,
           class ProbeSequence>
 static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::static_multimap(
   std::size_t capacity,
-  sentinel::empty_key<Key> empty_key_sentinel,
-  sentinel::empty_value<Value> empty_value_sentinel,
+  empty_key<Key> empty_key_sentinel,
+  empty_value<Value> empty_value_sentinel,
   cudaStream_t stream,
   Allocator const& alloc)
   : capacity_{cuco::detail::get_valid_capacity<cg_size(), vector_width(), uses_vector_load()>(
@@ -66,7 +66,7 @@ void static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::insert(InputI
                                                                           InputIt last,
                                                                           cudaStream_t stream)
 {
-  auto const num_keys = std::distance(first, last);
+  auto const num_keys = cuco::detail::distance(first, last);
   if (num_keys == 0) { return; }
 
   auto constexpr block_size = 128;
@@ -75,7 +75,7 @@ void static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::insert(InputI
   auto view            = get_device_mutable_view();
 
   detail::insert<block_size, cg_size()>
-    <<<grid_size, block_size, 0, stream>>>(first, first + num_keys, view);
+    <<<grid_size, block_size, 0, stream>>>(first, num_keys, view);
   CUCO_CUDA_TRY(cudaStreamSynchronize(stream));
 }
 
@@ -88,7 +88,7 @@ template <typename InputIt, typename StencilIt, typename Predicate>
 void static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::insert_if(
   InputIt first, InputIt last, StencilIt stencil, Predicate pred, cudaStream_t stream)
 {
-  auto const num_keys = std::distance(first, last);
+  auto const num_keys = cuco::detail::distance(first, last);
   if (num_keys == 0) { return; }
 
   auto constexpr block_size = 128;
@@ -110,7 +110,7 @@ template <typename InputIt, typename OutputIt, typename KeyEqual>
 void static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::contains(
   InputIt first, InputIt last, OutputIt output_begin, KeyEqual key_equal, cudaStream_t stream) const
 {
-  auto const num_keys = std::distance(first, last);
+  auto const num_keys = cuco::detail::distance(first, last);
   if (num_keys == 0) { return; }
 
   auto constexpr is_pair_contains = false;
@@ -120,7 +120,7 @@ void static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::contains(
   auto view            = get_device_view();
 
   detail::contains<is_pair_contains, block_size, cg_size()>
-    <<<grid_size, block_size, 0, stream>>>(first, last, output_begin, view, key_equal);
+    <<<grid_size, block_size, 0, stream>>>(first, num_keys, output_begin, view, key_equal);
   CUCO_CUDA_TRY(cudaStreamSynchronize(stream));
 }
 
@@ -134,7 +134,7 @@ void static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::pair_contains
   InputIt first, InputIt last, OutputIt output_begin, PairEqual pair_equal, cudaStream_t stream)
   const
 {
-  auto const num_pairs = std::distance(first, last);
+  auto const num_pairs = cuco::detail::distance(first, last);
   if (num_pairs == 0) { return; }
 
   auto constexpr is_pair_contains = true;
@@ -144,7 +144,7 @@ void static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::pair_contains
   auto view            = get_device_view();
 
   detail::contains<is_pair_contains, block_size, cg_size()>
-    <<<grid_size, block_size, 0, stream>>>(first, last, output_begin, view, pair_equal);
+    <<<grid_size, block_size, 0, stream>>>(first, num_pairs, output_begin, view, pair_equal);
   CUCO_CUDA_TRY(cudaStreamSynchronize(stream));
 }
 
@@ -157,7 +157,7 @@ template <typename InputIt, typename KeyEqual>
 std::size_t static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::count(
   InputIt first, InputIt last, cudaStream_t stream, KeyEqual key_equal) const
 {
-  auto const num_keys = std::distance(first, last);
+  auto const num_keys = cuco::detail::distance(first, last);
   if (num_keys == 0) { return 0; }
 
   auto constexpr is_outer   = false;
@@ -167,11 +167,11 @@ std::size_t static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::count(
   auto view            = get_device_view();
   auto const grid_size = (cg_size() * num_keys + stride * block_size - 1) / (stride * block_size);
 
-  cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream);
+  CUCO_CUDA_TRY(cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream));
   std::size_t h_counter;
 
   detail::count<block_size, cg_size(), is_outer>
-    <<<grid_size, block_size, 0, stream>>>(first, last, d_counter_.get(), view, key_equal);
+    <<<grid_size, block_size, 0, stream>>>(first, num_keys, d_counter_.get(), view, key_equal);
   CUCO_CUDA_TRY(cudaMemcpyAsync(
     &h_counter, d_counter_.get(), sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream));
   CUCO_CUDA_TRY(cudaStreamSynchronize(stream));
@@ -188,7 +188,7 @@ template <typename InputIt, typename KeyEqual>
 std::size_t static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::count_outer(
   InputIt first, InputIt last, cudaStream_t stream, KeyEqual key_equal) const
 {
-  auto const num_keys = std::distance(first, last);
+  auto const num_keys = cuco::detail::distance(first, last);
   if (num_keys == 0) { return 0; }
 
   auto constexpr is_outer   = true;
@@ -198,11 +198,11 @@ std::size_t static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::count_
   auto view            = get_device_view();
   auto const grid_size = (cg_size() * num_keys + stride * block_size - 1) / (stride * block_size);
 
-  cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream);
+  CUCO_CUDA_TRY(cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream));
   std::size_t h_counter;
 
   detail::count<block_size, cg_size(), is_outer>
-    <<<grid_size, block_size, 0, stream>>>(first, last, d_counter_.get(), view, key_equal);
+    <<<grid_size, block_size, 0, stream>>>(first, num_keys, d_counter_.get(), view, key_equal);
   CUCO_CUDA_TRY(cudaMemcpyAsync(
     &h_counter, d_counter_.get(), sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream));
   CUCO_CUDA_TRY(cudaStreamSynchronize(stream));
@@ -219,21 +219,21 @@ template <typename InputIt, typename PairEqual>
 std::size_t static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::pair_count(
   InputIt first, InputIt last, PairEqual pair_equal, cudaStream_t stream) const
 {
-  auto const num_keys = std::distance(first, last);
-  if (num_keys == 0) { return 0; }
+  auto const num_pairs = cuco::detail::distance(first, last);
+  if (num_pairs == 0) { return 0; }
 
   auto constexpr is_outer   = false;
   auto constexpr block_size = 128;
   auto constexpr stride     = 1;
 
   auto view            = get_device_view();
-  auto const grid_size = (cg_size() * num_keys + stride * block_size - 1) / (stride * block_size);
+  auto const grid_size = (cg_size() * num_pairs + stride * block_size - 1) / (stride * block_size);
 
-  cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream);
+  CUCO_CUDA_TRY(cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream));
   std::size_t h_counter;
 
   detail::pair_count<block_size, cg_size(), is_outer>
-    <<<grid_size, block_size, 0, stream>>>(first, last, d_counter_.get(), view, pair_equal);
+    <<<grid_size, block_size, 0, stream>>>(first, num_pairs, d_counter_.get(), view, pair_equal);
   CUCO_CUDA_TRY(cudaMemcpyAsync(
     &h_counter, d_counter_.get(), sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream));
   CUCO_CUDA_TRY(cudaStreamSynchronize(stream));
@@ -250,21 +250,21 @@ template <typename InputIt, typename PairEqual>
 std::size_t static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::pair_count_outer(
   InputIt first, InputIt last, PairEqual pair_equal, cudaStream_t stream) const
 {
-  auto const num_keys = std::distance(first, last);
-  if (num_keys == 0) { return 0; }
+  auto const num_pairs = cuco::detail::distance(first, last);
+  if (num_pairs == 0) { return 0; }
 
   auto constexpr is_outer   = true;
   auto constexpr block_size = 128;
   auto constexpr stride     = 1;
 
   auto view            = get_device_view();
-  auto const grid_size = (cg_size() * num_keys + stride * block_size - 1) / (stride * block_size);
+  auto const grid_size = (cg_size() * num_pairs + stride * block_size - 1) / (stride * block_size);
 
-  cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream);
+  CUCO_CUDA_TRY(cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream));
   std::size_t h_counter;
 
   detail::pair_count<block_size, cg_size(), is_outer>
-    <<<grid_size, block_size, 0, stream>>>(first, last, d_counter_.get(), view, pair_equal);
+    <<<grid_size, block_size, 0, stream>>>(first, num_pairs, d_counter_.get(), view, pair_equal);
   CUCO_CUDA_TRY(cudaMemcpyAsync(
     &h_counter, d_counter_.get(), sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream));
   CUCO_CUDA_TRY(cudaStreamSynchronize(stream));
@@ -281,12 +281,11 @@ template <typename InputIt, typename OutputIt, typename KeyEqual>
 OutputIt static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::retrieve(
   InputIt first, InputIt last, OutputIt output_begin, cudaStream_t stream, KeyEqual key_equal) const
 {
-  auto const num_keys = std::distance(first, last);
+  auto const num_keys = cuco::detail::distance(first, last);
   if (num_keys == 0) { return output_begin; }
 
   // Using per-warp buffer for vector loads and per-CG buffer for scalar loads
   constexpr auto buffer_size = uses_vector_load() ? (warp_size() * 3u) : (cg_size() * 3u);
-  constexpr auto block_size  = 128;
   constexpr auto is_outer    = false;
 
   auto view                   = get_device_view();
@@ -295,24 +294,14 @@ OutputIt static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::retrieve(
     return cg_size();
   }();
 
-  auto const grid_size = detail::get_grid_size(detail::retrieve<block_size,
-                                                                flushing_cg_size,
-                                                                cg_size(),
-                                                                buffer_size,
-                                                                is_outer,
-                                                                InputIt,
-                                                                OutputIt,
-                                                                atomic_ctr_type,
-                                                                device_view,
-                                                                KeyEqual>,
-                                               block_size);
-
-  cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream);
+  auto const grid_size = detail::grid_size(num_keys, cg_size());
+
+  CUCO_CUDA_TRY(cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream));
   std::size_t h_counter;
 
-  detail::retrieve<block_size, flushing_cg_size, cg_size(), buffer_size, is_outer>
-    <<<grid_size, block_size, 0, stream>>>(
-      first, last, output_begin, d_counter_.get(), view, key_equal);
+  detail::retrieve<detail::default_block_size(), flushing_cg_size, cg_size(), buffer_size, is_outer>
+    <<<grid_size, detail::default_block_size(), 0, stream>>>(
+      first, num_keys, output_begin, d_counter_.get(), view, key_equal);
 
   CUCO_CUDA_TRY(cudaMemcpyAsync(
     &h_counter, d_counter_.get(), sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream));
@@ -331,12 +320,11 @@ template <typename InputIt, typename OutputIt, typename KeyEqual>
 OutputIt static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::retrieve_outer(
   InputIt first, InputIt last, OutputIt output_begin, cudaStream_t stream, KeyEqual key_equal) const
 {
-  auto const num_keys = std::distance(first, last);
+  auto const num_keys = cuco::detail::distance(first, last);
   if (num_keys == 0) { return output_begin; }
 
   // Using per-warp buffer for vector loads and per-CG buffer for scalar loads
   constexpr auto buffer_size = uses_vector_load() ? (warp_size() * 3u) : (cg_size() * 3u);
-  constexpr auto block_size  = 128;
   constexpr auto is_outer    = true;
 
   auto view                   = get_device_view();
@@ -345,24 +333,14 @@ OutputIt static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::retrieve_
     return cg_size();
   }();
 
-  auto const grid_size = detail::get_grid_size(detail::retrieve<block_size,
-                                                                flushing_cg_size,
-                                                                cg_size(),
-                                                                buffer_size,
-                                                                is_outer,
-                                                                InputIt,
-                                                                OutputIt,
-                                                                atomic_ctr_type,
-                                                                device_view,
-                                                                KeyEqual>,
-                                               block_size);
-
-  cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream);
+  auto const grid_size = detail::grid_size(num_keys, cg_size());
+
+  CUCO_CUDA_TRY(cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream));
   std::size_t h_counter;
 
-  detail::retrieve<block_size, flushing_cg_size, cg_size(), buffer_size, is_outer>
-    <<<grid_size, block_size, 0, stream>>>(
-      first, last, output_begin, d_counter_.get(), view, key_equal);
+  detail::retrieve<detail::default_block_size(), flushing_cg_size, cg_size(), buffer_size, is_outer>
+    <<<grid_size, detail::default_block_size(), 0, stream>>>(
+      first, num_keys, output_begin, d_counter_.get(), view, key_equal);
 
   CUCO_CUDA_TRY(cudaMemcpyAsync(
     &h_counter, d_counter_.get(), sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream));
@@ -387,7 +365,7 @@ static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::pair_retrieve(
   PairEqual pair_equal,
   cudaStream_t stream) const
 {
-  auto const num_pairs = std::distance(first, last);
+  auto const num_pairs = cuco::detail::distance(first, last);
   if (num_pairs == 0) { return std::make_pair(probe_output_begin, contained_output_begin); }
 
   // Using per-warp buffer for vector loads and per-CG buffer for scalar loads
@@ -403,12 +381,17 @@ static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::pair_retrieve(
   }();
   auto const grid_size = (cg_size() * num_pairs + stride * block_size - 1) / (stride * block_size);
 
-  cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream);
+  CUCO_CUDA_TRY(cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream));
   std::size_t h_counter;
 
   detail::pair_retrieve<block_size, flushing_cg_size, cg_size(), buffer_size, is_outer>
-    <<<grid_size, block_size, 0, stream>>>(
-      first, last, probe_output_begin, contained_output_begin, d_counter_.get(), view, pair_equal);
+    <<<grid_size, block_size, 0, stream>>>(first,
+                                           num_pairs,
+                                           probe_output_begin,
+                                           contained_output_begin,
+                                           d_counter_.get(),
+                                           view,
+                                           pair_equal);
 
   CUCO_CUDA_TRY(cudaMemcpyAsync(
     &h_counter, d_counter_.get(), sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream));
@@ -432,7 +415,7 @@ static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::pair_retrieve_oute
   PairEqual pair_equal,
   cudaStream_t stream) const
 {
-  auto const num_pairs = std::distance(first, last);
+  auto const num_pairs = cuco::detail::distance(first, last);
   if (num_pairs == 0) { return std::make_pair(probe_output_begin, contained_output_begin); }
 
   // Using per-warp buffer for vector loads and per-CG buffer for scalar loads
@@ -448,12 +431,17 @@ static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::pair_retrieve_oute
   }();
   auto const grid_size = (cg_size() * num_pairs + stride * block_size - 1) / (stride * block_size);
 
-  cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream);
+  CUCO_CUDA_TRY(cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream));
   std::size_t h_counter;
 
   detail::pair_retrieve<block_size, flushing_cg_size, cg_size(), buffer_size, is_outer>
-    <<<grid_size, block_size, 0, stream>>>(
-      first, last, probe_output_begin, contained_output_begin, d_counter_.get(), view, pair_equal);
+    <<<grid_size, block_size, 0, stream>>>(first,
+                                           num_pairs,
+                                           probe_output_begin,
+                                           contained_output_begin,
+                                           d_counter_.get(),
+                                           view,
+                                           pair_equal);
 
   CUCO_CUDA_TRY(cudaMemcpyAsync(
     &h_counter, d_counter_.get(), sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream));
diff --git a/include/cuco/detail/static_set/functors.cuh b/include/cuco/detail/static_set/functors.cuh
new file mode 100644
index 000000000..3ee7be4be
--- /dev/null
+++ b/include/cuco/detail/static_set/functors.cuh
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ */
+
+#pragma once
+
+#include <cuco/detail/bitwise_compare.cuh>
+
+namespace cuco {
+namespace experimental {
+namespace static_set_ns {
+namespace detail {
+
+/**
+ * @brief Device functor returning whether the input slot indexed by `idx` is filled.
+ *
+ * @tparam T The slot content type
+ */
+template <typename T>
+struct slot_is_filled {
+  T empty_sentinel_;  ///< The value of the empty key sentinel
+
+  /**
+   * @brief Constructs `slot_is_filled` functor with the given empty sentinel.
+   *
+   * @param s Sentinel indicating empty slot
+   */
+  explicit constexpr slot_is_filled(T const& s) noexcept : empty_sentinel_{s} {}
+
+  /**
+   * @brief Indicates if the target slot `slot` is filled.
+   *
+   * @tparam T Slot content type
+   *
+   * @param slot The slot
+   *
+   * @return `true` if slot is filled
+   */
+  __device__ constexpr bool operator()(T const& slot) const noexcept
+  {
+    return not cuco::detail::bitwise_compare(empty_sentinel_, slot);
+  }
+};
+
+}  // namespace detail
+}  // namespace static_set_ns
+}  // namespace experimental
+}  // namespace cuco
diff --git a/include/cuco/detail/static_set/kernels.cuh b/include/cuco/detail/static_set/kernels.cuh
new file mode 100644
index 000000000..72744f2b4
--- /dev/null
+++ b/include/cuco/detail/static_set/kernels.cuh
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cuco/detail/bitwise_compare.cuh>
+#include <cuco/detail/utility/cuda.cuh>
+
+#include <cub/block/block_reduce.cuh>
+
+#include <cuda/atomic>
+
+#include <cooperative_groups.h>
+
+namespace cuco {
+namespace experimental {
+namespace static_set_ns {
+namespace detail {
+
+/**
+ * @brief Finds the equivalent set elements of all keys in the range `[first, last)`.
+ *
+ * If the key `*(first + i)` has a match in the set, copies its matched element to `(output_begin +
+ * i)`. Else, copies the empty key sentinel. Uses the CUDA Cooperative Groups API to leverage groups
+ * of multiple threads to find each key. This provides a significant boost in throughput compared to
+ * the non Cooperative Group `find` at moderate to high load factors.
+ *
+ * @tparam CGSize Number of threads in each CG
+ * @tparam BlockSize The size of the thread block
+ * @tparam InputIt Device accessible input iterator
+ * @tparam OutputIt Device accessible output iterator assignable from the set's `key_type`
+ * @tparam Ref Type of non-owning device ref allowing access to storage
+ *
+ * @param first Beginning of the sequence of keys
+ * @param n Number of keys to query
+ * @param output_begin Beginning of the sequence of matched elements retrieved for each key
+ * @param ref Non-owning set device ref used to access the slot storage
+ */
+template <int32_t CGSize, int32_t BlockSize, typename InputIt, typename OutputIt, typename Ref>
+__global__ void find(InputIt first, cuco::detail::index_type n, OutputIt output_begin, Ref ref)
+{
+  namespace cg = cooperative_groups;
+
+  auto const block       = cg::this_thread_block();
+  auto const thread_idx  = block.thread_rank();
+  auto const loop_stride = cuco::detail::grid_stride() / CGSize;
+  auto idx               = cuco::detail::global_thread_id() / CGSize;
+
+  __shared__ typename Ref::key_type output_buffer[BlockSize / CGSize];
+
+  while (idx - thread_idx < n) {  // the whole thread block falls into the same iteration
+    if (idx < n) {
+      auto const key = *(first + idx);
+      if constexpr (CGSize == 1) {
+        auto const found = ref.find(key);
+        /*
+         * The ld.relaxed.gpu instruction causes L1 to flush more frequently, causing increased
+         * sector stores from L2 to global memory. By writing results to shared memory and then
+         * synchronizing before writing back to global, we no longer rely on L1, preventing the
+         * increase in sector stores from L2 to global and improving performance.
+         */
+        output_buffer[thread_idx] = found == ref.end() ? ref.empty_key_sentinel() : *found;
+        block.sync();
+        *(output_begin + idx) = output_buffer[thread_idx];
+      } else {
+        auto const tile  = cg::tiled_partition<CGSize>(block);
+        auto const found = ref.find(tile, key);
+
+        if (tile.thread_rank() == 0) {
+          *(output_begin + idx) = found == ref.end() ? ref.empty_key_sentinel() : *found;
+        }
+      }
+    }
+    idx += loop_stride;
+  }
+}
+
+}  // namespace detail
+}  // namespace static_set_ns
+}  // namespace experimental
+}  // namespace cuco
diff --git a/include/cuco/detail/static_set/static_set.inl b/include/cuco/detail/static_set/static_set.inl
new file mode 100644
index 000000000..4898f3055
--- /dev/null
+++ b/include/cuco/detail/static_set/static_set.inl
@@ -0,0 +1,317 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuco/cuda_stream_ref.hpp>
+#include <cuco/detail/static_set/functors.cuh>
+#include <cuco/detail/static_set/kernels.cuh>
+#include <cuco/detail/utility/cuda.hpp>
+#include <cuco/detail/utils.hpp>
+#include <cuco/operator.hpp>
+#include <cuco/static_set_ref.cuh>
+
+#include <cstddef>
+
+namespace cuco {
+namespace experimental {
+
+template <class Key,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+constexpr static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::static_set(
+  Extent capacity,
+  empty_key<Key> empty_key_sentinel,
+  KeyEqual const& pred,
+  ProbingScheme const& probing_scheme,
+  Allocator const& alloc,
+  cuda_stream_ref stream)
+  : impl_{std::make_unique<impl_type>(
+      capacity, empty_key_sentinel, empty_key_sentinel, pred, probing_scheme, alloc, stream)}
+{
+}
+
+template <class Key,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+void static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::clear(
+  cuda_stream_ref stream) noexcept
+{
+  impl_->clear(stream);
+}
+
+template <class Key,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+void static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::clear_async(
+  cuda_stream_ref stream) noexcept
+{
+  impl_->clear_async(stream);
+}
+
+template <class Key,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+template <typename InputIt>
+static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::size_type
+static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::insert(
+  InputIt first, InputIt last, cuda_stream_ref stream)
+{
+  return impl_->insert(first, last, ref(op::insert), stream);
+}
+
+template <class Key,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+template <typename InputIt>
+void static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::insert_async(
+  InputIt first, InputIt last, cuda_stream_ref stream) noexcept
+{
+  impl_->insert_async(first, last, ref(op::insert), stream);
+}
+
+template <class Key,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+template <typename InputIt, typename StencilIt, typename Predicate>
+static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::size_type
+static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::insert_if(
+  InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda_stream_ref stream)
+{
+  return impl_->insert_if(first, last, stencil, pred, ref(op::insert), stream);
+}
+
+template <class Key,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+template <typename InputIt, typename StencilIt, typename Predicate>
+void static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::insert_if_async(
+  InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda_stream_ref stream) noexcept
+{
+  impl_->insert_if_async(first, last, stencil, pred, ref(op::insert), stream);
+}
+
+template <class Key,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+template <typename InputIt, typename OutputIt>
+void static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::contains(
+  InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream) const
+{
+  contains_async(first, last, output_begin, stream);
+  stream.synchronize();
+}
+
+template <class Key,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+template <typename InputIt, typename OutputIt>
+void static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::contains_async(
+  InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream) const noexcept
+{
+  impl_->contains_async(first, last, output_begin, ref(op::contains), stream);
+}
+
+template <class Key,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+template <typename InputIt, typename StencilIt, typename Predicate, typename OutputIt>
+void static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::contains_if(
+  InputIt first,
+  InputIt last,
+  StencilIt stencil,
+  Predicate pred,
+  OutputIt output_begin,
+  cuda_stream_ref stream) const
+{
+  contains_if_async(first, last, stencil, pred, output_begin, stream);
+  stream.synchronize();
+}
+
+template <class Key,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+template <typename InputIt, typename StencilIt, typename Predicate, typename OutputIt>
+void static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::contains_if_async(
+  InputIt first,
+  InputIt last,
+  StencilIt stencil,
+  Predicate pred,
+  OutputIt output_begin,
+  cuda_stream_ref stream) const noexcept
+{
+  impl_->contains_if_async(first, last, stencil, pred, output_begin, ref(op::contains), stream);
+}
+
+template <class Key,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+template <typename InputIt, typename OutputIt>
+void static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::find(
+  InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream) const
+{
+  find_async(first, last, output_begin, stream);
+  stream.synchronize();
+}
+
+template <class Key,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+template <typename InputIt, typename OutputIt>
+void static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::find_async(
+  InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream) const
+{
+  auto const num_keys = cuco::detail::distance(first, last);
+  if (num_keys == 0) { return; }
+
+  auto const grid_size = cuco::detail::grid_size(num_keys, cg_size);
+
+  static_set_ns::detail::find<cg_size, cuco::detail::default_block_size()>
+    <<<grid_size, cuco::detail::default_block_size(), 0, stream>>>(
+      first, num_keys, output_begin, ref(op::find));
+}
+
+template <class Key,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+template <typename OutputIt>
+OutputIt static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::retrieve_all(
+  OutputIt output_begin, cuda_stream_ref stream) const
+{
+  auto const begin =
+    thrust::make_transform_iterator(thrust::counting_iterator<size_type>{0},
+                                    detail::get_slot<storage_ref_type>(impl_->storage_ref()));
+  auto const is_filled = static_set_ns::detail::slot_is_filled(this->empty_key_sentinel());
+
+  return impl_->retrieve_all(begin, output_begin, is_filled, stream);
+}
+
+template <class Key,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::size_type
+static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::size(
+  cuda_stream_ref stream) const noexcept
+{
+  auto const is_filled = static_set_ns::detail::slot_is_filled(this->empty_key_sentinel());
+  return impl_->size(is_filled, stream);
+}
+
+template <class Key,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+constexpr auto
+static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::capacity()
+  const noexcept
+{
+  return impl_->capacity();
+}
+
+template <class Key,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+constexpr static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::key_type
+static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::empty_key_sentinel()
+  const noexcept
+{
+  return impl_->empty_key_sentinel();
+}
+
+template <class Key,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+template <typename... Operators>
+auto static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::ref(
+  Operators...) const noexcept
+{
+  static_assert(sizeof...(Operators), "No operators specified");
+  return ref_type<Operators...>{cuco::empty_key<key_type>(this->empty_key_sentinel()),
+                                impl_->key_eq(),
+                                impl_->probing_scheme(),
+                                impl_->storage_ref()};
+}
+}  // namespace experimental
+}  // namespace cuco
diff --git a/include/cuco/detail/static_set/static_set_ref.inl b/include/cuco/detail/static_set/static_set_ref.inl
new file mode 100644
index 000000000..4c3853971
--- /dev/null
+++ b/include/cuco/detail/static_set/static_set_ref.inl
@@ -0,0 +1,392 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuco/operator.hpp>
+
+#include <cuda/atomic>
+
+#include <cooperative_groups.h>
+
+namespace cuco {
+namespace experimental {
+
+template <typename Key,
+          cuda::thread_scope Scope,
+          typename KeyEqual,
+          typename ProbingScheme,
+          typename StorageRef,
+          typename... Operators>
+__host__ __device__ constexpr static_set_ref<
+  Key,
+  Scope,
+  KeyEqual,
+  ProbingScheme,
+  StorageRef,
+  Operators...>::static_set_ref(cuco::empty_key<Key> empty_key_sentinel,
+                                KeyEqual const& predicate,
+                                ProbingScheme const& probing_scheme,
+                                StorageRef storage_ref) noexcept
+  : impl_{empty_key_sentinel, probing_scheme, storage_ref},
+    predicate_{empty_key_sentinel, predicate}
+{
+}
+
+template <typename Key,
+          cuda::thread_scope Scope,
+          typename KeyEqual,
+          typename ProbingScheme,
+          typename StorageRef,
+          typename... Operators>
+template <typename... OtherOperators>
+__host__ __device__ constexpr static_set_ref<Key,
+                                             Scope,
+                                             KeyEqual,
+                                             ProbingScheme,
+                                             StorageRef,
+                                             Operators...>::
+  static_set_ref(
+    static_set_ref<Key, Scope, KeyEqual, ProbingScheme, StorageRef, OtherOperators...>&&
+      other) noexcept
+  : impl_{std::move(other.impl_)}, predicate_{std::move(other.predicate_)}
+{
+}
+
+template <typename Key,
+          cuda::thread_scope Scope,
+          typename KeyEqual,
+          typename ProbingScheme,
+          typename StorageRef,
+          typename... Operators>
+__host__ __device__ constexpr auto
+static_set_ref<Key, Scope, KeyEqual, ProbingScheme, StorageRef, Operators...>::capacity()
+  const noexcept
+{
+  return impl_.capacity();
+}
+
+template <typename Key,
+          cuda::thread_scope Scope,
+          typename KeyEqual,
+          typename ProbingScheme,
+          typename StorageRef,
+          typename... Operators>
+__host__ __device__ constexpr Key
+static_set_ref<Key, Scope, KeyEqual, ProbingScheme, StorageRef, Operators...>::empty_key_sentinel()
+  const noexcept
+{
+  return predicate_.empty_sentinel_;
+}
+
+template <typename Key,
+          cuda::thread_scope Scope,
+          typename KeyEqual,
+          typename ProbingScheme,
+          typename StorageRef,
+          typename... Operators>
+template <typename... NewOperators>
+auto static_set_ref<Key, Scope, KeyEqual, ProbingScheme, StorageRef, Operators...>::with(
+  NewOperators...) && noexcept
+{
+  return static_set_ref<Key, Scope, KeyEqual, ProbingScheme, StorageRef, NewOperators...>(
+    std::move(*this));
+}
+
+namespace detail {
+
+template <typename Key,
+          cuda::thread_scope Scope,
+          typename KeyEqual,
+          typename ProbingScheme,
+          typename StorageRef,
+          typename... Operators>
+class operator_impl<op::insert_tag,
+                    static_set_ref<Key, Scope, KeyEqual, ProbingScheme, StorageRef, Operators...>> {
+  using base_type  = static_set_ref<Key, Scope, KeyEqual, ProbingScheme, StorageRef>;
+  using ref_type   = static_set_ref<Key, Scope, KeyEqual, ProbingScheme, StorageRef, Operators...>;
+  using key_type   = typename base_type::key_type;
+  using value_type = typename base_type::value_type;
+
+  static constexpr auto cg_size     = base_type::cg_size;
+  static constexpr auto window_size = base_type::window_size;
+
+ public:
+  /**
+   * @brief Inserts an element.
+   *
+   * @param value The element to insert
+   *
+   * @return True if the given element is successfully inserted
+   */
+  __device__ bool insert(value_type const& value) noexcept
+  {
+    ref_type& ref_             = static_cast<ref_type&>(*this);
+    auto constexpr has_payload = false;
+    return ref_.impl_.insert<has_payload>(value, ref_.predicate_);
+  }
+
+  /**
+   * @brief Inserts an element.
+   *
+   * @param group The Cooperative Group used to perform group insert
+   * @param value The element to insert
+   *
+   * @return True if the given element is successfully inserted
+   */
+  __device__ bool insert(cooperative_groups::thread_block_tile<cg_size> const& group,
+                         value_type const& value) noexcept
+  {
+    auto& ref_                 = static_cast<ref_type&>(*this);
+    auto constexpr has_payload = false;
+    return ref_.impl_.insert<has_payload>(group, value, ref_.predicate_);
+  }
+};
+
+template <typename Key,
+          cuda::thread_scope Scope,
+          typename KeyEqual,
+          typename ProbingScheme,
+          typename StorageRef,
+          typename... Operators>
+class operator_impl<op::insert_and_find_tag,
+                    static_set_ref<Key, Scope, KeyEqual, ProbingScheme, StorageRef, Operators...>> {
+  using base_type  = static_set_ref<Key, Scope, KeyEqual, ProbingScheme, StorageRef>;
+  using ref_type   = static_set_ref<Key, Scope, KeyEqual, ProbingScheme, StorageRef, Operators...>;
+  using key_type   = typename base_type::key_type;
+  using value_type = typename base_type::value_type;
+  using iterator   = typename base_type::iterator;
+  using const_iterator = typename base_type::const_iterator;
+
+  static constexpr auto cg_size     = base_type::cg_size;
+  static constexpr auto window_size = base_type::window_size;
+
+ public:
+  /**
+   * @brief Returns a const_iterator to one past the last slot.
+   *
+   * @note This API is available only when `find_tag` or `insert_and_find_tag` is present.
+   *
+   * @return A const_iterator to one past the last slot
+   */
+  [[nodiscard]] __host__ __device__ constexpr const_iterator end() const noexcept
+  {
+    auto const& ref_ = static_cast<ref_type const&>(*this);
+    return ref_.impl_.end();
+  }
+
+  /**
+   * @brief Returns an iterator to one past the last slot.
+   *
+   * @note This API is available only when `find_tag` or `insert_and_find_tag` is present.
+   *
+   * @return An iterator to one past the last slot
+   */
+  [[nodiscard]] __host__ __device__ constexpr iterator end() noexcept
+  {
+    auto const& ref_ = static_cast<ref_type const&>(*this);
+    return ref_.impl_.end();
+  }
+
+  /**
+   * @brief Inserts the given element into the set.
+   *
+   * @note This API returns a pair consisting of an iterator to the inserted element (or to the
+   * element that prevented the insertion) and a `bool` denoting whether the insertion took place or
+   * not.
+   *
+   * @param value The element to insert
+   *
+   * @return a pair consisting of an iterator to the element and a bool indicating whether the
+   * insertion is successful or not.
+   */
+  __device__ thrust::pair<iterator, bool> insert_and_find(value_type const& value) noexcept
+  {
+    ref_type& ref_             = static_cast<ref_type&>(*this);
+    auto constexpr has_payload = false;
+    return ref_.impl_.insert_and_find<has_payload>(value, ref_.predicate_);
+  }
+
+  /**
+   * @brief Inserts the given element into the set.
+   *
+   * @note This API returns a pair consisting of an iterator to the inserted element (or to the
+   * element that prevented the insertion) and a `bool` denoting whether the insertion took place or
+   * not.
+   *
+   * @param group The Cooperative Group used to perform group insert_and_find
+   * @param value The element to insert
+   *
+   * @return a pair consisting of an iterator to the element and a bool indicating whether the
+   * insertion is successful or not.
+   */
+  __device__ thrust::pair<iterator, bool> insert_and_find(
+    cooperative_groups::thread_block_tile<cg_size> const& group, value_type const& value) noexcept
+  {
+    ref_type& ref_             = static_cast<ref_type&>(*this);
+    auto constexpr has_payload = false;
+    return ref_.impl_.insert_and_find<has_payload>(group, value, ref_.predicate_);
+  }
+};
+
+template <typename Key,
+          cuda::thread_scope Scope,
+          typename KeyEqual,
+          typename ProbingScheme,
+          typename StorageRef,
+          typename... Operators>
+class operator_impl<op::contains_tag,
+                    static_set_ref<Key, Scope, KeyEqual, ProbingScheme, StorageRef, Operators...>> {
+  using base_type  = static_set_ref<Key, Scope, KeyEqual, ProbingScheme, StorageRef>;
+  using ref_type   = static_set_ref<Key, Scope, KeyEqual, ProbingScheme, StorageRef, Operators...>;
+  using key_type   = typename base_type::key_type;
+  using value_type = typename base_type::value_type;
+
+  static constexpr auto cg_size     = base_type::cg_size;
+  static constexpr auto window_size = base_type::window_size;
+
+ public:
+  /**
+   * @brief Indicates whether the probe key `key` was inserted into the container.
+   *
+   * @note If the probe key `key` was inserted into the container, returns true. Otherwise, returns
+   * false.
+   *
+   * @tparam ProbeKey Probe key type
+   *
+   * @param key The key to search for
+   *
+   * @return A boolean indicating whether the probe key is present
+   */
+  template <typename ProbeKey>
+  [[nodiscard]] __device__ bool contains(ProbeKey const& key) const noexcept
+  {
+    auto const& ref_ = static_cast<ref_type const&>(*this);
+    return ref_.impl_.contains(key, ref_.predicate_);
+  }
+
+  /**
+   * @brief Indicates whether the probe key `key` was inserted into the container.
+   *
+   * @note If the probe key `key` was inserted into the container, returns true. Otherwise, returns
+   * false.
+   *
+   * @tparam ProbeKey Probe key type
+   *
+   * @param group The Cooperative Group used to perform group contains
+   * @param key The key to search for
+   *
+   * @return A boolean indicating whether the probe key is present
+   */
+  template <typename ProbeKey>
+  [[nodiscard]] __device__ bool contains(
+    cooperative_groups::thread_block_tile<cg_size> const& group, ProbeKey const& key) const noexcept
+  {
+    auto const& ref_ = static_cast<ref_type const&>(*this);
+    return ref_.impl_.contains(group, key, ref_.predicate_);
+  }
+};
+
+template <typename Key,
+          cuda::thread_scope Scope,
+          typename KeyEqual,
+          typename ProbingScheme,
+          typename StorageRef,
+          typename... Operators>
+class operator_impl<op::find_tag,
+                    static_set_ref<Key, Scope, KeyEqual, ProbingScheme, StorageRef, Operators...>> {
+  using base_type  = static_set_ref<Key, Scope, KeyEqual, ProbingScheme, StorageRef>;
+  using ref_type   = static_set_ref<Key, Scope, KeyEqual, ProbingScheme, StorageRef, Operators...>;
+  using key_type   = typename base_type::key_type;
+  using value_type = typename base_type::value_type;
+  using iterator   = typename base_type::iterator;
+  using const_iterator = typename base_type::const_iterator;
+
+  static constexpr auto cg_size     = base_type::cg_size;
+  static constexpr auto window_size = base_type::window_size;
+
+ public:
+  /**
+   * @brief Returns a const_iterator to one past the last slot.
+   *
+   * @note This API is available only when `find_tag` or `insert_and_find_tag` is present.
+   *
+   * @return A const_iterator to one past the last slot
+   */
+  [[nodiscard]] __host__ __device__ constexpr const_iterator end() const noexcept
+  {
+    auto const& ref_ = static_cast<ref_type const&>(*this);
+    return ref_.impl_.end();
+  }
+
+  /**
+   * @brief Returns an iterator to one past the last slot.
+   *
+   * @note This API is available only when `find_tag` or `insert_and_find_tag` is present.
+   *
+   * @return An iterator to one past the last slot
+   */
+  [[nodiscard]] __host__ __device__ constexpr iterator end() noexcept
+  {
+    auto const& ref_ = static_cast<ref_type const&>(*this);
+    return ref_.impl_.end();
+  }
+
+  /**
+   * @brief Finds an element in the set with key equivalent to the probe key.
+   *
+   * @note Returns a un-incrementable input iterator to the element whose key is equivalent to
+   * `key`. If no such element exists, returns `end()`.
+   *
+   * @tparam ProbeKey Probe key type
+   *
+   * @param key The key to search for
+   *
+   * @return An iterator to the position at which the equivalent key is stored
+   */
+  template <typename ProbeKey>
+  [[nodiscard]] __device__ const_iterator find(ProbeKey const& key) const noexcept
+  {
+    // CRTP: cast `this` to the actual ref type
+    auto const& ref_ = static_cast<ref_type const&>(*this);
+    return ref_.impl_.find(key, ref_.predicate_);
+  }
+
+  /**
+   * @brief Finds an element in the set with key equivalent to the probe key.
+   *
+   * @note Returns a un-incrementable input iterator to the element whose key is equivalent to
+   * `key`. If no such element exists, returns `end()`.
+   *
+   * @tparam ProbeKey Probe key type
+   *
+   * @param group The Cooperative Group used to perform this operation
+   * @param key The key to search for
+   *
+   * @return An iterator to the position at which the equivalent key is stored
+   */
+  template <typename ProbeKey>
+  [[nodiscard]] __device__ const_iterator find(
+    cooperative_groups::thread_block_tile<cg_size> const& group, ProbeKey const& key) const noexcept
+  {
+    auto const& ref_ = static_cast<ref_type const&>(*this);
+    return ref_.impl_.find(group, key, ref_.predicate_);
+  }
+};
+
+}  // namespace detail
+}  // namespace experimental
+}  // namespace cuco
diff --git a/include/cuco/detail/storage/aow_storage.inl b/include/cuco/detail/storage/aow_storage.inl
new file mode 100644
index 000000000..c4b5fa8b6
--- /dev/null
+++ b/include/cuco/detail/storage/aow_storage.inl
@@ -0,0 +1,205 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuco/cuda_stream_ref.hpp>
+#include <cuco/detail/storage/kernels.cuh>
+#include <cuco/detail/storage/storage_base.cuh>
+#include <cuco/detail/utility/cuda.hpp>
+#include <cuco/extent.cuh>
+
+#include <cuda/std/array>
+
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+#include <memory>
+
+namespace cuco {
+namespace experimental {
+
+template <typename T, int32_t WindowSize, typename Extent, typename Allocator>
+constexpr aow_storage<T, WindowSize, Extent, Allocator>::aow_storage(
+  Extent size, Allocator const& allocator) noexcept
+  : detail::aow_storage_base<T, WindowSize, Extent>{size},
+    allocator_{allocator},
+    window_deleter_{capacity(), allocator_},
+    windows_{allocator_.allocate(capacity()), window_deleter_}
+{
+}
+
+template <typename T, int32_t WindowSize, typename Extent, typename Allocator>
+constexpr aow_storage<T, WindowSize, Extent, Allocator>::window_type*
+aow_storage<T, WindowSize, Extent, Allocator>::data() const noexcept
+{
+  return windows_.get();
+}
+
+template <typename T, int32_t WindowSize, typename Extent, typename Allocator>
+constexpr aow_storage<T, WindowSize, Extent, Allocator>::allocator_type
+aow_storage<T, WindowSize, Extent, Allocator>::allocator() const noexcept
+{
+  return allocator_;
+}
+
+template <typename T, int32_t WindowSize, typename Extent, typename Allocator>
+constexpr aow_storage<T, WindowSize, Extent, Allocator>::ref_type
+aow_storage<T, WindowSize, Extent, Allocator>::ref() const noexcept
+{
+  return ref_type{this->window_extent(), this->data()};
+}
+
+template <typename T, int32_t WindowSize, typename Extent, typename Allocator>
+void aow_storage<T, WindowSize, Extent, Allocator>::initialize(value_type key,
+                                                               cuda_stream_ref stream) noexcept
+{
+  this->initialize_async(key, stream);
+  stream.synchronize();
+}
+
+template <typename T, int32_t WindowSize, typename Extent, typename Allocator>
+void aow_storage<T, WindowSize, Extent, Allocator>::initialize_async(
+  value_type key, cuda_stream_ref stream) noexcept
+{
+  auto constexpr cg_size = 1;
+  auto constexpr stride  = 4;
+  auto const grid_size   = cuco::detail::grid_size(this->num_windows(), cg_size, stride);
+
+  detail::initialize<<<grid_size, cuco::detail::default_block_size(), 0, stream>>>(
+    this->data(), this->num_windows(), key);
+}
+
+template <typename T, int32_t WindowSize, typename Extent>
+__host__ __device__ constexpr aow_storage_ref<T, WindowSize, Extent>::aow_storage_ref(
+  Extent size, window_type* windows) noexcept
+  : detail::aow_storage_base<T, WindowSize, Extent>{size}, windows_{windows}
+{
+}
+
+template <typename T, int32_t WindowSize, typename Extent>
+struct aow_storage_ref<T, WindowSize, Extent>::iterator {
+ public:
+  using iterator_category = std::input_iterator_tag;  ///< iterator category
+  using reference         = value_type&;              ///< iterator reference type
+
+  /**
+   * @brief Constructs a device side input iterator of the given slot.
+   *
+   * @param current The slot pointer
+   */
+  __device__ constexpr explicit iterator(value_type* current) noexcept : current_{current} {}
+
+  /**
+   * @brief Prefix increment operator
+   *
+   * @throw This code path should never be chosen.
+   *
+   * @return Current iterator
+   */
+  __device__ constexpr iterator& operator++() noexcept
+  {
+    static_assert("Un-incrementable input iterator");
+  }
+
+  /**
+   * @brief Postfix increment operator
+   *
+   * @throw This code path should never be chosen.
+   *
+   * @return Current iterator
+   */
+  __device__ constexpr iterator operator++(int32_t) noexcept
+  {
+    static_assert("Un-incrementable input iterator");
+  }
+
+  /**
+   * @brief Dereference operator
+   *
+   * @return Reference to the current slot
+   */
+  __device__ constexpr reference operator*() const { return *current_; }
+
+  /**
+   * @brief Access operator
+   *
+   * @return Pointer to the current slot
+   */
+  __device__ constexpr value_type* operator->() const { return current_; }
+
+  /**
+   * Equality operator
+   *
+   * @return True if two iterators are identical
+   */
+  friend __device__ constexpr bool operator==(iterator const& lhs, iterator const& rhs) noexcept
+  {
+    return lhs.current_ == rhs.current_;
+  }
+
+  /**
+   * Inequality operator
+   *
+   * @return True if two iterators are not identical
+   */
+  friend __device__ constexpr bool operator!=(iterator const& lhs, iterator const& rhs) noexcept
+  {
+    return not(lhs == rhs);
+  }
+
+ private:
+  value_type* current_{};  ///< Pointer to the current slot
+};
+
+template <typename T, int32_t WindowSize, typename Extent>
+__device__ constexpr aow_storage_ref<T, WindowSize, Extent>::iterator
+aow_storage_ref<T, WindowSize, Extent>::end() noexcept
+{
+  return iterator{reinterpret_cast<value_type*>(this->data() + this->capacity())};
+}
+
+template <typename T, int32_t WindowSize, typename Extent>
+__device__ constexpr aow_storage_ref<T, WindowSize, Extent>::const_iterator
+aow_storage_ref<T, WindowSize, Extent>::end() const noexcept
+{
+  return const_iterator{reinterpret_cast<value_type*>(this->data() + this->capacity())};
+}
+
+template <typename T, int32_t WindowSize, typename Extent>
+__device__ constexpr aow_storage_ref<T, WindowSize, Extent>::window_type*
+aow_storage_ref<T, WindowSize, Extent>::data() noexcept
+{
+  return windows_;
+}
+
+template <typename T, int32_t WindowSize, typename Extent>
+__device__ constexpr aow_storage_ref<T, WindowSize, Extent>::window_type*
+aow_storage_ref<T, WindowSize, Extent>::data() const noexcept
+{
+  return windows_;
+}
+
+template <typename T, int32_t WindowSize, typename Extent>
+__device__ constexpr aow_storage_ref<T, WindowSize, Extent>::window_type
+aow_storage_ref<T, WindowSize, Extent>::operator[](size_type index) const noexcept
+{
+  return *reinterpret_cast<window_type*>(
+    __builtin_assume_aligned(this->data() + index, sizeof(value_type) * window_size));
+}
+
+}  // namespace experimental
+}  // namespace cuco
diff --git a/include/cuco/detail/storage/aow_storage_base.cuh b/include/cuco/detail/storage/aow_storage_base.cuh
new file mode 100644
index 000000000..5f3d84df4
--- /dev/null
+++ b/include/cuco/detail/storage/aow_storage_base.cuh
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuco/detail/storage/storage_base.cuh>
+
+#include <cuda/std/array>
+
+#include <cstddef>
+#include <cstdint>
+
+namespace cuco {
+namespace experimental {
+namespace detail {
+/**
+￼ * @brief Window data structure type
+￼ *
+￼ * @tparam T Window slot type
+￼ * @tparam WindowSize Number of elements per window
+￼ */
+template <typename T, int32_t WindowSize>
+struct window : public cuda::std::array<T, WindowSize> {
+ public:
+  static int32_t constexpr window_size = WindowSize;  ///< Number of slots per window
+};
+
+/**
+ * @brief Base class of array of slot windows open addressing storage.
+ *
+ * @note This should NOT be used directly.
+ *
+ * @tparam T Slot type
+ * @tparam WindowSize Number of slots in each window
+ * @tparam Extent Type of extent denoting the number of windows
+ */
+template <typename T, int32_t WindowSize, typename Extent>
+class aow_storage_base : public storage_base<Extent> {
+ public:
+  /**
+   * @brief The number of elements (slots) processed per window.
+   */
+  static constexpr int32_t window_size = WindowSize;
+
+  using extent_type = typename storage_base<Extent>::extent_type;  ///< Storage extent type
+  using size_type   = typename storage_base<Extent>::size_type;    ///< Storage size type
+
+  using value_type  = T;                                ///< Slot type
+  using window_type = window<value_type, window_size>;  ///< Slot window type
+
+  /**
+   * @brief Constructor of AoW base storage.
+   *
+   * @param size Number of windows to store
+   */
+  __host__ __device__ explicit constexpr aow_storage_base(Extent size) : storage_base<Extent>{size}
+  {
+  }
+
+  /**
+   * @brief Gets the total number of slot windows in the current storage.
+   *
+   * @return The total number of slot windows
+   */
+  [[nodiscard]] __host__ __device__ constexpr size_type num_windows() const noexcept
+  {
+    return storage_base<Extent>::capacity();
+  }
+
+  /**
+   * @brief Gets the total number of slots in the current storage.
+   *
+   * @return The total number of slots
+   */
+  [[nodiscard]] __host__ __device__ constexpr size_type capacity() const noexcept
+  {
+    return storage_base<Extent>::capacity() * window_size;
+  }
+
+  /**
+   * @brief Gets the window extent of the current storage.
+   *
+   * @return The window extent.
+   */
+  [[nodiscard]] __host__ __device__ constexpr extent_type window_extent() const noexcept
+  {
+    return storage_base<Extent>::extent();
+  }
+};
+
+}  // namespace detail
+}  // namespace experimental
+}  // namespace cuco
diff --git a/include/cuco/detail/storage/counter_storage.cuh b/include/cuco/detail/storage/counter_storage.cuh
new file mode 100644
index 000000000..bb36b15e2
--- /dev/null
+++ b/include/cuco/detail/storage/counter_storage.cuh
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuco/cuda_stream_ref.hpp>
+#include <cuco/detail/error.hpp>
+#include <cuco/detail/storage/storage_base.cuh>
+#include <cuco/extent.cuh>
+
+#include <cuda/atomic>
+
+#include <memory>
+
+namespace cuco {
+namespace experimental {
+namespace detail {
+/**
+ * @brief Device atomic counter storage class.
+ *
+ * @tparam SizeType Type of storage size
+ * @tparam Scope The scope in which the counter will be used by individual threads
+ * @tparam Allocator Type of allocator used for device storage
+ */
+template <typename SizeType, cuda::thread_scope Scope, typename Allocator>
+class counter_storage : public storage_base<cuco::experimental::extent<SizeType, 1>> {
+ public:
+  using storage_base<cuco::experimental::extent<SizeType, 1>>::capacity;  ///< Storage size
+
+  using size_type      = SizeType;                        ///< Size type
+  using value_type     = cuda::atomic<size_type, Scope>;  ///< Type of the counter
+  using allocator_type = typename std::allocator_traits<Allocator>::rebind_alloc<
+    value_type>;  ///< Type of the allocator to (de)allocate counter
+  using counter_deleter_type =
+    custom_deleter<size_type, allocator_type>;  ///< Type of counter deleter
+
+  /**
+   * @brief Constructor of counter storage.
+   *
+   * @param allocator Allocator used for (de)allocating device storage
+   */
+  explicit constexpr counter_storage(Allocator const& allocator)
+    : storage_base<cuco::experimental::extent<SizeType, 1>>{cuco::experimental::extent<size_type,
+                                                                                       1>{}},
+      allocator_{allocator},
+      counter_deleter_{this->capacity(), allocator_},
+      counter_{allocator_.allocate(this->capacity()), counter_deleter_}
+  {
+  }
+
+  /**
+   * @brief Asynchronously resets counter to zero.
+   *
+   * @param stream CUDA stream used to reset
+   */
+  void reset(cuda_stream_ref stream)
+  {
+    static_assert(sizeof(size_type) == sizeof(value_type));
+    CUCO_CUDA_TRY(cudaMemsetAsync(this->data(), 0, sizeof(value_type), stream));
+  }
+
+  /**
+   * @brief Gets device atomic counter pointer.
+   *
+   * @return Pointer to the device atomic counter
+   */
+  [[nodiscard]] constexpr value_type* data() noexcept { return counter_.get(); }
+
+  /**
+   * @brief Gets device atomic counter pointer.
+   *
+   * @return Pointer to the device atomic counter
+   */
+  [[nodiscard]] constexpr value_type* data() const noexcept { return counter_.get(); }
+
+  /**
+   * @brief Atomically obtains the value of the device atomic counter and copies it to the host.
+   *
+   * @note This API synchronizes the given `stream`.
+   *
+   * @param stream CUDA stream used to copy device value to the host
+   * @return Value of the atomic counter
+   */
+  [[nodiscard]] constexpr size_type load_to_host(cuda_stream_ref stream) const
+  {
+    size_type h_count;
+    CUCO_CUDA_TRY(
+      cudaMemcpyAsync(&h_count, this->data(), sizeof(size_type), cudaMemcpyDeviceToHost, stream));
+    stream.synchronize();
+    return h_count;
+  }
+
+ private:
+  allocator_type allocator_;              ///< Allocator used to (de)allocate counter
+  counter_deleter_type counter_deleter_;  ///< Custom counter deleter
+  std::unique_ptr<value_type, counter_deleter_type> counter_;  ///< Pointer to counter storage
+};
+
+}  // namespace detail
+}  // namespace experimental
+}  // namespace cuco
diff --git a/include/cuco/detail/storage/kernels.cuh b/include/cuco/detail/storage/kernels.cuh
new file mode 100644
index 000000000..2a5868f61
--- /dev/null
+++ b/include/cuco/detail/storage/kernels.cuh
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cuco/detail/utility/cuda.cuh>
+
+#include <cstddef>
+
+namespace cuco {
+namespace experimental {
+namespace detail {
+
+/**
+ * @brief Initializes each slot in the window storage to contain `value`.
+ *
+ * @tparam WindowT Window type
+ *
+ * @param windows Pointer to flat storage for windows
+ * @param n Number of input windows
+ * @param value Value to which all values in `slots` are initialized
+ */
+template <typename WindowT>
+__global__ void initialize(WindowT* windows,
+                           cuco::detail::index_type n,
+                           typename WindowT::value_type value)
+{
+  auto const loop_stride = cuco::detail::grid_stride();
+  auto idx               = cuco::detail::global_thread_id();
+
+  while (idx < n) {
+    auto& window_slots = *(windows + idx);
+#pragma unroll
+    for (auto& slot : window_slots) {
+      slot = value;
+    }
+    idx += loop_stride;
+  }
+}
+
+}  // namespace detail
+}  // namespace experimental
+}  // namespace cuco
diff --git a/include/cuco/detail/storage/storage.cuh b/include/cuco/detail/storage/storage.cuh
new file mode 100644
index 000000000..4dda179c9
--- /dev/null
+++ b/include/cuco/detail/storage/storage.cuh
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuco/aow_storage.cuh>
+
+namespace cuco {
+namespace experimental {
+namespace detail {
+/**
+ * @brief Intermediate class internally used by data structures
+ *
+ * @tparam StorageImpl Storage implementation class
+ * @tparam T Storage element type
+ * @tparam Extent Type of extent denoting number of windows
+ * @tparam Allocator Type of allocator used for device storage
+ */
+template <class StorageImpl, class T, class Extent, class Allocator>
+class storage : StorageImpl::template impl<T, Extent, Allocator> {
+ public:
+  /// Storage implementation type
+  using impl_type      = typename StorageImpl::template impl<T, Extent, Allocator>;
+  using ref_type       = typename impl_type::ref_type;        ///< Storage ref type
+  using value_type     = typename impl_type::value_type;      ///< Storage value type
+  using allocator_type = typename impl_type::allocator_type;  ///< Storage value type
+
+  /// Number of elements per window
+  static constexpr int window_size = impl_type::window_size;
+
+  using impl_type::allocator;
+  using impl_type::capacity;
+  using impl_type::data;
+  using impl_type::initialize;
+  using impl_type::initialize_async;
+  using impl_type::num_windows;
+  using impl_type::ref;
+
+  /**
+   * @brief Constructs storage.
+   *
+   * @param size Number of slots to (de)allocate
+   * @param allocator Allocator used for (de)allocating device storage
+   */
+  explicit constexpr storage(Extent size, Allocator const& allocator) : impl_type{size, allocator}
+  {
+  }
+};
+
+}  // namespace detail
+}  // namespace experimental
+}  // namespace cuco
diff --git a/include/cuco/detail/storage/storage_base.cuh b/include/cuco/detail/storage/storage_base.cuh
new file mode 100644
index 000000000..98eed6c13
--- /dev/null
+++ b/include/cuco/detail/storage/storage_base.cuh
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cstddef>
+
+namespace cuco {
+namespace experimental {
+namespace detail {
+/**
+ * @brief Custom deleter for unique pointer.
+ *
+ * @tparam SizeType Type of device storage size
+ * @tparam Allocator Type of allocator used for device storage
+ */
+template <typename SizeType, typename Allocator>
+struct custom_deleter {
+  using pointer = typename Allocator::value_type*;  ///< Value pointer type
+
+  /**
+   * @brief Constructor of custom deleter.
+   *
+   * @param size Number of values to deallocate
+   * @param allocator Allocator used for deallocating device storage
+   */
+  explicit constexpr custom_deleter(SizeType size, Allocator& allocator)
+    : size_{size}, allocator_{allocator}
+  {
+  }
+
+  /**
+   * @brief Operator for deallocation
+   *
+   * @param ptr Pointer to the first value for deallocation
+   */
+  void operator()(pointer ptr) { allocator_.deallocate(ptr, size_); }
+
+  SizeType size_;         ///< Number of values to delete
+  Allocator& allocator_;  ///< Allocator used deallocating values
+};
+
+/**
+ * @brief Base class of open addressing storage.
+ *
+ * This class should not be used directly.
+ *
+ * @tparam Extent Type of extent denoting storage capacity
+ */
+template <typename Extent>
+class storage_base {
+ public:
+  using extent_type = Extent;                            ///< Storage extent type
+  using size_type   = typename extent_type::value_type;  ///< Storage size type
+
+  /**
+   * @brief Constructor of base storage.
+   *
+   * @param size Number of elements to (de)allocate
+   */
+  __host__ __device__ explicit constexpr storage_base(Extent size) : extent_{size} {}
+
+  /**
+   * @brief Gets the total number of elements in the current storage.
+   *
+   * @return The total number of elements
+   */
+  [[nodiscard]] __host__ __device__ constexpr size_type capacity() const noexcept
+  {
+    return static_cast<size_type>(extent_);
+  }
+
+  /**
+   * @brief Gets the extent of the current storage.
+   *
+   * @return The extent.
+   */
+  [[nodiscard]] __host__ __device__ constexpr extent_type extent() const noexcept
+  {
+    return extent_;
+  }
+
+ protected:
+  extent_type extent_;  ///< Total number of elements
+};
+
+}  // namespace detail
+}  // namespace experimental
+}  // namespace cuco
diff --git a/include/cuco/detail/traits.hpp b/include/cuco/detail/traits.hpp
new file mode 100644
index 000000000..313f95430
--- /dev/null
+++ b/include/cuco/detail/traits.hpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ */
+
+#pragma once
+
+#include <thrust/device_reference.h>
+#include <thrust/tuple.h>
+
+#include <cuda/std/type_traits>
+
+#include <tuple>
+
+namespace cuco::detail {
+
+template <typename T, typename = void>
+struct is_std_pair_like : cuda::std::false_type {
+};
+
+template <typename T>
+struct is_std_pair_like<T,
+                        cuda::std::void_t<decltype(std::get<0>(cuda::std::declval<T>())),
+                                          decltype(std::get<1>(cuda::std::declval<T>()))>>
+  : cuda::std::
+      conditional_t<std::tuple_size<T>::value == 2, cuda::std::true_type, cuda::std::false_type> {
+};
+
+template <typename T, typename = void>
+struct is_thrust_pair_like_impl : cuda::std::false_type {
+};
+
+template <typename T>
+struct is_thrust_pair_like_impl<
+  T,
+  cuda::std::void_t<decltype(thrust::get<0>(cuda::std::declval<T>())),
+                    decltype(thrust::get<1>(cuda::std::declval<T>()))>>
+  : cuda::std::conditional_t<thrust::tuple_size<T>::value == 2,
+                             cuda::std::true_type,
+                             cuda::std::false_type> {
+};
+
+template <typename T>
+struct is_thrust_pair_like
+  : is_thrust_pair_like_impl<cuda::std::remove_reference_t<decltype(thrust::raw_reference_cast(
+      cuda::std::declval<T>()))>> {
+};
+
+}  // namespace cuco::detail
diff --git a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh
new file mode 100644
index 000000000..8383669fc
--- /dev/null
+++ b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh
@@ -0,0 +1,375 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuco/cuda_stream_ref.hpp>
+
+#include <thrust/device_malloc_allocator.h>
+#include <thrust/device_vector.h>
+
+#include <cuda/std/array>
+
+#include <climits>
+#include <cstddef>
+
+namespace cuco {
+namespace experimental {
+namespace detail {
+
+/**
+ * @brief Struct to store ranks of bits at 256-bit intervals (or blocks)
+ *
+ * This struct encodes a list of four rank values using base + offset format
+ * e.g. [1000, 1005, 1006, 1009] is stored as base = 1000, offsets = [5, 6, 9]
+ * base uses 40 bits, split between one uint32_t and one uint8_t
+ * each offset uses 8 bits
+ */
+struct rank {
+  uint32_t base_hi_;                      ///< Upper 32 bits of base
+  uint8_t base_lo_;                       ///< Lower 8 bits of base
+  cuda::std::array<uint8_t, 3> offsets_;  ///< Offsets for 64-bit sub-intervals, relative to base
+
+  /**
+   * @brief Gets base rank of current 256-bit interval
+   *
+   * @return The base rank
+   */
+  __host__ __device__ constexpr uint64_t base() const noexcept
+  {
+    return (static_cast<uint64_t>(base_hi_) << CHAR_BIT) | base_lo_;
+  }
+
+  /**
+   * @brief Sets base rank of current 256-bit interval
+   *
+   * @param base Base rank
+   */
+  __host__ __device__ constexpr void set_base(uint64_t base) noexcept
+  {
+    base_hi_ = static_cast<uint32_t>(base >> CHAR_BIT);
+    base_lo_ = static_cast<uint8_t>(base);
+  }
+};
+
+/**
+ * @brief Bitset class with rank and select index structures
+ *
+ * In addition to standard bitset set/test operations, this class provides
+ * rank and select operation API. It maintains index structures to make both these
+ * new operations close to constant time.
+ *
+ * Current limitations:
+ * - Stream controls are partially supported due to the use of `thrust::device_vector` as storage
+ * - Device ref doesn't support modifiers like `set`, `reset`, etc.
+ *
+ * @tparam Allocator Type of allocator used for device storage
+ */
+// TODO: have to use device_malloc_allocator for now otherwise the container cannot grow
+template <class Allocator = thrust::device_malloc_allocator<std::byte>>
+class dynamic_bitset {
+ public:
+  using size_type = std::size_t;  ///< size type to specify bit index
+  using word_type = uint64_t;     ///< word type
+  /// Type of the allocator to (de)allocate words
+  using allocator_type = typename std::allocator_traits<Allocator>::rebind_alloc<word_type>;
+
+  /// Number of bits per block. Note this is a tradeoff between space efficiency and perf.
+  static constexpr size_type words_per_block = 4;
+  /// Number of bits in a word
+  static constexpr size_type bits_per_word = sizeof(word_type) * CHAR_BIT;
+  /// Number of bits in a block
+  static constexpr size_type bits_per_block = words_per_block * bits_per_word;
+
+  /**
+   * @brief Constructs an empty bitset
+   *
+   * @param allocator Allocator used for allocating device storage
+   */
+  constexpr dynamic_bitset(Allocator const& allocator = Allocator{});
+
+  /**
+   * @brief Appends the given element `value` to the end of the bitset
+   *
+   * This API may involve data reallocation if the current storage is exhausted.
+   *
+   * @param value Boolean value of the new bit to be added
+   */
+  constexpr void push_back(bool value) noexcept;
+
+  /**
+   * @brief Sets the target bit indexed by `index` to a specified `value`.
+   *
+   * @param index Position of bit to be modified
+   * @param value New value of the target bit
+   */
+  constexpr void set(size_type index, bool value) noexcept;
+
+  /**
+   * @brief Sets the last bit to a specified value
+   *
+   * @param value New value of the last bit
+   */
+  constexpr void set_last(bool value) noexcept;
+
+  /**
+   * @brief For any element `keys_begin[i]` in the range `[keys_begin, keys_end)`, stores the
+   * boolean value at position `keys_begin[i]` to `output_begin[i]`.
+   *
+   * @tparam KeyIt Device-accessible iterator whose `value_type` can be converted to bitset's
+   * `size_type`
+   * @tparam OutputIt Device-accessible iterator whose `value_type` can be constructed from boolean
+   * type
+   *
+   * @param keys_begin Begin iterator to keys list whose values are queried
+   * @param keys_end End iterator to keys list
+   * @param outputs_begin Begin iterator to outputs of test operation
+   * @param stream Stream to execute test kernel
+   */
+  template <typename KeyIt, typename OutputIt>
+  constexpr void test(KeyIt keys_begin,
+                      KeyIt keys_end,
+                      OutputIt outputs_begin,
+                      cuda_stream_ref stream = {}) noexcept;
+
+  /**
+   * @brief For any element `keys_begin[i]` in the range `[keys_begin, keys_end)`, stores total
+   * count of `1` bits preceeding (but not including) position `keys_begin[i]` to `output_begin[i]`.
+   *
+   * @tparam KeyIt Device-accessible iterator whose `value_type` can be converted to bitset's
+   * `size_type`
+   * @tparam OutputIt Device-accessible iterator whose `value_type` can be constructed from bitset's
+   * `size_type`
+   *
+   * @param keys_begin Begin iterator to keys list whose ranks are queried
+   * @param keys_end End iterator to keys list
+   * @param outputs_begin Begin iterator to outputs ranks list
+   * @param stream Stream to execute ranks kernel
+   */
+  template <typename KeyIt, typename OutputIt>
+  constexpr void rank(KeyIt keys_begin,
+                      KeyIt keys_end,
+                      OutputIt outputs_begin,
+                      cuda_stream_ref stream = {}) noexcept;
+
+  /**
+   * @brief For any element `keys_begin[i]` in the range `[keys_begin, keys_end)`, stores the
+   * position of `keys_begin[i]`th `1` bit to `output_begin[i]`.
+   *
+   * @tparam KeyIt Device-accessible iterator whose `value_type` can be converted to bitset's
+   * `size_type`
+   * @tparam OutputIt Device-accessible iterator whose `value_type` can be constructed from bitset's
+   * `size_type`
+   *
+   * @param keys_begin Begin iterator to keys list whose select values are queried
+   * @param keys_end End iterator to keys list
+   * @param outputs_begin Begin iterator to outputs selects list
+   * @param stream Stream to execute selects kernel
+   */
+  template <typename KeyIt, typename OutputIt>
+  constexpr void select(KeyIt keys_begin,
+                        KeyIt keys_end,
+                        OutputIt outputs_begin,
+                        cuda_stream_ref stream = {}) noexcept;
+
+  using rank_type = cuco::experimental::detail::rank;  ///< Rank type
+
+  /**
+   *@brief Struct to hold all storage refs needed by reference
+   */
+  // TODO: this is not a real ref type, to be changed
+  struct storage_ref_type {
+    const word_type* words_ref_;  ///< Words ref
+
+    const rank_type* ranks_true_ref_;    ///< Ranks ref for 1 bits
+    const size_type* selects_true_ref_;  ///< Selects ref for 1 bits
+
+    const rank_type* ranks_false_ref_;    ///< Ranks ref for 0 bits
+    const size_type* selects_false_ref_;  ///< Selects ref 0 bits
+  };
+
+  /**
+   * @brief Device non-owning reference type of dynamic_bitset
+   */
+  class reference {
+   public:
+    /**
+     * @brief Constructs a reference
+     *
+     * @param storage Struct with non-owning refs to bitset storage arrays
+     */
+    __host__ __device__ explicit constexpr reference(storage_ref_type storage) noexcept;
+
+    /**
+     * @brief Access value of a single bit
+     *
+     * @param key Position of bit
+     *
+     * @return Value of bit at position specified by key
+     */
+    [[nodiscard]] __device__ constexpr bool test(size_type key) const noexcept;
+
+    /**
+     * @brief Access a single word of internal storage
+     *
+     * @param word_id Index of word
+     *
+     * @return Word at position specified by index
+     */
+    [[nodiscard]] __device__ constexpr word_type word(size_type word_id) const noexcept;
+
+    /**
+     * @brief Find position of first set bit starting from a given position (inclusive)
+     *
+     * @param key Position of starting bit
+     *
+     * @return Index of next set bit
+     */
+    [[nodiscard]] __device__ size_type find_next(size_type key) const noexcept;
+
+    /**
+     * @brief Find number of set bits (rank) in all positions before the input position (exclusive)
+     *
+     * @param key Input bit position
+     *
+     * @return Rank of input position
+     */
+    [[nodiscard]] __device__ constexpr size_type rank(size_type key) const noexcept;
+
+    /**
+     * @brief Find position of Nth set (1) bit counting from start
+     *
+     * @param count Input N
+     *
+     * @return Position of Nth set bit
+     */
+    [[nodiscard]] __device__ constexpr size_type select(size_type count) const noexcept;
+
+    /**
+     * @brief Find position of Nth not-set (0) bit counting from start
+     *
+     * @param count Input N
+     *
+     * @return Position of Nth not-set bit
+     */
+    [[nodiscard]] __device__ constexpr size_type select_false(size_type count) const noexcept;
+
+   private:
+    /**
+     * @brief Helper function for select operation that computes an initial rank estimate
+     *
+     * @param count Input count for which select operation is being performed
+     * @param selects Selects array
+     * @param ranks Ranks array
+     *
+     * @return index in ranks which corresponds to highest rank less than count (least upper bound)
+     */
+    template <typename SelectsRef, typename RanksRef>
+    [[nodiscard]] __device__ constexpr size_type initial_rank_estimate(
+      size_type count, const SelectsRef& selects, const RanksRef& ranks) const noexcept;
+
+    /**
+     * @brief Subtract rank estimate from input count and return an increment to word_id
+     *
+     * @tparam Rank type
+     *
+     * @param count Input count that will be updated
+     * @param rank  Initial rank estimate for count
+     *
+     * @return Increment to word_id based on rank values
+     */
+    template <typename Rank>
+    [[nodiscard]] __device__ constexpr size_type subtract_rank_from_count(size_type& count,
+                                                                          Rank rank) const noexcept;
+
+    /**
+     * @brief Find position of Nth set bit in a 64-bit word
+     *
+     * @param N Input count
+     *
+     * @return Position of Nth set bit
+     */
+    [[nodiscard]] __device__ size_type select_bit_in_word(size_type N,
+                                                          word_type word) const noexcept;
+
+    storage_ref_type storage_;  ///< Non-owning storage
+  };
+
+  using ref_type = reference;  ///< Non-owning container ref type
+
+  /**
+   * @brief Gets non-owning device ref of the current object
+   *
+   * @return Device ref of the current `dynamic_bitset` object
+   */
+  [[nodiscard]] constexpr ref_type ref() const noexcept;
+
+  /**
+   * @brief Gets the number of bits dynamic_bitset holds
+   *
+   * @return Number of bits dynamic_bitset holds
+   */
+  [[nodiscard]] constexpr size_type size() const noexcept;
+
+ private:
+  /// Type of the allocator to (de)allocate ranks
+  using rank_allocator_type = typename std::allocator_traits<Allocator>::rebind_alloc<rank_type>;
+  /// Type of the allocator to (de)allocate indices
+  using size_allocator_type = typename std::allocator_traits<Allocator>::rebind_alloc<size_type>;
+
+  allocator_type allocator_;  ///< Words allocator
+  size_type n_bits_;          ///< Number of bits dynamic_bitset currently holds
+  bool is_built_;  ///< Flag indicating whether the rank and select indices are built or not
+
+  /// Words vector that represents all bits
+  thrust::device_vector<word_type, allocator_type> words_;
+  /// Rank values for every 256-th bit (4-th word)
+  thrust::device_vector<rank_type, rank_allocator_type> ranks_true_;
+  /// Same as ranks_ but for `0` bits
+  thrust::device_vector<rank_type, rank_allocator_type> ranks_false_;
+  /// Block indices of (0, 256, 512...)th `1` bit
+  thrust::device_vector<size_type, size_allocator_type> selects_true_;
+  /// Same as selects_, but for `0` bits
+  thrust::device_vector<size_type, size_allocator_type> selects_false_;
+
+  /**
+   * @brief Builds indexes for rank and select
+   *
+   * @param stream Stream to execute kernels
+   */
+  constexpr void build(cuda_stream_ref stream = {}) noexcept;
+
+  /**
+   * @brief Populates rank and select indexes for true or false bits
+   *
+   * @param ranks Output array of ranks
+   * @param selects Output array of selects
+   * @param flip_bits If true, negate bits to construct indexes for false bits
+   * @param stream Stream to execute kernels
+   */
+  constexpr void build_ranks_and_selects(
+    thrust::device_vector<rank_type, rank_allocator_type>& ranks,
+    thrust::device_vector<size_type, size_allocator_type>& selects,
+    bool flip_bits,
+    cuda_stream_ref stream = {});
+};
+
+}  // namespace detail
+}  // namespace experimental
+}  // namespace cuco
+
+#include <cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl>
diff --git a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl
new file mode 100644
index 000000000..d56ef9d7c
--- /dev/null
+++ b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl
@@ -0,0 +1,404 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuco/detail/trie/dynamic_bitset/kernels.cuh>
+#include <cuco/detail/utility/cuda.hpp>
+#include <cuco/detail/utils.hpp>
+
+#include <thrust/device_vector.h>
+#include <thrust/iterator/discard_iterator.h>
+
+#include <cub/device/device_scan.cuh>
+#include <cub/device/device_select.cuh>
+
+#include <cuda/std/bit>
+
+namespace cuco {
+namespace experimental {
+namespace detail {
+
+template <class Allocator>
+constexpr dynamic_bitset<Allocator>::dynamic_bitset(Allocator const& allocator)
+  : allocator_{allocator},
+    n_bits_{0},
+    is_built_{false},
+    words_{allocator},
+    ranks_true_{allocator},
+    ranks_false_{allocator},
+    selects_true_{allocator},
+    selects_false_{allocator}
+{
+}
+
+template <class Allocator>
+constexpr void dynamic_bitset<Allocator>::push_back(bool bit) noexcept
+{
+  if (n_bits_ % bits_per_block == 0) {
+    words_.resize(words_.size() + words_per_block);  // Extend storage by one block
+  }
+
+  set(n_bits_++, bit);
+}
+
+template <class Allocator>
+constexpr void dynamic_bitset<Allocator>::set(size_type index, bool bit) noexcept
+{
+  is_built_         = false;
+  size_type word_id = index / bits_per_word;
+  size_type bit_id  = index % bits_per_word;
+  if (bit) {
+    words_[word_id] |= 1UL << bit_id;
+  } else {
+    words_[word_id] &= ~(1UL << bit_id);
+  }
+}
+
+template <class Allocator>
+constexpr void dynamic_bitset<Allocator>::set_last(bool bit) noexcept
+{
+  set(n_bits_ - 1, bit);
+}
+
+template <class Allocator>
+template <typename KeyIt, typename OutputIt>
+constexpr void dynamic_bitset<Allocator>::test(KeyIt keys_begin,
+                                               KeyIt keys_end,
+                                               OutputIt outputs_begin,
+                                               cuda_stream_ref stream) noexcept
+
+{
+  build();
+  auto const num_keys = cuco::detail::distance(keys_begin, keys_end);
+  if (num_keys == 0) { return; }
+
+  auto const grid_size = cuco::detail::grid_size(num_keys);
+
+  bitset_test_kernel<<<grid_size, cuco::detail::default_block_size(), 0, stream>>>(
+    ref(), keys_begin, outputs_begin, num_keys);
+}
+
+template <class Allocator>
+template <typename KeyIt, typename OutputIt>
+constexpr void dynamic_bitset<Allocator>::rank(KeyIt keys_begin,
+                                               KeyIt keys_end,
+                                               OutputIt outputs_begin,
+                                               cuda_stream_ref stream) noexcept
+{
+  build();
+  auto const num_keys = cuco::detail::distance(keys_begin, keys_end);
+  if (num_keys == 0) { return; }
+
+  auto const grid_size = cuco::detail::grid_size(num_keys);
+
+  bitset_rank_kernel<<<grid_size, cuco::detail::default_block_size(), 0, stream>>>(
+    ref(), keys_begin, outputs_begin, num_keys);
+}
+
+template <class Allocator>
+template <typename KeyIt, typename OutputIt>
+constexpr void dynamic_bitset<Allocator>::select(KeyIt keys_begin,
+                                                 KeyIt keys_end,
+                                                 OutputIt outputs_begin,
+                                                 cuda_stream_ref stream) noexcept
+
+{
+  build();
+  auto const num_keys = cuco::detail::distance(keys_begin, keys_end);
+  if (num_keys == 0) { return; }
+
+  auto const grid_size = cuco::detail::grid_size(num_keys);
+
+  bitset_select_kernel<<<grid_size, cuco::detail::default_block_size(), 0, stream>>>(
+    ref(), keys_begin, outputs_begin, num_keys);
+}
+
+template <class Allocator>
+constexpr void dynamic_bitset<Allocator>::build_ranks_and_selects(
+  thrust::device_vector<rank_type, rank_allocator_type>& ranks,
+  thrust::device_vector<size_type, size_allocator_type>& selects,
+  bool flip_bits,
+  cuda_stream_ref stream)
+{
+  if (n_bits_ == 0) { return; }
+
+  // Step 1. Compute prefix sum of per-word bit counts
+  // Population counts for each word
+  size_type const num_words = words_.size();
+  // Sized to have one extra entry for subsequent prefix sum
+  auto const bit_counts_size = num_words + 1;
+
+  thrust::device_vector<size_type, size_allocator_type> bit_counts(num_words + 1, this->allocator_);
+  auto const bit_counts_begin = thrust::raw_pointer_cast(bit_counts.data());
+
+  auto grid_size = cuco::detail::grid_size(num_words);
+  bit_counts_kernel<<<grid_size, cuco::detail::default_block_size(), 0, stream>>>(
+    thrust::raw_pointer_cast(words_.data()), bit_counts_begin, num_words, flip_bits);
+
+  std::size_t temp_storage_bytes = 0;
+  using temp_allocator_type = typename std::allocator_traits<allocator_type>::rebind_alloc<char>;
+  auto temp_allocator       = temp_allocator_type{this->allocator_};
+
+  CUCO_CUDA_TRY(cub::DeviceScan::ExclusiveSum(
+    nullptr, temp_storage_bytes, bit_counts_begin, bit_counts_begin, bit_counts_size, stream));
+
+  // Allocate temporary storage
+  auto d_temp_storage = temp_allocator.allocate(temp_storage_bytes);
+
+  CUCO_CUDA_TRY(cub::DeviceScan::ExclusiveSum(thrust::raw_pointer_cast(d_temp_storage),
+                                              temp_storage_bytes,
+                                              bit_counts_begin,
+                                              bit_counts_begin,
+                                              bit_counts_size,
+                                              stream));
+
+  temp_allocator.deallocate(d_temp_storage, temp_storage_bytes);
+
+  // Step 2. Compute ranks
+  auto const num_blocks = (num_words - 1) / words_per_block + 2;
+  ranks.resize(num_blocks);
+
+  grid_size = cuco::detail::grid_size(num_blocks);
+  encode_ranks_from_prefix_bit_counts<<<grid_size, cuco::detail::default_block_size(), 0, stream>>>(
+    bit_counts_begin,
+    thrust::raw_pointer_cast(ranks.data()),
+    num_words,
+    num_blocks,
+    words_per_block);
+
+  // Step 3. Compute selects
+  thrust::device_vector<size_type, size_allocator_type> select_markers(num_blocks,
+                                                                       this->allocator_);
+  auto const select_markers_begin = thrust::raw_pointer_cast(select_markers.data());
+
+  mark_blocks_with_select_entries<<<grid_size, cuco::detail::default_block_size(), 0, stream>>>(
+    bit_counts_begin, select_markers_begin, num_blocks, words_per_block, bits_per_block);
+
+  auto d_sum = reinterpret_cast<size_type*>(thrust::raw_pointer_cast(
+    std::allocator_traits<temp_allocator_type>::allocate(temp_allocator, sizeof(size_type))));
+  CUCO_CUDA_TRY(cub::DeviceReduce::Sum(
+    nullptr, temp_storage_bytes, select_markers_begin, d_sum, num_blocks, stream));
+
+  d_temp_storage = temp_allocator.allocate(temp_storage_bytes);
+
+  CUCO_CUDA_TRY(cub::DeviceReduce::Sum(thrust::raw_pointer_cast(d_temp_storage),
+                                       temp_storage_bytes,
+                                       select_markers_begin,
+                                       d_sum,
+                                       num_blocks,
+                                       stream));
+
+  size_type num_selects{};
+  CUCO_CUDA_TRY(
+    cudaMemcpyAsync(&num_selects, d_sum, sizeof(size_type), cudaMemcpyDeviceToHost, stream));
+  stream.synchronize();
+  std::allocator_traits<temp_allocator_type>::deallocate(
+    temp_allocator, thrust::device_ptr<char>{reinterpret_cast<char*>(d_sum)}, sizeof(size_type));
+  temp_allocator.deallocate(d_temp_storage, temp_storage_bytes);
+
+  selects.resize(num_selects);
+
+  auto const select_begin = thrust::raw_pointer_cast(selects.data());
+
+  CUCO_CUDA_TRY(cub::DeviceSelect::Flagged(nullptr,
+                                           temp_storage_bytes,
+                                           thrust::make_counting_iterator(0UL),
+                                           select_markers_begin,
+                                           select_begin,
+                                           thrust::make_discard_iterator(),
+                                           num_blocks,
+                                           stream));
+
+  d_temp_storage = temp_allocator.allocate(temp_storage_bytes);
+
+  CUCO_CUDA_TRY(cub::DeviceSelect::Flagged(thrust::raw_pointer_cast(d_temp_storage),
+                                           temp_storage_bytes,
+                                           thrust::make_counting_iterator(0UL),
+                                           select_markers_begin,
+                                           select_begin,
+                                           thrust::make_discard_iterator(),
+                                           num_blocks,
+                                           stream));
+
+  temp_allocator.deallocate(d_temp_storage, temp_storage_bytes);
+}
+
+template <class Allocator>
+constexpr void dynamic_bitset<Allocator>::build(cuda_stream_ref stream) noexcept
+{
+  if (not is_built_) {
+    build_ranks_and_selects(ranks_true_, selects_true_, false, stream);   // 1 bits
+    build_ranks_and_selects(ranks_false_, selects_false_, true, stream);  // 0 bits
+    is_built_ = true;
+  }
+}
+
+template <class Allocator>
+constexpr dynamic_bitset<Allocator>::ref_type dynamic_bitset<Allocator>::ref() const noexcept
+{
+  return ref_type{storage_ref_type{thrust::raw_pointer_cast(words_.data()),
+                                   thrust::raw_pointer_cast(ranks_true_.data()),
+                                   thrust::raw_pointer_cast(selects_true_.data()),
+                                   thrust::raw_pointer_cast(ranks_false_.data()),
+                                   thrust::raw_pointer_cast(selects_false_.data())}};
+}
+
+template <class Allocator>
+constexpr dynamic_bitset<Allocator>::size_type dynamic_bitset<Allocator>::size() const noexcept
+{
+  return n_bits_;
+}
+
+// Device reference implementations
+
+template <class Allocator>
+__host__ __device__ constexpr dynamic_bitset<Allocator>::reference::reference(
+  storage_ref_type storage) noexcept
+  : storage_{storage}
+{
+}
+
+template <class Allocator>
+__device__ constexpr bool dynamic_bitset<Allocator>::reference::test(size_type key) const noexcept
+{
+  return (storage_.words_ref_[key / bits_per_word] >> (key % bits_per_word)) & 1UL;
+}
+
+template <class Allocator>
+__device__ constexpr typename dynamic_bitset<Allocator>::word_type
+dynamic_bitset<Allocator>::reference::word(size_type word_id) const noexcept
+{
+  return storage_.words_ref_[word_id];
+}
+
+template <class Allocator>
+__device__ typename dynamic_bitset<Allocator>::size_type
+dynamic_bitset<Allocator>::reference::find_next(size_type key) const noexcept
+{
+  size_type word_id = key / bits_per_word;
+  size_type bit_id  = key % bits_per_word;
+  word_type word    = storage_.words_ref_[word_id];
+  word &= ~(0UL) << bit_id;
+  while (word == 0) {
+    word = storage_.words_ref_[++word_id];
+  }
+  return word_id * bits_per_word + __ffsll(word) - 1;  // cuda intrinsic
+}
+
+template <class Allocator>
+__device__ constexpr typename dynamic_bitset<Allocator>::size_type
+dynamic_bitset<Allocator>::reference::rank(size_type key) const noexcept
+{
+  size_type word_id   = key / bits_per_word;
+  size_type bit_id    = key % bits_per_word;
+  size_type rank_id   = word_id / words_per_block;
+  size_type offset_id = word_id % words_per_block;
+
+  auto rank   = storage_.ranks_true_ref_[rank_id];
+  size_type n = rank.base();
+
+  if (offset_id != 0) { n += rank.offsets_[offset_id - 1]; }
+
+  n += cuda::std::popcount(storage_.words_ref_[word_id] & ((1UL << bit_id) - 1));
+
+  return n;
+}
+
+template <class Allocator>
+__device__ constexpr typename dynamic_bitset<Allocator>::size_type
+dynamic_bitset<Allocator>::reference::select(size_type count) const noexcept
+{
+  auto rank_id = initial_rank_estimate(count, storage_.selects_true_ref_, storage_.ranks_true_ref_);
+  auto rank    = storage_.ranks_true_ref_[rank_id];
+
+  size_type word_id = rank_id * words_per_block;
+  word_id += subtract_rank_from_count(count, rank);
+
+  return word_id * bits_per_word + select_bit_in_word(count, storage_.words_ref_[word_id]);
+}
+
+template <class Allocator>
+__device__ constexpr typename dynamic_bitset<Allocator>::size_type
+dynamic_bitset<Allocator>::reference::select_false(size_type count) const noexcept
+{
+  auto rank_id =
+    initial_rank_estimate(count, storage_.selects_false_ref_, storage_.ranks_false_ref_);
+  auto rank = storage_.ranks_false_ref_[rank_id];
+
+  size_type word_id = rank_id * words_per_block;
+  word_id += subtract_rank_from_count(count, rank);
+
+  return word_id * bits_per_word + select_bit_in_word(count, ~(storage_.words_ref_[word_id]));
+}
+
+template <class Allocator>
+template <typename SelectsRef, typename RanksRef>
+__device__ constexpr typename dynamic_bitset<Allocator>::size_type
+dynamic_bitset<Allocator>::reference::initial_rank_estimate(size_type count,
+                                                            SelectsRef const& selects,
+                                                            RanksRef const& ranks) const noexcept
+{
+  size_type block_id = count / (bits_per_word * words_per_block);
+  size_type begin    = selects[block_id];
+  size_type end      = selects[block_id + 1] + 1UL;
+
+  if (begin + 10 >= end) {  // Linear search
+    while (count >= ranks[begin + 1].base()) {
+      ++begin;
+    }
+  } else {  // Binary search
+    while (begin + 1 < end) {
+      size_type middle = (begin + end) / 2;
+      if (count < ranks[middle].base()) {
+        end = middle;
+      } else {
+        begin = middle;
+      }
+    }
+  }
+  return begin;
+}
+
+template <class Allocator>
+template <typename Rank>
+__device__ constexpr typename dynamic_bitset<Allocator>::size_type
+dynamic_bitset<Allocator>::reference::subtract_rank_from_count(size_type& count,
+                                                               Rank rank) const noexcept
+{
+  count -= rank.base();
+
+  bool a0       = count >= rank.offsets_[0];
+  bool a1       = count >= rank.offsets_[1];
+  bool a2       = count >= rank.offsets_[2];
+  size_type inc = a0 + a1 + a2;
+
+  count -= (inc > 0) * rank.offsets_[inc - (inc > 0)];
+
+  return inc;
+}
+
+template <class Allocator>
+__device__ typename dynamic_bitset<Allocator>::size_type
+dynamic_bitset<Allocator>::reference::select_bit_in_word(size_type N, word_type word) const noexcept
+{
+  for (size_type pos = 0; pos < N; pos++) {
+    word &= word - 1;
+  }
+  return __ffsll(word & -word) - 1;  // cuda intrinsic
+}
+}  // namespace detail
+}  // namespace experimental
+}  // namespace cuco
diff --git a/include/cuco/detail/trie/dynamic_bitset/kernels.cuh b/include/cuco/detail/trie/dynamic_bitset/kernels.cuh
new file mode 100644
index 000000000..c92ab60b2
--- /dev/null
+++ b/include/cuco/detail/trie/dynamic_bitset/kernels.cuh
@@ -0,0 +1,240 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuco/detail/utility/cuda.cuh>
+#include <cuco/detail/utility/cuda.hpp>
+
+#include <cuda/std/bit>
+
+namespace cuco {
+namespace experimental {
+namespace detail {
+
+/*
+ * @brief Test bits for a range of keys
+ *
+ * @tparam BitsetRef Bitset reference type
+ * @tparam KeyIt Device-accessible iterator whose `value_type` can be converted to bitset's
+ * `size_type`
+ * @tparam OutputIt Device-accessible iterator whose `value_type` can be constructed from boolean
+ * type
+ *
+ * @param ref Bitset ref
+ * @param keys Begin iterator to keys
+ * @param outputs Begin iterator to outputs
+ * @param num_keys Number of input keys
+ */
+template <typename BitsetRef, typename KeyIt, typename OutputIt>
+__global__ void bitset_test_kernel(BitsetRef ref,
+                                   KeyIt keys,
+                                   OutputIt outputs,
+                                   cuco::detail::index_type num_keys)
+{
+  auto key_id       = cuco::detail::global_thread_id();
+  auto const stride = cuco::detail::grid_stride();
+
+  while (key_id < num_keys) {
+    outputs[key_id] = ref.test(keys[key_id]);
+    key_id += stride;
+  }
+}
+
+/*
+ * @brief Gather rank values for a range of keys
+ *
+ * @tparam BitsetRef Bitset reference type
+ * @tparam KeyIt Device-accessible iterator whose `value_type` can be converted to bitset's
+ * `size_type`
+ * @tparam OutputIt Device-accessible iterator whose `value_type` can be constructed from bitset's
+ * `size_type`
+ *
+ * @param ref Bitset ref
+ * @param keys Begin iterator to keys
+ * @param outputs Begin iterator to outputs
+ * @param num_keys Number of input keys
+ */
+template <typename BitsetRef, typename KeyIt, typename OutputIt>
+__global__ void bitset_rank_kernel(BitsetRef ref,
+                                   KeyIt keys,
+                                   OutputIt outputs,
+                                   cuco::detail::index_type num_keys)
+{
+  auto key_id       = cuco::detail::global_thread_id();
+  auto const stride = cuco::detail::grid_stride();
+
+  while (key_id < num_keys) {
+    outputs[key_id] = ref.rank(keys[key_id]);
+    key_id += stride;
+  }
+}
+
+/*
+ * @brief Gather select values for a range of keys
+ *
+ * @tparam BitsetRef Bitset reference type
+ * @tparam KeyIt Device-accessible iterator whose `value_type` can be converted to bitset's
+ * `size_type`
+ * @tparam OutputIt Device-accessible iterator whose `value_type` can be constructed from bitset's
+ * `size_type`
+ *
+ * @param ref Bitset ref
+ * @param keys Begin iterator to keys
+ * @param outputs Begin iterator to outputs
+ * @param num_keys Number of input keys
+ */
+template <typename BitsetRef, typename KeyIt, typename OutputIt>
+__global__ void bitset_select_kernel(BitsetRef ref,
+                                     KeyIt keys,
+                                     OutputIt outputs,
+                                     cuco::detail::index_type num_keys)
+{
+  auto key_id       = cuco::detail::global_thread_id();
+  auto const stride = cuco::detail::grid_stride();
+
+  while (key_id < num_keys) {
+    outputs[key_id] = ref.select(keys[key_id]);
+    key_id += stride;
+  }
+}
+
+/*
+ * @brief Computes number of set or not-set bits in each word
+ *
+ * @tparam WordType Word type
+ * @tparam SizeType Size type
+ *
+ * @param words Input array of words
+ * @param bit_counts Output array of per-word bit counts
+ * @param num_words Number of words
+ * @param flip_bits Boolean to request negation of words before counting bits
+ */
+template <typename WordType, typename SizeType>
+__global__ void bit_counts_kernel(WordType const* words,
+                                  SizeType* bit_counts,
+                                  cuco::detail::index_type num_words,
+                                  bool flip_bits)
+{
+  auto word_id      = cuco::detail::global_thread_id();
+  auto const stride = cuco::detail::grid_stride();
+
+  while (word_id < num_words) {
+    auto word           = words[word_id];
+    bit_counts[word_id] = cuda::std::popcount(flip_bits ? ~word : word);
+    word_id += stride;
+  }
+}
+
+/*
+ * @brief Compute rank values at block size intervals.
+ *
+ * ranks[i] = Number of set bits in [0, i) range
+ * This kernel transforms prefix sum array of per-word bit counts
+ * into base-delta encoding style of `rank` struct.
+ * Since prefix sum is available, there are no dependencies across blocks.
+
+ * @tparam SizeType Size type
+ *
+ * @param prefix_bit_counts Prefix sum array of per-word bit counts
+ * @param ranks Output array of ranks
+ * @param num_words Length of input array
+ * @param num_blocks Length of ouput array
+ * @param words_per_block Number of words in each block
+ */
+template <typename SizeType>
+__global__ void encode_ranks_from_prefix_bit_counts(const SizeType* prefix_bit_counts,
+                                                    rank* ranks,
+                                                    SizeType num_words,
+                                                    SizeType num_blocks,
+                                                    SizeType words_per_block)
+{
+  auto rank_id      = cuco::detail::global_thread_id();
+  auto const stride = cuco::detail::grid_stride();
+
+  while (rank_id < num_blocks) {
+    SizeType word_id = rank_id * words_per_block;
+
+    // Set base value of rank
+    auto& rank = ranks[rank_id];
+    rank.set_base(prefix_bit_counts[word_id]);
+
+    if (rank_id < num_blocks - 1) {
+      // For each subsequent word in this block, compute deltas from base
+      for (SizeType block_offset = 0; block_offset < words_per_block - 1; block_offset++) {
+        auto delta = prefix_bit_counts[word_id + block_offset + 1] - prefix_bit_counts[word_id];
+        rank.offsets_[block_offset] = delta;
+      }
+    }
+    rank_id += stride;
+  }
+}
+
+/*
+ * @brief Compute select values at block size intervals.
+ *
+ * selects[i] = Position of (i+ 1)th set bit
+ * This kernel check for blocks where prefix sum crosses a multiple of `bits_per_block`.
+ * Such blocks are marked in the output boolean array
+ *
+ * @tparam SizeType Size type
+ *
+ * @param prefix_bit_counts Prefix sum array of per-word bit counts
+ * @param selects_markers Ouput array indicating whether a block has selects entry or not
+ * @param num_blocks Length of ouput array
+ * @param words_per_block Number of words in each block
+ * @param bits_per_block Number of bits in each block
+ */
+template <typename SizeType>
+__global__ void mark_blocks_with_select_entries(SizeType const* prefix_bit_counts,
+                                                SizeType* select_markers,
+                                                SizeType num_blocks,
+                                                SizeType words_per_block,
+                                                SizeType bits_per_block)
+{
+  auto block_id     = cuco::detail::global_thread_id();
+  auto const stride = cuco::detail::grid_stride();
+
+  while (block_id < num_blocks) {
+    if (block_id == 0) {  // Block 0 always has a selects entry
+      select_markers[block_id] = 1;
+      block_id += stride;
+      continue;
+    }
+
+    select_markers[block_id] = 0;  // Always clear marker first
+    SizeType word_id         = block_id * words_per_block;
+    SizeType prev_count      = prefix_bit_counts[word_id];
+
+    for (size_t block_offset = 1; block_offset <= words_per_block; block_offset++) {
+      SizeType count = prefix_bit_counts[word_id + block_offset];
+
+      // Selects entry is added when cumulative bitcount crosses a multiple of bits_per_block
+      if ((prev_count - 1) / bits_per_block != (count - 1) / bits_per_block) {
+        select_markers[block_id] = 1;
+        break;
+      }
+      prev_count = count;
+    }
+
+    block_id += stride;
+  }
+}
+
+}  // namespace detail
+}  // namespace experimental
+}  // namespace cuco
diff --git a/include/cuco/detail/utility/cuda.cuh b/include/cuco/detail/utility/cuda.cuh
new file mode 100644
index 000000000..6e5f13ff7
--- /dev/null
+++ b/include/cuco/detail/utility/cuda.cuh
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ */
+
+#pragma once
+
+#include <cuco/detail/utility/cuda.hpp>
+
+namespace cuco {
+namespace detail {
+
+/**
+ * @brief Returns the global thread index in a 1D scalar grid
+ *
+ * @return The global thread index
+ */
+__device__ static index_type global_thread_id() noexcept
+{
+  return index_type{threadIdx.x} + index_type{blockDim.x} * index_type{blockIdx.x};
+}
+
+/**
+ * @brief Returns the grid stride of a 1D grid
+ *
+ * @return The grid stride
+ */
+__device__ static index_type grid_stride() noexcept
+{
+  return index_type{gridDim.x} * index_type{blockDim.x};
+}
+
+}  // namespace detail
+}  // namespace cuco
diff --git a/include/cuco/detail/utility/cuda.hpp b/include/cuco/detail/utility/cuda.hpp
new file mode 100644
index 000000000..f6a84df98
--- /dev/null
+++ b/include/cuco/detail/utility/cuda.hpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ */
+
+#pragma once
+
+#include <cuco/detail/utility/math.hpp>
+
+namespace cuco {
+namespace detail {
+
+using index_type = int64_t;  ///< CUDA thread index type
+
+/// Default block size
+constexpr int32_t default_block_size() noexcept { return 128; }
+/// Default stride
+constexpr int32_t default_stride() noexcept { return 1; }
+
+/**
+ * @brief Computes the desired 1D grid size with the given parameters
+ *
+ * @param num Number of elements to handle in the kernel
+ * @param cg_size Number of threads per CUDA Cooperative Group
+ * @param stride Number of elements to be handled by each thread
+ * @param block_size Number of threads in each thread block
+ *
+ * @return The resulting grid size
+ */
+constexpr auto grid_size(index_type num,
+                         int32_t cg_size    = 1,
+                         int32_t stride     = default_stride(),
+                         int32_t block_size = default_block_size()) noexcept
+{
+  return int_div_ceil(cg_size * num, stride * block_size);
+}
+
+}  // namespace detail
+}  // namespace cuco
diff --git a/include/cuco/detail/utility/math.hpp b/include/cuco/detail/utility/math.hpp
new file mode 100644
index 000000000..47484d6ad
--- /dev/null
+++ b/include/cuco/detail/utility/math.hpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ */
+
+#pragma once
+
+#include <type_traits>
+
+namespace cuco {
+namespace detail {
+
+/**
+ * @brief Ceiling of an integer division
+ *
+ * @tparam T Type of dividend
+ * @tparam U Type of divisor
+ *
+ * @throw If `T` is not an integral type
+ * @throw If `U` is not an integral type
+ *
+ * @param dividend Numerator
+ * @param divisor Denominator
+ *
+ * @return Ceiling of the integer division
+ */
+template <typename T, typename U>
+constexpr T int_div_ceil(T dividend, U divisor) noexcept
+{
+  static_assert(std::is_integral_v<T>);
+  static_assert(std::is_integral_v<U>);
+  return (dividend + divisor - 1) / divisor;
+}
+
+}  // namespace detail
+}  // namespace cuco
diff --git a/include/cuco/detail/utils.cuh b/include/cuco/detail/utils.cuh
index 5b02cef96..22675d496 100644
--- a/include/cuco/detail/utils.cuh
+++ b/include/cuco/detail/utils.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,8 +15,14 @@
 
 #pragma once
 
+#include <cuco/detail/bitwise_compare.cuh>
+
 #include <thrust/tuple.h>
 
+#include <cuda/std/bit>
+#include <cuda/std/cmath>
+#include <cuda/std/type_traits>
+
 namespace cuco {
 namespace detail {
 
@@ -59,7 +65,7 @@ struct slot_to_tuple {
  */
 template <typename Key>
 struct slot_is_filled {
-  Key empty_key_sentinel;  ///< The value of the empty key sentinel
+  Key empty_key_sentinel_;  ///< The value of the empty key sentinel
 
   /**
    * @brief Indicates if the target slot `s` is filled.
@@ -72,8 +78,144 @@ struct slot_is_filled {
   template <typename S>
   __device__ bool operator()(S const& s)
   {
-    return thrust::get<0>(s) != empty_key_sentinel;
+    return not cuco::detail::bitwise_compare(thrust::get<0>(s), empty_key_sentinel_);
+  }
+};
+
+/**
+ * @brief A strong type wrapper.
+ *
+ * @tparam T Type of the mapped values
+ */
+template <typename T>
+struct strong_type {
+  /**
+   * @brief Constructs a strong type.
+   *
+   * @param v Value to be wrapped as a strong type
+   */
+  __host__ __device__ explicit constexpr strong_type(T v) : value{v} {}
+
+  /**
+   * @brief Implicit conversion operator to the underlying value.
+   *
+   * @return Underlying value
+   */
+  __host__ __device__ constexpr operator T() const noexcept { return value; }
+
+  T value;  ///< Underlying value
+};
+
+/**
+ * @brief Converts a given hash value into a valid (positive) size type.
+ *
+ * @tparam SizeType The target type
+ * @tparam HashType The input type
+ *
+ * @return Converted hash value
+ */
+template <typename SizeType, typename HashType>
+__host__ __device__ constexpr SizeType sanitize_hash(HashType hash) noexcept
+{
+  if constexpr (cuda::std::is_signed_v<SizeType>) {
+    return cuda::std::abs(static_cast<SizeType>(hash));
+  } else {
+    return static_cast<SizeType>(hash);
   }
+}
+
+/**
+ * @brief Gives value to use as alignment for a pair type that is at least the
+ * size of the sum of the size of the first type and second type, or 16,
+ * whichever is smaller.
+ */
+template <typename First, typename Second>
+constexpr std::size_t pair_alignment()
+{
+  return std::min(std::size_t{16}, cuda::std::bit_ceil(sizeof(First) + sizeof(Second)));
+}
+
+/**
+ * @brief Denotes the equivalent packed type based on the size of the object.
+ *
+ * @tparam N The size of the object
+ */
+template <std::size_t N>
+struct packed {
+  using type = void;  ///< `void` type by default
+};
+
+/**
+ * @brief Denotes the packed type when the size of the object is 8.
+ */
+template <>
+struct packed<sizeof(uint64_t)> {
+  using type = uint64_t;  ///< Packed type as `uint64_t` if the size of the object is 8
+};
+
+/**
+ * @brief Denotes the packed type when the size of the object is 4.
+ */
+template <>
+struct packed<sizeof(uint32_t)> {
+  using type = uint32_t;  ///< Packed type as `uint32_t` if the size of the object is 4
+};
+
+template <typename Pair>
+using packed_t = typename packed<sizeof(Pair)>::type;
+
+/**
+ * @brief Indicates if a pair type can be packed.
+ *
+ * When the size of the key,value pair being inserted into the hash table is
+ * equal in size to a type where atomicCAS is natively supported, it is more
+ * efficient to "pack" the pair and insert it with a single atomicCAS.
+ *
+ * Pair types whose key and value have the same object representation may be
+ * packed. Also, the `Pair` must not contain any padding bits otherwise
+ * accessing the packed value would be undefined.
+ *
+ * @tparam Pair The pair type that will be packed
+ *
+ * @return true If the pair type can be packed
+ * @return false  If the pair type cannot be packed
+ */
+template <typename Pair>
+constexpr bool is_packable()
+{
+  return not std::is_void<packed_t<Pair>>::value and std::has_unique_object_representations_v<Pair>;
+}
+
+/**
+ * @brief Allows viewing a pair in a packed representation.
+ *
+ * Used as an optimization for inserting when a pair can be inserted with a
+ * single atomicCAS
+ */
+template <typename Pair>
+union pair_converter {
+  using packed_type = packed_t<Pair>;  ///< The packed pair type
+  packed_type packed;                  ///< The pair in the packed representation
+  Pair pair;                           ///< The pair in the pair representation
+
+  /**
+   * @brief Constructs a pair converter by copying from `p`
+   *
+   * @tparam T Type that is convertible to `Pair`
+   *
+   * @param p The pair to copy from
+   */
+  template <typename T>
+  __device__ pair_converter(T&& p) : pair{p}
+  {
+  }
+
+  /**
+   * @brief Constructs a pair converter by copying from `p`
+   *
+   * @param p The packed data to copy from
+   */
+  __device__ pair_converter(packed_type p) : packed{p} {}
 };
 
 }  // namespace detail
diff --git a/include/cuco/detail/utils.hpp b/include/cuco/detail/utils.hpp
index 40697ff5c..86c045e3b 100644
--- a/include/cuco/detail/utils.hpp
+++ b/include/cuco/detail/utils.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,38 +15,60 @@
 
 #pragma once
 
+#include <cuco/detail/error.hpp>
+#include <cuco/detail/utility/cuda.hpp>
+
+#include <iterator>
+#include <type_traits>
+
 namespace cuco {
 namespace detail {
 
+template <typename Iterator>
+constexpr inline index_type distance(Iterator begin, Iterator end)
+{
+  using category = typename std::iterator_traits<Iterator>::iterator_category;
+  static_assert(std::is_base_of_v<std::random_access_iterator_tag, category>,
+                "Input iterator should be a random access iterator.");
+  // `int64_t` instead of arch-dependant `long int`
+  return static_cast<index_type>(std::distance(begin, end));
+}
+
 /**
- * @brief Compute the number of bits of a simple type.
+ * @brief C++17 constexpr backport of `std::lower_bound`.
  *
- * @tparam T The type we want to infer its size in bits
+ * @tparam ForwardIt Type of input iterator
+ * @tparam T Type of `value`
  *
- * @return Size of type T in bits
+ * @param first Iterator defining the start of the range to examine
+ * @param last Iterator defining the start of the range to examine
+ * @param value Value to compare the elements to
+ *
+ * @return Iterator pointing to the first element in the range [first, last) that does not satisfy
+ * element < value
  */
-template <typename T>
-static constexpr std::size_t type_bits() noexcept
+template <class ForwardIt, class T>
+constexpr ForwardIt lower_bound(ForwardIt first, ForwardIt last, const T& value)
 {
-  return sizeof(T) * CHAR_BIT;
-}
+  using diff_type = typename std::iterator_traits<ForwardIt>::difference_type;
 
-// safe division
-#ifndef SDIV
-#define SDIV(x, y) (((x) + (y)-1) / (y))
-#endif
+  ForwardIt it{};
+  diff_type count = std::distance(first, last);
+  diff_type step{};
 
-template <typename Kernel>
-auto get_grid_size(Kernel kernel, std::size_t block_size, std::size_t dynamic_smem_bytes = 0)
-{
-  int grid_size{-1};
-  cudaOccupancyMaxActiveBlocksPerMultiprocessor(&grid_size, kernel, block_size, dynamic_smem_bytes);
-  int dev_id{-1};
-  cudaGetDevice(&dev_id);
-  int num_sms{-1};
-  cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, dev_id);
-  grid_size *= num_sms;
-  return grid_size;
+  while (count > 0) {
+    it   = first;
+    step = count / 2;
+    std::advance(it, step);
+
+    if (static_cast<T>(*it) < value) {
+      first = ++it;
+      count -= step + 1;
+    } else
+      count = step;
+  }
+
+  return first;
 }
 
 }  // namespace detail
diff --git a/include/cuco/dynamic_map.cuh b/include/cuco/dynamic_map.cuh
index a75512d3c..998ff3647 100644
--- a/include/cuco/dynamic_map.cuh
+++ b/include/cuco/dynamic_map.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,7 +17,7 @@
 #pragma once
 
 #include <cuco/detail/dynamic_map_kernels.cuh>
-#include <cuco/detail/error.hpp>
+#include <cuco/hash_functions.cuh>
 #include <cuco/sentinel.cuh>
 #include <cuco/static_map.cuh>
 
@@ -43,8 +43,8 @@ namespace cuco {
  * concurrent insert and find) from threads in device code.
  *
  * Current limitations:
- * - Requires keys that are Arithmetic
- * - Does not support erasing keys
+ * - Requires keys and values that where `cuco::is_bitwise_comparable_v<T>` is true
+ *    - Comparisons against the "sentinel" values will always be done with bitwise comparisons.
  * - Capacity does not shrink automatically
  * - Requires the user to specify sentinel values for both key and mapped value
  *   to indicate empty slots
@@ -66,8 +66,8 @@ namespace cuco {
  * // within the second insert.
  *
  * dynamic_map<int, int> m{100'000,
- *                         sentinel::empty_key<int>{empty_key_sentinel},
- *                         sentinel::empty_value<int>{empty_value_sentinel}};
+ *                         empty_key<int>{empty_key_sentinel},
+ *                         empty_value<int>{empty_value_sentinel}};
  *
  * // Create a sequence of pairs {{0,0}, {1,1}, ... {i,i}}
  * thrust::device_vector<thrust::pair<int,int>> pairs_0(50'000);
@@ -101,22 +101,25 @@ class dynamic_map {
   static_assert(std::is_arithmetic<Key>::value, "Unsupported, non-arithmetic key type.");
 
  public:
-  using value_type      = cuco::pair_type<Key, Value>;       ///< Type of key/value pairs
+  using value_type      = cuco::pair<Key, Value>;            ///< Type of key/value pairs
   using key_type        = Key;                               ///< Key type
   using mapped_type     = Value;                             ///< Type of mapped values
-  using atomic_ctr_type = cuda::atomic<std::size_t, Scope>;  ///< Type of atomic counters
-  using view_type = typename static_map<Key, Value, Scope>::device_view;  ///< Device view type
-  using mutable_view_type = typename static_map<Key, Value, Scope>::device_mutable_view;
-  ///< Device mutable view type
+  using atomic_ctr_type = cuda::atomic<std::size_t, Scope>;  ///< Atomic counter type
+  using view_type =
+    typename static_map<Key, Value, Scope>::device_view;  ///< Type for submap device view
+  using mutable_view_type =
+    typename static_map<Key, Value, Scope>::device_mutable_view;  ///< Type for submap mutable
+                                                                  ///< device view
 
   dynamic_map(dynamic_map const&) = delete;
   dynamic_map(dynamic_map&&)      = delete;
+
   dynamic_map& operator=(dynamic_map const&) = delete;
   dynamic_map& operator=(dynamic_map&&) = delete;
 
   /**
-   * @brief Construct a dynamically-sized map with the specified initial capacity, growth factor and
-   * sentinel values.
+   * @brief Constructs a dynamically-sized map with the specified initial capacity, growth factor
+   * and sentinel values.
    *
    * The capacity of the map will automatically increase as the user adds key/value pairs using
    * `insert`.
@@ -133,17 +136,50 @@ class dynamic_map {
    * @param empty_key_sentinel The reserved key value for empty slots
    * @param empty_value_sentinel The reserved mapped value for empty slots
    * @param alloc Allocator used to allocate submap device storage
+   * @param stream Stream used for executing the kernels
    */
   dynamic_map(std::size_t initial_capacity,
-              sentinel::empty_key<Key> empty_key_sentinel,
-              sentinel::empty_value<Value> empty_value_sentinel,
-              Allocator const& alloc = Allocator{});
+              empty_key<Key> empty_key_sentinel,
+              empty_value<Value> empty_value_sentinel,
+              Allocator const& alloc = Allocator{},
+              cudaStream_t stream    = nullptr);
 
   /**
-   * @brief Destroy the map and frees its contents
+   * @brief Constructs a dynamically-sized map with erase capability.
+   *
+   * The capacity of the map will automatically increase as the user adds key/value pairs using
+   * `insert`.
+   *
+   * Capacity increases by a factor of growth_factor each time the size of the map exceeds a
+   * threshold occupancy. The performance of `find` and `contains` decreases somewhat each time the
+   * map's capacity grows.
+   *
+   * The `empty_key_sentinel` and `empty_value_sentinel` values are reserved and
+   * undefined behavior results from attempting to insert any key/value pair
+   * that contains either.
    *
+   * @param initial_capacity The initial number of slots in the map
+   * @param empty_key_sentinel The reserved key value for empty slots
+   * @param empty_value_sentinel The reserved mapped value for empty slots
+   * @param erased_key_sentinel The reserved key value for erased slots
+   * @param alloc Allocator used to allocate submap device storage
+   * @param stream Stream used for executing the kernels
+   *
+   * @throw std::runtime error if the empty key sentinel and erased key sentinel
+   * are the same value
    */
-  ~dynamic_map();
+  dynamic_map(std::size_t initial_capacity,
+              empty_key<Key> empty_key_sentinel,
+              empty_value<Value> empty_value_sentinel,
+              erased_key<Key> erased_key_sentinel,
+              Allocator const& alloc = Allocator{},
+              cudaStream_t stream    = nullptr);
+
+  /**
+   * @brief Destroys the map and frees its contents
+   *
+   */
+  ~dynamic_map() {}
 
   /**
    * @brief Grows the capacity of the map so there is enough space for `n` key/value pairs.
@@ -151,8 +187,9 @@ class dynamic_map {
    * If there is already enough space for `n` key/value pairs, the capacity remains the same.
    *
    * @param n The number of key value pairs for which there must be space
+   * @param stream Stream used for executing the kernels
    */
-  void reserve(std::size_t n);
+  void reserve(std::size_t n, cudaStream_t stream = nullptr);
 
   /**
    * @brief Inserts all key/value pairs in the range `[first, last)`.
@@ -168,11 +205,55 @@ class dynamic_map {
    * @param last End of the sequence of key/value pairs
    * @param hash The unary function to apply to hash each key
    * @param key_equal The binary function to compare two keys for equality
+   * @param stream Stream used for executing the kernels
    */
   template <typename InputIt,
-            typename Hash     = cuco::detail::MurmurHash3_32<key_type>,
+            typename Hash     = cuco::default_hash_function<key_type>,
             typename KeyEqual = thrust::equal_to<key_type>>
-  void insert(InputIt first, InputIt last, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{});
+  void insert(InputIt first,
+              InputIt last,
+              Hash hash           = Hash{},
+              KeyEqual key_equal  = KeyEqual{},
+              cudaStream_t stream = nullptr);
+
+  /**
+   * @brief Erases keys in the range `[first, last)`.
+   *
+   * For each key `k` in `[first, last)`, if `contains(k) == true), removes `k` and it's
+   * associated value from the map. Else, no effect.
+   *
+   *  Side-effects:
+   *  - `contains(k) == false`
+   *  - `find(k) == end()`
+   *  - `insert({k,v}) == true`
+   *  - `get_size()` is reduced by the total number of erased keys
+   *
+   * This function synchronizes `stream`.
+   *
+   * Keep in mind that `erase` does not cause the map to shrink its memory allocation.
+   *
+   * @tparam InputIt Device accessible input iterator whose `value_type` is
+   * convertible to the map's `value_type`
+   * @tparam Hash Unary callable type
+   * @tparam KeyEqual Binary callable type
+   *
+   * @param first Beginning of the sequence of keys
+   * @param last End of the sequence of keys
+   * @param hash The unary function to apply to hash each key
+   * @param key_equal The binary function to compare two keys for equality
+   * @param stream Stream used for executing the kernels
+   *
+   * @throw std::runtime_error if a unique erased key sentinel value was not
+   * provided at construction
+   */
+  template <typename InputIt,
+            typename Hash     = cuco::default_hash_function<key_type>,
+            typename KeyEqual = thrust::equal_to<key_type>>
+  void erase(InputIt first,
+             InputIt last,
+             Hash hash           = Hash{},
+             KeyEqual key_equal  = KeyEqual{},
+             cudaStream_t stream = nullptr);
 
   /**
    * @brief Finds the values corresponding to all keys in the range `[first, last)`.
@@ -186,21 +267,24 @@ class dynamic_map {
    * convertible to the map's `mapped_type`
    * @tparam Hash Unary callable type
    * @tparam KeyEqual Binary callable type
+   *
    * @param first Beginning of the sequence of keys
    * @param last End of the sequence of keys
    * @param output_begin Beginning of the sequence of values retrieved for each key
    * @param hash The unary function to apply to hash each key
    * @param key_equal The binary function to compare two keys for equality
+   * @param stream Stream used for executing the kernels
    */
   template <typename InputIt,
             typename OutputIt,
-            typename Hash     = cuco::detail::MurmurHash3_32<key_type>,
+            typename Hash     = cuco::default_hash_function<key_type>,
             typename KeyEqual = thrust::equal_to<key_type>>
   void find(InputIt first,
             InputIt last,
             OutputIt output_begin,
-            Hash hash          = Hash{},
-            KeyEqual key_equal = KeyEqual{});
+            Hash hash           = Hash{},
+            KeyEqual key_equal  = KeyEqual{},
+            cudaStream_t stream = nullptr);
 
   /**
    * @brief Indicates whether the keys in the range `[first, last)` are contained in the map.
@@ -213,21 +297,24 @@ class dynamic_map {
    * convertible to the map's `mapped_type`
    * @tparam Hash Unary callable type
    * @tparam KeyEqual Binary callable type
+   *
    * @param first Beginning of the sequence of keys
    * @param last End of the sequence of keys
    * @param output_begin Beginning of the sequence of booleans for the presence of each key
    * @param hash The unary function to apply to hash each key
    * @param key_equal The binary function to compare two keys for equality
+   * @param stream Stream used for executing the kernels
    */
   template <typename InputIt,
             typename OutputIt,
-            typename Hash     = cuco::detail::MurmurHash3_32<key_type>,
+            typename Hash     = cuco::default_hash_function<key_type>,
             typename KeyEqual = thrust::equal_to<key_type>>
   void contains(InputIt first,
                 InputIt last,
                 OutputIt output_begin,
-                Hash hash          = Hash{},
-                KeyEqual key_equal = KeyEqual{});
+                Hash hash           = Hash{},
+                KeyEqual key_equal  = KeyEqual{},
+                cudaStream_t stream = nullptr);
 
   /**
    * @brief Gets the current number of elements in the map
@@ -253,18 +340,22 @@ class dynamic_map {
  private:
   key_type empty_key_sentinel_{};       ///< Key value that represents an empty slot
   mapped_type empty_value_sentinel_{};  ///< Initial value of empty slot
-  std::size_t size_{};                  ///< Number of keys in the map
-  std::size_t capacity_{};              ///< Maximum number of keys that can be inserted
-  float max_load_factor_{};             ///< Max load factor before capacity growth
+  key_type erased_key_sentinel_{};      ///< Key value that represents an erased slot
+
+  // TODO: initialize this
+  std::size_t size_{};       ///< Number of keys in the map
+  std::size_t capacity_{};   ///< Maximum number of keys that can be inserted
+  float max_load_factor_{};  ///< Max load factor before capacity growth
 
   std::vector<std::unique_ptr<static_map<key_type, mapped_type, Scope>>>
     submaps_;                                      ///< vector of pointers to each submap
   thrust::device_vector<view_type> submap_views_;  ///< vector of device views for each submap
   thrust::device_vector<mutable_view_type>
-    submap_mutable_views_;          ///< vector of mutable device views for each submap
-  std::size_t min_insert_size_{};   ///< min remaining capacity of submap for insert
-  atomic_ctr_type* num_successes_;  ///< number of successfully inserted keys on insert
-  Allocator alloc_{};  ///< Allocator passed to submaps to allocate their device storage
+    submap_mutable_views_;         ///< vector of mutable device views for each submap
+  std::size_t min_insert_size_{};  ///< min remaining capacity of submap for insert
+  thrust::device_vector<atomic_ctr_type*>
+    submap_num_successes_;  ///< Number of successfully erased keys for each submap
+  Allocator alloc_{};       ///< Allocator passed to submaps to allocate their device storage
 };
 }  // namespace cuco
 
diff --git a/include/cuco/extent.cuh b/include/cuco/extent.cuh
new file mode 100644
index 000000000..50e7ae4aa
--- /dev/null
+++ b/include/cuco/extent.cuh
@@ -0,0 +1,180 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+
+namespace cuco {
+namespace experimental {
+static constexpr std::size_t dynamic_extent = static_cast<std::size_t>(-1);
+
+/**
+ * @brief Static extent class.
+ *
+ * @tparam SizeType Size type
+ * @tparam N Extent
+ */
+template <typename SizeType, std::size_t N = dynamic_extent>
+struct extent {
+  using value_type = SizeType;  ///< Extent value type
+
+  constexpr extent() = default;
+
+  /// Constructs from `SizeType`
+  __host__ __device__ constexpr extent(SizeType) noexcept {}
+
+  /**
+   * @brief Conversion to value_type.
+   *
+   * @return Extent size
+   */
+  __host__ __device__ constexpr operator value_type() const noexcept { return N; }
+};
+
+/**
+ * @brief Dynamic extent class.
+ *
+ * @tparam SizeType Size type
+ */
+template <typename SizeType>
+struct extent<SizeType, dynamic_extent> {
+  using value_type = SizeType;  ///< Extent value type
+
+  /**
+   * @brief Constructs extent from a given `size`.
+   *
+   * @param size The extent size
+   */
+  __host__ __device__ constexpr extent(SizeType size) noexcept : value_{size} {}
+
+  /**
+   * @brief Conversion to value_type.
+   *
+   * @return Extent size
+   */
+  __host__ __device__ constexpr operator value_type() const noexcept { return value_; }
+
+ private:
+  value_type value_;  ///< Extent value
+};
+
+/**
+ * @brief Window extent strong type.
+ *
+ * @note This type is used internally and can only be constructed using the `make_window_extent'
+ * factory method.
+ *
+ * @tparam SizeType Size type
+ * @tparam N Extent
+ *
+ */
+template <typename SizeType, std::size_t N = dynamic_extent>
+struct window_extent;
+
+/**
+ * @brief Computes a valid window extent/capacity for a given container type.
+ *
+ * @note The actual capacity of a container (map/set) should be exclusively determined by the return
+ * value of this utility since the output depends on the requested low-bound size, the probing
+ * scheme, and the storage. This utility is used internally during container constructions while for
+ * container ref constructions, it would be users' responsibility to use this function to determine
+ * the capacity ctor argument for the container.
+ *
+ * @tparam Container Container type to compute the extent for
+ * @tparam SizeType Size type
+ * @tparam N Extent
+ *
+ * @param ext The input extent
+ *
+ * @throw If the input extent is invalid
+ *
+ * @return Resulting valid `window extent`
+ */
+template <typename Container, typename SizeType, std::size_t N>
+[[nodiscard]] auto constexpr make_window_extent(extent<SizeType, N> ext);
+
+/**
+ * @brief Computes a valid capacity for a given container type.
+ *
+ * @note The actual capacity of a container (map/set) should be exclusively determined by the return
+ * value of this utility since the output depends on the requested low-bound size, the probing
+ * scheme, and the storage. This utility is used internally during container constructions while for
+ * container ref constructions, it would be users' responsibility to use this function to determine
+ * the capacity ctor argument for the container.
+ *
+ * @tparam Container Container type to compute the extent for
+ * @tparam SizeType Size type
+ *
+ * @param size The input size
+ *
+ * @throw If the input size is invalid
+ *
+ * @return Resulting valid extent
+ */
+template <typename Container, typename SizeType>
+[[nodiscard]] auto constexpr make_window_extent(SizeType size);
+
+/**
+ * @brief Computes valid window extent based on given parameters.
+ *
+ * @note The actual capacity of a container (map/set) should be exclusively determined by the return
+ * value of this utility since the output depends on the requested low-bound size, the probing
+ * scheme, and the storage. This utility is used internally during container constructions while for
+ * container ref constructions, it would be users' responsibility to use this function to determine
+ * the input size of the ref.
+ *
+ * @tparam CGSize Number of elements handled per CG
+ * @tparam WindowSize Number of elements handled per Window
+ * @tparam SizeType Size type
+ * @tparam N Extent
+ *
+ * @param ext The input extent
+ *
+ * @throw If the input extent is invalid
+ *
+ * @return Resulting valid extent
+ */
+template <int32_t CGSize, int32_t WindowSize, typename SizeType, std::size_t N>
+[[nodiscard]] auto constexpr make_window_extent(extent<SizeType, N> ext);
+
+/**
+ * @brief Computes valid window extent/capacity based on given parameters.
+ *
+ * @note The actual capacity of a container (map/set) should be exclusively determined by the return
+ * value of this utility since the output depends on the requested low-bound size, the probing
+ * scheme, and the storage. This utility is used internally during container constructions while for
+ * container ref constructions, it would be users' responsibility to use this function to determine
+ * the capacity ctor argument for the container.
+ *
+ * @tparam CGSize Number of elements handled per CG
+ * @tparam WindowSize Number of elements handled per Window
+ * @tparam SizeType Size type
+ *
+ * @param size The input size
+ *
+ * @throw If the input size is invalid
+ *
+ * @return Resulting valid extent
+ */
+template <int32_t CGSize, int32_t WindowSize, typename SizeType>
+[[nodiscard]] auto constexpr make_window_extent(SizeType size);
+
+}  // namespace experimental
+}  // namespace cuco
+
+#include <cuco/detail/extent/extent.inl>
diff --git a/include/cuco/hash_functions.cuh b/include/cuco/hash_functions.cuh
new file mode 100644
index 000000000..000f46fef
--- /dev/null
+++ b/include/cuco/hash_functions.cuh
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuco/detail/hash_functions/murmurhash3.cuh>
+#include <cuco/detail/hash_functions/xxhash.cuh>
+
+namespace cuco {
+
+/**
+ * @brief The 32-bit integer finalizer function of `MurmurHash3` to hash the given argument on host
+ * and device.
+ *
+ * @throw Key type must be 4 bytes in size
+ *
+ * @tparam Key The type of the values to hash
+ */
+template <typename Key>
+using murmurhash3_fmix_32 = detail::MurmurHash3_fmix32<Key>;
+
+/**
+ * @brief The 64-bit integer finalizer function of `MurmurHash3` to hash the given argument on host
+ * and device.
+ *
+ * @throw Key type must be 8 bytes in size
+ *
+ * @tparam Key The type of the values to hash
+ */
+template <typename Key>
+using murmurhash3_fmix_64 = detail::MurmurHash3_fmix64<Key>;
+
+/**
+ * @brief A 32-bit `MurmurHash3` hash function to hash the given argument on host and device.
+ *
+ * @tparam Key The type of the values to hash
+ */
+template <typename Key>
+using murmurhash3_32 = detail::MurmurHash3_32<Key>;
+
+/**
+ * @brief A 32-bit `XXH32` hash function to hash the given argument on host and device.
+ *
+ * @tparam Key The type of the values to hash
+ */
+template <typename Key>
+using xxhash_32 = detail::XXHash_32<Key>;
+
+/**
+ * @brief A 64-bit `XXH64` hash function to hash the given argument on host and device.
+ *
+ * @tparam Key The type of the values to hash
+ */
+template <typename Key>
+using xxhash_64 = detail::XXHash_64<Key>;
+
+/**
+ * @brief Default hash function.
+ *
+ * @tparam Key The type of the values to hash
+ */
+template <typename Key>
+using default_hash_function = xxhash_32<Key>;
+
+}  // namespace cuco
diff --git a/include/cuco/operator.hpp b/include/cuco/operator.hpp
new file mode 100644
index 000000000..77cf2c133
--- /dev/null
+++ b/include/cuco/operator.hpp
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+namespace cuco {
+namespace experimental {
+inline namespace op {
+// TODO enum class of int32_t instead of struct
+// https://github.com/NVIDIA/cuCollections/issues/239
+/**
+ * @brief `insert` operator tag
+ */
+struct insert_tag {
+} inline constexpr insert;
+
+/**
+ * @brief `insert_and_find` operator tag
+ */
+struct insert_and_find_tag {
+} inline constexpr insert_and_find;
+
+/**
+ * @brief `insert_or_assign` operator tag
+ */
+struct insert_or_assign_tag {
+} inline constexpr insert_or_assign;
+
+/**
+ * @brief `contains` operator tag
+ */
+struct contains_tag {
+} inline constexpr contains;
+
+/**
+ * @brief `find` operator tag
+ */
+struct find_tag {
+} inline constexpr find;
+
+}  // namespace op
+}  // namespace experimental
+}  // namespace cuco
+
+#include <cuco/detail/operator.inl>
diff --git a/include/cuco/pair.cuh b/include/cuco/pair.cuh
new file mode 100644
index 000000000..0a804cc04
--- /dev/null
+++ b/include/cuco/pair.cuh
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuco/detail/traits.hpp>
+#include <cuco/detail/utils.cuh>
+
+#include <thrust/device_reference.h>
+#include <thrust/tuple.h>
+
+#include <tuple>
+#include <type_traits>
+
+namespace cuco {
+
+/**
+ * @brief Custom pair type
+ *
+ * @note This is necessary because `thrust::pair` is under aligned.
+ *
+ * @tparam First Type of the first value in the pair
+ * @tparam Second Type of the second value in the pair
+ */
+template <typename First, typename Second>
+struct alignas(detail::pair_alignment<First, Second>()) pair {
+  using first_type  = First;   ///< Type of the first value in the pair
+  using second_type = Second;  ///< Type of the second value in the pair
+
+  pair()            = default;
+  ~pair()           = default;
+  pair(pair const&) = default;  ///< Copy constructor
+  pair(pair&&)      = default;  ///< Move constructor
+
+  /**
+   * @brief Replaces the contents of the pair with another pair.
+   *
+   * @return Reference of the current pair object
+   */
+  pair& operator=(pair const&) = default;
+
+  /**
+   * @brief Replaces the contents of the pair with another pair.
+   *
+   * @return Reference of the current pair object
+   */
+  pair& operator=(pair&&) = default;
+
+  /**
+   * @brief Constructs a pair from objects `f` and `s`.
+   *
+   * @param f The object to copy into `first`
+   * @param s The object to copy into `second`
+   */
+  __host__ __device__ constexpr pair(First const& f, Second const& s);
+
+  /**
+   * @brief Constructs a pair by copying from the given pair `p`.
+   *
+   * @tparam F Type of the first value of `p`
+   * @tparam S Type of the second value of `p`
+   *
+   * @param p The pair to copy from
+   */
+  template <typename F, typename S>
+  __host__ __device__ constexpr pair(pair<F, S> const& p);
+
+  /**
+   * @brief Constructs a pair from the given std::pair-like `p`.
+   *
+   * @tparam T Type of the pair to copy from
+   *
+   * @param p The input pair to copy from
+   */
+  template <typename T, std::enable_if_t<detail::is_std_pair_like<T>::value>* = nullptr>
+  __host__ __device__ constexpr pair(T const& p)
+    : pair{std::get<0>(thrust::raw_reference_cast(p)), std::get<1>(thrust::raw_reference_cast(p))}
+  {
+  }
+
+  /**
+   * @brief Constructs a pair from the given thrust::pair-like `p`.
+   *
+   * @tparam T Type of the pair to copy from
+   *
+   * @param p The input pair to copy from
+   */
+  template <typename T, std::enable_if_t<detail::is_thrust_pair_like<T>::value>* = nullptr>
+  __host__ __device__ constexpr pair(T const& p)
+    : pair{thrust::get<0>(thrust::raw_reference_cast(p)),
+           thrust::get<1>(thrust::raw_reference_cast(p))}
+  {
+  }
+
+  First first;    ///< The first value in the pair
+  Second second;  ///< The second value in the pair
+};
+
+/**
+ * @brief Creates a pair with the given first and second elements
+ *
+ * @tparam F Type of first element
+ * @tparam S Type of second element
+ *
+ * @param f First element
+ * @param s Second element
+ *
+ * @return A pair with first element `f` and second element `s`.
+ */
+template <typename F, typename S>
+__host__ __device__ constexpr pair<std::decay_t<F>, std::decay_t<S>> make_pair(F&& f,
+                                                                               S&& s) noexcept;
+
+/**
+ * @brief Tests if both elements of lhs and rhs are equal
+ *
+ * @tparam T1 Type of the first element of the left-hand side pair
+ * @tparam T2 Type of the second element of the left-hand side pair
+ * @tparam U1 Type of the first element of the right-hand side pair
+ * @tparam U2 Type of the second element of the right-hand side pair
+ *
+ * @param lhs Left-hand side pair
+ * @param rhs Right-hand side pair
+ *
+ * @return True if two pairs are equal. False otherwise
+ */
+template <class T1, class T2, class U1, class U2>
+__host__ __device__ constexpr bool operator==(cuco::pair<T1, T2> const& lhs,
+                                              cuco::pair<U1, U2> const& rhs) noexcept;
+
+}  // namespace cuco
+
+#include <cuco/detail/pair.inl>
diff --git a/include/cuco/probe_sequences.cuh b/include/cuco/probe_sequences.cuh
index 071b0921e..7921b6629 100644
--- a/include/cuco/probe_sequences.cuh
+++ b/include/cuco/probe_sequences.cuh
@@ -60,7 +60,7 @@ class linear_probing : public detail::probe_sequence_base<CGSize> {
  * @tparam Hash1 Unary callable type
  * @tparam Hash2 Unary callable type
  */
-template <uint32_t CGSize, typename Hash1, typename Hash2>
+template <uint32_t CGSize, typename Hash1, typename Hash2 = Hash1>
 class double_hashing : public detail::probe_sequence_base<CGSize> {
  public:
   using probe_sequence_base_type =
diff --git a/include/cuco/probing_scheme.cuh b/include/cuco/probing_scheme.cuh
new file mode 100644
index 000000000..039433cef
--- /dev/null
+++ b/include/cuco/probing_scheme.cuh
@@ -0,0 +1,153 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuco/detail/probing_scheme_base.cuh>
+
+#include <cooperative_groups.h>
+
+namespace cuco {
+namespace experimental {
+/**
+ * @brief Public linear probing scheme class.
+ *
+ * @note Linear probing is efficient when few collisions are present, e.g., low occupancy or low
+ * multiplicity.
+ *
+ * @note `Hash` should be callable object type.
+ *
+ * @tparam CGSize Size of CUDA Cooperative Groups
+ * @tparam Hash Unary callable type
+ */
+template <int32_t CGSize, typename Hash>
+class linear_probing : private detail::probing_scheme_base<CGSize> {
+ public:
+  using probing_scheme_base_type =
+    detail::probing_scheme_base<CGSize>;  ///< The base probe scheme type
+  using probing_scheme_base_type::cg_size;
+
+  /**
+   *@brief Constructs linear probing scheme with the hasher callable.
+   *
+   * @param hash Hasher
+   */
+  __host__ __device__ constexpr linear_probing(Hash const& hash = {});
+
+  /**
+   * @brief Operator to return a probing iterator
+   *
+   * @tparam ProbeKey Type of probing key
+   * @tparam Extent Type of extent
+   *
+   * @param probe_key The probing key
+   * @param upper_bound Upper bound of the iteration
+   * @return An iterator whose value_type is convertible to slot index type
+   */
+  template <typename ProbeKey, typename Extent>
+  __host__ __device__ constexpr auto operator()(ProbeKey const& probe_key,
+                                                Extent upper_bound) const noexcept;
+
+  /**
+   * @brief Operator to return a CG-based probing iterator
+   *
+   * @tparam ProbeKey Type of probing key
+   * @tparam Extent Type of extent
+   *
+   * @param g the Cooperative Group to generate probing iterator
+   * @param probe_key The probing key
+   * @param upper_bound Upper bound of the iteration
+   * @return An iterator whose value_type is convertible to slot index type
+   */
+  template <typename ProbeKey, typename Extent>
+  __host__ __device__ constexpr auto operator()(
+    cooperative_groups::thread_block_tile<cg_size> const& g,
+    ProbeKey const& probe_key,
+    Extent upper_bound) const noexcept;
+
+ private:
+  Hash hash_;
+};
+
+/**
+ * @brief Public double hashing scheme class.
+ *
+ * @note Default probing scheme for cuco data structures. It shows superior performance over linear
+ * probing especially when dealing with high multiplicty and/or high occupancy use cases.
+ *
+ * @note `Hash1` and `Hash2` should be callable object type.
+ *
+ * @note `Hash2` needs to be able to construct from an integer value to avoid secondary clustering.
+ *
+ * @tparam CGSize Size of CUDA Cooperative Groups
+ * @tparam Hash1 Unary callable type
+ * @tparam Hash2 Unary callable type
+ */
+template <int32_t CGSize, typename Hash1, typename Hash2 = Hash1>
+class double_hashing : private detail::probing_scheme_base<CGSize> {
+ public:
+  using probing_scheme_base_type =
+    detail::probing_scheme_base<CGSize>;  ///< The base probe scheme type
+  using probing_scheme_base_type::cg_size;
+
+  /**
+   *@brief Constructs double hashing probing scheme with the two hasher callables.
+   *
+   * @param hash1 First hasher
+   * @param hash2 Second hasher
+   */
+  __host__ __device__ constexpr double_hashing(Hash1 const& hash1 = {}, Hash2 const& hash2 = {1});
+
+  /**
+   * @brief Operator to return a probing iterator
+   *
+   * @tparam ProbeKey Type of probing key
+   * @tparam Extent Type of extent
+   *
+   * @param probe_key The probing key
+   * @param upper_bound Upper bound of the iteration
+   * @return An iterator whose value_type is convertible to slot index type
+   */
+  template <typename ProbeKey, typename Extent>
+  __host__ __device__ constexpr auto operator()(ProbeKey const& probe_key,
+                                                Extent upper_bound) const noexcept;
+
+  /**
+   * @brief Operator to return a CG-based probing iterator
+   *
+   * @tparam ProbeKey Type of probing key
+   * @tparam Extent Type of extent
+   *
+   * @param g the Cooperative Group to generate probing iterator
+   * @param probe_key The probing key
+   * @param upper_bound Upper bound of the iteration
+   * @return An iterator whose value_type is convertible to slot index type
+   */
+  template <typename ProbeKey, typename Extent>
+  __host__ __device__ constexpr auto operator()(
+    cooperative_groups::thread_block_tile<cg_size> const& g,
+    ProbeKey const& probe_key,
+    Extent upper_bound) const noexcept;
+
+ private:
+  Hash1 hash1_;
+  Hash2 hash2_;
+};
+
+}  // namespace experimental
+}  // namespace cuco
+
+#include <cuco/detail/probing_scheme_impl.inl>
diff --git a/include/cuco/sentinel.cuh b/include/cuco/sentinel.cuh
index 58317d179..a440e5b2c 100644
--- a/include/cuco/sentinel.cuh
+++ b/include/cuco/sentinel.cuh
@@ -16,22 +16,24 @@
 
 #pragma once
 
+#include <cuco/detail/utils.cuh>
+
 namespace cuco {
-namespace sentinel {
+inline namespace sentinel {
+
 /**
  * @brief A strong type wrapper used to denote the empty key sentinel.
  *
  * @tparam T Type of the key values
  */
 template <typename T>
-struct empty_key {
+struct empty_key : public cuco::detail::strong_type<T> {
   /**
    * @brief Constructs an empty key sentinel with the given `v`.
    *
    * @param v The empty key sentinel value
    */
-  __host__ __device__ explicit constexpr empty_key(T v) : value{v} {}
-  T value;  ///< Empty key sentinel
+  __host__ __device__ explicit constexpr empty_key(T v) : cuco::detail::strong_type<T>(v) {}
 };
 
 /**
@@ -40,14 +42,13 @@ struct empty_key {
  * @tparam T Type of the mapped values
  */
 template <typename T>
-struct empty_value {
+struct empty_value : public cuco::detail::strong_type<T> {
   /**
    * @brief Constructs an empty value sentinel with the given `v`.
    *
    * @param v The empty value sentinel value
    */
-  __host__ __device__ explicit constexpr empty_value(T v) : value{v} {}
-  T value;  ///< Empty value sentinel
+  __host__ __device__ explicit constexpr empty_value(T v) : cuco::detail::strong_type<T>(v) {}
 };
 
 /**
@@ -56,14 +57,13 @@ struct empty_value {
  * @tparam T Type of the key values
  */
 template <typename T>
-struct erased_key {
+struct erased_key : public cuco::detail::strong_type<T> {
   /**
    * @brief Constructs an erased key sentinel with the given `v`.
    *
    * @param v The erased key sentinel value
    */
-  __host__ __device__ explicit constexpr erased_key(T v) : value{v} {}
-  T value;  ///< Erased key sentinel
+  __host__ __device__ explicit constexpr erased_key(T v) : cuco::detail::strong_type<T>(v) {}
 };
 
 }  // namespace sentinel
diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh
index 361e97d37..825f88ab7 100644
--- a/include/cuco/static_map.cuh
+++ b/include/cuco/static_map.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,14 +16,16 @@
 
 #pragma once
 
-#include <cuco/allocator.hpp>
+#include <cuco/cuda_stream_ref.hpp>
 #include <cuco/detail/__config>
-#include <cuco/detail/error.hpp>
-#include <cuco/detail/hash_functions.cuh>
-#include <cuco/detail/pair.cuh>
+#include <cuco/detail/open_addressing_impl.cuh>
 #include <cuco/detail/static_map_kernels.cuh>
+#include <cuco/hash_functions.cuh>
+#include <cuco/pair.cuh>
 #include <cuco/sentinel.cuh>
-#include <cuco/traits.hpp>
+#include <cuco/static_map_ref.cuh>
+#include <cuco/utility/allocator.hpp>
+#include <cuco/utility/traits.hpp>
 
 #include <thrust/functional.h>
 
@@ -38,6 +40,512 @@
 #include <utility>
 
 namespace cuco {
+namespace experimental {
+/**
+ * @brief A GPU-accelerated, unordered, associative container of key-value pairs with unique keys.
+ *
+ * The `static_map` supports two types of operations:
+ * - Host-side "bulk" operations
+ * - Device-side "singular" operations
+ *
+ * The host-side bulk operations include `insert`, `contains`, etc. These APIs should be used when
+ * there are a large number of keys to modify or lookup. For example, given a range of keys
+ * specified by device-accessible iterators, the bulk `insert` function will insert all keys into
+ * the map.
+ *
+ * The singular device-side operations allow individual threads (or cooperative groups) to perform
+ * independent modify or lookup operations from device code. These operations are accessed through
+ * non-owning, trivially copyable reference types (or "ref"). User can combine any arbitrary
+ * operators (see options in `include/cuco/operator.hpp`) when creating the ref. Concurrent modify
+ * and lookup will be supported if both kinds of operators are specified during the ref
+ * construction.
+ *
+ * @note Allows constant time concurrent modify or lookup operations from threads in device code.
+ * @note cuCollections data structures always place the slot keys on the left-hand side when
+ * invoking the key comparison predicate, i.e., `pred(slot_key, query_key)`. Order-sensitive
+ * `KeyEqual` should be used with caution.
+ * @note `ProbingScheme::cg_size` indicates how many threads are used to handle one independent
+ * device operation. `cg_size == 1` uses the scalar (or non-CG) code paths.
+ *
+ * @throw If the size of the given key type is larger than 8 bytes
+ * @throw If the size of the given payload type is larger than 8 bytes
+ * @throw If the size of the given slot type is larger than 16 bytes
+ * @throw If the given key type doesn't have unique object representations, i.e.,
+ * `cuco::bitwise_comparable_v<Key> == false`
+ * @throw If the given mapped type doesn't have unique object representations, i.e.,
+ * `cuco::bitwise_comparable_v<T> == false`
+ * @throw If the probing scheme type is not inherited from `cuco::detail::probing_scheme_base`
+ *
+ * @tparam Key Type used for keys. Requires `cuco::is_bitwise_comparable_v<Key>`
+ * @tparam T Type of the mapped values
+ * @tparam Extent Data structure size type
+ * @tparam Scope The scope in which operations will be performed by individual threads.
+ * @tparam KeyEqual Binary callable type used to compare two keys for equality
+ * @tparam ProbingScheme Probing scheme (see `include/cuco/probing_scheme.cuh` for choices)
+ * @tparam Allocator Type of allocator used for device storage
+ * @tparam Storage Slot window storage type
+ */
+
+template <class Key,
+          class T,
+          class Extent             = cuco::experimental::extent<std::size_t>,
+          cuda::thread_scope Scope = cuda::thread_scope_device,
+          class KeyEqual           = thrust::equal_to<Key>,
+          class ProbingScheme =
+            cuco::experimental::double_hashing<4,  // CG size
+                                               cuco::default_hash_function<Key>>,
+          class Allocator = cuco::cuda_allocator<cuco::pair<Key, T>>,
+          class Storage   = cuco::experimental::storage<1>>
+class static_map {
+  static_assert(sizeof(Key) <= 8, "Container does not support key types larger than 8 bytes.");
+
+  static_assert(sizeof(T) <= 8, "Container does not support payload types larger than 8 bytes.");
+
+  static_assert(cuco::is_bitwise_comparable_v<T>,
+                "Mapped type must have unique object representations or have been explicitly "
+                "declared as safe for bitwise comparison via specialization of "
+                "cuco::is_bitwise_comparable_v<T>.");
+
+  using impl_type = detail::open_addressing_impl<Key,
+                                                 cuco::pair<Key, T>,
+                                                 Extent,
+                                                 Scope,
+                                                 KeyEqual,
+                                                 ProbingScheme,
+                                                 Allocator,
+                                                 Storage>;
+
+ public:
+  static constexpr auto cg_size      = impl_type::cg_size;       ///< CG size used for probing
+  static constexpr auto window_size  = impl_type::window_size;   ///< Window size used for probing
+  static constexpr auto thread_scope = impl_type::thread_scope;  ///< CUDA thread scope
+
+  using key_type       = typename impl_type::key_type;        ///< Key type
+  using value_type     = typename impl_type::value_type;      ///< Key-value pair type
+  using extent_type    = typename impl_type::extent_type;     ///< Extent type
+  using size_type      = typename impl_type::size_type;       ///< Size type
+  using key_equal      = typename impl_type::key_equal;       ///< Key equality comparator type
+  using allocator_type = typename impl_type::allocator_type;  ///< Allocator type
+  /// Non-owning window storage ref type
+  using storage_ref_type    = typename impl_type::storage_ref_type;
+  using probing_scheme_type = typename impl_type::probing_scheme_type;  ///< Probing scheme type
+
+  using mapped_type = T;  ///< Payload type
+  template <typename... Operators>
+  using ref_type =
+    cuco::experimental::static_map_ref<key_type,
+                                       mapped_type,
+                                       thread_scope,
+                                       key_equal,
+                                       probing_scheme_type,
+                                       storage_ref_type,
+                                       Operators...>;  ///< Non-owning container ref type
+
+  static_map(static_map const&) = delete;
+  static_map& operator=(static_map const&) = delete;
+
+  static_map(static_map&&) = default;  ///< Move constructor
+
+  /**
+   * @brief Replaces the contents of the container with another container.
+   *
+   * @return Reference of the current map object
+   */
+  static_map& operator=(static_map&&) = default;
+  ~static_map()                       = default;
+
+  /**
+   * @brief Constructs a statically-sized map with the specified initial capacity, sentinel values
+   * and CUDA stream.
+   *
+   * The actual map capacity depends on the given `capacity`, the probing scheme, CG size, and the
+   * window size and it is computed via the `make_window_extent` factory. Insert operations will not
+   * automatically grow the map. Attempting to insert more unique keys than the capacity of the map
+   * results in undefined behavior.
+   *
+   * @note Any `*_sentinel`s are reserved and behavior is undefined when attempting to insert
+   * this sentinel value.
+   * @note If a non-default CUDA stream is provided, the caller is responsible for synchronizing the
+   * stream before the object is first used.
+   *
+   * @param capacity The requested lower-bound map size
+   * @param empty_key_sentinel The reserved key value for empty slots
+   * @param empty_value_sentinel The reserved mapped value for empty slots
+   * @param pred Key equality binary predicate
+   * @param probing_scheme Probing scheme
+   * @param alloc Allocator used for allocating device storage
+   * @param stream CUDA stream used to initialize the map
+   */
+  constexpr static_map(Extent capacity,
+                       empty_key<Key> empty_key_sentinel,
+                       empty_value<T> empty_value_sentinel,
+                       KeyEqual const& pred                = {},
+                       ProbingScheme const& probing_scheme = {},
+                       Allocator const& alloc              = {},
+                       cuda_stream_ref stream              = {});
+
+  /**
+   * @brief Erases all elements from the container. After this call, `size()` returns zero.
+   * Invalidates any references, pointers, or iterators referring to contained elements.
+   *
+   * @param stream CUDA stream this operation is executed in
+   */
+  void clear(cuda_stream_ref stream = {}) noexcept;
+
+  /**
+   * @brief Asynchronously erases all elements from the container. After this call, `size()` returns
+   * zero. Invalidates any references, pointers, or iterators referring to contained elements.
+   *
+   * @param stream CUDA stream this operation is executed in
+   */
+  void clear_async(cuda_stream_ref stream = {}) noexcept;
+
+  /**
+   * @brief Inserts all keys in the range `[first, last)` and returns the number of successful
+   * insertions.
+   *
+   * @note This function synchronizes the given stream. For asynchronous execution use
+   * `insert_async`.
+   *
+   * @tparam InputIt Device accessible random access input iterator where
+   * <tt>std::is_convertible<std::iterator_traits<InputIt>::value_type,
+   * static_map<K, V>::value_type></tt> is `true`
+   *
+   * @param first Beginning of the sequence of keys
+   * @param last End of the sequence of keys
+   * @param stream CUDA stream used for insert
+   *
+   * @return Number of successful insertions
+   */
+  template <typename InputIt>
+  size_type insert(InputIt first, InputIt last, cuda_stream_ref stream = {});
+
+  /**
+   * @brief Asynchronously inserts all keys in the range `[first, last)`.
+   *
+   * @tparam InputIt Device accessible random access input iterator where
+   * <tt>std::is_convertible<std::iterator_traits<InputIt>::value_type,
+   * static_map<K, V>::value_type></tt> is `true`
+   *
+   * @param first Beginning of the sequence of keys
+   * @param last End of the sequence of keys
+   * @param stream CUDA stream used for insert
+   */
+  template <typename InputIt>
+  void insert_async(InputIt first, InputIt last, cuda_stream_ref stream = {}) noexcept;
+
+  /**
+   * @brief Inserts keys in the range `[first, last)` if `pred` of the corresponding stencil returns
+   * true.
+   *
+   * @note The key `*(first + i)` is inserted if `pred( *(stencil + i) )` returns true.
+   * @note This function synchronizes the given stream and returns the number of successful
+   * insertions. For asynchronous execution use `insert_if_async`.
+   *
+   * @tparam InputIt Device accessible random access iterator whose `value_type` is
+   * convertible to the container's `value_type`
+   * @tparam StencilIt Device accessible random access iterator whose value_type is
+   * convertible to Predicate's argument type
+   * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` and
+   * argument type is convertible from <tt>std::iterator_traits<StencilIt>::value_type</tt>
+   *
+   * @param first Beginning of the sequence of key/value pairs
+   * @param last End of the sequence of key/value pairs
+   * @param stencil Beginning of the stencil sequence
+   * @param pred Predicate to test on every element in the range `[stencil, stencil +
+   * std::distance(first, last))`
+   * @param stream CUDA stream used for the operation
+   *
+   * @return Number of successful insertions
+   */
+  template <typename InputIt, typename StencilIt, typename Predicate>
+  size_type insert_if(
+    InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda_stream_ref stream = {});
+
+  /**
+   * @brief Asynchronously inserts keys in the range `[first, last)` if `pred` of the corresponding
+   * stencil returns true.
+   *
+   * @note The key `*(first + i)` is inserted if `pred( *(stencil + i) )` returns true.
+   *
+   * @tparam InputIt Device accessible random access iterator whose `value_type` is
+   * convertible to the container's `value_type`
+   * @tparam StencilIt Device accessible random access iterator whose value_type is
+   * convertible to Predicate's argument type
+   * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` and
+   * argument type is convertible from <tt>std::iterator_traits<StencilIt>::value_type</tt>
+   *
+   * @param first Beginning of the sequence of key/value pairs
+   * @param last End of the sequence of key/value pairs
+   * @param stencil Beginning of the stencil sequence
+   * @param pred Predicate to test on every element in the range `[stencil, stencil +
+   * std::distance(first, last))`
+   * @param stream CUDA stream used for the operation
+   */
+  template <typename InputIt, typename StencilIt, typename Predicate>
+  void insert_if_async(InputIt first,
+                       InputIt last,
+                       StencilIt stencil,
+                       Predicate pred,
+                       cuda_stream_ref stream = {}) noexcept;
+
+  /**
+   * @brief For any key-value pair `{k, v}` in the range `[first, last)`, if a key equivalent to `k`
+   * already exists in the container, assigns `v` to the mapped_type corresponding to the key `k`.
+   * If the key does not exist, inserts the pair as if by insert.
+   *
+   * @note This function synchronizes the given stream. For asynchronous execution use
+   * `insert_or_assign_async`.
+   * @note If multiple pairs in `[first, last)` compare equal, it is unspecified which pair is
+   * inserted or assigned.
+   *
+   * @tparam InputIt Device accessible random access input iterator where
+   * <tt>std::is_convertible<std::iterator_traits<InputIt>::value_type,
+   * static_map<K, V>::value_type></tt> is `true`
+   *
+   * @param first Beginning of the sequence of keys
+   * @param last End of the sequence of keys
+   * @param stream CUDA stream used for insert
+   */
+  template <typename InputIt>
+  void insert_or_assign(InputIt first, InputIt last, cuda_stream_ref stream = {}) noexcept;
+
+  /**
+   * @brief For any key-value pair `{k, v}` in the range `[first, last)`, if a key equivalent to `k`
+   * already exists in the container, assigns `v` to the mapped_type corresponding to the key `k`.
+   * If the key does not exist, inserts the pair as if by insert.
+   *
+   * @note If multiple pairs in `[first, last)` compare equal, it is unspecified which pair is
+   * inserted or assigned.
+   *
+   * @tparam InputIt Device accessible random access input iterator where
+   * <tt>std::is_convertible<std::iterator_traits<InputIt>::value_type,
+   * static_map<K, V>::value_type></tt> is `true`
+   *
+   * @param first Beginning of the sequence of keys
+   * @param last End of the sequence of keys
+   * @param stream CUDA stream used for insert
+   */
+  template <typename InputIt>
+  void insert_or_assign_async(InputIt first, InputIt last, cuda_stream_ref stream = {}) noexcept;
+
+  /**
+   * @brief Indicates whether the keys in the range `[first, last)` are contained in the map.
+   *
+   * @note This function synchronizes the given stream. For asynchronous execution use
+   * `contains_async`.
+   *
+   * @tparam InputIt Device accessible input iterator
+   * @tparam OutputIt Device accessible output iterator assignable from `bool`
+   *
+   * @param first Beginning of the sequence of keys
+   * @param last End of the sequence of keys
+   * @param output_begin Beginning of the sequence of booleans for the presence of each key
+   * @param stream Stream used for executing the kernels
+   */
+  template <typename InputIt, typename OutputIt>
+  void contains(InputIt first,
+                InputIt last,
+                OutputIt output_begin,
+                cuda_stream_ref stream = {}) const;
+
+  /**
+   * @brief Asynchronously indicates whether the keys in the range `[first, last)` are contained in
+   * the map.
+   *
+   * @tparam InputIt Device accessible input iterator
+   * @tparam OutputIt Device accessible output iterator assignable from `bool`
+   *
+   * @param first Beginning of the sequence of keys
+   * @param last End of the sequence of keys
+   * @param output_begin Beginning of the sequence of booleans for the presence of each key
+   * @param stream Stream used for executing the kernels
+   */
+  template <typename InputIt, typename OutputIt>
+  void contains_async(InputIt first,
+                      InputIt last,
+                      OutputIt output_begin,
+                      cuda_stream_ref stream = {}) const noexcept;
+
+  /**
+   * @brief Indicates whether the keys in the range `[first, last)` are contained in the map if
+   * `pred` of the corresponding stencil returns true.
+   *
+   * @note If `pred( *(stencil + i) )` is true, stores `true` or `false` to `(output_begin + i)`
+   * indicating if the key `*(first + i)` is present in the map. If `pred( *(stencil + i) )` is
+   * false, stores false to `(output_begin + i)`.
+   * @note This function synchronizes the given stream. For asynchronous execution use
+   * `contains_if_async`.
+   *
+   * @tparam InputIt Device accessible input iterator
+   * @tparam StencilIt Device accessible random access iterator whose value_type is
+   * convertible to Predicate's argument type
+   * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` and
+   * argument type is convertible from <tt>std::iterator_traits<StencilIt>::value_type</tt>
+   * @tparam OutputIt Device accessible output iterator assignable from `bool`
+   *
+   * @param first Beginning of the sequence of keys
+   * @param last End of the sequence of keys
+   * @param stencil Beginning of the stencil sequence
+   * @param pred Predicate to test on every element in the range `[stencil, stencil +
+   * std::distance(first, last))`
+   * @param output_begin Beginning of the sequence of booleans for the presence of each key
+   * @param stream Stream used for executing the kernels
+   */
+  template <typename InputIt, typename StencilIt, typename Predicate, typename OutputIt>
+  void contains_if(InputIt first,
+                   InputIt last,
+                   StencilIt stencil,
+                   Predicate pred,
+                   OutputIt output_begin,
+                   cuda_stream_ref stream = {}) const;
+
+  /**
+   * @brief Asynchronously indicates whether the keys in the range `[first, last)` are contained in
+   * the map if `pred` of the corresponding stencil returns true.
+   *
+   * @note If `pred( *(stencil + i) )` is true, stores `true` or `false` to `(output_begin + i)`
+   * indicating if the key `*(first + i)` is present in the map. If `pred( *(stencil + i) )` is
+   * false, stores false to `(output_begin + i)`.
+   *
+   * @tparam InputIt Device accessible input iterator
+   * @tparam StencilIt Device accessible random access iterator whose value_type is
+   * convertible to Predicate's argument type
+   * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` and
+   * argument type is convertible from <tt>std::iterator_traits<StencilIt>::value_type</tt>
+   * @tparam OutputIt Device accessible output iterator assignable from `bool`
+   *
+   * @param first Beginning of the sequence of keys
+   * @param last End of the sequence of keys
+   * @param stencil Beginning of the stencil sequence
+   * @param pred Predicate to test on every element in the range `[stencil, stencil +
+   * std::distance(first, last))`
+   * @param output_begin Beginning of the sequence of booleans for the presence of each key
+   * @param stream Stream used for executing the kernels
+   */
+  template <typename InputIt, typename StencilIt, typename Predicate, typename OutputIt>
+  void contains_if_async(InputIt first,
+                         InputIt last,
+                         StencilIt stencil,
+                         Predicate pred,
+                         OutputIt output_begin,
+                         cuda_stream_ref stream = {}) const noexcept;
+
+  /**
+   * @brief For all keys in the range `[first, last)`, finds a payload with its key equivalent to
+   * the query key.
+   *
+   * @note This function synchronizes the given stream. For asynchronous execution use `find_async`.
+   * @note If the key `*(first + i)` has a matched `element` in the map, copies the payload of
+   * `element` to
+   * `(output_begin + i)`. Else, copies the empty value sentinel.
+   *
+   * @tparam InputIt Device accessible input iterator
+   * @tparam OutputIt Device accessible output iterator assignable from the map's `mapped_type`
+   *
+   * @param first Beginning of the sequence of keys
+   * @param last End of the sequence of keys
+   * @param output_begin Beginning of the sequence of payloads retrieved for each key
+   * @param stream Stream used for executing the kernels
+   */
+  template <typename InputIt, typename OutputIt>
+  void find(InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream = {}) const;
+
+  /**
+   * @brief For all keys in the range `[first, last)`, asynchronously finds a payload with its key
+   * equivalent to the query key.
+   *
+   * @note If the key `*(first + i)` has a matched `element` in the map, copies the payload of
+   * `element` to
+   * `(output_begin + i)`. Else, copies the empty value sentinel.
+   *
+   * @tparam InputIt Device accessible input iterator
+   * @tparam OutputIt Device accessible output iterator assignable from the map's `mapped_type`
+   *
+   * @param first Beginning of the sequence of keys
+   * @param last End of the sequence of keys
+   * @param output_begin Beginning of the sequence of payloads retrieved for each key
+   * @param stream Stream used for executing the kernels
+   */
+  template <typename InputIt, typename OutputIt>
+  void find_async(InputIt first,
+                  InputIt last,
+                  OutputIt output_begin,
+                  cuda_stream_ref stream = {}) const;
+
+  /**
+   * @brief Retrieves all of the keys and their associated values.
+   *
+   * @note This API synchronizes the given stream.
+   * @note The order in which keys are returned is implementation defined and not guaranteed to be
+   * consistent between subsequent calls to `retrieve_all`.
+   * @note Behavior is undefined if the range beginning at `keys_out` or `values_out` is smaller
+   * than the return value of `size()`.
+   *
+   * @tparam KeyOut Device accessible random access output iterator whose `value_type` is
+   * convertible from `key_type`.
+   * @tparam ValueOut Device accesible random access output iterator whose `value_type` is
+   * convertible from `mapped_type`.
+   *
+   * @param keys_out Beginning output iterator for keys
+   * @param values_out Beginning output iterator for associated values
+   * @param stream CUDA stream used for this operation
+   *
+   * @return Pair of iterators indicating the last elements in the output
+   */
+  template <typename KeyOut, typename ValueOut>
+  std::pair<KeyOut, ValueOut> retrieve_all(KeyOut keys_out,
+                                           ValueOut values_out,
+                                           cuda_stream_ref stream = {}) const;
+
+  /**
+   * @brief Gets the number of elements in the container.
+   *
+   * @note This function synchronizes the given stream.
+   *
+   * @param stream CUDA stream used to get the number of inserted elements
+   * @return The number of elements in the container
+   */
+  [[nodiscard]] size_type size(cuda_stream_ref stream = {}) const noexcept;
+
+  /**
+   * @brief Gets the maximum number of elements the hash map can hold.
+   *
+   * @return The maximum number of elements the hash map can hold
+   */
+  [[nodiscard]] constexpr auto capacity() const noexcept;
+
+  /**
+   * @brief Gets the sentinel value used to represent an empty key slot.
+   *
+   * @return The sentinel value used to represent an empty key slot
+   */
+  [[nodiscard]] constexpr key_type empty_key_sentinel() const noexcept;
+
+  /**
+   * @brief Gets the sentinel value used to represent an empty value slot.
+   *
+   * @return The sentinel value used to represent an empty value slot
+   */
+  [[nodiscard]] constexpr mapped_type empty_value_sentinel() const noexcept;
+
+  /**
+   * @brief Get device ref with operators.
+   *
+   * @tparam Operators Set of `cuco::op` to be provided by the ref
+   *
+   * @param ops List of operators, e.g., `cuco::insert`
+   *
+   * @return Device ref of the current `static_map` object
+   */
+  template <typename... Operators>
+  [[nodiscard]] auto ref(Operators... ops) const noexcept;
+
+ private:
+  std::unique_ptr<impl_type> impl_;   ///< Static map implementation
+  mapped_type empty_value_sentinel_;  ///< Sentinel value that indicates an empty payload
+};
+}  // namespace experimental
 
 template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
 class dynamic_map;
@@ -53,7 +561,6 @@ class dynamic_map;
  * Current limitations:
  * - Requires keys and values that where `cuco::is_bitwise_comparable_v<T>` is true
  *    - Comparisons against the "sentinel" values will always be done with bitwise comparisons.
- * - Does not support erasing keys
  * - Capacity is fixed and will not grow automatically
  * - Requires the user to specify sentinel values for both key and mapped value to indicate empty
  * slots
@@ -137,14 +644,14 @@ class static_map {
   friend class dynamic_map<Key, Value, Scope, Allocator>;  ///< Dynamic map as friend class
 
  public:
-  using value_type         = cuco::pair_type<Key, Value>;       ///< Type of key/value pairs
+  using value_type         = cuco::pair<Key, Value>;            ///< Type of key/value pairs
   using key_type           = Key;                               ///< Key type
   using mapped_type        = Value;                             ///< Type of mapped values
   using atomic_key_type    = cuda::atomic<key_type, Scope>;     ///< Type of atomic keys
   using atomic_mapped_type = cuda::atomic<mapped_type, Scope>;  ///< Type of atomic mapped values
   using pair_atomic_type =
-    cuco::pair_type<atomic_key_type,
-                    atomic_mapped_type>;  ///< Pair type of atomic key and atomic mapped value
+    cuco::pair<atomic_key_type,
+               atomic_mapped_type>;  ///< Pair type of atomic key and atomic mapped value
   using slot_type           = pair_atomic_type;                  ///< Type of hash map slots
   using atomic_ctr_type     = cuda::atomic<std::size_t, Scope>;  ///< Atomic counter type
   using allocator_type      = Allocator;                         ///< Allocator type
@@ -200,8 +707,8 @@ class static_map {
    * @param stream Stream used for executing the kernels
    */
   static_map(std::size_t capacity,
-             sentinel::empty_key<Key> empty_key_sentinel,
-             sentinel::empty_value<Value> empty_value_sentinel,
+             empty_key<Key> empty_key_sentinel,
+             empty_value<Value> empty_value_sentinel,
              Allocator const& alloc = Allocator{},
              cudaStream_t stream    = 0);
 
@@ -220,9 +727,9 @@ class static_map {
    * @param stream Stream used for executing the kernels
    */
   static_map(std::size_t capacity,
-             sentinel::empty_key<Key> empty_key_sentinel,
-             sentinel::empty_value<Value> empty_value_sentinel,
-             sentinel::erased_key<Key> erased_key_sentinel,
+             empty_key<Key> empty_key_sentinel,
+             empty_value<Value> empty_value_sentinel,
+             erased_key<Key> erased_key_sentinel,
              Allocator const& alloc = Allocator{},
              cudaStream_t stream    = 0);
 
@@ -253,7 +760,7 @@ class static_map {
    * @param stream Stream used for executing the kernels
    */
   template <typename InputIt,
-            typename Hash     = cuco::detail::MurmurHash3_32<key_type>,
+            typename Hash     = cuco::default_hash_function<key_type>,
             typename KeyEqual = thrust::equal_to<key_type>>
   void insert(InputIt first,
               InputIt last,
@@ -287,7 +794,7 @@ class static_map {
   template <typename InputIt,
             typename StencilIt,
             typename Predicate,
-            typename Hash     = cuco::detail::MurmurHash3_32<key_type>,
+            typename Hash     = cuco::default_hash_function<key_type>,
             typename KeyEqual = thrust::equal_to<key_type>>
   void insert_if(InputIt first,
                  InputIt last,
@@ -325,7 +832,7 @@ class static_map {
    * provided at construction
    */
   template <typename InputIt,
-            typename Hash     = cuco::detail::MurmurHash3_32<key_type>,
+            typename Hash     = cuco::default_hash_function<key_type>,
             typename KeyEqual = thrust::equal_to<key_type>>
   void erase(InputIt first,
              InputIt last,
@@ -354,7 +861,7 @@ class static_map {
    */
   template <typename InputIt,
             typename OutputIt,
-            typename Hash     = cuco::detail::MurmurHash3_32<key_type>,
+            typename Hash     = cuco::default_hash_function<key_type>,
             typename KeyEqual = thrust::equal_to<key_type>>
   void find(InputIt first,
             InputIt last,
@@ -384,7 +891,7 @@ class static_map {
   template <typename KeyOut, typename ValueOut>
   std::pair<KeyOut, ValueOut> retrieve_all(KeyOut keys_out,
                                            ValueOut values_out,
-                                           cudaStream_t stream = 0);
+                                           cudaStream_t stream = 0) const;
 
   /**
    * @brief Indicates whether the keys in the range `[first, last)` are contained in the map.
@@ -409,7 +916,7 @@ class static_map {
    */
   template <typename InputIt,
             typename OutputIt,
-            typename Hash     = cuco::detail::MurmurHash3_32<key_type>,
+            typename Hash     = cuco::default_hash_function<key_type>,
             typename KeyEqual = thrust::equal_to<key_type>>
   void contains(InputIt first,
                 InputIt last,
@@ -437,8 +944,8 @@ class static_map {
 
     __host__ __device__ device_view_base(pair_atomic_type* slots,
                                          std::size_t capacity,
-                                         sentinel::empty_key<Key> empty_key_sentinel,
-                                         sentinel::empty_value<Value> empty_value_sentinel) noexcept
+                                         empty_key<Key> empty_key_sentinel,
+                                         empty_value<Value> empty_value_sentinel) noexcept
       : slots_{slots},
         capacity_{capacity},
         empty_key_sentinel_{empty_key_sentinel.value},
@@ -449,9 +956,9 @@ class static_map {
 
     __host__ __device__ device_view_base(pair_atomic_type* slots,
                                          std::size_t capacity,
-                                         sentinel::empty_key<Key> empty_key_sentinel,
-                                         sentinel::empty_value<Value> empty_value_sentinel,
-                                         sentinel::erased_key<Key> erased_key_sentinel) noexcept
+                                         empty_key<Key> empty_key_sentinel,
+                                         empty_value<Value> empty_value_sentinel,
+                                         erased_key<Key> erased_key_sentinel) noexcept
       : slots_{slots},
         capacity_{capacity},
         empty_key_sentinel_{empty_key_sentinel.value},
@@ -770,11 +1277,10 @@ class static_map {
      * @param empty_value_sentinel The reserved value for mapped values to
      * represent empty slots
      */
-    __host__ __device__
-    device_mutable_view(pair_atomic_type* slots,
-                        std::size_t capacity,
-                        sentinel::empty_key<Key> empty_key_sentinel,
-                        sentinel::empty_value<Value> empty_value_sentinel) noexcept
+    __host__ __device__ device_mutable_view(pair_atomic_type* slots,
+                                            std::size_t capacity,
+                                            empty_key<Key> empty_key_sentinel,
+                                            empty_value<Value> empty_value_sentinel) noexcept
       : device_view_base{slots, capacity, empty_key_sentinel, empty_value_sentinel}
     {
     }
@@ -791,9 +1297,9 @@ class static_map {
      */
     __host__ __device__ device_mutable_view(pair_atomic_type* slots,
                                             std::size_t capacity,
-                                            sentinel::empty_key<Key> empty_key_sentinel,
-                                            sentinel::empty_value<Value> empty_value_sentinel,
-                                            sentinel::erased_key<Key> erased_key_sentinel) noexcept
+                                            empty_key<Key> empty_key_sentinel,
+                                            empty_value<Value> empty_value_sentinel,
+                                            erased_key<Key> erased_key_sentinel) noexcept
       : device_view_base{
           slots, capacity, empty_key_sentinel, empty_value_sentinel, erased_key_sentinel}
     {
@@ -880,8 +1386,8 @@ class static_map {
       CG const& g,
       pair_atomic_type* slots,
       std::size_t capacity,
-      sentinel::empty_key<Key> empty_key_sentinel,
-      sentinel::empty_value<Value> empty_value_sentinel) noexcept
+      empty_key<Key> empty_key_sentinel,
+      empty_value<Value> empty_value_sentinel) noexcept
     {
       device_view_base::initialize_slots(
         g, slots, capacity, empty_key_sentinel.value, empty_value_sentinel.value);
@@ -889,7 +1395,7 @@ class static_map {
                                  capacity,
                                  empty_key_sentinel,
                                  empty_value_sentinel,
-                                 sentinel::erased_key<Key>{empty_key_sentinel.value}};
+                                 erased_key<Key>{empty_key_sentinel.value}};
     }
 
     /**
@@ -911,9 +1417,9 @@ class static_map {
       CG const& g,
       pair_atomic_type* slots,
       std::size_t capacity,
-      sentinel::empty_key<Key> empty_key_sentinel,
-      sentinel::empty_value<Value> empty_value_sentinel,
-      sentinel::erased_key<Key> erased_key_sentinel) noexcept
+      empty_key<Key> empty_key_sentinel,
+      empty_value<Value> empty_value_sentinel,
+      erased_key<Key> erased_key_sentinel) noexcept
     {
       device_view_base::initialize_slots(
         g, slots, capacity, empty_key_sentinel, empty_value_sentinel);
@@ -932,7 +1438,7 @@ class static_map {
      * equality
      * @return `true` if the insert was successful, `false` otherwise.
      */
-    template <typename Hash     = cuco::detail::MurmurHash3_32<key_type>,
+    template <typename Hash     = cuco::default_hash_function<key_type>,
               typename KeyEqual = thrust::equal_to<key_type>>
     __device__ bool insert(value_type const& insert_pair,
                            Hash hash          = Hash{},
@@ -963,7 +1469,7 @@ class static_map {
      * @return a pair consisting of an iterator to the element and a bool,
      * either `true` if the insert was successful, `false` otherwise.
      */
-    template <typename Hash     = cuco::detail::MurmurHash3_32<key_type>,
+    template <typename Hash     = cuco::default_hash_function<key_type>,
               typename KeyEqual = thrust::equal_to<key_type>>
     __device__ thrust::pair<iterator, bool> insert_and_find(
       value_type const& insert_pair, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}) noexcept;
@@ -988,7 +1494,7 @@ class static_map {
      * @return `true` if the insert was successful, `false` otherwise.
      */
     template <typename CG,
-              typename Hash     = cuco::detail::MurmurHash3_32<key_type>,
+              typename Hash     = cuco::default_hash_function<key_type>,
               typename KeyEqual = thrust::equal_to<key_type>>
     __device__ bool insert(CG const& g,
                            value_type const& insert_pair,
@@ -1009,7 +1515,7 @@ class static_map {
      * equality
      * @return `true` if the erasure was successful, `false` otherwise.
      */
-    template <typename Hash     = cuco::detail::MurmurHash3_32<key_type>,
+    template <typename Hash     = cuco::default_hash_function<key_type>,
               typename KeyEqual = thrust::equal_to<key_type>>
     __device__ bool erase(key_type const& k,
                           Hash hash          = Hash{},
@@ -1032,7 +1538,7 @@ class static_map {
      * @return `true` if the erasure was successful, `false` otherwise.
      */
     template <typename CG,
-              typename Hash     = cuco::detail::MurmurHash3_32<key_type>,
+              typename Hash     = cuco::default_hash_function<key_type>,
               typename KeyEqual = thrust::equal_to<key_type>>
     __device__ bool erase(CG const& g,
                           key_type const& k,
@@ -1072,8 +1578,8 @@ class static_map {
      */
     __host__ __device__ device_view(pair_atomic_type* slots,
                                     std::size_t capacity,
-                                    sentinel::empty_key<Key> empty_key_sentinel,
-                                    sentinel::empty_value<Value> empty_value_sentinel) noexcept
+                                    empty_key<Key> empty_key_sentinel,
+                                    empty_value<Value> empty_value_sentinel) noexcept
       : device_view_base{slots, capacity, empty_key_sentinel, empty_value_sentinel}
     {
     }
@@ -1090,9 +1596,9 @@ class static_map {
      */
     __host__ __device__ device_view(pair_atomic_type* slots,
                                     std::size_t capacity,
-                                    sentinel::empty_key<Key> empty_key_sentinel,
-                                    sentinel::empty_value<Value> empty_value_sentinel,
-                                    sentinel::erased_key<Key> erased_key_sentinel) noexcept
+                                    empty_key<Key> empty_key_sentinel,
+                                    empty_value<Value> empty_value_sentinel,
+                                    erased_key<Key> erased_key_sentinel) noexcept
       : device_view_base{
           slots, capacity, empty_key_sentinel, empty_value_sentinel, erased_key_sentinel}
     {
@@ -1106,9 +1612,9 @@ class static_map {
     __host__ __device__ explicit device_view(device_mutable_view mutable_map)
       : device_view_base{mutable_map.get_slots(),
                          mutable_map.get_capacity(),
-                         sentinel::empty_key<Key>{mutable_map.get_empty_key_sentinel()},
-                         sentinel::empty_value<Value>{mutable_map.get_empty_value_sentinel()},
-                         sentinel::erased_key<Key>{mutable_map.get_erased_key_sentinel()}}
+                         empty_key<Key>{mutable_map.get_empty_key_sentinel()},
+                         empty_value<Value>{mutable_map.get_empty_value_sentinel()},
+                         erased_key<Key>{mutable_map.get_erased_key_sentinel()}}
     {
     }
 
@@ -1177,12 +1683,11 @@ class static_map {
       g.sync();
 #endif
 
-      return device_view(
-        memory_to_use,
-        source_device_view.get_capacity(),
-        sentinel::empty_key<Key>{source_device_view.get_empty_key_sentinel()},
-        sentinel::empty_value<Value>{source_device_view.get_empty_value_sentinel()},
-        sentinel::erased_key<Key>{source_device_view.get_erased_key_sentinel()});
+      return device_view(memory_to_use,
+                         source_device_view.get_capacity(),
+                         empty_key<Key>{source_device_view.get_empty_key_sentinel()},
+                         empty_value<Value>{source_device_view.get_empty_value_sentinel()},
+                         erased_key<Key>{source_device_view.get_erased_key_sentinel()});
     }
 
     /**
@@ -1200,7 +1705,7 @@ class static_map {
      * @return An iterator to the position at which the key/value pair
      * containing `k` was inserted
      */
-    template <typename Hash     = cuco::detail::MurmurHash3_32<key_type>,
+    template <typename Hash     = cuco::default_hash_function<key_type>,
               typename KeyEqual = thrust::equal_to<key_type>>
     __device__ iterator find(Key const& k,
                              Hash hash          = Hash{},
@@ -1220,7 +1725,7 @@ class static_map {
      * @return An iterator to the position at which the key/value pair
      * containing `k` was inserted
      */
-    template <typename Hash     = cuco::detail::MurmurHash3_32<key_type>,
+    template <typename Hash     = cuco::default_hash_function<key_type>,
               typename KeyEqual = thrust::equal_to<key_type>>
     __device__ const_iterator find(Key const& k,
                                    Hash hash          = Hash{},
@@ -1247,7 +1752,7 @@ class static_map {
      * containing `k` was inserted
      */
     template <typename CG,
-              typename Hash     = cuco::detail::MurmurHash3_32<key_type>,
+              typename Hash     = cuco::default_hash_function<key_type>,
               typename KeyEqual = thrust::equal_to<key_type>>
     __device__ iterator
     find(CG g, Key const& k, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}) noexcept;
@@ -1273,7 +1778,7 @@ class static_map {
      * containing `k` was inserted
      */
     template <typename CG,
-              typename Hash     = cuco::detail::MurmurHash3_32<key_type>,
+              typename Hash     = cuco::default_hash_function<key_type>,
               typename KeyEqual = thrust::equal_to<key_type>>
     __device__ const_iterator
     find(CG g, Key const& k, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}) const noexcept;
@@ -1302,7 +1807,7 @@ class static_map {
      * containing `k` was inserted
      */
     template <typename ProbeKey,
-              typename Hash     = cuco::detail::MurmurHash3_32<key_type>,
+              typename Hash     = cuco::default_hash_function<key_type>,
               typename KeyEqual = thrust::equal_to<key_type>>
     __device__ bool contains(ProbeKey const& k,
                              Hash hash          = Hash{},
@@ -1337,7 +1842,7 @@ class static_map {
      */
     template <typename CG,
               typename ProbeKey,
-              typename Hash     = cuco::detail::MurmurHash3_32<key_type>,
+              typename Hash     = cuco::default_hash_function<key_type>,
               typename KeyEqual = thrust::equal_to<key_type>>
     __device__ std::enable_if_t<std::is_invocable_v<KeyEqual, ProbeKey, Key>, bool> contains(
       CG const& g,
@@ -1397,9 +1902,9 @@ class static_map {
   {
     return device_view(slots_,
                        capacity_,
-                       sentinel::empty_key<Key>{empty_key_sentinel_},
-                       sentinel::empty_value<Value>{empty_value_sentinel_},
-                       sentinel::erased_key<Key>{erased_key_sentinel_});
+                       empty_key<Key>{empty_key_sentinel_},
+                       empty_value<Value>{empty_value_sentinel_},
+                       erased_key<Key>{erased_key_sentinel_});
   }
 
   /**
@@ -1411,13 +1916,13 @@ class static_map {
   {
     return device_mutable_view(slots_,
                                capacity_,
-                               sentinel::empty_key<Key>{empty_key_sentinel_},
-                               sentinel::empty_value<Value>{empty_value_sentinel_},
-                               sentinel::erased_key<Key>{erased_key_sentinel_});
+                               empty_key<Key>{empty_key_sentinel_},
+                               empty_value<Value>{empty_value_sentinel_},
+                               erased_key<Key>{erased_key_sentinel_});
   }
 
  private:
-  pair_atomic_type* slots_{nullptr};            ///< Pointer to flat slots storage
+  pair_atomic_type* slots_{};                   ///< Pointer to flat slots storage
   std::size_t capacity_{};                      ///< Total number of slots
   std::size_t size_{};                          ///< Number of keys in map
   Key empty_key_sentinel_{};                    ///< Key value that represents an empty slot
@@ -1430,3 +1935,4 @@ class static_map {
 }  // namespace cuco
 
 #include <cuco/detail/static_map.inl>
+#include <cuco/detail/static_map/static_map.inl>
diff --git a/include/cuco/static_map_ref.cuh b/include/cuco/static_map_ref.cuh
new file mode 100644
index 000000000..c41ed88f3
--- /dev/null
+++ b/include/cuco/static_map_ref.cuh
@@ -0,0 +1,187 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuco/detail/open_addressing_ref_impl.cuh>
+#include <cuco/hash_functions.cuh>
+#include <cuco/operator.hpp>
+#include <cuco/probing_scheme.cuh>
+#include <cuco/sentinel.cuh>
+#include <cuco/storage.cuh>
+
+#include <cuda/std/atomic>
+
+namespace cuco {
+namespace experimental {
+
+/**
+ * @brief Device non-owning "ref" type that can be used in device code to perform arbitrary
+ * operations defined in `include/cuco/operator.hpp`
+ *
+ * @note Concurrent modify and lookup will be supported if both kinds of operators are specified
+ * during the ref construction.
+ * @note cuCollections data structures always place the slot keys on the left-hand
+ * side when invoking the key comparison predicate.
+ * @note Ref types are trivially-copyable and are intended to be passed by value.
+ * @note `ProbingScheme::cg_size` indicates how many threads are used to handle one independent
+ * device operation. `cg_size == 1` uses the scalar (or non-CG) code paths.
+ *
+ * @throw If the size of the given key type is larger than 8 bytes
+ * @throw If the size of the given payload type is larger than 8 bytes
+ * @throw If the given key type doesn't have unique object representations, i.e.,
+ * `cuco::bitwise_comparable_v<Key> == false`
+ * @throw If the given payload type doesn't have unique object representations, i.e.,
+ * `cuco::bitwise_comparable_v<T> == false`
+ * @throw If the probing scheme type is not inherited from `cuco::detail::probing_scheme_base`
+ *
+ * @tparam Key Type used for keys. Requires `cuco::is_bitwise_comparable_v<Key>` returning true
+ * @tparam T Type used for mapped values. Requires `cuco::is_bitwise_comparable_v<T>` returning true
+ * @tparam Scope The scope in which operations will be performed by individual threads.
+ * @tparam KeyEqual Binary callable type used to compare two keys for equality
+ * @tparam ProbingScheme Probing scheme (see `include/cuco/probing_scheme.cuh` for options)
+ * @tparam StorageRef Storage ref type
+ * @tparam Operators Device operator options defined in `include/cuco/operator.hpp`
+ */
+template <typename Key,
+          typename T,
+          cuda::thread_scope Scope,
+          typename KeyEqual,
+          typename ProbingScheme,
+          typename StorageRef,
+          typename... Operators>
+class static_map_ref
+  : public detail::operator_impl<
+      Operators,
+      static_map_ref<Key, T, Scope, KeyEqual, ProbingScheme, StorageRef, Operators...>>... {
+  using impl_type = detail::open_addressing_ref_impl<Key, Scope, ProbingScheme, StorageRef>;
+
+  static_assert(sizeof(T) <= 8, "Container does not support payload types larger than 8 bytes.");
+
+  static_assert(
+    cuco::is_bitwise_comparable_v<Key>,
+    "Key type must have unique object representations or have been explicitly declared as safe for "
+    "bitwise comparison via specialization of cuco::is_bitwise_comparable_v<Key>.");
+
+ public:
+  using key_type            = Key;                                     ///< Key type
+  using mapped_type         = T;                                       ///< Mapped type
+  using probing_scheme_type = ProbingScheme;                           ///< Type of probing scheme
+  using storage_ref_type    = StorageRef;                              ///< Type of storage ref
+  using window_type         = typename storage_ref_type::window_type;  ///< Window type
+  using value_type          = typename storage_ref_type::value_type;   ///< Storage element type
+  using extent_type         = typename storage_ref_type::extent_type;  ///< Extent type
+  using size_type           = typename storage_ref_type::size_type;    ///< Probing scheme size type
+  using key_equal           = KeyEqual;  ///< Type of key equality binary callable
+  using iterator            = typename storage_ref_type::iterator;   ///< Slot iterator type
+  using const_iterator = typename storage_ref_type::const_iterator;  ///< Const slot iterator type
+
+  static constexpr auto cg_size = probing_scheme_type::cg_size;  ///< Cooperative group size
+  static constexpr auto window_size =
+    storage_ref_type::window_size;  ///< Number of elements handled per window
+
+  /**
+   * @brief Constructs static_map_ref.
+   *
+   * @param empty_key_sentinel Sentinel indicating empty key
+   * @param empty_value_sentinel Sentinel indicating empty payload
+   * @param predicate Key equality binary callable
+   * @param probing_scheme Probing scheme
+   * @param storage_ref Non-owning ref of slot storage
+   */
+  __host__ __device__ explicit constexpr static_map_ref(
+    cuco::empty_key<key_type> empty_key_sentinel,
+    cuco::empty_value<mapped_type> empty_value_sentinel,
+    key_equal const& predicate,
+    probing_scheme_type const& probing_scheme,
+    storage_ref_type storage_ref) noexcept;
+
+  /**
+   * @brief Operator-agnostic move constructor.
+   *
+   * @tparam OtherOperators Operator set of the `other` object
+   *
+   * @param other Object to construct `*this` from
+   */
+  template <typename... OtherOperators>
+  __host__ __device__ explicit constexpr static_map_ref(
+    static_map_ref<Key, T, Scope, KeyEqual, ProbingScheme, StorageRef, OtherOperators...>&&
+      other) noexcept;
+
+  /**
+   * @brief Gets the maximum number of elements the container can hold.
+   *
+   * @return The maximum number of elements the container can hold
+   */
+  [[nodiscard]] __host__ __device__ constexpr auto capacity() const noexcept;
+
+  /**
+   * @brief Gets the sentinel value used to represent an empty key slot.
+   *
+   * @return The sentinel value used to represent an empty key slot
+   */
+  [[nodiscard]] __host__ __device__ constexpr key_type empty_key_sentinel() const noexcept;
+
+  /**
+   * @brief Gets the sentinel value used to represent an empty key slot.
+   *
+   * @return The sentinel value used to represent an empty key slot
+   */
+  [[nodiscard]] __host__ __device__ constexpr mapped_type empty_value_sentinel() const noexcept;
+
+  /**
+   * @brief Creates a reference with new operators from the current object.
+   *
+   * Note that this function uses move semantics and thus invalidates the current object.
+   *
+   * @warning Using two or more reference objects to the same container but with
+   * a different operator set at the same time results in undefined behavior.
+   *
+   * @tparam NewOperators List of `cuco::op::*_tag` types
+   *
+   * @param ops List of operators, e.g., `cuco::insert`
+   *
+   * @return `*this` with `NewOperators...`
+   */
+  template <typename... NewOperators>
+  [[nodiscard]] __host__ __device__ auto with(NewOperators... ops) && noexcept;
+
+ private:
+  struct predicate_wrapper;
+
+  impl_type impl_;                    ///< Static map ref implementation
+  predicate_wrapper predicate_;       ///< Key equality binary callable
+  mapped_type empty_value_sentinel_;  ///< Empty value sentinel
+
+  // Mixins need to be friends with this class in order to access private members
+  template <typename Op, typename Ref>
+  friend class detail::operator_impl;
+
+  // Refs with other operator sets need to be friends too
+  template <typename Key_,
+            typename T_,
+            cuda::thread_scope Scope_,
+            typename KeyEqual_,
+            typename ProbingScheme_,
+            typename StorageRef_,
+            typename... Operators_>
+  friend class static_map_ref;
+};
+
+}  // namespace experimental
+}  // namespace cuco
+
+#include <cuco/detail/static_map/static_map_ref.inl>
diff --git a/include/cuco/static_multimap.cuh b/include/cuco/static_multimap.cuh
index ef43b2175..9e2a2e280 100644
--- a/include/cuco/static_multimap.cuh
+++ b/include/cuco/static_multimap.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,13 +16,13 @@
 
 #pragma once
 
-#include <cuco/allocator.hpp>
 #include <cuco/detail/__config>
-#include <cuco/detail/error.hpp>
 #include <cuco/detail/prime.hpp>
+#include <cuco/hash_functions.cuh>
 #include <cuco/probe_sequences.cuh>
 #include <cuco/sentinel.cuh>
-#include <cuco/traits.hpp>
+#include <cuco/utility/allocator.hpp>
+#include <cuco/utility/traits.hpp>
 
 #include <thrust/functional.h>
 
@@ -130,8 +130,7 @@ template <typename Key,
           typename Value,
           cuda::thread_scope Scope = cuda::thread_scope_device,
           typename Allocator       = cuco::cuda_allocator<char>,
-          class ProbeSequence =
-            cuco::double_hashing<8, detail::MurmurHash3_32<Key>, detail::MurmurHash3_32<Key>>>
+          class ProbeSequence      = cuco::double_hashing<8, cuco::default_hash_function<Key>>>
 class static_multimap {
   static_assert(
     cuco::is_bitwise_comparable_v<Key>,
@@ -149,14 +148,14 @@ class static_multimap {
     "cuco::linear_probing.");
 
  public:
-  using value_type         = cuco::pair_type<Key, Value>;       ///< Type of key/value pairs
+  using value_type         = cuco::pair<Key, Value>;            ///< Type of key/value pairs
   using key_type           = Key;                               ///< Key type
   using mapped_type        = Value;                             ///< Type of mapped values
   using atomic_key_type    = cuda::atomic<key_type, Scope>;     ///< Type of atomic keys
   using atomic_mapped_type = cuda::atomic<mapped_type, Scope>;  ///< Type of atomic mapped values
   using pair_atomic_type =
-    cuco::pair_type<atomic_key_type,
-                    atomic_mapped_type>;  ///< Pair type of atomic key and atomic mapped value
+    cuco::pair<atomic_key_type,
+               atomic_mapped_type>;  ///< Pair type of atomic key and atomic mapped value
   using atomic_ctr_type     = cuda::atomic<std::size_t, Scope>;  ///< Atomic counter type
   using allocator_type      = Allocator;                         ///< Allocator type
   using slot_allocator_type = typename std::allocator_traits<Allocator>::rebind_alloc<
@@ -224,8 +223,8 @@ class static_multimap {
    * @param alloc Allocator used for allocating device storage
    */
   static_multimap(std::size_t capacity,
-                  sentinel::empty_key<Key> empty_key_sentinel,
-                  sentinel::empty_value<Value> empty_value_sentinel,
+                  empty_key<Key> empty_key_sentinel,
+                  empty_value<Value> empty_value_sentinel,
                   cudaStream_t stream    = 0,
                   Allocator const& alloc = Allocator{});
 
@@ -610,8 +609,8 @@ class static_multimap {
 
     __host__ __device__ device_view_base(pair_atomic_type* slots,
                                          std::size_t capacity,
-                                         sentinel::empty_key<Key> empty_key_sentinel,
-                                         sentinel::empty_value<Value> empty_value_sentinel) noexcept
+                                         empty_key<Key> empty_key_sentinel,
+                                         empty_value<Value> empty_value_sentinel) noexcept
       : impl_{slots, capacity, empty_key_sentinel.value, empty_value_sentinel.value}
     {
     }
@@ -713,11 +712,10 @@ class static_multimap {
      * @param empty_value_sentinel The reserved value for mapped values to
      * represent empty slots
      */
-    __host__ __device__
-    device_mutable_view(pair_atomic_type* slots,
-                        std::size_t capacity,
-                        sentinel::empty_key<Key> empty_key_sentinel,
-                        sentinel::empty_value<Value> empty_value_sentinel) noexcept
+    __host__ __device__ device_mutable_view(pair_atomic_type* slots,
+                                            std::size_t capacity,
+                                            empty_key<Key> empty_key_sentinel,
+                                            empty_value<Value> empty_value_sentinel) noexcept
       : view_base_type{slots, capacity, empty_key_sentinel, empty_value_sentinel}
     {
     }
@@ -769,8 +767,8 @@ class static_multimap {
      */
     __host__ __device__ device_view(pair_atomic_type* slots,
                                     std::size_t capacity,
-                                    sentinel::empty_key<Key> empty_key_sentinel,
-                                    sentinel::empty_value<Value> empty_value_sentinel) noexcept
+                                    empty_key<Key> empty_key_sentinel,
+                                    empty_value<Value> empty_value_sentinel) noexcept
       : view_base_type{slots, capacity, empty_key_sentinel, empty_value_sentinel}
     {
     }
@@ -1324,8 +1322,8 @@ class static_multimap {
   {
     return device_view(slots_.get(),
                        capacity_,
-                       sentinel::empty_key<Key>{empty_key_sentinel_},
-                       sentinel::empty_value<Value>{empty_value_sentinel_});
+                       empty_key<Key>{empty_key_sentinel_},
+                       empty_value<Value>{empty_value_sentinel_});
   }
 
   /**
@@ -1338,8 +1336,8 @@ class static_multimap {
   {
     return device_mutable_view(slots_.get(),
                                capacity_,
-                               sentinel::empty_key<Key>{empty_key_sentinel_},
-                               sentinel::empty_value<Value>{empty_value_sentinel_});
+                               empty_key<Key>{empty_key_sentinel_},
+                               empty_value<Value>{empty_value_sentinel_});
   }
 
  private:
diff --git a/include/cuco/static_set.cuh b/include/cuco/static_set.cuh
new file mode 100644
index 000000000..613a99bd4
--- /dev/null
+++ b/include/cuco/static_set.cuh
@@ -0,0 +1,469 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuco/cuda_stream_ref.hpp>
+#include <cuco/detail/open_addressing_impl.cuh>
+#include <cuco/extent.cuh>
+#include <cuco/hash_functions.cuh>
+#include <cuco/probing_scheme.cuh>
+#include <cuco/sentinel.cuh>
+#include <cuco/static_set_ref.cuh>
+#include <cuco/storage.cuh>
+#include <cuco/utility/allocator.hpp>
+#include <cuco/utility/traits.hpp>
+
+#include <thrust/functional.h>
+
+#include <cuda/atomic>
+
+#if defined(CUCO_HAS_CUDA_BARRIER)
+#include <cuda/barrier>
+#endif
+
+#include <cstddef>
+#include <type_traits>
+
+namespace cuco {
+namespace experimental {
+/**
+ * @brief A GPU-accelerated, unordered, associative container of unique keys.
+ *
+ * The `static_set` supports two types of operations:
+ * - Host-side "bulk" operations
+ * - Device-side "singular" operations
+ *
+ * The host-side bulk operations include `insert`, `contains`, etc. These APIs should be used when
+ * there are a large number of keys to modify or lookup. For example, given a range of keys
+ * specified by device-accessible iterators, the bulk `insert` function will insert all keys into
+ * the set.
+ *
+ * The singular device-side operations allow individual threads (or cooperative groups) to perform
+ * independent modify or lookup operations from device code. These operations are accessed through
+ * non-owning, trivially copyable reference types (or "ref"). User can combine any arbitrary
+ * operators (see options in `include/cuco/operator.hpp`) when creating the ref. Concurrent modify
+ * and lookup will be supported if both kinds of operators are specified during the ref
+ * construction.
+ *
+ * @note Allows constant time concurrent modify or lookup operations from threads in device code.
+ * @note cuCollections data structures always place the slot keys on the left-hand side when
+ * invoking the key comparison predicate, i.e., `pred(slot_key, query_key)`. Order-sensitive
+ * `KeyEqual` should be used with caution.
+ * @note `ProbingScheme::cg_size` indicates how many threads are used to handle one independent
+ * device operation. `cg_size == 1` uses the scalar (or non-CG) code paths.
+ *
+ * @throw If the size of the given key type is larger than 8 bytes
+ * @throw If the given key type doesn't have unique object representations, i.e.,
+ * `cuco::bitwise_comparable_v<Key> == false`
+ * @throw If the probing scheme type is not inherited from `cuco::detail::probing_scheme_base`
+ *
+ * @tparam Key Type used for keys. Requires `cuco::is_bitwise_comparable_v<Key>`
+ * @tparam Extent Data structure size type
+ * @tparam Scope The scope in which operations will be performed by individual threads.
+ * @tparam KeyEqual Binary callable type used to compare two keys for equality
+ * @tparam ProbingScheme Probing scheme (see `include/cuco/probing_scheme.cuh` for choices)
+ * @tparam Allocator Type of allocator used for device storage
+ * @tparam Storage Slot window storage type
+ */
+
+template <class Key,
+          class Extent             = cuco::experimental::extent<std::size_t>,
+          cuda::thread_scope Scope = cuda::thread_scope_device,
+          class KeyEqual           = thrust::equal_to<Key>,
+          class ProbingScheme      = experimental::double_hashing<4,  // CG size
+                                                             cuco::default_hash_function<Key>>,
+          class Allocator          = cuco::cuda_allocator<Key>,
+          class Storage            = cuco::experimental::storage<1>>
+class static_set {
+  using impl_type = detail::
+    open_addressing_impl<Key, Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>;
+
+ public:
+  static constexpr auto cg_size      = impl_type::cg_size;       ///< CG size used for probing
+  static constexpr auto window_size  = impl_type::window_size;   ///< Window size used for probing
+  static constexpr auto thread_scope = impl_type::thread_scope;  ///< CUDA thread scope
+
+  using key_type       = typename impl_type::key_type;        ///< Key type
+  using value_type     = typename impl_type::value_type;      ///< Key type
+  using extent_type    = typename impl_type::extent_type;     ///< Extent type
+  using size_type      = typename impl_type::size_type;       ///< Size type
+  using key_equal      = typename impl_type::key_equal;       ///< Key equality comparator type
+  using allocator_type = typename impl_type::allocator_type;  ///< Allocator type
+  /// Non-owning window storage ref type
+  using storage_ref_type    = typename impl_type::storage_ref_type;
+  using probing_scheme_type = typename impl_type::probing_scheme_type;  ///< Probing scheme type
+
+  template <typename... Operators>
+  using ref_type =
+    cuco::experimental::static_set_ref<key_type,
+                                       thread_scope,
+                                       key_equal,
+                                       probing_scheme_type,
+                                       storage_ref_type,
+                                       Operators...>;  ///< Non-owning container ref type
+
+  static_set(static_set const&) = delete;
+  static_set& operator=(static_set const&) = delete;
+
+  static_set(static_set&&) = default;  ///< Move constructor
+
+  /**
+   * @brief Replaces the contents of the container with another container.
+   *
+   * @return Reference of the current map object
+   */
+  static_set& operator=(static_set&&) = default;
+  ~static_set()                       = default;
+
+  /**
+   * @brief Constructs a statically-sized set with the specified initial capacity, sentinel values
+   * and CUDA stream.
+   *
+   * The actual set capacity depends on the given `capacity`, the probing scheme, CG size, and the
+   * window size and it is computed via the `make_window_extent` factory. Insert operations will not
+   * automatically grow the set. Attempting to insert more unique keys than the capacity of the map
+   * results in undefined behavior.
+   *
+   * @note Any `*_sentinel`s are reserved and behavior is undefined when attempting to insert
+   * this sentinel value.
+   * @note If a non-default CUDA stream is provided, the caller is responsible for synchronizing the
+   * stream before the object is first used.
+   *
+   * @param capacity The requested lower-bound set size
+   * @param empty_key_sentinel The reserved key value for empty slots
+   * @param pred Key equality binary predicate
+   * @param probing_scheme Probing scheme
+   * @param alloc Allocator used for allocating device storage
+   * @param stream CUDA stream used to initialize the set
+   */
+  constexpr static_set(Extent capacity,
+                       empty_key<Key> empty_key_sentinel,
+                       KeyEqual const& pred                = {},
+                       ProbingScheme const& probing_scheme = {},
+                       Allocator const& alloc              = {},
+                       cuda_stream_ref stream              = {});
+
+  /**
+   * @brief Erases all elements from the container. After this call, `size()` returns zero.
+   * Invalidates any references, pointers, or iterators referring to contained elements.
+   *
+   * @param stream CUDA stream this operation is executed in
+   */
+  void clear(cuda_stream_ref stream = {}) noexcept;
+
+  /**
+   * @brief Asynchronously erases all elements from the container. After this call, `size()` returns
+   * zero. Invalidates any references, pointers, or iterators referring to contained elements.
+   *
+   * @param stream CUDA stream this operation is executed in
+   */
+  void clear_async(cuda_stream_ref stream = {}) noexcept;
+
+  /**
+   * @brief Inserts all keys in the range `[first, last)` and returns the number of successful
+   * insertions.
+   *
+   * @note This function synchronizes the given stream. For asynchronous execution use
+   * `insert_async`.
+   *
+   * @tparam InputIt Device accessible random access input iterator where
+   * <tt>std::is_convertible<std::iterator_traits<InputIt>::value_type,
+   * static_set<K>::value_type></tt> is `true`
+   *
+   * @param first Beginning of the sequence of keys
+   * @param last End of the sequence of keys
+   * @param stream CUDA stream used for insert
+   *
+   * @return Number of successfully inserted keys
+   */
+  template <typename InputIt>
+  size_type insert(InputIt first, InputIt last, cuda_stream_ref stream = {});
+
+  /**
+   * @brief Asynchronously inserts all keys in the range `[first, last)`.
+   *
+   * @tparam InputIt Device accessible random access input iterator where
+   * <tt>std::is_convertible<std::iterator_traits<InputIt>::value_type,
+   * static_set<K>::value_type></tt> is `true`
+   *
+   * @param first Beginning of the sequence of keys
+   * @param last End of the sequence of keys
+   * @param stream CUDA stream used for insert
+   */
+  template <typename InputIt>
+  void insert_async(InputIt first, InputIt last, cuda_stream_ref stream = {}) noexcept;
+
+  /**
+   * @brief Inserts keys in the range `[first, last)` if `pred` of the corresponding stencil returns
+   * true.
+   *
+   * @note The key `*(first + i)` is inserted if `pred( *(stencil + i) )` returns true.
+   * @note This function synchronizes the given stream and returns the number of successful
+   * insertions. For asynchronous execution use `insert_if_async`.
+   *
+   * @tparam InputIt Device accessible random access iterator whose `value_type` is
+   * convertible to the container's `value_type`
+   * @tparam StencilIt Device accessible random access iterator whose value_type is
+   * convertible to Predicate's argument type
+   * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` and
+   * argument type is convertible from <tt>std::iterator_traits<StencilIt>::value_type</tt>
+   *
+   * @param first Beginning of the sequence of key/value pairs
+   * @param last End of the sequence of key/value pairs
+   * @param stencil Beginning of the stencil sequence
+   * @param pred Predicate to test on every element in the range `[stencil, stencil +
+   * std::distance(first, last))`
+   * @param stream CUDA stream used for the operation
+   *
+   * @return Number of successfully inserted keys
+   */
+  template <typename InputIt, typename StencilIt, typename Predicate>
+  size_type insert_if(
+    InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda_stream_ref stream = {});
+
+  /**
+   * @brief Asynchronously inserts keys in the range `[first, last)` if `pred` of the corresponding
+   * stencil returns true.
+   *
+   * @note The key `*(first + i)` is inserted if `pred( *(stencil + i) )` returns true.
+   *
+   * @tparam InputIt Device accessible random access iterator whose `value_type` is
+   * convertible to the container's `value_type`
+   * @tparam StencilIt Device accessible random access iterator whose value_type is
+   * convertible to Predicate's argument type
+   * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` and
+   * argument type is convertible from <tt>std::iterator_traits<StencilIt>::value_type</tt>
+   *
+   * @param first Beginning of the sequence of key/value pairs
+   * @param last End of the sequence of key/value pairs
+   * @param stencil Beginning of the stencil sequence
+   * @param pred Predicate to test on every element in the range `[stencil, stencil +
+   * std::distance(first, last))`
+   * @param stream CUDA stream used for the operation
+   */
+  template <typename InputIt, typename StencilIt, typename Predicate>
+  void insert_if_async(InputIt first,
+                       InputIt last,
+                       StencilIt stencil,
+                       Predicate pred,
+                       cuda_stream_ref stream = {}) noexcept;
+
+  /**
+   * @brief Indicates whether the keys in the range `[first, last)` are contained in the set.
+   *
+   * @note This function synchronizes the given stream. For asynchronous execution use
+   * `contains_async`.
+   *
+   * @tparam InputIt Device accessible input iterator
+   * @tparam OutputIt Device accessible output iterator assignable from `bool`
+   *
+   * @param first Beginning of the sequence of keys
+   * @param last End of the sequence of keys
+   * @param output_begin Beginning of the sequence of booleans for the presence of each key
+   * @param stream Stream used for executing the kernels
+   */
+  template <typename InputIt, typename OutputIt>
+  void contains(InputIt first,
+                InputIt last,
+                OutputIt output_begin,
+                cuda_stream_ref stream = {}) const;
+
+  /**
+   * @brief Asynchronously indicates whether the keys in the range `[first, last)` are contained in
+   * the set.
+   *
+   * @tparam InputIt Device accessible input iterator
+   * @tparam OutputIt Device accessible output iterator assignable from `bool`
+   *
+   * @param first Beginning of the sequence of keys
+   * @param last End of the sequence of keys
+   * @param output_begin Beginning of the sequence of booleans for the presence of each key
+   * @param stream Stream used for executing the kernels
+   */
+  template <typename InputIt, typename OutputIt>
+  void contains_async(InputIt first,
+                      InputIt last,
+                      OutputIt output_begin,
+                      cuda_stream_ref stream = {}) const noexcept;
+
+  /**
+   * @brief Indicates whether the keys in the range `[first, last)` are contained in the set if
+   * `pred` of the corresponding stencil returns true.
+   *
+   * @note If `pred( *(stencil + i) )` is true, stores `true` or `false` to `(output_begin + i)`
+   * indicating if the key `*(first + i)` is present in the set. If `pred( *(stencil + i) )` is
+   * false, stores false to `(output_begin + i)`.
+   * @note This function synchronizes the given stream. For asynchronous execution use
+   * `contains_if_async`.
+   *
+   * @tparam InputIt Device accessible input iterator
+   * @tparam StencilIt Device accessible random access iterator whose value_type is
+   * convertible to Predicate's argument type
+   * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` and
+   * argument type is convertible from <tt>std::iterator_traits<StencilIt>::value_type</tt>
+   * @tparam OutputIt Device accessible output iterator assignable from `bool`
+   *
+   * @param first Beginning of the sequence of keys
+   * @param last End of the sequence of keys
+   * @param stencil Beginning of the stencil sequence
+   * @param pred Predicate to test on every element in the range `[stencil, stencil +
+   * std::distance(first, last))`
+   * @param output_begin Beginning of the sequence of booleans for the presence of each key
+   * @param stream Stream used for executing the kernels
+   */
+  template <typename InputIt, typename StencilIt, typename Predicate, typename OutputIt>
+  void contains_if(InputIt first,
+                   InputIt last,
+                   StencilIt stencil,
+                   Predicate pred,
+                   OutputIt output_begin,
+                   cuda_stream_ref stream = {}) const;
+
+  /**
+   * @brief Asynchronously indicates whether the keys in the range `[first, last)` are contained in
+   * the set if `pred` of the corresponding stencil returns true.
+   *
+   * @note If `pred( *(stencil + i) )` is true, stores `true` or `false` to `(output_begin + i)`
+   * indicating if the key `*(first + i)` is present in the set. If `pred( *(stencil + i) )` is
+   * false, stores false to `(output_begin + i)`.
+   *
+   * @tparam InputIt Device accessible input iterator
+   * @tparam StencilIt Device accessible random access iterator whose value_type is
+   * convertible to Predicate's argument type
+   * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` and
+   * argument type is convertible from <tt>std::iterator_traits<StencilIt>::value_type</tt>
+   * @tparam OutputIt Device accessible output iterator assignable from `bool`
+   *
+   * @param first Beginning of the sequence of keys
+   * @param last End of the sequence of keys
+   * @param stencil Beginning of the stencil sequence
+   * @param pred Predicate to test on every element in the range `[stencil, stencil +
+   * std::distance(first, last))`
+   * @param output_begin Beginning of the sequence of booleans for the presence of each key
+   * @param stream Stream used for executing the kernels
+   */
+  template <typename InputIt, typename StencilIt, typename Predicate, typename OutputIt>
+  void contains_if_async(InputIt first,
+                         InputIt last,
+                         StencilIt stencil,
+                         Predicate pred,
+                         OutputIt output_begin,
+                         cuda_stream_ref stream = {}) const noexcept;
+
+  /**
+   * @brief For all keys in the range `[first, last)`, finds an element with key equivalent to the
+   * query key.
+   *
+   * @note This function synchronizes the given stream. For asynchronous execution use `find_async`.
+   * @note If the key `*(first + i)` has a matched `element` in the set, copies `element` to
+   * `(output_begin + i)`. Else, copies the empty key sentinel.
+   *
+   * @tparam InputIt Device accessible input iterator
+   * @tparam OutputIt Device accessible output iterator assignable from the set's `key_type`
+   *
+   * @param first Beginning of the sequence of keys
+   * @param last End of the sequence of keys
+   * @param output_begin Beginning of the sequence of elements retrieved for each key
+   * @param stream Stream used for executing the kernels
+   */
+  template <typename InputIt, typename OutputIt>
+  void find(InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream = {}) const;
+
+  /**
+   * @brief For all keys in the range `[first, last)`, asynchronously finds an element with key
+   * equivalent to the query key.
+   *
+   * @note If the key `*(first + i)` has a matched `element` in the set, copies `element` to
+   * `(output_begin + i)`. Else, copies the empty key sentinel.
+   *
+   * @tparam InputIt Device accessible input iterator
+   * @tparam OutputIt Device accessible output iterator assignable from the set's `key_type`
+   *
+   * @param first Beginning of the sequence of keys
+   * @param last End of the sequence of keys
+   * @param output_begin Beginning of the sequence of elements retrieved for each key
+   * @param stream Stream used for executing the kernels
+   */
+  template <typename InputIt, typename OutputIt>
+  void find_async(InputIt first,
+                  InputIt last,
+                  OutputIt output_begin,
+                  cuda_stream_ref stream = {}) const;
+
+  /**
+   * @brief Retrieves all keys contained in the set.
+   *
+   * @note This API synchronizes the given stream.
+   * @note The order in which keys are returned is implementation defined and not guaranteed to be
+   * consistent between subsequent calls to `retrieve_all`.
+   * @note Behavior is undefined if the range beginning at `output_begin` is smaller than the return
+   * value of `size()`.
+   *
+   * @tparam OutputIt Device accessible random access output iterator whose `value_type` is
+   * convertible from the container's `key_type`.
+   *
+   * @param output_begin Beginning output iterator for keys
+   * @param stream CUDA stream used for this operation
+   *
+   * @return Iterator indicating the end of the output
+   */
+  template <typename OutputIt>
+  OutputIt retrieve_all(OutputIt output_begin, cuda_stream_ref stream = {}) const;
+
+  /**
+   * @brief Gets the number of elements in the container.
+   *
+   * @note This function synchronizes the given stream.
+   *
+   * @param stream CUDA stream used to get the number of inserted elements
+   * @return The number of elements in the container
+   */
+  [[nodiscard]] size_type size(cuda_stream_ref stream = {}) const noexcept;
+
+  /**
+   * @brief Gets the maximum number of elements the hash map can hold.
+   *
+   * @return The maximum number of elements the hash map can hold
+   */
+  [[nodiscard]] constexpr auto capacity() const noexcept;
+
+  /**
+   * @brief Gets the sentinel value used to represent an empty key slot.
+   *
+   * @return The sentinel value used to represent an empty key slot
+   */
+  [[nodiscard]] constexpr key_type empty_key_sentinel() const noexcept;
+
+  /**
+   * @brief Get device ref with operators.
+   *
+   * @tparam Operators Set of `cuco::op` to be provided by the ref
+   *
+   * @param ops List of operators, e.g., `cuco::insert`
+   *
+   * @return Device ref of the current `static_set` object
+   */
+  template <typename... Operators>
+  [[nodiscard]] auto ref(Operators... ops) const noexcept;
+
+ private:
+  std::unique_ptr<impl_type> impl_;
+};
+}  // namespace experimental
+}  // namespace cuco
+
+#include <cuco/detail/static_set/static_set.inl>
diff --git a/include/cuco/static_set_ref.cuh b/include/cuco/static_set_ref.cuh
new file mode 100644
index 000000000..b2c8158e7
--- /dev/null
+++ b/include/cuco/static_set_ref.cuh
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuco/detail/equal_wrapper.cuh>
+#include <cuco/detail/open_addressing_ref_impl.cuh>
+#include <cuco/hash_functions.cuh>
+#include <cuco/operator.hpp>
+#include <cuco/probing_scheme.cuh>
+#include <cuco/sentinel.cuh>
+#include <cuco/storage.cuh>
+
+#include <cuda/std/atomic>
+
+#include <memory>
+
+namespace cuco {
+namespace experimental {
+
+/**
+ * @brief Device non-owning "ref" type that can be used in device code to perform arbitrary
+ * operations defined in `include/cuco/operator.hpp`
+ *
+ * @note Concurrent modify and lookup will be supported if both kinds of operators are specified
+ * during the ref construction.
+ * @note cuCollections data structures always place the slot keys on the left-hand
+ * side when invoking the key comparison predicate.
+ * @note Ref types are trivially-copyable and are intended to be passed by value.
+ * @note `ProbingScheme::cg_size` indicates how many threads are used to handle one independent
+ * device operation. `cg_size == 1` uses the scalar (or non-CG) code paths.
+ *
+ * @throw If the size of the given key type is larger than 8 bytes
+ * @throw If the given key type doesn't have unique object representations, i.e.,
+ * `cuco::bitwise_comparable_v<Key> == false`
+ * @throw If the probing scheme type is not inherited from `cuco::detail::probing_scheme_base`
+ *
+ * @tparam Key Type used for keys. Requires `cuco::is_bitwise_comparable_v<Key>` returning true
+ * @tparam Scope The scope in which operations will be performed by individual threads.
+ * @tparam KeyEqual Binary callable type used to compare two keys for equality
+ * @tparam ProbingScheme Probing scheme (see `include/cuco/probing_scheme.cuh` for options)
+ * @tparam StorageRef Storage ref type
+ * @tparam Operators Device operator options defined in `include/cuco/operator.hpp`
+ */
+template <typename Key,
+          cuda::thread_scope Scope,
+          typename KeyEqual,
+          typename ProbingScheme,
+          typename StorageRef,
+          typename... Operators>
+class static_set_ref
+  : public detail::operator_impl<
+      Operators,
+      static_set_ref<Key, Scope, KeyEqual, ProbingScheme, StorageRef, Operators...>>... {
+  using impl_type = detail::open_addressing_ref_impl<Key, Scope, ProbingScheme, StorageRef>;
+
+ public:
+  using key_type            = Key;                                     ///< Key Type
+  using probing_scheme_type = ProbingScheme;                           ///< Type of probing scheme
+  using storage_ref_type    = StorageRef;                              ///< Type of storage ref
+  using window_type         = typename storage_ref_type::window_type;  ///< Window type
+  using value_type          = typename storage_ref_type::value_type;   ///< Storage element type
+  using extent_type         = typename storage_ref_type::extent_type;  ///< Extent type
+  using size_type           = typename storage_ref_type::size_type;    ///< Probing scheme size type
+  using key_equal           = KeyEqual;  ///< Type of key equality binary callable
+  using iterator            = typename storage_ref_type::iterator;   ///< Slot iterator type
+  using const_iterator = typename storage_ref_type::const_iterator;  ///< Const slot iterator type
+
+  static constexpr auto cg_size = probing_scheme_type::cg_size;  ///< Cooperative group size
+  static constexpr auto window_size =
+    storage_ref_type::window_size;  ///< Number of elements handled per window
+
+  /**
+   * @brief Constructs static_set_ref.
+   *
+   * @param empty_key_sentinel Sentinel indicating empty key
+   * @param predicate Key equality binary callable
+   * @param probing_scheme Probing scheme
+   * @param storage_ref Non-owning ref of slot storage
+   */
+  __host__ __device__ explicit constexpr static_set_ref(
+    cuco::empty_key<key_type> empty_key_sentinel,
+    key_equal const& predicate,
+    probing_scheme_type const& probing_scheme,
+    storage_ref_type storage_ref) noexcept;
+
+  /**
+   * @brief Operator-agnostic move constructor.
+   *
+   * @tparam OtherOperators Operator set of the `other` object
+   *
+   * @param other Object to construct `*this` from
+   */
+  template <typename... OtherOperators>
+  __host__ __device__ explicit constexpr static_set_ref(
+    static_set_ref<Key, Scope, KeyEqual, ProbingScheme, StorageRef, OtherOperators...>&&
+      other) noexcept;
+
+  /**
+   * @brief Gets the maximum number of elements the container can hold.
+   *
+   * @return The maximum number of elements the container can hold
+   */
+  [[nodiscard]] __host__ __device__ constexpr auto capacity() const noexcept;
+
+  /**
+   * @brief Gets the sentinel value used to represent an empty key slot.
+   *
+   * @return The sentinel value used to represent an empty key slot
+   */
+  [[nodiscard]] __host__ __device__ constexpr key_type empty_key_sentinel() const noexcept;
+
+  /**
+   * @brief Creates a reference with new operators from the current object.
+   *
+   * Note that this function uses move semantics and thus invalidates the current object.
+   *
+   * @warning Using two or more reference objects to the same container but with
+   * a different operator set at the same time results in undefined behavior.
+   *
+   * @tparam NewOperators List of `cuco::op::*_tag` types
+   *
+   * @param ops List of operators, e.g., `cuco::insert`
+   *
+   * @return `*this` with `NewOperators...`
+   */
+  template <typename... NewOperators>
+  [[nodiscard]] __host__ __device__ auto with(NewOperators... ops) && noexcept;
+
+ private:
+  impl_type impl_;
+  detail::equal_wrapper<key_type, key_equal> predicate_;  ///< Key equality binary callable
+
+  // Mixins need to be friends with this class in order to access private members
+  template <typename Op, typename Ref>
+  friend class detail::operator_impl;
+
+  // Refs with other operator sets need to be friends too
+  template <typename Key_,
+            cuda::thread_scope Scope_,
+            typename KeyEqual_,
+            typename ProbingScheme_,
+            typename StorageRef_,
+            typename... Operators_>
+  friend class static_set_ref;
+};
+
+}  // namespace experimental
+}  // namespace cuco
+
+#include <cuco/detail/static_set/static_set_ref.inl>
diff --git a/include/cuco/storage.cuh b/include/cuco/storage.cuh
new file mode 100644
index 000000000..e2e0c6f46
--- /dev/null
+++ b/include/cuco/storage.cuh
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuco/detail/storage/storage.cuh>
+
+namespace cuco {
+namespace experimental {
+
+/**
+ * @brief Public storage class.
+ *
+ * @note This is a public interface used to control storage window size. A window consists of a
+ * number of contiguous slots. The window size defines the workload granularity for each CUDA
+ * thread, i.e., how many slots a thread would concurrently operate on when performing modify or
+ * lookup operations. cuCollections uses the AoW storage to supersede the raw flat slot storage due
+ * to its superior granularity control: When window size equals one, AoW performs the same as the
+ * flat storage. If the underlying operation is more memory bandwidth bound, e.g., high occupancy
+ * multimap operations, a larger window size can reduce the length of probing sequences thus improve
+ * runtime performance.
+ *
+ * @tparam WindowSize Number of elements per window storage
+ */
+template <int32_t WindowSize>
+class storage {
+ public:
+  /// Number of slots per window storage
+  static constexpr int32_t window_size = WindowSize;
+
+  /// Type of implementation details
+  template <class T, class Extent, class Allocator>
+  using impl = aow_storage<T, window_size, Extent, Allocator>;
+};
+
+}  // namespace experimental
+}  // namespace cuco
diff --git a/include/cuco/allocator.hpp b/include/cuco/utility/allocator.hpp
similarity index 97%
rename from include/cuco/allocator.hpp
rename to include/cuco/utility/allocator.hpp
index c19552963..583571620 100644
--- a/include/cuco/allocator.hpp
+++ b/include/cuco/utility/allocator.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/include/cuco/utility/error.hpp b/include/cuco/utility/error.hpp
new file mode 100644
index 000000000..eb6a5f2e3
--- /dev/null
+++ b/include/cuco/utility/error.hpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <stdexcept>
+#include <string>
+
+namespace cuco {
+/**
+ * @brief Exception thrown when logical precondition is violated.
+ *
+ * This exception should not be thrown directly and is instead thrown by the
+ * CUCO_EXPECTS macro.
+ */
+struct logic_error : public std::logic_error {
+  /**
+   * @brief Constructs a logic_error with the error message.
+   *
+   * @param message Message to be associated with the exception
+   */
+  logic_error(char const* const message) : std::logic_error(message) {}
+
+  /**
+   * @brief Construct a new logic error object with error message
+   *
+   * @param message Message to be associated with the exception
+   */
+  logic_error(std::string const& message) : std::logic_error(message) {}
+};
+/**
+ * @brief Exception thrown when a CUDA error is encountered.
+ *
+ */
+struct cuda_error : public std::runtime_error {
+  /**
+   * @brief Constructs a `cuda_error` object with the given `message`.
+   *
+   * @param message The error char array used to construct `cuda_error`
+   */
+  cuda_error(const char* message) : std::runtime_error(message) {}
+  /**
+   * @brief Constructs a `cuda_error` object with the given `message` string.
+   *
+   * @param message The `std::string` used to construct `cuda_error`
+   */
+  cuda_error(std::string const& message) : cuda_error{message.c_str()} {}
+};
+}  // namespace cuco
diff --git a/include/cuco/utility/fast_int.cuh b/include/cuco/utility/fast_int.cuh
new file mode 100644
index 000000000..6616e2c5c
--- /dev/null
+++ b/include/cuco/utility/fast_int.cuh
@@ -0,0 +1,159 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuco/detail/__config>
+
+#include <cstdint>
+#include <cuda/std/bit>
+#include <cuda/std/limits>
+#include <cuda/std/type_traits>
+
+namespace cuco::utility {
+
+/**
+ * @brief Integer type with optimized division and modulo operators.
+ *
+ * @tparam T Underlying integer type
+ */
+template <typename T>
+struct fast_int {
+  static_assert(cuda::std::is_same_v<T, std::int32_t> or cuda::std::is_same_v<T, std::uint32_t>
+#if defined(CUCO_HAS_INT128)
+                  or cuda::std::is_same_v<T, std::int64_t> or cuda::std::is_same_v<T, std::uint64_t>
+#endif
+                ,
+                "Unsupported integer type");
+
+  using value_type = T;  ///< Underlying integer type
+
+  /**
+   * @brief Constructs a fast_int from an integer value.
+   *
+   * @param value Integer value
+   */
+  __host__ __device__ explicit constexpr fast_int(T value) noexcept : value_{value}
+  {
+    evaluate_magic_numbers();
+  }
+
+  /**
+   * @brief Get the underlying integer value.
+   *
+   * @return Underlying value
+   */
+  __host__ __device__ constexpr value_type value() const noexcept { return value_; }
+
+  /**
+   * @brief Explicit conversion operator to the underlying value type.
+   *
+   * @return Underlying value
+   */
+  __host__ __device__ explicit constexpr operator value_type() const noexcept { return value_; }
+
+ private:
+  using intermediate_type =
+    cuda::std::conditional_t<sizeof(value_type) == 4,
+                             std::uint64_t,
+                             unsigned __int128>;  ///< Intermediate type for multiplication
+  using unsigned_value_type = cuda::std::make_unsigned_t<value_type>;  ///< Unsigned value type
+  using signed_value_type   = cuda::std::make_signed_t<value_type>;    ///< Signed value type
+
+  static constexpr value_type value_bits =
+    CHAR_BIT * sizeof(value_type);  ///< Number of bits required to represent the value
+
+  /**
+   * @brief Computes the high bits of the multiplication of two unsigned integers.
+   *
+   * @param lhs Left-hand side of the multiplication
+   * @param rhs Right-hand side of the multiplication
+   *
+   * @return High bits of the multiplication
+   */
+  __host__ __device__ constexpr value_type mulhi(unsigned_value_type lhs,
+                                                 unsigned_value_type rhs) const noexcept
+  {
+#if defined(__CUDA_ARCH__)
+    if constexpr (sizeof(value_type) == 4) {
+      return __umulhi(lhs, rhs);
+    } else {
+      return __umul64hi(lhs, rhs);
+    }
+#else
+    return (intermediate_type(lhs) * intermediate_type(rhs)) >> value_bits;
+#endif
+  }
+
+  /**
+   * @brief Computes the log2 of an unsigned integer.
+   *
+   * @param v Unsigned integer
+   *
+   * @return Log2 of the unsigned integer
+   */
+  __host__ __device__ constexpr value_type log2(value_type v) const noexcept
+  {
+    return cuda::std::bit_width(unsigned_value_type(v)) - 1;
+  }
+
+  /**
+   * @brief Computes the magic numbers for the fast division.
+   */
+  __host__ __device__ constexpr void evaluate_magic_numbers() noexcept
+  {
+    // TODO assert(value_ > 0);
+    auto const val_log2 = this->log2(value_);
+
+    // if value_ is a power of 2, we can use a simple shift
+    if (cuda::std::has_single_bit(unsigned_value_type(value_))) {
+      magic_ = 0;
+      shift_ = val_log2;
+    } else {
+      auto upper      = intermediate_type(1) << value_bits;
+      auto lower      = intermediate_type(1);
+      auto const lval = intermediate_type(value_);
+
+      // compute the magic number and shift; see "Hacker's Delight" by Henry S. Warren, Jr., 10-2
+      for (shift_ = 0; shift_ < val_log2; ++shift_, upper <<= 1, lower <<= 1) {
+        if ((upper % lval) <= lower) { break; }
+      }
+      magic_ = upper / lval;
+    }
+  }
+
+  value_type value_;  ///< Underlying integer value
+  value_type magic_;  ///< Magic number for fast division
+  value_type shift_;  ///< Shift for fast division
+
+  template <typename Lhs>
+  friend __host__ __device__ constexpr value_type operator/(Lhs lhs, fast_int const& rhs) noexcept
+  {
+    static_assert(cuda::std::is_same_v<Lhs, value_type>,
+                  "Left-hand side operand must be of type value_type.");
+    if (rhs.value_ == 1) { return lhs; }                // edge case for value_ == 1
+    if (rhs.magic_ == 0) { return lhs >> rhs.shift_; }  // edge case for value_ == pow2
+    auto const mul = (lhs == cuda::std::numeric_limits<T>::max()) ? lhs : lhs + 1;
+    return rhs.mulhi(rhs.magic_, mul) >> rhs.shift_;
+  }
+
+  template <typename Lhs>
+  friend __host__ __device__ constexpr value_type operator%(Lhs lhs, fast_int const& rhs) noexcept
+  {
+    return lhs - (lhs / rhs) * rhs.value_;
+  }
+};
+}  // namespace cuco::utility
\ No newline at end of file
diff --git a/include/cuco/utility/key_generator.hpp b/include/cuco/utility/key_generator.hpp
new file mode 100644
index 000000000..deea62a62
--- /dev/null
+++ b/include/cuco/utility/key_generator.hpp
@@ -0,0 +1,293 @@
+/*
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuco/detail/error.hpp>
+#include <cuco/detail/utils.cuh>
+
+#include <thrust/execution_policy.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/random.h>
+#include <thrust/sequence.h>
+#include <thrust/shuffle.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/transform.h>
+#include <thrust/type_traits/is_execution_policy.h>
+
+#include <cstdint>
+#include <iterator>
+#include <time.h>
+#include <type_traits>
+
+namespace cuco::utility {
+
+namespace distribution {
+
+/**
+ * @brief Tag struct representing a random distribution of unique keys.
+ */
+struct unique {
+};
+
+/**
+ * @brief Tag struct representing a uniform distribution.
+ */
+struct uniform : public cuco::detail::strong_type<int64_t> {
+  /**
+   * @param multiplicity Average key multiplicity of the distribution.
+   */
+  uniform(int64_t multiplicity) : cuco::detail::strong_type<int64_t>{multiplicity}
+  {
+    CUCO_EXPECTS(multiplicity > 0, "Multiplicity must be greater than 0");
+  }
+};
+
+/**
+ * @brief Tag struct representing a gaussian distribution.
+ */
+struct gaussian : public cuco::detail::strong_type<double> {
+  /**
+   * @param skew 0 represents a uniform distribution; &infin; represents a Dirac delta distribution.
+   */
+  gaussian(double skew) : cuco::detail::strong_type<double>{skew}
+  {
+    CUCO_EXPECTS(skew > 0, "Skew must be greater than 0");
+  }
+};
+
+}  // namespace distribution
+
+/**
+ * @brief Random key generator.
+ *
+ * @tparam RNG Pseudo-random number generator
+ */
+template <typename RNG = thrust::default_random_engine>
+class key_generator {
+ public:
+  /**
+   * @brief Construct a new key generator object.
+   *
+   * @param seed Seed for the random number generator
+   */
+  key_generator(uint32_t seed = static_cast<uint32_t>(time(nullptr))) : rng_(seed) {}
+
+  /**
+   * @brief Generates a sequence of random keys in the interval [0, N).
+   *
+   * @tparam Dist Key distribution type
+   * @tparam OutputIt Ouput iterator typy which value type is the desired key type
+   * @tparam ExecPolicy Thrust execution policy
+   * @tparam Enable SFINAE helper
+   *
+   * @param dist Random distribution to use
+   * @param out_begin Start of the output sequence
+   * @param out_end End of the output sequence
+   * @param exec_policy Thrust execution policy this operation will be executed with
+   */
+  template <typename Dist,
+            typename OutputIt,
+            typename ExecPolicy,
+            typename Enable = std::enable_if_t<thrust::is_execution_policy<ExecPolicy>::value>>
+  void generate(Dist dist, OutputIt out_begin, OutputIt out_end, ExecPolicy exec_policy)
+  {
+    using value_type = typename std::iterator_traits<OutputIt>::value_type;
+
+    if constexpr (std::is_same_v<Dist, distribution::unique>) {
+      thrust::sequence(exec_policy, out_begin, out_end, 0);
+      thrust::shuffle(exec_policy, out_begin, out_end, this->rng_);
+    } else if constexpr (std::is_same_v<Dist, distribution::uniform>) {
+      size_t num_keys = thrust::distance(out_begin, out_end);
+
+      thrust::counting_iterator<size_t> seeds(this->rng_());
+
+      thrust::transform(exec_policy,
+                        seeds,
+                        seeds + num_keys,
+                        out_begin,
+                        [*this, dist, num_keys] __host__ __device__(size_t const seed) {
+                          RNG rng;
+                          thrust::uniform_int_distribution<value_type> uniform_dist(
+                            1, num_keys / dist.value);
+                          rng.seed(seed);
+                          return uniform_dist(rng);
+                        });
+    } else if constexpr (std::is_same_v<Dist, distribution::gaussian>) {
+      size_t num_keys = thrust::distance(out_begin, out_end);
+
+      thrust::counting_iterator<size_t> seq(this->rng_());
+
+      thrust::transform(exec_policy,
+                        seq,
+                        seq + num_keys,
+                        out_begin,
+                        [*this, dist, num_keys] __host__ __device__(size_t const seed) {
+                          RNG rng;
+                          thrust::normal_distribution<> normal_dist(
+                            static_cast<double>(num_keys / 2), num_keys * dist.value);
+                          rng.seed(seed);
+                          auto val = normal_dist(rng);
+                          while (val < 0 or val >= num_keys) {
+                            // Re-sample if the value is outside the range [0, N)
+                            // This is necessary because the normal distribution is not bounded
+                            // might be a better way to do this, e.g., discard(n)
+                            val = normal_dist(rng);
+                          }
+                          return val;
+                        });
+    } else {
+      CUCO_FAIL("Unexpected distribution type");
+    }
+  }
+
+  /**
+   * @brief Overload of 'generate' which automatically selects a suitable execution policy
+   *
+   * @tparam Dist Key distribution type
+   * @tparam OutputIt Ouput iterator typy which value type is the desired key type
+   *
+   * @param dist Random distribution to use
+   * @param out_begin Start of the output sequence
+   * @param out_end End of the output sequence
+   */
+  template <typename Dist, typename OutputIt>
+  void generate(Dist dist, OutputIt out_begin, OutputIt out_end)
+  {
+    using thrust::system::detail::generic::select_system;
+
+    typedef typename thrust::iterator_system<OutputIt>::type System;
+    System system;
+
+    generate(dist, out_begin, out_end, select_system(system));
+  }
+
+  /**
+   * @brief Overload of 'generate' which uses 'thrust::cuda::par_nosync' execution policy on CUDA
+   * stream 'stream'
+   *
+   * @tparam Dist Key distribution type
+   * @tparam OutputIt Ouput iterator typy which value type is the desired key type
+   *
+   * @param dist Random distribution to use
+   * @param out_begin Start of the output sequence
+   * @param out_end End of the output sequence
+   * @param stream CUDA stream in which this operation is executed in
+   */
+  template <typename Dist, typename OutputIt>
+  void generate(Dist dist, OutputIt out_begin, OutputIt out_end, cudaStream_t stream)
+  {
+    generate(dist, out_begin, out_end, thrust::cuda::par_nosync.on(stream));
+  }
+
+  /**
+   * @brief Randomly replaces previously generated keys with new keys outside the input
+   * distribution.
+   *
+   * @tparam InOutIt Input/Ouput iterator typy which value type is the desired key type
+   * @tparam ExecPolicy Thrust execution policy
+   * @tparam Enable SFINAE helper
+   *
+   * @param begin Start of the key sequence
+   * @param end End of the key sequence
+   * @param keep_prob Probability that a key is kept
+   * @param exec_policy Thrust execution policy this operation will be executed with
+   */
+  template <typename InOutIt,
+            typename ExecPolicy,
+            typename Enable = std::enable_if_t<thrust::is_execution_policy<ExecPolicy>::value>>
+  void dropout(InOutIt begin, InOutIt end, double keep_prob, ExecPolicy exec_policy)
+  {
+    using value_type = typename std::iterator_traits<InOutIt>::value_type;
+
+    CUCO_EXPECTS(keep_prob >= 0.0 and keep_prob <= 1.0, "Probability needs to be between 0 and 1");
+
+    if (keep_prob < 1.0) {
+      size_t num_keys = thrust::distance(begin, end);
+
+      thrust::counting_iterator<size_t> seeds(rng_());
+
+      thrust::transform_if(
+        exec_policy,
+        seeds,
+        seeds + num_keys,
+        begin,
+        [num_keys] __host__ __device__(size_t const seed) {
+          RNG rng;
+          thrust::uniform_int_distribution<value_type> non_match_dist{
+            static_cast<value_type>(num_keys), std::numeric_limits<value_type>::max()};
+          rng.seed(seed);
+          return non_match_dist(rng);
+        },
+        [keep_prob] __host__ __device__(size_t const seed) {
+          RNG rng;
+          thrust::uniform_real_distribution<double> rate_dist(0.0, 1.0);
+          rng.seed(seed);
+          return (rate_dist(rng) > keep_prob);
+        });
+    }
+
+    thrust::shuffle(exec_policy, begin, end, rng_);
+  }
+
+  /**
+   * @brief Overload of 'dropout' which automatically selects a suitable execution policy
+   *
+   * @tparam InOutIt Input/Ouput iterator typy which value type is the desired key type
+   *
+   * @param begin Start of the key sequence
+   * @param end End of the key sequence
+   * @param keep_prob Probability that a key is kept
+   */
+  template <typename InOutIt>
+  void dropout(InOutIt begin, InOutIt end, double keep_prob)
+  {
+    using thrust::system::detail::generic::select_system;
+
+    typedef typename thrust::iterator_system<InOutIt>::type System;
+    System system;
+
+    dropout(begin, end, keep_prob, select_system(system));
+  }
+
+  /**
+   * @brief Overload of 'dropout' which uses 'thrust::cuda::par_nosync' execution policy on CUDA
+   * stream 'stream'
+   *
+   * @tparam InOutIt Input/Ouput iterator typy which value type is the desired key type
+   *
+   * @param begin Start of the key sequence
+   * @param end End of the key sequence
+   * @param keep_prob Probability that a key is kept
+   * @param stream CUDA stream in which this operation is executed in
+   */
+  template <typename InOutIt>
+  void dropout(InOutIt begin, InOutIt end, double keep_prob, cudaStream_t stream)
+  {
+    using thrust::system::detail::generic::select_system;
+
+    typedef typename thrust::iterator_system<InOutIt>::type System;
+    System system;
+
+    dropout(begin, end, keep_prob, thrust::cuda::par_nosync.on(stream));
+  }
+
+ private:
+  RNG rng_;  ///< Random number generator
+};
+
+}  // namespace cuco::utility
diff --git a/include/cuco/traits.hpp b/include/cuco/utility/traits.hpp
similarity index 87%
rename from include/cuco/traits.hpp
rename to include/cuco/utility/traits.hpp
index 445a40daf..1a6252dcb 100644
--- a/include/cuco/traits.hpp
+++ b/include/cuco/utility/traits.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,9 @@
 
 #pragma once
 
+#include <thrust/device_reference.h>
+#include <thrust/tuple.h>
+
 #include <type_traits>
 
 namespace cuco {
@@ -58,4 +61,10 @@ inline constexpr bool is_bitwise_comparable_v = is_bitwise_comparable<T>::value;
   };                                                    \
   }
 
+template <bool value, typename... Args>
+inline constexpr bool dependent_bool_value = value;
+
+template <typename... Args>
+inline constexpr bool dependent_false = dependent_bool_value<false, Args...>;
+
 }  // namespace cuco
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 2d1d25526..3deeeddf1 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -1,5 +1,5 @@
 #=============================================================================
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+# Copyright (c) 2018-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #=============================================================================
-cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
+cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR)
 
 include(CTest)
 
@@ -23,35 +23,49 @@ include(CTest)
 CPMAddPackage(
   NAME Catch2
   GITHUB_REPOSITORY catchorg/Catch2
-  VERSION 2.13.9
+  VERSION 3.3.0
 )
 
+# Header for catch_discover_tests
 if(Catch2_ADDED)
-    include(${Catch2_SOURCE_DIR}/contrib/Catch.cmake)
+    include(${Catch2_SOURCE_DIR}/extras/Catch.cmake)
 endif()
 
-# catch_main.cpp defines `CATCH_CONFIG_MAIN` which provides main()
-# Compiles it to be linked into test executables
-add_library(CatchMain OBJECT ${CMAKE_CURRENT_SOURCE_DIR}/catch_main.cpp)
-target_link_libraries(CatchMain Catch2::Catch2)
-
 ###################################################################################################
 function(ConfigureTest TEST_NAME)
-    add_executable(${TEST_NAME} ${ARGN}
-                   $<TARGET_OBJECTS:CatchMain>) # Link in the CatchMain object file
-    target_link_libraries(${TEST_NAME} Catch2::Catch2 cuco CUDA::cudart)
+    add_executable(${TEST_NAME} ${ARGN})
+    target_link_libraries(${TEST_NAME} PRIVATE Catch2::Catch2WithMain cuco CUDA::cudart)
     target_include_directories(${TEST_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
     set_target_properties(${TEST_NAME} PROPERTIES
                                        RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/tests")
     target_compile_options(${TEST_NAME} PRIVATE --compiler-options=-Wall --compiler-options=-Wextra
       --expt-extended-lambda --expt-relaxed-constexpr -Xcompiler -Wno-subobject-linkage)
-    catch_discover_tests(${TEST_NAME})
+    catch_discover_tests(${TEST_NAME} EXTRA_ARGS --allow-running-no-tests)
 endfunction(ConfigureTest)
 
 ###################################################################################################
 ### test sources ##################################################################################
 ###################################################################################################
 
+###################################################################################################
+# - utility tests ---------------------------------------------------------------------------------
+ConfigureTest(UTILITY_TEST
+    utility/extent_test.cu
+    utility/storage_test.cu
+    utility/fast_int_test.cu
+    utility/hash_test.cu)
+
+###################################################################################################
+# - static_set tests ------------------------------------------------------------------------------
+ConfigureTest(STATIC_SET_TEST
+    static_set/capacity_test.cu
+    static_set/heterogeneous_lookup_test.cu
+    static_set/insert_and_find_test.cu
+    static_set/large_input_test.cu
+    static_set/retrieve_all_test.cu
+    static_set/size_test.cu
+    static_set/unique_sequence_test.cu)
+
 ###################################################################################################
 # - static_map tests ------------------------------------------------------------------------------
 ConfigureTest(STATIC_MAP_TEST
@@ -60,6 +74,7 @@ ConfigureTest(STATIC_MAP_TEST
     static_map/erase_test.cu
     static_map/heterogeneous_lookup_test.cu
     static_map/insert_and_find_test.cu
+    static_map/insert_or_assign_test.cu
     static_map/key_sentinel_test.cu
     static_map/shared_memory_test.cu
     static_map/stream_test.cu
@@ -68,7 +83,8 @@ ConfigureTest(STATIC_MAP_TEST
 ###################################################################################################
 # - dynamic_map tests -----------------------------------------------------------------------------
 ConfigureTest(DYNAMIC_MAP_TEST
-    dynamic_map/unique_sequence_test.cu)
+    dynamic_map/unique_sequence_test.cu
+    dynamic_map/erase_test.cu)
 
 ###################################################################################################
 # - static_multimap tests -------------------------------------------------------------------------
@@ -80,3 +96,12 @@ ConfigureTest(STATIC_MULTIMAP_TEST
     static_multimap/multiplicity_test.cu
     static_multimap/non_match_test.cu
     static_multimap/pair_function_test.cu)
+
+###################################################################################################
+# - dynamic_bitset tests --------------------------------------------------------------------------
+ConfigureTest(DYNAMIC_BITSET_TEST
+    dynamic_bitset/find_next_test.cu
+    dynamic_bitset/get_test.cu
+    dynamic_bitset/rank_test.cu
+    dynamic_bitset/select_test.cu
+    dynamic_bitset/size_test.cu)
diff --git a/tests/catch_main.cpp b/tests/catch_main.cpp
deleted file mode 100644
index a7cc18e23..000000000
--- a/tests/catch_main.cpp
+++ /dev/null
@@ -1,6 +0,0 @@
-// In a Catch project with multiple files, dedicate one file to compile the
-// source code of Catch itself and reuse the resulting object file for linking.
-
-// Let Catch provide main():
-#define CATCH_CONFIG_MAIN
-#include <catch2/catch.hpp>
diff --git a/tests/dynamic_bitset/find_next_test.cu b/tests/dynamic_bitset/find_next_test.cu
new file mode 100644
index 000000000..97ba366ea
--- /dev/null
+++ b/tests/dynamic_bitset/find_next_test.cu
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <utils.hpp>
+
+#include <cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <thrust/host_vector.h>
+
+#include <catch2/catch_test_macros.hpp>
+
+template <class BitsetRef, typename size_type, typename OutputIt>
+__global__ void find_next_kernel(BitsetRef ref, size_type num_elements, OutputIt output)
+{
+  cuco::detail::index_type index  = blockIdx.x * blockDim.x + threadIdx.x;
+  cuco::detail::index_type stride = gridDim.x * blockDim.x;
+  while (index < num_elements) {
+    output[index] = ref.find_next(index);
+    index += stride;
+  }
+}
+
+extern bool modulo_bitgen(uint64_t i);  // Defined in get_test.cu
+
+TEST_CASE("Find next set test", "")
+{
+  cuco::experimental::detail::dynamic_bitset bv;
+
+  using size_type = std::size_t;
+  constexpr size_type num_elements{400};
+
+  for (size_type i = 0; i < num_elements; i++) {
+    bv.push_back(modulo_bitgen(i));
+  }
+
+  thrust::device_vector<size_type> device_result(num_elements);
+  auto ref = bv.ref();
+  find_next_kernel<<<1, 1024>>>(ref, num_elements, device_result.data());
+
+  thrust::host_vector<size_type> host_result = device_result;
+  size_type num_matches                      = 0;
+
+  size_type next_set_pos = -1lu;
+  do {
+    next_set_pos++;
+  } while (next_set_pos < num_elements and !modulo_bitgen(next_set_pos));
+
+  for (size_type key = 0; key < num_elements; key++) {
+    num_matches += host_result[key] == next_set_pos;
+
+    if (key == next_set_pos) {
+      do {
+        next_set_pos++;
+      } while (next_set_pos < num_elements and !modulo_bitgen(next_set_pos));
+    }
+  }
+  REQUIRE(num_matches == num_elements);
+}
diff --git a/tests/dynamic_bitset/get_test.cu b/tests/dynamic_bitset/get_test.cu
new file mode 100644
index 000000000..10f81a116
--- /dev/null
+++ b/tests/dynamic_bitset/get_test.cu
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <catch2/catch_test_macros.hpp>
+#include <cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh>
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <thrust/sequence.h>
+#include <utils.hpp>
+
+template <class BitsetRef, typename size_type, typename OutputIt>
+__global__ void test_kernel(BitsetRef ref, size_type num_elements, OutputIt output)
+{
+  cuco::detail::index_type index  = blockIdx.x * blockDim.x + threadIdx.x;
+  cuco::detail::index_type stride = gridDim.x * blockDim.x;
+  while (index < num_elements) {
+    output[index] = ref.test(index);
+    index += stride;
+  }
+}
+
+bool modulo_bitgen(uint64_t i) { return i % 7 == 0; }
+
+TEST_CASE("Get test", "")
+{
+  cuco::experimental::detail::dynamic_bitset bv;
+
+  using size_type = std::size_t;
+  constexpr size_type num_elements{400};
+
+  size_type num_set_ref = 0;
+  for (size_type i = 0; i < num_elements; i++) {
+    bv.push_back(modulo_bitgen(i));
+    num_set_ref += modulo_bitgen(i);
+  }
+
+  // Host-bulk test
+  thrust::device_vector<size_type> keys(num_elements);
+  thrust::sequence(keys.begin(), keys.end(), 0);
+
+  thrust::device_vector<size_type> test_result(num_elements);
+  thrust::fill(test_result.begin(), test_result.end(), 0);
+
+  bv.test(keys.begin(), keys.end(), test_result.begin());
+
+  size_type num_set = thrust::reduce(thrust::device, test_result.begin(), test_result.end(), 0);
+  REQUIRE(num_set == num_set_ref);
+
+  // Device-ref test
+  auto ref = bv.ref();
+  thrust::fill(test_result.begin(), test_result.end(), 0);
+  test_kernel<<<1, 1024>>>(ref, num_elements, test_result.data());
+
+  num_set = thrust::reduce(thrust::device, test_result.begin(), test_result.end(), 0);
+  REQUIRE(num_set == num_set_ref);
+}
diff --git a/tests/dynamic_bitset/rank_test.cu b/tests/dynamic_bitset/rank_test.cu
new file mode 100644
index 000000000..3b4d17cca
--- /dev/null
+++ b/tests/dynamic_bitset/rank_test.cu
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <utils.hpp>
+
+#include <cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/sequence.h>
+
+#include <catch2/catch_test_macros.hpp>
+
+extern bool modulo_bitgen(uint64_t i);  // Defined in get_test.cu
+
+TEST_CASE("Rank test", "")
+{
+  cuco::experimental::detail::dynamic_bitset bv;
+
+  using size_type = std::size_t;
+  constexpr size_type num_elements{4000};
+
+  for (size_type i = 0; i < num_elements; i++) {
+    bv.push_back(modulo_bitgen(i));
+  }
+
+  thrust::device_vector<size_type> keys(num_elements);
+  thrust::sequence(keys.begin(), keys.end(), 0);
+
+  thrust::device_vector<size_type> d_ranks(num_elements);
+
+  bv.rank(keys.begin(), keys.end(), d_ranks.begin());
+
+  thrust::host_vector<size_type> h_ranks = d_ranks;
+
+  size_type cur_rank    = 0;
+  size_type num_matches = 0;
+  for (size_type i = 0; i < num_elements; i++) {
+    num_matches += cur_rank == h_ranks[i];
+    if (modulo_bitgen(i)) { cur_rank++; }
+  }
+  REQUIRE(num_matches == num_elements);
+}
diff --git a/tests/dynamic_bitset/select_test.cu b/tests/dynamic_bitset/select_test.cu
new file mode 100644
index 000000000..3dc0d74da
--- /dev/null
+++ b/tests/dynamic_bitset/select_test.cu
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <utils.hpp>
+
+#include <cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/sequence.h>
+
+#include <catch2/catch_test_macros.hpp>
+
+template <class BitsetRef, typename size_type, typename OutputIt>
+__global__ void select_false_kernel(BitsetRef ref, size_type num_elements, OutputIt output)
+{
+  cuco::detail::index_type index  = blockIdx.x * blockDim.x + threadIdx.x;
+  cuco::detail::index_type stride = gridDim.x * blockDim.x;
+  while (index < num_elements) {
+    output[index] = ref.select_false(index);
+    index += stride;
+  }
+}
+
+extern bool modulo_bitgen(uint64_t i);  // Defined in get_test.cu
+
+TEST_CASE("Select test", "")
+{
+  cuco::experimental::detail::dynamic_bitset bv;
+
+  using size_type = std::size_t;
+  constexpr size_type num_elements{4000};
+
+  size_type num_set = 0;
+  for (size_type i = 0; i < num_elements; i++) {
+    bv.push_back(modulo_bitgen(i));
+    num_set += modulo_bitgen(i);
+  }
+
+  // Check select
+  {
+    thrust::device_vector<size_type> keys(num_set);
+    thrust::sequence(keys.begin(), keys.end(), 0);
+
+    thrust::device_vector<size_type> d_selects(num_set);
+
+    bv.select(keys.begin(), keys.end(), d_selects.begin());
+
+    thrust::host_vector<size_type> h_selects = d_selects;
+
+    size_type num_matches = 0;
+    size_type cur_set_pos = -1lu;
+    for (size_type i = 0; i < num_set; i++) {
+      do {
+        cur_set_pos++;
+      } while (cur_set_pos < num_elements and !modulo_bitgen(cur_set_pos));
+
+      num_matches += cur_set_pos == h_selects[i];
+    }
+    REQUIRE(num_matches == num_set);
+  }
+
+  // Check select_false
+  {
+    size_type num_not_set = num_elements - num_set;
+
+    auto ref = bv.ref();
+    thrust::device_vector<size_type> device_result(num_not_set);
+    select_false_kernel<<<1, 1024>>>(ref, num_not_set, device_result.data());
+    thrust::host_vector<size_type> host_result = device_result;
+
+    size_type num_matches     = 0;
+    size_type cur_not_set_pos = -1lu;
+    for (size_type i = 0; i < num_not_set; i++) {
+      do {
+        cur_not_set_pos++;
+      } while (cur_not_set_pos < num_elements and modulo_bitgen(cur_not_set_pos));
+
+      num_matches += cur_not_set_pos == host_result[i];
+    }
+    REQUIRE(num_matches == num_not_set);
+  }
+}
diff --git a/tests/dynamic_bitset/size_test.cu b/tests/dynamic_bitset/size_test.cu
new file mode 100644
index 000000000..611159dc3
--- /dev/null
+++ b/tests/dynamic_bitset/size_test.cu
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh>
+
+#include <catch2/catch_test_macros.hpp>
+
+TEST_CASE("Size computation", "")
+{
+  cuco::experimental::detail::dynamic_bitset bv;
+  using size_type = std::size_t;
+  constexpr size_type num_elements{400};
+
+  for (size_type i = 0; i < num_elements; i++) {
+    bv.push_back(i % 2 == 0);  // Alternate 0s and 1s pattern
+  }
+
+  auto size = bv.size();
+  REQUIRE(size == num_elements);
+}
diff --git a/tests/dynamic_map/erase_test.cu b/tests/dynamic_map/erase_test.cu
new file mode 100644
index 000000000..1a60b49b6
--- /dev/null
+++ b/tests/dynamic_map/erase_test.cu
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <utils.hpp>
+
+#include <cuco/dynamic_map.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <thrust/functional.h>
+#include <thrust/sequence.h>
+
+#include <catch2/catch_template_test_macros.hpp>
+
+TEMPLATE_TEST_CASE_SIG("erase key",
+                       "",
+                       ((typename Key, typename Value), Key, Value),
+                       (int32_t, int32_t),
+                       (int32_t, int64_t),
+                       (int64_t, int32_t),
+                       (int64_t, int64_t))
+{
+  constexpr std::size_t num_keys = 1'000'000;
+  cuco::dynamic_map<Key, Value> map{num_keys * 2,
+                                    cuco::empty_key<Key>{-1},
+                                    cuco::empty_value<Value>{-1},
+                                    cuco::erased_key<Key>{-2}};
+
+  SECTION("Check single submap insert/erase")
+  {
+    thrust::device_vector<Key> d_keys(num_keys);
+    thrust::device_vector<Value> d_values(num_keys);
+    thrust::device_vector<bool> d_keys_exist(num_keys);
+
+    thrust::sequence(thrust::device, d_keys.begin(), d_keys.end(), 1);
+    thrust::sequence(thrust::device, d_values.begin(), d_values.end(), 1);
+
+    auto pairs_begin =
+      thrust::make_zip_iterator(thrust::make_tuple(d_keys.begin(), d_values.begin()));
+
+    map.insert(pairs_begin, pairs_begin + num_keys);
+
+    REQUIRE(map.get_size() == num_keys);
+
+    map.erase(d_keys.begin(), d_keys.end());
+
+    // delete decreases count correctly
+    REQUIRE(map.get_size() == 0);
+
+    map.contains(d_keys.begin(), d_keys.end(), d_keys_exist.begin());
+
+    // keys were actaully deleted
+    REQUIRE(cuco::test::none_of(d_keys_exist.begin(), d_keys_exist.end(), thrust::identity{}));
+
+    // ensures that map is reusing deleted slots
+    map.insert(pairs_begin, pairs_begin + num_keys);
+
+    REQUIRE(map.get_size() == num_keys);
+
+    map.contains(d_keys.begin(), d_keys.end(), d_keys_exist.begin());
+
+    REQUIRE(cuco::test::all_of(d_keys_exist.begin(), d_keys_exist.end(), thrust::identity{}));
+
+    // erase can act selectively
+    map.erase(d_keys.begin(), d_keys.begin() + num_keys / 2);
+    map.contains(d_keys.begin(), d_keys.end(), d_keys_exist.begin());
+
+    REQUIRE(cuco::test::none_of(
+      d_keys_exist.begin(), d_keys_exist.begin() + num_keys / 2, thrust::identity{}));
+
+    REQUIRE(cuco::test::all_of(
+      d_keys_exist.begin() + num_keys / 2, d_keys_exist.end(), thrust::identity{}));
+
+    // clear map
+    map.erase(d_keys.begin() + num_keys / 2, d_keys.end());
+  }
+
+  SECTION("Check multiple submaps insert/erase")
+  {
+    constexpr std::size_t num = 4 * num_keys;
+
+    thrust::device_vector<Key> d_keys(num);
+    thrust::device_vector<Value> d_values(num);
+    thrust::device_vector<bool> d_keys_exist(num);
+
+    thrust::sequence(thrust::device, d_keys.begin(), d_keys.end(), 1);
+    thrust::sequence(thrust::device, d_values.begin(), d_values.end(), 1);
+
+    auto pairs_begin =
+      thrust::make_zip_iterator(thrust::make_tuple(d_keys.begin(), d_values.begin()));
+
+    map.insert(pairs_begin, pairs_begin + num);
+
+    // map should resize twice if the erased slots are successfully reused
+    REQUIRE(map.get_capacity() == 2 * num);
+    // check that keys can be successfully deleted from only the first and second submaps
+    map.erase(d_keys.begin(), d_keys.begin() + 2 * num_keys);
+    map.contains(d_keys.begin(), d_keys.end(), d_keys_exist.begin());
+
+    REQUIRE(cuco::test::none_of(
+      d_keys_exist.begin(), d_keys_exist.begin() + 2 * num_keys, thrust::identity{}));
+
+    REQUIRE(cuco::test::all_of(
+      d_keys_exist.begin() + 2 * num_keys, d_keys_exist.end(), thrust::identity{}));
+
+    REQUIRE(map.get_size() == 2 * num_keys);
+    // check that keys can be successfully deleted from all submaps (some will be unsuccessful
+    // erases)
+    map.erase(d_keys.begin(), d_keys.end());
+
+    map.contains(d_keys.begin(), d_keys.end(), d_keys_exist.begin());
+
+    REQUIRE(cuco::test::none_of(d_keys_exist.begin(), d_keys_exist.end(), thrust::identity{}));
+
+    REQUIRE(map.get_size() == 0);
+  }
+}
diff --git a/tests/dynamic_map/unique_sequence_test.cu b/tests/dynamic_map/unique_sequence_test.cu
index de26bb3dc..aa01ca51a 100644
--- a/tests/dynamic_map/unique_sequence_test.cu
+++ b/tests/dynamic_map/unique_sequence_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,7 +27,7 @@
 #include <thrust/sequence.h>
 #include <thrust/tuple.h>
 
-#include <catch2/catch.hpp>
+#include <catch2/catch_template_test_macros.hpp>
 
 TEMPLATE_TEST_CASE_SIG("Unique sequence of keys",
                        "",
@@ -38,8 +38,9 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence of keys",
                        (int64_t, int64_t))
 {
   constexpr std::size_t num_keys{50'000'000};
+
   cuco::dynamic_map<Key, Value> map{
-    30'000'000, cuco::sentinel::empty_key<Key>{-1}, cuco::sentinel::empty_value<Value>{-1}};
+    30'000'000, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{-1}};
 
   thrust::device_vector<Key> d_keys(num_keys);
   thrust::device_vector<Value> d_values(num_keys);
@@ -47,9 +48,9 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence of keys",
   thrust::sequence(thrust::device, d_keys.begin(), d_keys.end());
   thrust::sequence(thrust::device, d_values.begin(), d_values.end());
 
-  auto pairs_begin = thrust::make_transform_iterator(
-    thrust::make_counting_iterator<int>(0),
-    [] __device__(auto i) { return cuco::pair_type<Key, Value>(i, i); });
+  auto pairs_begin =
+    thrust::make_transform_iterator(thrust::make_counting_iterator<int>(0),
+                                    [] __device__(auto i) { return cuco::pair<Key, Value>(i, i); });
 
   thrust::device_vector<Value> d_results(num_keys);
   thrust::device_vector<bool> d_contained(num_keys);
diff --git a/tests/static_map/custom_type_test.cu b/tests/static_map/custom_type_test.cu
index e587613d4..e23216ca3 100644
--- a/tests/static_map/custom_type_test.cu
+++ b/tests/static_map/custom_type_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,7 +25,7 @@
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/transform.h>
 
-#include <catch2/catch.hpp>
+#include <catch2/catch_template_test_macros.hpp>
 
 #include <tuple>
 
@@ -113,9 +113,8 @@ TEMPLATE_TEST_CASE_SIG("User defined key and value type",
 
   constexpr std::size_t num      = 100;
   constexpr std::size_t capacity = num * 2;
-  cuco::static_map<Key, Value> map{capacity,
-                                   cuco::sentinel::empty_key<Key>{sentinel_key},
-                                   cuco::sentinel::empty_value<Value>{sentinel_value}};
+  cuco::static_map<Key, Value> map{
+    capacity, cuco::empty_key<Key>{sentinel_key}, cuco::empty_value<Value>{sentinel_value}};
 
   thrust::device_vector<Key> insert_keys(num);
   thrust::device_vector<Value> insert_values(num);
@@ -132,9 +131,9 @@ TEMPLATE_TEST_CASE_SIG("User defined key and value type",
                     insert_values.begin(),
                     [] __device__(auto i) { return Value{i}; });
 
-  auto insert_pairs = thrust::make_transform_iterator(
-    thrust::make_counting_iterator<int>(0),
-    [] __device__(auto i) { return cuco::pair_type<Key, Value>(i, i); });
+  auto insert_pairs =
+    thrust::make_transform_iterator(thrust::make_counting_iterator<int>(0),
+                                    [] __device__(auto i) { return cuco::pair<Key, Value>(i, i); });
 
   SECTION("All inserted keys-value pairs should be correctly recovered during find")
   {
@@ -213,7 +212,7 @@ TEMPLATE_TEST_CASE_SIG("User defined key and value type",
     map.insert(insert_pairs, insert_pairs + num, hash_custom_key{}, custom_key_equals{});
     auto view = map.get_device_view();
     REQUIRE(cuco::test::all_of(
-      insert_pairs, insert_pairs + num, [view] __device__(cuco::pair_type<Key, Value> const& pair) {
+      insert_pairs, insert_pairs + num, [view] __device__(cuco::pair<Key, Value> const& pair) {
         return view.contains(pair.first, hash_custom_key{}, custom_key_equals{});
       }));
   }
@@ -221,12 +220,11 @@ TEMPLATE_TEST_CASE_SIG("User defined key and value type",
   SECTION("Inserting unique keys should return insert success.")
   {
     auto m_view = map.get_device_mutable_view();
-    REQUIRE(
-      cuco::test::all_of(insert_pairs,
-                         insert_pairs + num,
-                         [m_view] __device__(cuco::pair_type<Key, Value> const& pair) mutable {
-                           return m_view.insert(pair, hash_custom_key{}, custom_key_equals{});
-                         }));
+    REQUIRE(cuco::test::all_of(insert_pairs,
+                               insert_pairs + num,
+                               [m_view] __device__(cuco::pair<Key, Value> const& pair) mutable {
+                                 return m_view.insert(pair, hash_custom_key{}, custom_key_equals{});
+                               }));
   }
 
   SECTION("Cannot find any key in an empty hash map")
@@ -237,7 +235,7 @@ TEMPLATE_TEST_CASE_SIG("User defined key and value type",
       REQUIRE(cuco::test::all_of(
         insert_pairs,
         insert_pairs + num,
-        [view] __device__(cuco::pair_type<Key, Value> const& pair) mutable {
+        [view] __device__(cuco::pair<Key, Value> const& pair) mutable {
           return view.find(pair.first, hash_custom_key{}, custom_key_equals{}) == view.end();
         }));
     }
@@ -246,9 +244,7 @@ TEMPLATE_TEST_CASE_SIG("User defined key and value type",
     {
       auto const view = map.get_device_view();
       REQUIRE(cuco::test::all_of(
-        insert_pairs,
-        insert_pairs + num,
-        [view] __device__(cuco::pair_type<Key, Value> const& pair) {
+        insert_pairs, insert_pairs + num, [view] __device__(cuco::pair<Key, Value> const& pair) {
           return view.find(pair.first, hash_custom_key{}, custom_key_equals{}) == view.end();
         }));
     }
diff --git a/tests/static_map/duplicate_keys_test.cu b/tests/static_map/duplicate_keys_test.cu
index 34a315a1c..5620fa4e9 100644
--- a/tests/static_map/duplicate_keys_test.cu
+++ b/tests/static_map/duplicate_keys_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,7 +27,7 @@
 #include <thrust/sequence.h>
 #include <thrust/sort.h>
 
-#include <catch2/catch.hpp>
+#include <catch2/catch_template_test_macros.hpp>
 
 TEMPLATE_TEST_CASE_SIG("Duplicate keys",
                        "",
@@ -39,7 +39,7 @@ TEMPLATE_TEST_CASE_SIG("Duplicate keys",
 {
   constexpr std::size_t num_keys{500'000};
   cuco::static_map<Key, Value> map{
-    num_keys * 2, cuco::sentinel::empty_key<Key>{-1}, cuco::sentinel::empty_value<Value>{-1}};
+    num_keys * 2, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{-1}};
 
   thrust::device_vector<Key> d_keys(num_keys);
   thrust::device_vector<Value> d_values(num_keys);
@@ -49,7 +49,7 @@ TEMPLATE_TEST_CASE_SIG("Duplicate keys",
 
   auto pairs_begin = thrust::make_transform_iterator(
     thrust::make_counting_iterator<int>(0),
-    [] __device__(auto i) { return cuco::pair_type<Key, Value>(i / 2, i / 2); });
+    [] __device__(auto i) { return cuco::pair<Key, Value>(i / 2, i / 2); });
 
   thrust::device_vector<Value> d_results(num_keys);
   thrust::device_vector<bool> d_contained(num_keys);
diff --git a/tests/static_map/erase_test.cu b/tests/static_map/erase_test.cu
index b5641539c..26cbd3fd3 100644
--- a/tests/static_map/erase_test.cu
+++ b/tests/static_map/erase_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,16 +14,18 @@
  * limitations under the License.
  */
 
-#include <catch2/catch.hpp>
+#include <utils.hpp>
+
+#include <cuco/static_map.cuh>
+
 #include <thrust/device_vector.h>
 #include <thrust/execution_policy.h>
+#include <thrust/functional.h>
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/sequence.h>
 #include <thrust/tuple.h>
 
-#include <cuco/static_map.cuh>
-
-#include <utils.hpp>
+#include <catch2/catch_template_test_macros.hpp>
 
 TEMPLATE_TEST_CASE_SIG("erase key", "", ((typename T), T), (int32_t), (int64_t))
 {
@@ -33,10 +35,8 @@ TEMPLATE_TEST_CASE_SIG("erase key", "", ((typename T), T), (int32_t), (int64_t))
   constexpr std::size_t num_keys = 1'000'000;
   constexpr std::size_t capacity = 1'100'000;
 
-  cuco::static_map<Key, Value> map{capacity,
-                                   cuco::sentinel::empty_key<Key>{-1},
-                                   cuco::sentinel::empty_value<Value>{-1},
-                                   cuco::sentinel::erased_key<Key>{-2}};
+  cuco::static_map<Key, Value> map{
+    capacity, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{-1}, cuco::erased_key<Key>{-2}};
 
   thrust::device_vector<Key> d_keys(num_keys);
   thrust::device_vector<Value> d_values(num_keys);
@@ -60,9 +60,7 @@ TEMPLATE_TEST_CASE_SIG("erase key", "", ((typename T), T), (int32_t), (int64_t))
 
     map.contains(d_keys.begin(), d_keys.end(), d_keys_exist.begin());
 
-    REQUIRE(cuco::test::none_of(d_keys_exist.begin(),
-                                d_keys_exist.end(),
-                                [] __device__(const bool key_found) { return key_found; }));
+    REQUIRE(cuco::test::none_of(d_keys_exist.begin(), d_keys_exist.end(), thrust::identity{}));
 
     map.insert(pairs_begin, pairs_begin + num_keys);
 
@@ -70,20 +68,16 @@ TEMPLATE_TEST_CASE_SIG("erase key", "", ((typename T), T), (int32_t), (int64_t))
 
     map.contains(d_keys.begin(), d_keys.end(), d_keys_exist.begin());
 
-    REQUIRE(cuco::test::all_of(d_keys_exist.begin(),
-                               d_keys_exist.end(),
-                               [] __device__(const bool key_found) { return key_found; }));
+    REQUIRE(cuco::test::all_of(d_keys_exist.begin(), d_keys_exist.end(), thrust::identity{}));
 
     map.erase(d_keys.begin(), d_keys.begin() + num_keys / 2);
     map.contains(d_keys.begin(), d_keys.end(), d_keys_exist.begin());
 
-    REQUIRE(cuco::test::none_of(d_keys_exist.begin(),
-                                d_keys_exist.begin() + num_keys / 2,
-                                [] __device__(const bool key_found) { return key_found; }));
+    REQUIRE(cuco::test::none_of(
+      d_keys_exist.begin(), d_keys_exist.begin() + num_keys / 2, thrust::identity{}));
 
-    REQUIRE(cuco::test::all_of(d_keys_exist.begin() + num_keys / 2,
-                               d_keys_exist.end(),
-                               [] __device__(const bool key_found) { return key_found; }));
+    REQUIRE(cuco::test::all_of(
+      d_keys_exist.begin() + num_keys / 2, d_keys_exist.end(), thrust::identity{}));
 
     map.erase(d_keys.begin() + num_keys / 2, d_keys.end());
     REQUIRE(map.get_size() == 0);
diff --git a/tests/static_map/heterogeneous_lookup_test.cu b/tests/static_map/heterogeneous_lookup_test.cu
index 766fa9e1f..e842612b1 100644
--- a/tests/static_map/heterogeneous_lookup_test.cu
+++ b/tests/static_map/heterogeneous_lookup_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,7 +25,7 @@
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/transform.h>
 
-#include <catch2/catch.hpp>
+#include <catch2/catch_template_test_macros.hpp>
 
 #include <tuple>
 
@@ -96,13 +96,12 @@ TEMPLATE_TEST_CASE("Heterogeneous lookup",
 
   constexpr std::size_t num      = 100;
   constexpr std::size_t capacity = num * 2;
-  cuco::static_map<Key, Value> map{capacity,
-                                   cuco::sentinel::empty_key<Key>{sentinel_key},
-                                   cuco::sentinel::empty_value<Value>{sentinel_value}};
+  cuco::static_map<Key, Value> map{
+    capacity, cuco::empty_key<Key>{sentinel_key}, cuco::empty_value<Value>{sentinel_value}};
 
-  auto insert_pairs = thrust::make_transform_iterator(
-    thrust::counting_iterator<int>(0),
-    [] __device__(auto i) { return cuco::pair_type<Key, Value>(i, i); });
+  auto insert_pairs =
+    thrust::make_transform_iterator(thrust::counting_iterator<int>(0),
+                                    [] __device__(auto i) { return cuco::pair<Key, Value>(i, i); });
   auto probe_keys = thrust::make_transform_iterator(thrust::counting_iterator<int>(0),
                                                     [] __device__(auto i) { return ProbeKey(i); });
 
diff --git a/tests/static_map/insert_and_find_test.cu b/tests/static_map/insert_and_find_test.cu
index ec3339c4f..5784f786f 100644
--- a/tests/static_map/insert_and_find_test.cu
+++ b/tests/static_map/insert_and_find_test.cu
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2022, Jonas Hahnfeld, CERN.
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,7 +24,7 @@
 #include <thrust/functional.h>
 #include <thrust/sequence.h>
 
-#include <catch2/catch.hpp>
+#include <catch2/catch_template_test_macros.hpp>
 
 static constexpr int Iters = 10'000;
 
@@ -59,14 +59,14 @@ TEMPLATE_TEST_CASE_SIG("Parallel insert-or-update",
                        (int64_t, int32_t),
                        (int64_t, int64_t))
 {
-  cuco::sentinel::empty_key<Key> empty_key_sentinel{-1};
-  cuco::sentinel::empty_value<Value> empty_value_sentinel{-1};
+  cuco::empty_key<Key> empty_key_sentinel{-1};
+  cuco::empty_value<Value> empty_value_sentinel{-1};
   cuco::static_map<Key, Value> m(10 * Iters, empty_key_sentinel, empty_value_sentinel);
 
   static constexpr int Blocks  = 1024;
   static constexpr int Threads = 128;
   parallel_sum<<<Blocks, Threads>>>(m.get_device_mutable_view());
-  cudaDeviceSynchronize();
+  CUCO_CUDA_TRY(cudaDeviceSynchronize());
 
   thrust::device_vector<Key> d_keys(Iters);
   thrust::device_vector<Value> d_values(Iters);
diff --git a/tests/static_map/insert_or_assign_test.cu b/tests/static_map/insert_or_assign_test.cu
new file mode 100644
index 000000000..90c6553ce
--- /dev/null
+++ b/tests/static_map/insert_or_assign_test.cu
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <utils.hpp>
+
+#include <cuco/static_map.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <thrust/functional.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/sort.h>
+
+#include <catch2/catch_template_test_macros.hpp>
+
+using size_type = std::size_t;
+
+template <typename Map>
+__inline__ void test_insert_or_assign(Map& map, size_type num_keys)
+{
+  using Key   = typename Map::key_type;
+  using Value = typename Map::mapped_type;
+
+  // Insert pairs
+  auto pairs_begin =
+    thrust::make_transform_iterator(thrust::counting_iterator<size_type>(0),
+                                    [] __device__(auto i) { return cuco::pair<Key, Value>(i, i); });
+
+  auto const initial_size = map.insert(pairs_begin, pairs_begin + num_keys);
+  REQUIRE(initial_size == num_keys);  // all keys should be inserted
+
+  // Query pairs have the same keys but different payloads
+  auto query_pairs_begin = thrust::make_transform_iterator(
+    thrust::counting_iterator<size_type>(0),
+    [] __device__(auto i) { return cuco::pair<Key, Value>(i, i * 2); });
+
+  map.insert_or_assign(query_pairs_begin, query_pairs_begin + num_keys);
+
+  auto const updated_size = map.size();
+  // all keys are present in the map so the size shouldn't change
+  REQUIRE(updated_size == initial_size);
+
+  thrust::device_vector<Key> d_keys(num_keys);
+  thrust::device_vector<Key> d_values(num_keys);
+  map.retrieve_all(d_keys.begin(), d_values.begin());
+
+  auto gold_values_begin = thrust::make_transform_iterator(thrust::counting_iterator<size_type>(0),
+                                                           [] __device__(auto i) { return i * 2; });
+
+  thrust::sort(thrust::device, d_values.begin(), d_values.end());
+  REQUIRE(cuco::test::equal(
+    d_values.begin(), d_values.end(), gold_values_begin, thrust::equal_to<Value>{}));
+}
+
+TEMPLATE_TEST_CASE_SIG(
+  "Insert or assign",
+  "",
+  ((typename Key, typename Value, cuco::test::probe_sequence Probe, int CGSize),
+   Key,
+   Value,
+   Probe,
+   CGSize),
+  (int32_t, int32_t, cuco::test::probe_sequence::double_hashing, 1),
+  (int32_t, int64_t, cuco::test::probe_sequence::double_hashing, 1),
+  (int32_t, int32_t, cuco::test::probe_sequence::double_hashing, 2),
+  (int32_t, int64_t, cuco::test::probe_sequence::double_hashing, 2),
+  (int64_t, int32_t, cuco::test::probe_sequence::double_hashing, 1),
+  (int64_t, int64_t, cuco::test::probe_sequence::double_hashing, 1),
+  (int64_t, int32_t, cuco::test::probe_sequence::double_hashing, 2),
+  (int64_t, int64_t, cuco::test::probe_sequence::double_hashing, 2),
+  (int32_t, int32_t, cuco::test::probe_sequence::linear_probing, 1),
+  (int32_t, int64_t, cuco::test::probe_sequence::linear_probing, 1),
+  (int32_t, int32_t, cuco::test::probe_sequence::linear_probing, 2),
+  (int32_t, int64_t, cuco::test::probe_sequence::linear_probing, 2),
+  (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 1),
+  (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 1),
+  (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 2),
+  (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2))
+{
+  constexpr size_type num_keys{400};
+
+  using probe =
+    std::conditional_t<Probe == cuco::test::probe_sequence::linear_probing,
+                       cuco::experimental::linear_probing<CGSize, cuco::murmurhash3_32<Key>>,
+                       cuco::experimental::double_hashing<CGSize,
+                                                          cuco::murmurhash3_32<Key>,
+                                                          cuco::murmurhash3_32<Key>>>;
+
+  auto map = cuco::experimental::static_map<Key,
+                                            Value,
+                                            cuco::experimental::extent<size_type>,
+                                            cuda::thread_scope_device,
+                                            thrust::equal_to<Key>,
+                                            probe,
+                                            cuco::cuda_allocator<std::byte>,
+                                            cuco::experimental::storage<2>>{
+    num_keys, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{-1}};
+
+  test_insert_or_assign(map, num_keys);
+}
diff --git a/tests/static_map/key_sentinel_test.cu b/tests/static_map/key_sentinel_test.cu
index e52c1405e..74a1badd1 100644
--- a/tests/static_map/key_sentinel_test.cu
+++ b/tests/static_map/key_sentinel_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,7 +22,7 @@
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 
-#include <catch2/catch.hpp>
+#include <catch2/catch_template_test_macros.hpp>
 
 #define SIZE 10
 __device__ int A[SIZE];
@@ -40,7 +40,7 @@ TEMPLATE_TEST_CASE_SIG(
 
   constexpr std::size_t num_keys{SIZE};
   cuco::static_map<Key, Value> map{
-    SIZE * 2, cuco::sentinel::empty_key<Key>{-1}, cuco::sentinel::empty_value<Value>{-1}};
+    SIZE * 2, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{-1}};
 
   auto m_view = map.get_device_mutable_view();
   auto view   = map.get_device_view();
@@ -49,21 +49,21 @@ TEMPLATE_TEST_CASE_SIG(
   for (int i = 0; i < SIZE; i++) {
     h_A[i] = i;
   }
-  cudaMemcpyToSymbol(A, h_A, SIZE * sizeof(int));
+  CUCO_CUDA_TRY(cudaMemcpyToSymbol(A, h_A, SIZE * sizeof(int)));
 
-  auto pairs_begin = thrust::make_transform_iterator(
-    thrust::make_counting_iterator<T>(0),
-    [] __device__(auto i) { return cuco::pair_type<Key, Value>(i, i); });
+  auto pairs_begin =
+    thrust::make_transform_iterator(thrust::make_counting_iterator<T>(0),
+                                    [] __device__(auto i) { return cuco::pair<Key, Value>(i, i); });
 
   SECTION(
     "Tests of non-CG insert: The custom `key_equal` can never be used to compare against sentinel")
   {
-    REQUIRE(cuco::test::all_of(
-      pairs_begin,
-      pairs_begin + num_keys,
-      [m_view] __device__(cuco::pair_type<Key, Value> const& pair) mutable {
-        return m_view.insert(pair, cuco::detail::MurmurHash3_32<Key>{}, custom_equals<Key>{});
-      }));
+    REQUIRE(cuco::test::all_of(pairs_begin,
+                               pairs_begin + num_keys,
+                               [m_view] __device__(cuco::pair<Key, Value> const& pair) mutable {
+                                 return m_view.insert(
+                                   pair, cuco::default_hash_function<Key>{}, custom_equals<Key>{});
+                               }));
   }
 
   SECTION(
@@ -71,16 +71,14 @@ TEMPLATE_TEST_CASE_SIG(
   {
     map.insert(pairs_begin,
                pairs_begin + num_keys,
-               cuco::detail::MurmurHash3_32<Key>{},
+               cuco::default_hash_function<Key>{},
                custom_equals<Key>{});
     // All keys inserted via custom `key_equal` should be found
-    REQUIRE(cuco::test::all_of(pairs_begin,
-                               pairs_begin + num_keys,
-                               [view] __device__(cuco::pair_type<Key, Value> const& pair) {
-                                 auto const found = view.find(pair.first);
-                                 return (found != view.end()) and
-                                        (found->first.load() == pair.first and
-                                         found->second.load() == pair.second);
-                               }));
+    REQUIRE(cuco::test::all_of(
+      pairs_begin, pairs_begin + num_keys, [view] __device__(cuco::pair<Key, Value> const& pair) {
+        auto const found = view.find(pair.first);
+        return (found != view.end()) and
+               (found->first.load() == pair.first and found->second.load() == pair.second);
+      }));
   }
 }
diff --git a/tests/static_map/shared_memory_test.cu b/tests/static_map/shared_memory_test.cu
index 67ae88d88..444f1c7e7 100644
--- a/tests/static_map/shared_memory_test.cu
+++ b/tests/static_map/shared_memory_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,7 +25,7 @@
 #include <thrust/sequence.h>
 #include <thrust/tuple.h>
 
-#include <catch2/catch.hpp>
+#include <catch2/catch_template_test_macros.hpp>
 
 #include <limits>
 
@@ -95,7 +95,7 @@ TEMPLATE_TEST_CASE_SIG("Shared memory static map",
   std::vector<std::unique_ptr<MapType>> maps;
   for (std::size_t map_id = 0; map_id < number_of_maps; ++map_id) {
     maps.push_back(std::make_unique<MapType>(
-      map_capacity, cuco::sentinel::empty_key<Key>{-1}, cuco::sentinel::empty_value<Value>{-1}));
+      map_capacity, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{-1}));
   }
 
   thrust::device_vector<bool> d_keys_exist(number_of_maps * elements_in_map);
@@ -148,9 +148,7 @@ TEMPLATE_TEST_CASE_SIG("Shared memory static map",
                                d_keys_exist.data().get(),
                                d_keys_and_values_correct.data().get());
 
-    REQUIRE(cuco::test::none_of(d_keys_exist.begin(),
-                                d_keys_exist.end(),
-                                [] __device__(const bool key_found) { return key_found; }));
+    REQUIRE(cuco::test::none_of(d_keys_exist.begin(), d_keys_exist.end(), thrust::identity{}));
   }
 }
 
@@ -161,11 +159,8 @@ __global__ void shared_memory_hash_table_kernel(bool* key_found)
   using map_type = typename cuco::static_map<K, V, cuda::thread_scope_block>::device_mutable_view;
   using find_map_type = typename cuco::static_map<K, V, cuda::thread_scope_block>::device_view;
   __shared__ typename map_type::slot_type slots[N];
-  auto map = map_type::make_from_uninitialized_slots(cg::this_thread_block(),
-                                                     &slots[0],
-                                                     N,
-                                                     cuco::sentinel::empty_key<K>{-1},
-                                                     cuco::sentinel::empty_value<V>{-1});
+  auto map = map_type::make_from_uninitialized_slots(
+    cg::this_thread_block(), &slots[0], N, cuco::empty_key<K>{-1}, cuco::empty_value<V>{-1});
 
   auto g            = cg::this_thread_block();
   std::size_t index = threadIdx.x + blockIdx.x * blockDim.x;
diff --git a/tests/static_map/stream_test.cu b/tests/static_map/stream_test.cu
index 5f816410e..6121cbd62 100644
--- a/tests/static_map/stream_test.cu
+++ b/tests/static_map/stream_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,7 +27,7 @@
 #include <thrust/sequence.h>
 #include <thrust/tuple.h>
 
-#include <catch2/catch.hpp>
+#include <catch2/catch_template_test_macros.hpp>
 
 TEMPLATE_TEST_CASE_SIG("Unique sequence of keys on given stream",
                        "",
@@ -38,12 +38,12 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence of keys on given stream",
                        (int64_t, int64_t))
 {
   cudaStream_t stream;
-  cudaStreamCreate(&stream);
+  CUCO_CUDA_TRY(cudaStreamCreate(&stream));
 
   constexpr std::size_t num_keys{500'000};
   cuco::static_map<Key, Value> map{1'000'000,
-                                   cuco::sentinel::empty_key<Key>{-1},
-                                   cuco::sentinel::empty_value<Value>{-1},
+                                   cuco::empty_key<Key>{-1},
+                                   cuco::empty_value<Value>{-1},
                                    cuco::cuda_allocator<char>{},
                                    stream};
 
@@ -53,11 +53,11 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence of keys on given stream",
   thrust::sequence(thrust::device, d_keys.begin(), d_keys.end());
   thrust::sequence(thrust::device, d_values.begin(), d_values.end());
 
-  auto pairs_begin = thrust::make_transform_iterator(
-    thrust::make_counting_iterator<int>(0),
-    [] __device__(auto i) { return cuco::pair_type<Key, Value>(i, i); });
+  auto pairs_begin =
+    thrust::make_transform_iterator(thrust::make_counting_iterator<int>(0),
+                                    [] __device__(auto i) { return cuco::pair<Key, Value>(i, i); });
 
-  auto hash_fn  = cuco::detail::MurmurHash3_32<Key>{};
+  auto hash_fn  = cuco::default_hash_function<Key>{};
   auto equal_fn = thrust::equal_to<Value>{};
 
   // bulk function test cases
@@ -67,7 +67,6 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence of keys on given stream",
 
     map.insert(pairs_begin, pairs_begin + num_keys, hash_fn, equal_fn, stream);
     map.find(d_keys.begin(), d_keys.end(), d_results.begin(), hash_fn, equal_fn, stream);
-    // cudaStreamSynchronize(stream);
     auto zip = thrust::make_zip_iterator(thrust::make_tuple(d_results.begin(), d_values.begin()));
 
     REQUIRE(cuco::test::all_of(
@@ -87,5 +86,5 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence of keys on given stream",
     REQUIRE(cuco::test::all_of(d_contained.begin(), d_contained.end(), thrust::identity{}, stream));
   }
 
-  cudaStreamDestroy(stream);
+  CUCO_CUDA_TRY(cudaStreamDestroy(stream));
 }
diff --git a/tests/static_map/unique_sequence_test.cu b/tests/static_map/unique_sequence_test.cu
index 75bb67d61..6a0165cc2 100644
--- a/tests/static_map/unique_sequence_test.cu
+++ b/tests/static_map/unique_sequence_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,9 +26,10 @@
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/sequence.h>
+#include <thrust/sort.h>
 #include <thrust/tuple.h>
 
-#include <catch2/catch.hpp>
+#include <catch2/catch_template_test_macros.hpp>
 
 TEMPLATE_TEST_CASE_SIG("Unique sequence of keys",
                        "",
@@ -40,7 +41,7 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence of keys",
 {
   constexpr std::size_t num_keys{500'000};
   cuco::static_map<Key, Value> map{
-    1'000'000, cuco::sentinel::empty_key<Key>{-1}, cuco::sentinel::empty_value<Value>{-1}};
+    1'000'000, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{-1}};
 
   auto m_view = map.get_device_mutable_view();
   auto view   = map.get_device_view();
@@ -51,9 +52,9 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence of keys",
   thrust::sequence(thrust::device, d_keys.begin(), d_keys.end());
   thrust::sequence(thrust::device, d_values.begin(), d_values.end());
 
-  auto pairs_begin = thrust::make_transform_iterator(
-    thrust::make_counting_iterator<int>(0),
-    [] __device__(auto i) { return cuco::pair_type<Key, Value>(i, i); });
+  auto pairs_begin =
+    thrust::make_transform_iterator(thrust::make_counting_iterator<int>(0),
+                                    [] __device__(auto i) { return cuco::pair<Key, Value>(i, i); });
 
   thrust::device_vector<Value> d_results(num_keys);
   thrust::device_vector<bool> d_contained(num_keys);
@@ -87,68 +88,217 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence of keys",
 
   SECTION("Inserting unique keys should return insert success.")
   {
-    REQUIRE(
-      cuco::test::all_of(pairs_begin,
-                         pairs_begin + num_keys,
-                         [m_view] __device__(cuco::pair_type<Key, Value> const& pair) mutable {
-                           return m_view.insert(pair);
-                         }));
+    REQUIRE(cuco::test::all_of(pairs_begin,
+                               pairs_begin + num_keys,
+                               [m_view] __device__(cuco::pair<Key, Value> const& pair) mutable {
+                                 return m_view.insert(pair);
+                               }));
   }
 
   SECTION("Cannot find any key in an empty hash map with non-const view")
   {
     SECTION("non-const view")
-    {
-      REQUIRE(
-        cuco::test::all_of(pairs_begin,
-                           pairs_begin + num_keys,
-                           [view] __device__(cuco::pair_type<Key, Value> const& pair) mutable {
-                             return view.find(pair.first) == view.end();
-                           }));
-    }
-    SECTION("const view")
     {
       REQUIRE(cuco::test::all_of(pairs_begin,
                                  pairs_begin + num_keys,
-                                 [view] __device__(cuco::pair_type<Key, Value> const& pair) {
+                                 [view] __device__(cuco::pair<Key, Value> const& pair) mutable {
                                    return view.find(pair.first) == view.end();
                                  }));
     }
+    SECTION("const view")
+    {
+      REQUIRE(cuco::test::all_of(
+        pairs_begin, pairs_begin + num_keys, [view] __device__(cuco::pair<Key, Value> const& pair) {
+          return view.find(pair.first) == view.end();
+        }));
+    }
   }
 
   SECTION("Keys are all found after inserting many keys.")
   {
     // Bulk insert keys
-    thrust::for_each(thrust::device,
-                     pairs_begin,
-                     pairs_begin + num_keys,
-                     [m_view] __device__(cuco::pair_type<Key, Value> const& pair) mutable {
-                       m_view.insert(pair);
-                     });
+    thrust::for_each(
+      thrust::device,
+      pairs_begin,
+      pairs_begin + num_keys,
+      [m_view] __device__(cuco::pair<Key, Value> const& pair) mutable { m_view.insert(pair); });
 
     SECTION("non-const view")
-    {
-      // All keys should be found
-      REQUIRE(cuco::test::all_of(
-        pairs_begin,
-        pairs_begin + num_keys,
-        [view] __device__(cuco::pair_type<Key, Value> const& pair) mutable {
-          auto const found = view.find(pair.first);
-          return (found != view.end()) and
-                 (found->first.load() == pair.first and found->second.load() == pair.second);
-        }));
-    }
-    SECTION("const view")
     {
       // All keys should be found
       REQUIRE(cuco::test::all_of(pairs_begin,
                                  pairs_begin + num_keys,
-                                 [view] __device__(cuco::pair_type<Key, Value> const& pair) {
+                                 [view] __device__(cuco::pair<Key, Value> const& pair) mutable {
                                    auto const found = view.find(pair.first);
                                    return (found != view.end()) and
                                           (found->first.load() == pair.first and
                                            found->second.load() == pair.second);
                                  }));
     }
+    SECTION("const view")
+    {
+      // All keys should be found
+      REQUIRE(cuco::test::all_of(
+        pairs_begin, pairs_begin + num_keys, [view] __device__(cuco::pair<Key, Value> const& pair) {
+          auto const found = view.find(pair.first);
+          return (found != view.end()) and
+                 (found->first.load() == pair.first and found->second.load() == pair.second);
+        }));
+    }
+  }
+}
+
+using size_type = int32_t;
+
+template <typename Map>
+__inline__ void test_unique_sequence(Map& map, size_type num_keys)
+{
+  using Key   = typename Map::key_type;
+  using Value = typename Map::mapped_type;
+
+  thrust::device_vector<Key> d_keys(num_keys);
+
+  thrust::sequence(thrust::device, d_keys.begin(), d_keys.end());
+
+  auto keys_begin = d_keys.begin();
+  auto pairs_begin =
+    thrust::make_transform_iterator(thrust::make_counting_iterator<size_type>(0),
+                                    [] __device__(auto i) { return cuco::pair<Key, Value>(i, i); });
+  thrust::device_vector<bool> d_contained(num_keys);
+
+  auto zip_equal = [] __device__(auto const& p) { return thrust::get<0>(p) == thrust::get<1>(p); };
+  auto is_even   = [] __device__(auto const& i) { return i % 2 == 0; };
+
+  SECTION("Non-inserted keys should not be contained.")
+  {
+    REQUIRE(map.size() == 0);
+
+    map.contains(keys_begin, keys_begin + num_keys, d_contained.begin());
+    REQUIRE(cuco::test::none_of(d_contained.begin(), d_contained.end(), thrust::identity{}));
+  }
+
+  SECTION("Non-inserted keys have no matches")
+  {
+    thrust::device_vector<Value> d_results(num_keys);
+
+    map.find(keys_begin, keys_begin + num_keys, d_results.begin());
+    auto zip = thrust::make_zip_iterator(thrust::make_tuple(
+      d_results.begin(), thrust::constant_iterator<Key>{map.empty_key_sentinel()}));
+
+    REQUIRE(cuco::test::all_of(zip, zip + num_keys, zip_equal));
+  }
+
+  SECTION("All conditionally inserted keys should be contained")
+  {
+    auto const inserted = map.insert_if(
+      pairs_begin, pairs_begin + num_keys, thrust::counting_iterator<std::size_t>(0), is_even);
+    REQUIRE(inserted == num_keys / 2);
+    REQUIRE(map.size() == num_keys / 2);
+
+    map.contains(keys_begin, keys_begin + num_keys, d_contained.begin());
+    REQUIRE(cuco::test::equal(d_contained.begin(),
+                              d_contained.end(),
+                              thrust::counting_iterator<std::size_t>(0),
+                              [] __device__(auto const& idx_contained, auto const& idx) {
+                                return ((idx % 2) == 0) == idx_contained;
+                              }));
+  }
+
+  map.insert(pairs_begin, pairs_begin + num_keys);
+  REQUIRE(map.size() == num_keys);
+
+  SECTION("All inserted keys should be contained.")
+  {
+    map.contains(keys_begin, keys_begin + num_keys, d_contained.begin());
+    REQUIRE(cuco::test::all_of(d_contained.begin(), d_contained.end(), thrust::identity{}));
+  }
+
+  SECTION("Conditional contains should return true on even inputs.")
+  {
+    map.contains_if(keys_begin,
+                    keys_begin + num_keys,
+                    thrust::counting_iterator<std::size_t>(0),
+                    is_even,
+                    d_contained.begin());
+    auto gold_iter =
+      thrust::make_transform_iterator(thrust::counting_iterator<std::size_t>(0), is_even);
+    auto zip = thrust::make_zip_iterator(thrust::make_tuple(d_contained.begin(), gold_iter));
+    REQUIRE(cuco::test::all_of(zip, zip + num_keys, zip_equal));
+  }
+
+  SECTION("All inserted keys should be correctly recovered during find")
+  {
+    thrust::device_vector<Value> d_results(num_keys);
+
+    map.find(keys_begin, keys_begin + num_keys, d_results.begin());
+    auto zip = thrust::make_zip_iterator(thrust::make_tuple(d_results.begin(), keys_begin));
+
+    REQUIRE(cuco::test::all_of(zip, zip + num_keys, zip_equal));
   }
+
+  SECTION("All inserted key-values should be properly retrieved")
+  {
+    thrust::device_vector<Value> d_values(num_keys);
+
+    auto const [keys_end, values_end] = map.retrieve_all(keys_begin, d_values.begin());
+    REQUIRE(std::distance(keys_begin, keys_end) == num_keys);
+    REQUIRE(std::distance(d_values.begin(), values_end) == num_keys);
+
+    thrust::sort(thrust::device, d_values.begin(), values_end);
+    REQUIRE(cuco::test::equal(d_values.begin(),
+                              values_end,
+                              thrust::make_counting_iterator<Value>(0),
+                              thrust::equal_to<Value>{}));
+  }
+}
+
+TEMPLATE_TEST_CASE_SIG(
+  "Unique sequence",
+  "",
+  ((typename Key, typename Value, cuco::test::probe_sequence Probe, int CGSize),
+   Key,
+   Value,
+   Probe,
+   CGSize),
+  (int32_t, int32_t, cuco::test::probe_sequence::double_hashing, 1),
+  (int32_t, int64_t, cuco::test::probe_sequence::double_hashing, 1),
+  (int32_t, int32_t, cuco::test::probe_sequence::double_hashing, 2),
+  (int32_t, int64_t, cuco::test::probe_sequence::double_hashing, 2),
+  (int64_t, int32_t, cuco::test::probe_sequence::double_hashing, 1),
+  (int64_t, int64_t, cuco::test::probe_sequence::double_hashing, 1),
+  (int64_t, int32_t, cuco::test::probe_sequence::double_hashing, 2),
+  (int64_t, int64_t, cuco::test::probe_sequence::double_hashing, 2),
+  (int32_t, int32_t, cuco::test::probe_sequence::linear_probing, 1),
+  (int32_t, int64_t, cuco::test::probe_sequence::linear_probing, 1),
+  (int32_t, int32_t, cuco::test::probe_sequence::linear_probing, 2),
+  (int32_t, int64_t, cuco::test::probe_sequence::linear_probing, 2),
+  (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 1),
+  (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 1),
+  (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 2),
+  (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2))
+{
+  constexpr size_type num_keys{400};
+  constexpr size_type gold_capacity = CGSize == 1 ? 422   // 211 x 1 x 2
+                                                  : 412;  // 103 x 2 x 2
+
+  using probe =
+    std::conditional_t<Probe == cuco::test::probe_sequence::linear_probing,
+                       cuco::experimental::linear_probing<CGSize, cuco::murmurhash3_32<Key>>,
+                       cuco::experimental::double_hashing<CGSize,
+                                                          cuco::murmurhash3_32<Key>,
+                                                          cuco::murmurhash3_32<Key>>>;
+
+  auto map = cuco::experimental::static_map<Key,
+                                            Value,
+                                            cuco::experimental::extent<size_type>,
+                                            cuda::thread_scope_device,
+                                            thrust::equal_to<Key>,
+                                            probe,
+                                            cuco::cuda_allocator<std::byte>,
+                                            cuco::experimental::storage<2>>{
+    num_keys, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{-1}};
+
+  REQUIRE(map.capacity() == gold_capacity);
+
+  test_unique_sequence(map, num_keys);
 }
diff --git a/tests/static_multimap/custom_pair_retrieve_test.cu b/tests/static_multimap/custom_pair_retrieve_test.cu
index 5d0329382..7856b9e20 100644
--- a/tests/static_multimap/custom_pair_retrieve_test.cu
+++ b/tests/static_multimap/custom_pair_retrieve_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -28,15 +28,15 @@
 #include <thrust/sort.h>
 #include <thrust/transform.h>
 
-#include <catch2/catch.hpp>
+#include <catch2/catch_template_test_macros.hpp>
 
 #include <cooperative_groups.h>
 
 // Custom pair equal
 template <typename Key, typename Value>
 struct pair_equal {
-  __device__ bool operator()(const cuco::pair_type<Key, Value>& lhs,
-                             const cuco::pair_type<Key, Value>& rhs) const
+  __device__ bool operator()(const cuco::pair<Key, Value>& lhs,
+                             const cuco::pair<Key, Value>& rhs) const
   {
     return lhs.first == rhs.first;
   }
@@ -86,7 +86,7 @@ void test_non_shmem_pair_retrieve(Map& map, std::size_t const num_pairs)
   using Key   = typename Map::key_type;
   using Value = typename Map::mapped_type;
 
-  thrust::device_vector<cuco::pair_type<Key, Value>> d_pairs(num_pairs);
+  thrust::device_vector<cuco::pair<Key, Value>> d_pairs(num_pairs);
 
   // pair multiplicity = 2
   thrust::transform(thrust::device,
@@ -94,7 +94,7 @@ void test_non_shmem_pair_retrieve(Map& map, std::size_t const num_pairs)
                     thrust::counting_iterator<int>(num_pairs),
                     d_pairs.begin(),
                     [] __device__(auto i) {
-                      return cuco::pair_type<Key, Value>{i / 2, i};
+                      return cuco::pair<Key, Value>{i / 2, i};
                     });
 
   auto pair_begin = d_pairs.begin();
@@ -107,7 +107,7 @@ void test_non_shmem_pair_retrieve(Map& map, std::size_t const num_pairs)
                     thrust::counting_iterator<int>(num_pairs),
                     pair_begin,
                     [] __device__(auto i) {
-                      return cuco::pair_type<Key, Value>{i, i};
+                      return cuco::pair<Key, Value>{i, i};
                     });
 
   // create an array of prefix sum
@@ -196,19 +196,11 @@ TEMPLATE_TEST_CASE_SIG(
 {
   constexpr std::size_t num_pairs{200};
 
-  if constexpr (Probe == cuco::test::probe_sequence::linear_probing) {
-    cuco::static_multimap<Key,
-                          Value,
-                          cuda::thread_scope_device,
-                          cuco::cuda_allocator<char>,
-                          cuco::linear_probing<1, cuco::detail::MurmurHash3_32<Key>>>
-      map{
-        num_pairs * 2, cuco::sentinel::empty_key<Key>{-1}, cuco::sentinel::empty_value<Value>{-1}};
-    test_non_shmem_pair_retrieve(map, num_pairs);
-  }
-  if constexpr (Probe == cuco::test::probe_sequence::double_hashing) {
-    cuco::static_multimap<Key, Value> map{
-      num_pairs * 2, cuco::sentinel::empty_key<Key>{-1}, cuco::sentinel::empty_value<Value>{-1}};
-    test_non_shmem_pair_retrieve(map, num_pairs);
-  }
+  using probe = std::conditional_t<Probe == cuco::test::probe_sequence::linear_probing,
+                                   cuco::linear_probing<1, cuco::default_hash_function<Key>>,
+                                   cuco::double_hashing<8, cuco::default_hash_function<Key>>>;
+
+  cuco::static_multimap<Key, Value, cuda::thread_scope_device, cuco::cuda_allocator<char>, probe>
+    map{num_pairs * 2, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{-1}};
+  test_non_shmem_pair_retrieve(map, num_pairs);
 }
diff --git a/tests/static_multimap/custom_type_test.cu b/tests/static_multimap/custom_type_test.cu
index 40bdbe8ba..f53719205 100644
--- a/tests/static_multimap/custom_type_test.cu
+++ b/tests/static_multimap/custom_type_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,7 +27,7 @@
 #include <thrust/transform.h>
 #include <thrust/tuple.h>
 
-#include <catch2/catch.hpp>
+#include <catch2/catch_template_test_macros.hpp>
 
 #include <tuple>
 
@@ -39,7 +39,10 @@ struct key_pair {
 };
 
 struct hash_key_pair {
-  __device__ uint32_t operator()(key_pair k) const { return k.a; };
+  __host__ __device__ hash_key_pair() : hash_key_pair{0} {}
+  __host__ __device__ hash_key_pair(uint32_t offset) : offset_(offset) {}
+  __device__ uint32_t operator()(key_pair k) const { return k.a + offset_; };
+  uint32_t offset_;
 };
 
 struct key_pair_equals {
@@ -95,7 +98,7 @@ __inline__ void test_custom_key_value_type(Map& map, std::size_t num_pairs)
     auto count = map.count(key_begin, key_begin + num_pairs, stream, key_pair_equals{});
     REQUIRE(count == num_pairs);
 
-    thrust::device_vector<cuco::pair_type<Key, Value>> found_pairs(num_pairs);
+    thrust::device_vector<cuco::pair<Key, Value>> found_pairs(num_pairs);
     auto output_end = map.retrieve(
       key_begin, key_begin + num_pairs, found_pairs.begin(), stream, key_pair_equals{});
     std::size_t const size = std::distance(found_pairs.begin(), output_end);
@@ -107,16 +110,17 @@ __inline__ void test_custom_key_value_type(Map& map, std::size_t num_pairs)
       thrust::device,
       found_pairs.begin(),
       found_pairs.end(),
-      [] __device__(const cuco::pair_type<Key, Value>& lhs,
-                    const cuco::pair_type<Key, Value>& rhs) { return lhs.first.a < rhs.first.a; });
-
-    REQUIRE(cuco::test::equal(
-      pair_begin,
-      pair_begin + num_pairs,
-      found_pairs.begin(),
-      [] __device__(cuco::pair_type<Key, Value> lhs, cuco::pair_type<Key, Value> rhs) {
-        return lhs.first.a == rhs.first.a;
-      }));
+      [] __device__(const cuco::pair<Key, Value>& lhs, const cuco::pair<Key, Value>& rhs) {
+        return lhs.first.a < rhs.first.a;
+      });
+
+    REQUIRE(
+      cuco::test::equal(pair_begin,
+                        pair_begin + num_pairs,
+                        found_pairs.begin(),
+                        [] __device__(cuco::pair<Key, Value> lhs, cuco::pair<Key, Value> rhs) {
+                          return lhs.first.a == rhs.first.a;
+                        }));
   }
 
   SECTION("Non-matches are not included in the output")
@@ -138,7 +142,7 @@ __inline__ void test_custom_key_value_type(Map& map, std::size_t num_pairs)
     auto count = map.count(query_key_begin, query_key_begin + num, stream, key_pair_equals{});
     REQUIRE(count == num_pairs);
 
-    thrust::device_vector<cuco::pair_type<Key, Value>> found_pairs(num_pairs);
+    thrust::device_vector<cuco::pair<Key, Value>> found_pairs(num_pairs);
     auto output_end = map.retrieve(
       query_key_begin, query_key_begin + num, found_pairs.begin(), stream, key_pair_equals{});
     std::size_t const size = std::distance(found_pairs.begin(), output_end);
@@ -150,15 +154,16 @@ __inline__ void test_custom_key_value_type(Map& map, std::size_t num_pairs)
       thrust::device,
       found_pairs.begin(),
       found_pairs.end(),
-      [] __device__(const cuco::pair_type<Key, Value>& lhs,
-                    const cuco::pair_type<Key, Value>& rhs) { return lhs.first.a < rhs.first.a; });
-    REQUIRE(cuco::test::equal(
-      pair_begin,
-      pair_begin + num_pairs,
-      found_pairs.begin(),
-      [] __device__(cuco::pair_type<Key, Value> lhs, cuco::pair_type<Key, Value> rhs) {
-        return lhs.first.a == rhs.first.a;
-      }));
+      [] __device__(const cuco::pair<Key, Value>& lhs, const cuco::pair<Key, Value>& rhs) {
+        return lhs.first.a < rhs.first.a;
+      });
+    REQUIRE(
+      cuco::test::equal(pair_begin,
+                        pair_begin + num_pairs,
+                        found_pairs.begin(),
+                        [] __device__(cuco::pair<Key, Value> lhs, cuco::pair<Key, Value> rhs) {
+                          return lhs.first.a == rhs.first.a;
+                        }));
   }
 
   SECTION("Outer functions include non-matches in the output")
@@ -180,7 +185,7 @@ __inline__ void test_custom_key_value_type(Map& map, std::size_t num_pairs)
       map.count_outer(query_key_begin, query_key_begin + num, stream, key_pair_equals{});
     REQUIRE(count_outer == num);
 
-    thrust::device_vector<cuco::pair_type<Key, Value>> found_pairs(num);
+    thrust::device_vector<cuco::pair<Key, Value>> found_pairs(num);
     auto output_end = map.retrieve_outer(
       query_key_begin, query_key_begin + num, found_pairs.begin(), stream, key_pair_equals{});
     std::size_t const size_outer = std::distance(found_pairs.begin(), output_end);
@@ -228,21 +233,11 @@ TEMPLATE_TEST_CASE_SIG("User defined key and value type",
   constexpr std::size_t num_pairs = 100;
   constexpr std::size_t capacity  = num_pairs * 2;
 
-  if constexpr (Probe == cuco::test::probe_sequence::linear_probing) {
-    cuco::static_multimap<Key,
-                          Value,
-                          cuda::thread_scope_device,
-                          cuco::cuda_allocator<char>,
-                          cuco::linear_probing<1, hash_key_pair>>
-      map{capacity,
-          cuco::sentinel::empty_key{sentinel_key},
-          cuco::sentinel::empty_value{sentinel_value}};
-    test_custom_key_value_type(map, num_pairs);
-  }
-  if constexpr (Probe == cuco::test::probe_sequence::double_hashing) {
-    cuco::static_multimap<Key, Value> map{capacity,
-                                          cuco::sentinel::empty_key{sentinel_key},
-                                          cuco::sentinel::empty_value{sentinel_value}};
-    test_custom_key_value_type(map, num_pairs);
-  }
+  using probe = std::conditional_t<Probe == cuco::test::probe_sequence::linear_probing,
+                                   cuco::linear_probing<1, hash_key_pair>,
+                                   cuco::double_hashing<8, hash_key_pair, hash_key_pair>>;
+
+  cuco::static_multimap<Key, Value, cuda::thread_scope_device, cuco::cuda_allocator<char>, probe>
+    map{capacity, cuco::empty_key{sentinel_key}, cuco::empty_value{sentinel_value}};
+  test_custom_key_value_type(map, num_pairs);
 }
diff --git a/tests/static_multimap/heterogeneous_lookup_test.cu b/tests/static_multimap/heterogeneous_lookup_test.cu
index dca3de826..5a5b8b242 100644
--- a/tests/static_multimap/heterogeneous_lookup_test.cu
+++ b/tests/static_multimap/heterogeneous_lookup_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,7 +25,7 @@
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/transform.h>
 
-#include <catch2/catch.hpp>
+#include <catch2/catch_template_test_macros.hpp>
 
 #include <tuple>
 
@@ -101,13 +101,11 @@ TEMPLATE_TEST_CASE("Heterogeneous lookup",
                         cuda::thread_scope_device,
                         cuco::cuda_allocator<char>,
                         cuco::linear_probing<1, custom_hasher>>
-    map{capacity,
-        cuco::sentinel::empty_key<Key>{sentinel_key},
-        cuco::sentinel::empty_value<Value>{sentinel_value}};
+    map{capacity, cuco::empty_key<Key>{sentinel_key}, cuco::empty_value<Value>{sentinel_value}};
 
-  auto insert_pairs = thrust::make_transform_iterator(
-    thrust::counting_iterator<int>(0),
-    [] __device__(auto i) { return cuco::pair_type<Key, Value>(i, i); });
+  auto insert_pairs =
+    thrust::make_transform_iterator(thrust::counting_iterator<int>(0),
+                                    [] __device__(auto i) { return cuco::pair<Key, Value>(i, i); });
   auto probe_keys = thrust::make_transform_iterator(thrust::counting_iterator<int>(0),
                                                     [] __device__(auto i) { return ProbeKey(i); });
 
diff --git a/tests/static_multimap/insert_if_test.cu b/tests/static_multimap/insert_if_test.cu
index 506563502..5d5648e71 100644
--- a/tests/static_multimap/insert_if_test.cu
+++ b/tests/static_multimap/insert_if_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,7 +24,7 @@
 #include <thrust/sequence.h>
 #include <thrust/transform.h>
 
-#include <catch2/catch.hpp>
+#include <catch2/catch_template_test_macros.hpp>
 
 template <typename Key, typename Map, typename PairIt, typename KeyIt>
 __inline__ void test_insert_if(Map& map, PairIt pair_begin, KeyIt key_begin, std::size_t size)
@@ -55,7 +55,7 @@ TEMPLATE_TEST_CASE_SIG(
   constexpr std::size_t num_keys{1'000};
 
   thrust::device_vector<Key> d_keys(num_keys);
-  thrust::device_vector<cuco::pair_type<Key, Value>> d_pairs(num_keys);
+  thrust::device_vector<cuco::pair<Key, Value>> d_pairs(num_keys);
 
   thrust::sequence(thrust::device, d_keys.begin(), d_keys.end());
   // multiplicity = 1
@@ -64,21 +64,14 @@ TEMPLATE_TEST_CASE_SIG(
                     thrust::counting_iterator<int>(num_keys),
                     d_pairs.begin(),
                     [] __device__(auto i) {
-                      return cuco::pair_type<Key, Value>{i, i};
+                      return cuco::pair<Key, Value>{i, i};
                     });
 
-  if constexpr (Probe == cuco::test::probe_sequence::linear_probing) {
-    cuco::static_multimap<Key,
-                          Value,
-                          cuda::thread_scope_device,
-                          cuco::cuda_allocator<char>,
-                          cuco::linear_probing<1, cuco::detail::MurmurHash3_32<Key>>>
-      map{num_keys * 2, cuco::sentinel::empty_key<Key>{-1}, cuco::sentinel::empty_value<Value>{-1}};
-    test_insert_if<Key>(map, d_pairs.begin(), d_keys.begin(), num_keys);
-  }
-  if constexpr (Probe == cuco::test::probe_sequence::double_hashing) {
-    cuco::static_multimap<Key, Value> map{
-      num_keys * 2, cuco::sentinel::empty_key<Key>{-1}, cuco::sentinel::empty_value<Value>{-1}};
-    test_insert_if<Key>(map, d_pairs.begin(), d_keys.begin(), num_keys);
-  }
+  using probe = std::conditional_t<Probe == cuco::test::probe_sequence::linear_probing,
+                                   cuco::linear_probing<1, cuco::default_hash_function<Key>>,
+                                   cuco::double_hashing<8, cuco::default_hash_function<Key>>>;
+
+  cuco::static_multimap<Key, Value, cuda::thread_scope_device, cuco::cuda_allocator<char>, probe>
+    map{num_keys * 2, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{-1}};
+  test_insert_if<Key>(map, d_pairs.begin(), d_keys.begin(), num_keys);
 }
diff --git a/tests/static_multimap/multiplicity_test.cu b/tests/static_multimap/multiplicity_test.cu
index 3f5581b03..5de83a042 100644
--- a/tests/static_multimap/multiplicity_test.cu
+++ b/tests/static_multimap/multiplicity_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,7 +27,7 @@
 #include <thrust/sort.h>
 #include <thrust/transform.h>
 
-#include <catch2/catch.hpp>
+#include <catch2/catch_template_test_macros.hpp>
 
 template <typename Map>
 __inline__ void test_multiplicity_two(Map& map, std::size_t num_items)
@@ -36,7 +36,7 @@ __inline__ void test_multiplicity_two(Map& map, std::size_t num_items)
   using Value = typename Map::mapped_type;
 
   thrust::device_vector<Key> d_keys(num_items / 2);
-  thrust::device_vector<cuco::pair_type<Key, Value>> d_pairs(num_items);
+  thrust::device_vector<cuco::pair<Key, Value>> d_pairs(num_items);
 
   thrust::sequence(thrust::device, d_keys.begin(), d_keys.end());
   // multiplicity = 2
@@ -45,10 +45,10 @@ __inline__ void test_multiplicity_two(Map& map, std::size_t num_items)
                     thrust::counting_iterator<int>(num_items),
                     d_pairs.begin(),
                     [] __device__(auto i) {
-                      return cuco::pair_type<Key, Value>{i / 2, i};
+                      return cuco::pair<Key, Value>{i / 2, i};
                     });
 
-  thrust::device_vector<cuco::pair_type<Key, Value>> d_results(num_items);
+  thrust::device_vector<cuco::pair<Key, Value>> d_results(num_items);
 
   auto key_begin    = d_keys.begin();
   auto pair_begin   = d_pairs.begin();
@@ -91,22 +91,22 @@ __inline__ void test_multiplicity_two(Map& map, std::size_t num_items)
     REQUIRE(size == num_items);
 
     // sort before compare
-    thrust::sort(thrust::device,
-                 d_results.begin(),
-                 d_results.end(),
-                 [] __device__(const cuco::pair_type<Key, Value>& lhs,
-                               const cuco::pair_type<Key, Value>& rhs) {
-                   if (lhs.first != rhs.first) { return lhs.first < rhs.first; }
-                   return lhs.second < rhs.second;
-                 });
-
-    REQUIRE(cuco::test::equal(
-      pair_begin,
-      pair_begin + num_items,
-      output_begin,
-      [] __device__(cuco::pair_type<Key, Value> lhs, cuco::pair_type<Key, Value> rhs) {
-        return lhs.first == rhs.first and lhs.second == rhs.second;
-      }));
+    thrust::sort(
+      thrust::device,
+      d_results.begin(),
+      d_results.end(),
+      [] __device__(const cuco::pair<Key, Value>& lhs, const cuco::pair<Key, Value>& rhs) {
+        if (lhs.first != rhs.first) { return lhs.first < rhs.first; }
+        return lhs.second < rhs.second;
+      });
+
+    REQUIRE(
+      cuco::test::equal(pair_begin,
+                        pair_begin + num_items,
+                        output_begin,
+                        [] __device__(cuco::pair<Key, Value> lhs, cuco::pair<Key, Value> rhs) {
+                          return lhs.first == rhs.first and lhs.second == rhs.second;
+                        }));
   }
 
   SECTION("count and count_outer should return the same value.")
@@ -129,22 +129,22 @@ __inline__ void test_multiplicity_two(Map& map, std::size_t num_items)
     REQUIRE(size == size_outer);
 
     // sort before compare
-    thrust::sort(thrust::device,
-                 d_results.begin(),
-                 d_results.end(),
-                 [] __device__(const cuco::pair_type<Key, Value>& lhs,
-                               const cuco::pair_type<Key, Value>& rhs) {
-                   if (lhs.first != rhs.first) { return lhs.first < rhs.first; }
-                   return lhs.second < rhs.second;
-                 });
-
-    REQUIRE(cuco::test::equal(
-      pair_begin,
-      pair_begin + num_items,
-      output_begin,
-      [] __device__(cuco::pair_type<Key, Value> lhs, cuco::pair_type<Key, Value> rhs) {
-        return lhs.first == rhs.first and lhs.second == rhs.second;
-      }));
+    thrust::sort(
+      thrust::device,
+      d_results.begin(),
+      d_results.end(),
+      [] __device__(const cuco::pair<Key, Value>& lhs, const cuco::pair<Key, Value>& rhs) {
+        if (lhs.first != rhs.first) { return lhs.first < rhs.first; }
+        return lhs.second < rhs.second;
+      });
+
+    REQUIRE(
+      cuco::test::equal(pair_begin,
+                        pair_begin + num_items,
+                        output_begin,
+                        [] __device__(cuco::pair<Key, Value> lhs, cuco::pair<Key, Value> rhs) {
+                          return lhs.first == rhs.first and lhs.second == rhs.second;
+                        }));
   }
 }
 
@@ -161,18 +161,11 @@ TEMPLATE_TEST_CASE_SIG(
 {
   constexpr std::size_t num_items{4};
 
-  if constexpr (Probe == cuco::test::probe_sequence::linear_probing) {
-    cuco::static_multimap<Key,
-                          Value,
-                          cuda::thread_scope_device,
-                          cuco::cuda_allocator<char>,
-                          cuco::linear_probing<1, cuco::detail::MurmurHash3_32<Key>>>
-      map{5, cuco::sentinel::empty_key<Key>{-1}, cuco::sentinel::empty_value<Value>{-1}};
-    test_multiplicity_two(map, num_items);
-  }
-  if constexpr (Probe == cuco::test::probe_sequence::double_hashing) {
-    cuco::static_multimap<Key, Value> map{
-      5, cuco::sentinel::empty_key<Key>{-1}, cuco::sentinel::empty_value<Value>{-1}};
-    test_multiplicity_two(map, num_items);
-  }
+  using probe = std::conditional_t<Probe == cuco::test::probe_sequence::linear_probing,
+                                   cuco::linear_probing<1, cuco::default_hash_function<Key>>,
+                                   cuco::double_hashing<8, cuco::default_hash_function<Key>>>;
+
+  cuco::static_multimap<Key, Value, cuda::thread_scope_device, cuco::cuda_allocator<char>, probe>
+    map{5, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{-1}};
+  test_multiplicity_two(map, num_items);
 }
diff --git a/tests/static_multimap/non_match_test.cu b/tests/static_multimap/non_match_test.cu
index ef0042012..94023af56 100644
--- a/tests/static_multimap/non_match_test.cu
+++ b/tests/static_multimap/non_match_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,7 +26,7 @@
 #include <thrust/sort.h>
 #include <thrust/transform.h>
 
-#include <catch2/catch.hpp>
+#include <catch2/catch_template_test_macros.hpp>
 
 template <typename Key, typename Value, typename Map, typename PairIt, typename KeyIt>
 __inline__ void test_non_matches(Map& map, PairIt pair_begin, KeyIt key_begin, std::size_t num_keys)
@@ -39,77 +39,77 @@ __inline__ void test_non_matches(Map& map, PairIt pair_begin, KeyIt key_begin, s
   SECTION("Output of count and retrieve should be coherent.")
   {
     auto num = map.count(key_begin, key_begin + num_keys);
-    thrust::device_vector<cuco::pair_type<Key, Value>> d_results(num);
+    thrust::device_vector<cuco::pair<Key, Value>> d_results(num);
 
     REQUIRE(num == num_keys);
 
-    auto output_begin      = d_results.data().get();
+    auto output_begin      = d_results.begin();
     auto output_end        = map.retrieve(key_begin, key_begin + num_keys, output_begin);
     std::size_t const size = thrust::distance(output_begin, output_end);
 
     REQUIRE(size == num_keys);
 
     // sort before compare
-    thrust::sort(thrust::device,
-                 output_begin,
-                 output_end,
-                 [] __device__(const cuco::pair_type<Key, Value>& lhs,
-                               const cuco::pair_type<Key, Value>& rhs) {
-                   if (lhs.first != rhs.first) { return lhs.first < rhs.first; }
-                   return lhs.second < rhs.second;
-                 });
-
-    REQUIRE(cuco::test::equal(
-      pair_begin,
-      pair_begin + num_keys,
+    thrust::sort(
+      thrust::device,
       output_begin,
-      [] __device__(cuco::pair_type<Key, Value> lhs, cuco::pair_type<Key, Value> rhs) {
-        return lhs.first == rhs.first and lhs.second == rhs.second;
-      }));
+      output_end,
+      [] __device__(const cuco::pair<Key, Value>& lhs, const cuco::pair<Key, Value>& rhs) {
+        if (lhs.first != rhs.first) { return lhs.first < rhs.first; }
+        return lhs.second < rhs.second;
+      });
+
+    REQUIRE(
+      cuco::test::equal(pair_begin,
+                        pair_begin + num_keys,
+                        output_begin,
+                        [] __device__(cuco::pair<Key, Value> lhs, cuco::pair<Key, Value> rhs) {
+                          return lhs.first == rhs.first and lhs.second == rhs.second;
+                        }));
   }
 
   SECTION("Output of count_outer and retrieve_outer should be coherent.")
   {
     auto num = map.count_outer(key_begin, key_begin + num_keys);
-    thrust::device_vector<cuco::pair_type<Key, Value>> d_results(num);
+    thrust::device_vector<cuco::pair<Key, Value>> d_results(num);
 
     REQUIRE(num == (num_keys + num_keys / 2));
 
-    auto output_begin      = d_results.data().get();
+    auto output_begin      = d_results.begin();
     auto output_end        = map.retrieve_outer(key_begin, key_begin + num_keys, output_begin);
     std::size_t const size = thrust::distance(output_begin, output_end);
 
     REQUIRE(size == (num_keys + num_keys / 2));
 
     // sort before compare
-    thrust::sort(thrust::device,
-                 output_begin,
-                 output_end,
-                 [] __device__(const cuco::pair_type<Key, Value>& lhs,
-                               const cuco::pair_type<Key, Value>& rhs) {
-                   if (lhs.first != rhs.first) { return lhs.first < rhs.first; }
-                   return lhs.second < rhs.second;
-                 });
+    thrust::sort(
+      thrust::device,
+      output_begin,
+      output_end,
+      [] __device__(const cuco::pair<Key, Value>& lhs, const cuco::pair<Key, Value>& rhs) {
+        if (lhs.first != rhs.first) { return lhs.first < rhs.first; }
+        return lhs.second < rhs.second;
+      });
 
     // create gold reference
-    thrust::device_vector<cuco::pair_type<Key, Value>> gold(size);
+    thrust::device_vector<cuco::pair<Key, Value>> gold(size);
     auto gold_begin = gold.begin();
     thrust::transform(thrust::device,
                       thrust::counting_iterator<int>(0),
                       thrust::counting_iterator<int>(size),
                       gold_begin,
                       [num_keys] __device__(auto i) {
-                        if (i < num_keys) { return cuco::pair_type<Key, Value>{i / 2, i}; }
-                        return cuco::pair_type<Key, Value>{i - num_keys / 2, -1};
+                        if (i < num_keys) { return cuco::pair<Key, Value>{i / 2, i}; }
+                        return cuco::pair<Key, Value>{i - num_keys / 2, -1};
                       });
 
-    REQUIRE(cuco::test::equal(
-      gold_begin,
-      gold_begin + size,
-      output_begin,
-      [] __device__(cuco::pair_type<Key, Value> lhs, cuco::pair_type<Key, Value> rhs) {
-        return lhs.first == rhs.first and lhs.second == rhs.second;
-      }));
+    REQUIRE(
+      cuco::test::equal(gold_begin,
+                        gold_begin + size,
+                        output_begin,
+                        [] __device__(cuco::pair<Key, Value> lhs, cuco::pair<Key, Value> rhs) {
+                          return lhs.first == rhs.first and lhs.second == rhs.second;
+                        }));
   }
 }
 
@@ -127,7 +127,7 @@ TEMPLATE_TEST_CASE_SIG(
   constexpr std::size_t num_keys{1'000};
 
   thrust::device_vector<Key> d_keys(num_keys);
-  thrust::device_vector<cuco::pair_type<Key, Value>> d_pairs(num_keys);
+  thrust::device_vector<cuco::pair<Key, Value>> d_pairs(num_keys);
 
   thrust::sequence(thrust::device, d_keys.begin(), d_keys.end());
   // multiplicity = 2
@@ -136,21 +136,18 @@ TEMPLATE_TEST_CASE_SIG(
                     thrust::counting_iterator<int>(num_keys),
                     d_pairs.begin(),
                     [] __device__(auto i) {
-                      return cuco::pair_type<Key, Value>{i / 2, i};
+                      return cuco::pair<Key, Value>{i / 2, i};
                     });
 
-  if constexpr (Probe == cuco::test::probe_sequence::linear_probing) {
-    cuco::static_multimap<Key,
-                          Value,
-                          cuda::thread_scope_device,
-                          cuco::cuda_allocator<char>,
-                          cuco::linear_probing<1, cuco::detail::MurmurHash3_32<Key>>>
-      map{num_keys * 2, cuco::sentinel::empty_key<Key>{-1}, cuco::sentinel::empty_value<Value>{-1}};
-    test_non_matches<Key, Value>(map, d_pairs.begin(), d_keys.begin(), num_keys);
-  }
-  if constexpr (Probe == cuco::test::probe_sequence::double_hashing) {
-    cuco::static_multimap<Key, Value> map{
-      num_keys * 2, cuco::sentinel::empty_key<Key>{-1}, cuco::sentinel::empty_value<Value>{-1}};
-    test_non_matches<Key, Value>(map, d_pairs.begin(), d_keys.begin(), num_keys);
-  }
+  using probe = std::conditional_t<Probe == cuco::test::probe_sequence::linear_probing,
+                                   cuco::linear_probing<1, cuco::default_hash_function<Key>>,
+                                   cuco::double_hashing<8, cuco::default_hash_function<Key>>>;
+
+  cuco::static_multimap<Key,
+                        Value,
+                        cuda::thread_scope_device,
+                        cuco::cuda_allocator<char>,
+                        cuco::linear_probing<1, cuco::default_hash_function<Key>>>
+    map{num_keys * 2, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{-1}};
+  test_non_matches<Key, Value>(map, d_pairs.begin(), d_keys.begin(), num_keys);
 }
diff --git a/tests/static_multimap/pair_function_test.cu b/tests/static_multimap/pair_function_test.cu
index c5442533b..3ef49377d 100644
--- a/tests/static_multimap/pair_function_test.cu
+++ b/tests/static_multimap/pair_function_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,13 +27,13 @@
 #include <thrust/transform.h>
 #include <thrust/tuple.h>
 
-#include <catch2/catch.hpp>
+#include <catch2/catch_template_test_macros.hpp>
 
 // Custom pair equal
 template <typename Key, typename Value>
 struct pair_equal {
-  __device__ bool operator()(const cuco::pair_type<Key, Value>& lhs,
-                             const cuco::pair_type<Key, Value>& rhs) const
+  __device__ bool operator()(const cuco::pair<Key, Value>& lhs,
+                             const cuco::pair<Key, Value>& rhs) const
   {
     return lhs.first == rhs.first;
   }
@@ -43,7 +43,7 @@ template <typename Key, typename Value, typename Map, typename PairIt>
 __inline__ void test_pair_functions(Map& map, PairIt pair_begin, std::size_t num_pairs)
 {
   map.insert(pair_begin, pair_begin + num_pairs);
-  cudaStreamSynchronize(0);
+  CUCO_CUDA_TRY(cudaStreamSynchronize(0));
 
   auto res = map.get_size();
   REQUIRE(res == num_pairs);
@@ -54,7 +54,7 @@ __inline__ void test_pair_functions(Map& map, PairIt pair_begin, std::size_t num
                     thrust::counting_iterator<int>(num_pairs),
                     pair_begin,
                     [] __device__(auto i) {
-                      return cuco::pair_type<Key, Value>{i, i};
+                      return cuco::pair<Key, Value>{i, i};
                     });
 
   SECTION("pair_contains returns true for all inserted pairs and false for non-inserted ones.")
@@ -121,7 +121,7 @@ TEMPLATE_TEST_CASE_SIG(
   (int64_t, int64_t, cuco::test::probe_sequence::double_hashing))
 {
   constexpr std::size_t num_pairs{4};
-  thrust::device_vector<cuco::pair_type<Key, Value>> d_pairs(num_pairs);
+  thrust::device_vector<cuco::pair<Key, Value>> d_pairs(num_pairs);
 
   // pair multiplicity = 2
   thrust::transform(thrust::device,
@@ -129,22 +129,14 @@ TEMPLATE_TEST_CASE_SIG(
                     thrust::counting_iterator<int>(num_pairs),
                     d_pairs.begin(),
                     [] __device__(auto i) {
-                      return cuco::pair_type<Key, Value>{i / 2, i};
+                      return cuco::pair<Key, Value>{i / 2, i};
                     });
 
-  if constexpr (Probe == cuco::test::probe_sequence::linear_probing) {
-    cuco::static_multimap<Key,
-                          Value,
-                          cuda::thread_scope_device,
-                          cuco::cuda_allocator<char>,
-                          cuco::linear_probing<1, cuco::detail::MurmurHash3_32<Key>>>
-      map{
-        num_pairs * 2, cuco::sentinel::empty_key<Key>{-1}, cuco::sentinel::empty_value<Value>{-1}};
-    test_pair_functions<Key, Value>(map, d_pairs.begin(), num_pairs);
-  }
-  if constexpr (Probe == cuco::test::probe_sequence::double_hashing) {
-    cuco::static_multimap<Key, Value> map{
-      num_pairs * 2, cuco::sentinel::empty_key<Key>{-1}, cuco::sentinel::empty_value<Value>{-1}};
-    test_pair_functions<Key, Value>(map, d_pairs.begin(), num_pairs);
-  }
+  using probe = std::conditional_t<Probe == cuco::test::probe_sequence::linear_probing,
+                                   cuco::linear_probing<1, cuco::default_hash_function<Key>>,
+                                   cuco::double_hashing<8, cuco::default_hash_function<Key>>>;
+
+  cuco::static_multimap<Key, Value, cuda::thread_scope_device, cuco::cuda_allocator<char>, probe>
+    map{num_pairs * 2, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{-1}};
+  test_pair_functions<Key, Value>(map, d_pairs.begin(), num_pairs);
 }
diff --git a/tests/static_set/capacity_test.cu b/tests/static_set/capacity_test.cu
new file mode 100644
index 000000000..4c66a7ccc
--- /dev/null
+++ b/tests/static_set/capacity_test.cu
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuco/static_set.cuh>
+
+#include <catch2/catch_test_macros.hpp>
+
+TEST_CASE("Static set capacity", "")
+{
+  using Key        = int32_t;
+  using ProbeT     = cuco::experimental::double_hashing<1, cuco::default_hash_function<Key>>;
+  using Equal      = thrust::equal_to<Key>;
+  using AllocatorT = cuco::cuda_allocator<std::byte>;
+  using StorageT   = cuco::experimental::storage<2>;
+
+  SECTION("zero capacity is allowed.")
+  {
+    auto constexpr gold_capacity = 4;
+
+    using extent_type = cuco::experimental::extent<std::size_t, 0>;
+    cuco::experimental::
+      static_set<Key, extent_type, cuda::thread_scope_device, Equal, ProbeT, AllocatorT, StorageT>
+        set{extent_type{}, cuco::empty_key<Key>{-1}};
+    auto const capacity = set.capacity();
+    REQUIRE(capacity == gold_capacity);
+
+    auto ref                = set.ref(cuco::experimental::insert);
+    auto const ref_capacity = ref.capacity();
+    REQUIRE(ref_capacity == gold_capacity);
+  }
+
+  SECTION("negative capacity (ikr -_-||) is also allowed.")
+  {
+    auto constexpr gold_capacity = 4;
+
+    using extent_type = cuco::experimental::extent<int32_t>;
+    cuco::experimental::
+      static_set<Key, extent_type, cuda::thread_scope_device, Equal, ProbeT, AllocatorT, StorageT>
+        set{extent_type{-10}, cuco::empty_key<Key>{-1}};
+    auto const capacity = set.capacity();
+    REQUIRE(capacity == gold_capacity);
+
+    auto ref                = set.ref(cuco::experimental::insert);
+    auto const ref_capacity = ref.capacity();
+    REQUIRE(ref_capacity == gold_capacity);
+  }
+
+  constexpr std::size_t num_keys{400};
+
+  SECTION("Dynamic extent is evaluated at run time.")
+  {
+    auto constexpr gold_capacity = 422;  // 211 x 2
+
+    using extent_type = cuco::experimental::extent<std::size_t>;
+    cuco::experimental::
+      static_set<Key, extent_type, cuda::thread_scope_device, Equal, ProbeT, AllocatorT, StorageT>
+        set{num_keys, cuco::empty_key<Key>{-1}};
+    auto const capacity = set.capacity();
+    REQUIRE(capacity == gold_capacity);
+
+    auto ref                = set.ref(cuco::experimental::insert);
+    auto const ref_capacity = ref.capacity();
+    REQUIRE(ref_capacity == gold_capacity);
+  }
+
+  SECTION("Dynamic extent is evaluated at run time.")
+  {
+    auto constexpr gold_capacity = 412;  // 103 x 2 x 2
+
+    using probe = cuco::experimental::linear_probing<2, cuco::default_hash_function<Key>>;
+    auto set    = cuco::experimental::static_set<Key,
+                                              cuco::experimental::extent<std::size_t>,
+                                              cuda::thread_scope_device,
+                                              Equal,
+                                              probe,
+                                              AllocatorT,
+                                              StorageT>{num_keys, cuco::empty_key<Key>{-1}};
+
+    auto const capacity = set.capacity();
+    REQUIRE(capacity == gold_capacity);
+
+    auto ref                = set.ref(cuco::experimental::insert);
+    auto const ref_capacity = ref.capacity();
+    REQUIRE(ref_capacity == gold_capacity);
+  }
+}
diff --git a/tests/static_set/heterogeneous_lookup_test.cu b/tests/static_set/heterogeneous_lookup_test.cu
new file mode 100644
index 000000000..cbc0efac3
--- /dev/null
+++ b/tests/static_set/heterogeneous_lookup_test.cu
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <utils.hpp>
+
+#include <cuco/static_set.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <thrust/functional.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/transform.h>
+
+#include <catch2/catch_template_test_macros.hpp>
+
+#include <tuple>
+
+// insert key type
+template <typename T>
+struct key_pair {
+  T a;
+  T b;
+
+  __host__ __device__ key_pair() {}
+  __host__ __device__ key_pair(T x) : a{x}, b{x} {}
+
+  // Device equality operator is mandatory due to libcudacxx bug:
+  // https://github.com/NVIDIA/libcudacxx/issues/223
+  __device__ bool operator==(key_pair const& other) const { return a == other.a and b == other.b; }
+};
+
+// probe key type
+template <typename T>
+struct key_triplet {
+  T a;
+  T b;
+  T c;
+
+  __host__ __device__ key_triplet() {}
+  __host__ __device__ key_triplet(T x) : a{x}, b{x}, c{x} {}
+
+  // Device equality operator is mandatory due to libcudacxx bug:
+  // https://github.com/NVIDIA/libcudacxx/issues/223
+  __device__ bool operator==(key_triplet const& other) const
+  {
+    return a == other.a and b == other.b and c == other.c;
+  }
+};
+
+// User-defined device hasher
+struct custom_hasher {
+  template <typename CustomKey>
+  __device__ uint32_t operator()(CustomKey const& k) const
+  {
+    return thrust::raw_reference_cast(k).a;
+  };
+};
+
+// User-defined device key equality
+struct custom_key_equal {
+  template <typename LHS, typename RHS>
+  __device__ bool operator()(LHS const& lhs, RHS const& rhs) const
+  {
+    return thrust::raw_reference_cast(lhs).a == thrust::raw_reference_cast(rhs).a;
+  }
+};
+
+TEMPLATE_TEST_CASE_SIG(
+  "Heterogeneous lookup", "", ((typename T, int CGSize), T, CGSize), (int32_t, 1), (int32_t, 2))
+{
+  using Key        = key_pair<T>;
+  using ProbeKey   = key_triplet<T>;
+  using probe_type = cuco::experimental::double_hashing<CGSize, custom_hasher, custom_hasher>;
+
+  auto const sentinel_key = Key{-1};
+
+  constexpr std::size_t num      = 100;
+  constexpr std::size_t capacity = num * 2;
+  auto const probe               = probe_type{custom_hasher{}, custom_hasher{}};
+  auto my_set                    = cuco::experimental::static_set<Key,
+                                               cuco::experimental::extent<std::size_t>,
+                                               cuda::thread_scope_device,
+                                               custom_key_equal,
+                                               probe_type>{
+    capacity, cuco::empty_key<Key>{sentinel_key}, custom_key_equal{}, probe};
+
+  auto insert_pairs = thrust::make_transform_iterator(thrust::counting_iterator<int>(0),
+                                                      [] __device__(auto i) { return Key{i}; });
+  auto probe_keys   = thrust::make_transform_iterator(thrust::counting_iterator<int>(0),
+                                                    [] __device__(auto i) { return ProbeKey(i); });
+
+  SECTION("All inserted keys should be contained")
+  {
+    thrust::device_vector<bool> contained(num);
+    my_set.insert(insert_pairs, insert_pairs + num);
+    my_set.contains(probe_keys, probe_keys + num, contained.begin());
+    REQUIRE(cuco::test::all_of(contained.begin(), contained.end(), thrust::identity{}));
+  }
+
+  SECTION("Non-inserted keys should not be contained")
+  {
+    thrust::device_vector<bool> contained(num);
+    my_set.contains(probe_keys, probe_keys + num, contained.begin());
+    REQUIRE(cuco::test::none_of(contained.begin(), contained.end(), thrust::identity{}));
+  }
+}
diff --git a/tests/static_set/insert_and_find_test.cu b/tests/static_set/insert_and_find_test.cu
new file mode 100644
index 000000000..278510e08
--- /dev/null
+++ b/tests/static_set/insert_and_find_test.cu
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <utils.hpp>
+
+#include <cuco/static_set.cuh>
+
+#include <thrust/functional.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+
+#include <catch2/catch_template_test_macros.hpp>
+
+template <typename Set>
+__inline__ void test_insert_and_find(Set& set, std::size_t num_keys)
+{
+  using Key                     = typename Set::key_type;
+  static auto constexpr cg_size = Set::cg_size;
+
+  auto const keys_begin = [&]() {
+    if constexpr (cg_size == 1) {
+      return thrust::counting_iterator<Key>(0);
+    } else {
+      return thrust::make_transform_iterator(thrust::counting_iterator<Key>(0),
+                                             [] __device__(auto i) { return i / cg_size; });
+    }
+  }();
+  auto const keys_end = [&]() {
+    if constexpr (cg_size == 1) {
+      return keys_begin + num_keys;
+    } else {
+      return keys_begin + num_keys * cg_size;
+    }
+  }();
+
+  auto ref = set.ref(cuco::experimental::op::insert_and_find);
+
+  REQUIRE(cuco::test::all_of(keys_begin, keys_end, [ref] __device__(Key key) mutable {
+    auto [iter, inserted] = [&]() {
+      if constexpr (cg_size == 1) {
+        return ref.insert_and_find(key);
+      } else {
+        auto const tile =
+          cooperative_groups::tiled_partition<cg_size>(cooperative_groups::this_thread_block());
+        return ref.insert_and_find(tile, key);
+      }
+    }();
+    return inserted == true;
+  }));
+
+  SECTION("Inserting elements for the second time will always fail.")
+  {
+    REQUIRE(cuco::test::all_of(keys_begin, keys_end, [ref] __device__(Key key) mutable {
+      auto [iter, inserted] = [&]() {
+        if constexpr (cg_size == 1) {
+          return ref.insert_and_find(key);
+        } else {
+          auto const tile =
+            cooperative_groups::tiled_partition<cg_size>(cooperative_groups::this_thread_block());
+          return ref.insert_and_find(tile, key);
+        }
+      }();
+      return inserted == false and key == *iter;
+    }));
+  }
+}
+
+TEMPLATE_TEST_CASE_SIG(
+  "Insert and find",
+  "",
+  ((typename Key, cuco::test::probe_sequence Probe, int CGSize), Key, Probe, CGSize),
+  (int32_t, cuco::test::probe_sequence::double_hashing, 1),
+  (int32_t, cuco::test::probe_sequence::double_hashing, 2),
+  (int64_t, cuco::test::probe_sequence::double_hashing, 1),
+  (int64_t, cuco::test::probe_sequence::double_hashing, 2),
+  (int32_t, cuco::test::probe_sequence::linear_probing, 1),
+  (int32_t, cuco::test::probe_sequence::linear_probing, 2),
+  (int64_t, cuco::test::probe_sequence::linear_probing, 1),
+  (int64_t, cuco::test::probe_sequence::linear_probing, 2))
+{
+  constexpr std::size_t num_keys{400};
+
+  using probe = std::conditional_t<
+    Probe == cuco::test::probe_sequence::linear_probing,
+    cuco::experimental::linear_probing<CGSize, cuco::default_hash_function<Key>>,
+    cuco::experimental::double_hashing<CGSize, cuco::default_hash_function<Key>>>;
+
+  auto set = cuco::experimental::static_set<Key,
+                                            cuco::experimental::extent<std::size_t>,
+                                            cuda::thread_scope_device,
+                                            thrust::equal_to<Key>,
+                                            probe,
+                                            cuco::cuda_allocator<std::byte>,
+                                            cuco::experimental::storage<2>>{
+    num_keys, cuco::empty_key<Key>{-1}};
+  test_insert_and_find(set, num_keys);
+}
diff --git a/tests/static_set/large_input_test.cu b/tests/static_set/large_input_test.cu
new file mode 100644
index 000000000..5015ca750
--- /dev/null
+++ b/tests/static_set/large_input_test.cu
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <utils.hpp>
+
+#include <cuco/static_set.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/distance.h>
+#include <thrust/execution_policy.h>
+#include <thrust/functional.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
+#include <thrust/transform.h>
+
+#include <catch2/catch_template_test_macros.hpp>
+
+template <typename Set>
+__inline__ void test_unique_sequence(Set& set, bool* res_begin, std::size_t num_keys)
+{
+  using Key = typename Set::key_type;
+
+  auto const keys_begin = thrust::counting_iterator<Key>(0);
+  auto const keys_end   = thrust::counting_iterator<Key>(num_keys);
+
+  SECTION("Non-inserted keys should not be contained.")
+  {
+    REQUIRE(set.size() == 0);
+
+    set.contains(keys_begin, keys_end, res_begin);
+    REQUIRE(cuco::test::none_of(res_begin, res_begin + num_keys, thrust::identity{}));
+  }
+
+  set.insert(keys_begin, keys_end);
+  REQUIRE(set.size() == num_keys);
+
+  SECTION("All inserted key/value pairs should be contained.")
+  {
+    set.contains(keys_begin, keys_end, res_begin);
+    REQUIRE(cuco::test::all_of(res_begin, res_begin + num_keys, thrust::identity{}));
+  }
+}
+
+TEMPLATE_TEST_CASE_SIG(
+  "Large input",
+  "",
+  ((typename Key, cuco::test::probe_sequence Probe, int CGSize), Key, Probe, CGSize),
+  (int32_t, cuco::test::probe_sequence::double_hashing, 1),
+  (int32_t, cuco::test::probe_sequence::double_hashing, 2),
+  (int64_t, cuco::test::probe_sequence::double_hashing, 1),
+  (int64_t, cuco::test::probe_sequence::double_hashing, 2))
+{
+  constexpr std::size_t num_keys{1'200'000'000};
+
+  using extent_type = cuco::experimental::extent<std::size_t>;
+  using probe       = cuco::experimental::double_hashing<CGSize, cuco::default_hash_function<Key>>;
+
+  try {
+    auto set = cuco::experimental::
+      static_set<Key, extent_type, cuda::thread_scope_device, thrust::equal_to<Key>, probe>{
+        num_keys * 2, cuco::empty_key<Key>{-1}};
+
+    thrust::device_vector<bool> d_contained(num_keys);
+    test_unique_sequence(set, d_contained.data().get(), num_keys);
+  } catch (cuco::cuda_error&) {
+    SKIP("Out of memory");
+  } catch (std::bad_alloc&) {
+    SKIP("Out of memory");
+  }
+}
diff --git a/tests/static_set/retrieve_all_test.cu b/tests/static_set/retrieve_all_test.cu
new file mode 100644
index 000000000..616e35138
--- /dev/null
+++ b/tests/static_set/retrieve_all_test.cu
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <utils.hpp>
+
+#include <cuco/static_set.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/distance.h>
+#include <thrust/functional.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
+
+#include <catch2/catch_template_test_macros.hpp>
+
+template <typename Set>
+__inline__ void test_unique_sequence(Set& set, std::size_t num_keys)
+{
+  using Key = typename Set::key_type;
+
+  thrust::device_vector<Key> d_keys(num_keys);
+  thrust::sequence(d_keys.begin(), d_keys.end());
+  auto keys_begin = d_keys.begin();
+
+  SECTION("Non-inserted keys should not be contained.")
+  {
+    REQUIRE(set.size() == 0);
+
+    auto keys_end = set.retrieve_all(keys_begin);
+    REQUIRE(std::distance(keys_begin, keys_end) == 0);
+  }
+
+  set.insert(keys_begin, keys_begin + num_keys);
+  REQUIRE(set.size() == num_keys);
+
+  SECTION("All inserted key/value pairs should be contained.")
+  {
+    thrust::device_vector<Key> d_res(num_keys);
+    auto d_res_end = set.retrieve_all(d_res.begin());
+    thrust::sort(d_res.begin(), d_res_end);
+    REQUIRE(cuco::test::equal(
+      d_res.begin(), d_res_end, thrust::counting_iterator<Key>(0), thrust::equal_to<Key>{}));
+  }
+}
+
+TEMPLATE_TEST_CASE_SIG(
+  "Retrieve all",
+  "",
+  ((typename Key, cuco::test::probe_sequence Probe, int CGSize), Key, Probe, CGSize),
+  (int32_t, cuco::test::probe_sequence::double_hashing, 1),
+  (int32_t, cuco::test::probe_sequence::double_hashing, 2),
+  (int64_t, cuco::test::probe_sequence::double_hashing, 1),
+  (int64_t, cuco::test::probe_sequence::double_hashing, 2),
+  (int32_t, cuco::test::probe_sequence::linear_probing, 1),
+  (int32_t, cuco::test::probe_sequence::linear_probing, 2),
+  (int64_t, cuco::test::probe_sequence::linear_probing, 1),
+  (int64_t, cuco::test::probe_sequence::linear_probing, 2))
+{
+  constexpr std::size_t num_keys{400};
+  auto constexpr gold_capacity = CGSize == 1 ? 409  // 409 x 1 x 1
+                                             : 422  // 211 x 2 x 1
+    ;
+
+  using probe = std::conditional_t<
+    Probe == cuco::test::probe_sequence::linear_probing,
+    cuco::experimental::linear_probing<CGSize, cuco::default_hash_function<Key>>,
+    cuco::experimental::double_hashing<CGSize, cuco::default_hash_function<Key>>>;
+
+  auto set = cuco::experimental::static_set<Key,
+                                            cuco::experimental::extent<std::size_t>,
+                                            cuda::thread_scope_device,
+                                            thrust::equal_to<Key>,
+                                            probe,
+                                            cuco::cuda_allocator<std::byte>,
+                                            cuco::experimental::storage<1>>{
+    num_keys, cuco::empty_key<Key>{-1}};
+
+  REQUIRE(set.capacity() == gold_capacity);
+
+  test_unique_sequence(set, num_keys);
+}
diff --git a/tests/static_set/size_test.cu b/tests/static_set/size_test.cu
new file mode 100644
index 000000000..2e2bfd6c2
--- /dev/null
+++ b/tests/static_set/size_test.cu
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuco/static_set.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <thrust/sequence.h>
+
+#include <catch2/catch_test_macros.hpp>
+
+TEST_CASE("Size computation", "")
+{
+  constexpr std::size_t num_keys{400};
+
+  cuco::experimental::static_set<int> set{cuco::experimental::extent<std::size_t>{400},
+                                          cuco::empty_key{-1}};
+
+  thrust::device_vector<int> d_keys(num_keys);
+
+  thrust::sequence(thrust::device, d_keys.begin(), d_keys.end());
+
+  auto const num_successes = set.insert(d_keys.begin(), d_keys.end());
+
+  REQUIRE(set.size() == num_keys);
+  REQUIRE(num_successes == num_keys);
+
+  set.clear();
+
+  REQUIRE(set.size() == 0);
+}
diff --git a/tests/static_set/unique_sequence_test.cu b/tests/static_set/unique_sequence_test.cu
new file mode 100644
index 000000000..53ede7524
--- /dev/null
+++ b/tests/static_set/unique_sequence_test.cu
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <utils.hpp>
+
+#include <cuco/static_set.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/distance.h>
+#include <thrust/functional.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
+#include <thrust/transform.h>
+
+#include <catch2/catch_template_test_macros.hpp>
+
+using size_type = int32_t;
+
+template <typename Set>
+__inline__ void test_unique_sequence(Set& set, size_type num_keys)
+{
+  using Key = typename Set::key_type;
+
+  thrust::device_vector<Key> d_keys(num_keys);
+
+  thrust::sequence(thrust::device, d_keys.begin(), d_keys.end());
+
+  auto keys_begin = d_keys.begin();
+  thrust::device_vector<bool> d_contained(num_keys);
+
+  auto zip_equal = [] __device__(auto const& p) { return thrust::get<0>(p) == thrust::get<1>(p); };
+  auto is_even   = [] __device__(auto const& i) { return i % 2 == 0; };
+
+  SECTION("Non-inserted keys should not be contained.")
+  {
+    REQUIRE(set.size() == 0);
+
+    set.contains(keys_begin, keys_begin + num_keys, d_contained.begin());
+    REQUIRE(cuco::test::none_of(d_contained.begin(), d_contained.end(), thrust::identity{}));
+  }
+
+  SECTION("Non-inserted keys have no matches")
+  {
+    thrust::device_vector<Key> d_results(num_keys);
+
+    set.find(keys_begin, keys_begin + num_keys, d_results.begin());
+    auto zip = thrust::make_zip_iterator(thrust::make_tuple(
+      d_results.begin(), thrust::constant_iterator<Key>{set.empty_key_sentinel()}));
+
+    REQUIRE(cuco::test::all_of(zip, zip + num_keys, zip_equal));
+  }
+
+  SECTION("All conditionally inserted keys should be contained")
+  {
+    auto const inserted = set.insert_if(
+      keys_begin, keys_begin + num_keys, thrust::counting_iterator<std::size_t>(0), is_even);
+    REQUIRE(inserted == num_keys / 2);
+    REQUIRE(set.size() == num_keys / 2);
+
+    set.contains(keys_begin, keys_begin + num_keys, d_contained.begin());
+    REQUIRE(cuco::test::equal(d_contained.begin(),
+                              d_contained.end(),
+                              thrust::counting_iterator<std::size_t>(0),
+                              [] __device__(auto const& idx_contained, auto const& idx) {
+                                return ((idx % 2) == 0) == idx_contained;
+                              }));
+  }
+
+  set.insert(keys_begin, keys_begin + num_keys);
+  REQUIRE(set.size() == num_keys);
+
+  SECTION("All inserted keys should be contained.")
+  {
+    set.contains(keys_begin, keys_begin + num_keys, d_contained.begin());
+    REQUIRE(cuco::test::all_of(d_contained.begin(), d_contained.end(), thrust::identity{}));
+  }
+
+  SECTION("Conditional contains should return true on even inputs.")
+  {
+    set.contains_if(keys_begin,
+                    keys_begin + num_keys,
+                    thrust::counting_iterator<std::size_t>(0),
+                    is_even,
+                    d_contained.begin());
+    auto gold_iter =
+      thrust::make_transform_iterator(thrust::counting_iterator<std::size_t>(0), is_even);
+    auto zip = thrust::make_zip_iterator(thrust::make_tuple(d_contained.begin(), gold_iter));
+    REQUIRE(cuco::test::all_of(zip, zip + num_keys, zip_equal));
+  }
+
+  SECTION("All inserted keys should be correctly recovered during find")
+  {
+    thrust::device_vector<Key> d_results(num_keys);
+
+    set.find(keys_begin, keys_begin + num_keys, d_results.begin());
+    auto zip = thrust::make_zip_iterator(thrust::make_tuple(d_results.begin(), keys_begin));
+
+    REQUIRE(cuco::test::all_of(zip, zip + num_keys, zip_equal));
+  }
+}
+
+TEMPLATE_TEST_CASE_SIG(
+  "Unique sequence",
+  "",
+  ((typename Key, cuco::test::probe_sequence Probe, int CGSize), Key, Probe, CGSize),
+  (int32_t, cuco::test::probe_sequence::double_hashing, 1),
+  (int32_t, cuco::test::probe_sequence::double_hashing, 2),
+  (int64_t, cuco::test::probe_sequence::double_hashing, 1),
+  (int64_t, cuco::test::probe_sequence::double_hashing, 2),
+  (int32_t, cuco::test::probe_sequence::linear_probing, 1),
+  (int32_t, cuco::test::probe_sequence::linear_probing, 2),
+  (int64_t, cuco::test::probe_sequence::linear_probing, 1),
+  (int64_t, cuco::test::probe_sequence::linear_probing, 2))
+{
+  constexpr size_type num_keys{400};
+  constexpr size_type gold_capacity = CGSize == 1 ? 422  // 211 x 1 x 2
+                                                  : 412  // 103 x 2 x 2
+    ;
+
+  using probe = std::conditional_t<
+    Probe == cuco::test::probe_sequence::linear_probing,
+    cuco::experimental::linear_probing<CGSize, cuco::default_hash_function<Key>>,
+    cuco::experimental::double_hashing<CGSize, cuco::default_hash_function<Key>>>;
+
+  auto set = cuco::experimental::static_set<Key,
+                                            cuco::experimental::extent<size_type>,
+                                            cuda::thread_scope_device,
+                                            thrust::equal_to<Key>,
+                                            probe,
+                                            cuco::cuda_allocator<std::byte>,
+                                            cuco::experimental::storage<2>>{
+    num_keys, cuco::empty_key<Key>{-1}};
+
+  REQUIRE(set.capacity() == gold_capacity);
+
+  test_unique_sequence(set, num_keys);
+}
diff --git a/tests/utility/extent_test.cu b/tests/utility/extent_test.cu
new file mode 100644
index 000000000..d44e20368
--- /dev/null
+++ b/tests/utility/extent_test.cu
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <utils.hpp>
+
+#include <cuco/extent.cuh>
+
+#include <catch2/catch_template_test_macros.hpp>
+
+TEMPLATE_TEST_CASE_SIG(
+  "Extent tests", "", ((typename SizeType), SizeType), (int32_t), (int64_t), (std::size_t))
+{
+  SizeType constexpr num            = 1234;
+  SizeType constexpr gold_reference = 314;  // 157 x 2
+  auto constexpr cg_size            = 2;
+  auto constexpr window_size        = 4;
+
+  SECTION("Static extent must be evaluated at compile time.")
+  {
+    auto const size = cuco::experimental::extent<SizeType, num>{};
+    STATIC_REQUIRE(num == size);
+  }
+
+  SECTION("Dynamic extent is evaluated at run time.")
+  {
+    auto const size = cuco::experimental::extent(num);
+    REQUIRE(size == num);
+  }
+
+  SECTION("Compute static valid extent at compile time.")
+  {
+    auto constexpr size = cuco::experimental::extent<SizeType, num>{};
+    auto constexpr res  = cuco::experimental::make_window_extent<cg_size, window_size>(size);
+    STATIC_REQUIRE(gold_reference == res.value());
+  }
+
+  SECTION("Compute dynamic valid extent at run time.")
+  {
+    auto const size = cuco::experimental::extent<SizeType>{num};
+    auto const res  = cuco::experimental::make_window_extent<cg_size, window_size>(size);
+    REQUIRE(gold_reference == res.value());
+  }
+}
diff --git a/tests/utility/fast_int_test.cu b/tests/utility/fast_int_test.cu
new file mode 100644
index 000000000..c780293f9
--- /dev/null
+++ b/tests/utility/fast_int_test.cu
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <utils.hpp>
+
+#include <cuco/utility/fast_int.cuh>
+
+#include <catch2/catch_template_test_macros.hpp>
+#include <catch2/generators/catch_generators.hpp>
+
+#include <cstdint>
+#include <type_traits>
+
+TEMPLATE_TEST_CASE(
+  "utility::fast_int tests", "", std::int32_t, std::uint32_t, std::int64_t, std::uint64_t)
+{
+  TestType value           = GENERATE(1, 2, 9, 32, 4123, 8192, 4312456);
+  TestType lhs             = GENERATE(1, 2, 9, 32, 4123, 8192, 4312456);
+  constexpr auto max_value = std::numeric_limits<TestType>::max();
+
+  cuco::utility::fast_int fast_value{value};
+
+  SECTION("Should be explicitly convertible to the underlying integer type.")
+  {
+    REQUIRE(static_cast<TestType>(fast_value) == value);
+  }
+
+  SECTION("Fast div/mod should produce correct result.")
+  {
+    INFO(lhs << " /% " << value);
+    REQUIRE(lhs / fast_value == lhs / value);
+    REQUIRE(lhs % fast_value == lhs % value);
+  }
+
+  SECTION("Fast div/mod with maximum rhs value should produce correct result.")
+  {
+    INFO(lhs << " /% " << max_value);
+    cuco::utility::fast_int fast_max{max_value};
+    REQUIRE(lhs / fast_max == lhs / max_value);
+    REQUIRE(lhs % fast_max == lhs % max_value);
+  }
+
+  SECTION("Fast div/mod with maximum lhs value should produce correct result.")
+  {
+    INFO(max_value << " /% " << value);
+    REQUIRE(max_value / fast_value == max_value / value);
+    REQUIRE(max_value % fast_value == max_value % value);
+  }
+}
diff --git a/tests/utility/hash_test.cu b/tests/utility/hash_test.cu
new file mode 100644
index 000000000..3e8880860
--- /dev/null
+++ b/tests/utility/hash_test.cu
@@ -0,0 +1,198 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <utils.hpp>
+
+#include <cuco/detail/__config>
+#include <cuco/hash_functions.cuh>
+
+#include <thrust/device_vector.h>
+
+#include <catch2/catch_template_test_macros.hpp>
+#include <catch2/catch_test_macros.hpp>
+
+#include <cstddef>
+
+template <int32_t Words>
+struct large_key {
+  constexpr __host__ __device__ large_key(int32_t value) noexcept
+  {
+    for (int32_t i = 0; i < Words; ++i) {
+      data_[i] = value;
+    }
+  }
+
+ private:
+  int32_t data_[Words];
+};
+
+template <typename Hash>
+__host__ __device__ bool check_hash_result(typename Hash::argument_type const& key,
+                                           typename Hash::result_type seed,
+                                           typename Hash::result_type expected) noexcept
+{
+  Hash h(seed);
+  return (h(key) == expected);
+}
+
+template <typename OutputIter>
+__global__ void check_hash_result_kernel_64(OutputIter result)
+{
+  int i = 0;
+
+  result[i++] = check_hash_result<cuco::xxhash_64<char>>(0, 0, 16804241149081757544);
+  result[i++] = check_hash_result<cuco::xxhash_64<char>>(42, 0, 765293966243412708);
+  result[i++] = check_hash_result<cuco::xxhash_64<char>>(0, 42, 9486749600008296231);
+
+  result[i++] = check_hash_result<cuco::xxhash_64<int32_t>>(0, 0, 4246796580750024372);
+  result[i++] = check_hash_result<cuco::xxhash_64<int32_t>>(0, 42, 3614696996920510707);
+  result[i++] = check_hash_result<cuco::xxhash_64<int32_t>>(42, 0, 15516826743637085169);
+  result[i++] = check_hash_result<cuco::xxhash_64<int32_t>>(123456789, 0, 9462334144942111946);
+
+  result[i++] = check_hash_result<cuco::xxhash_64<int64_t>>(0, 0, 3803688792395291579);
+  result[i++] = check_hash_result<cuco::xxhash_64<int64_t>>(0, 42, 13194218611613725804);
+  result[i++] = check_hash_result<cuco::xxhash_64<int64_t>>(42, 0, 13066772586158965587);
+  result[i++] = check_hash_result<cuco::xxhash_64<int64_t>>(123456789, 0, 14662639848940634189);
+
+#if defined(CUCO_HAS_INT128)
+  result[i++] = check_hash_result<cuco::xxhash_64<__int128>>(123456789, 0, 7986913354431084250);
+#endif
+
+  result[i++] =
+    check_hash_result<cuco::xxhash_64<large_key<32>>>(123456789, 0, 2031761887105658523);
+}
+
+TEST_CASE("Test cuco::xxhash_64", "")
+{
+  // Reference hash values were computed using https://github.com/Cyan4973/xxHash
+  SECTION("Check if host-generated hash values match the reference implementation.")
+  {
+    CHECK(check_hash_result<cuco::xxhash_64<char>>(0, 0, 16804241149081757544));
+    CHECK(check_hash_result<cuco::xxhash_64<char>>(42, 0, 765293966243412708));
+    CHECK(check_hash_result<cuco::xxhash_64<char>>(0, 42, 9486749600008296231));
+
+    CHECK(check_hash_result<cuco::xxhash_64<int32_t>>(0, 0, 4246796580750024372));
+    CHECK(check_hash_result<cuco::xxhash_64<int32_t>>(0, 42, 3614696996920510707));
+    CHECK(check_hash_result<cuco::xxhash_64<int32_t>>(42, 0, 15516826743637085169));
+    CHECK(check_hash_result<cuco::xxhash_64<int32_t>>(123456789, 0, 9462334144942111946));
+
+    CHECK(check_hash_result<cuco::xxhash_64<int64_t>>(0, 0, 3803688792395291579));
+    CHECK(check_hash_result<cuco::xxhash_64<int64_t>>(0, 42, 13194218611613725804));
+    CHECK(check_hash_result<cuco::xxhash_64<int64_t>>(42, 0, 13066772586158965587));
+    CHECK(check_hash_result<cuco::xxhash_64<int64_t>>(123456789, 0, 14662639848940634189));
+
+#if defined(CUCO_HAS_INT128)
+    CHECK(check_hash_result<cuco::xxhash_64<__int128>>(123456789, 0, 7986913354431084250));
+#endif
+
+    // 32*4=128-byte key to test the pipelined outermost hashing loop
+    CHECK(check_hash_result<cuco::xxhash_64<large_key<32>>>(123456789, 0, 2031761887105658523));
+  }
+
+  SECTION("Check if device-generated hash values match the reference implementation.")
+  {
+    thrust::device_vector<bool> result(10);
+
+    check_hash_result_kernel_64<<<1, 1>>>(result.begin());
+
+    CHECK(cuco::test::all_of(result.begin(), result.end(), [] __device__(bool v) { return v; }));
+  }
+}
+
+template <typename OutputIter>
+__global__ void check_hash_result_kernel_32(OutputIter result)
+{
+  int i = 0;
+
+  result[i++] = check_hash_result<cuco::xxhash_32<char>>(0, 0, 3479547966);
+  result[i++] = check_hash_result<cuco::xxhash_32<char>>(42, 0, 3774771295);
+  result[i++] = check_hash_result<cuco::xxhash_32<char>>(0, 42, 2099223482);
+
+  result[i++] = check_hash_result<cuco::xxhash_32<int32_t>>(0, 0, 148298089);
+  result[i++] = check_hash_result<cuco::xxhash_32<int32_t>>(0, 42, 2132181312);
+  result[i++] = check_hash_result<cuco::xxhash_32<int32_t>>(42, 0, 1161967057);
+  result[i++] = check_hash_result<cuco::xxhash_32<int32_t>>(123456789, 0, 2987034094);
+
+  result[i++] = check_hash_result<cuco::xxhash_32<int64_t>>(0, 0, 3736311059);
+  result[i++] = check_hash_result<cuco::xxhash_32<int64_t>>(0, 42, 1076387279);
+  result[i++] = check_hash_result<cuco::xxhash_32<int64_t>>(42, 0, 2332451213);
+  result[i++] = check_hash_result<cuco::xxhash_32<int64_t>>(123456789, 0, 1561711919);
+
+#if defined(CUCO_HAS_INT128)
+  result[i++] = check_hash_result<cuco::xxhash_32<__int128>>(123456789, 0, 1846633701);
+#endif
+
+  result[i++] = check_hash_result<cuco::xxhash_32<large_key<32>>>(123456789, 0, 3715432378);
+}
+
+TEST_CASE("Test cuco::xxhash_32", "")
+{
+  // Reference hash values were computed using https://github.com/Cyan4973/xxHash
+  SECTION("Check if host-generated hash values match the reference implementation.")
+  {
+    CHECK(check_hash_result<cuco::xxhash_32<char>>(0, 0, 3479547966));
+    CHECK(check_hash_result<cuco::xxhash_32<char>>(42, 0, 3774771295));
+    CHECK(check_hash_result<cuco::xxhash_32<char>>(0, 42, 2099223482));
+
+    CHECK(check_hash_result<cuco::xxhash_32<int32_t>>(0, 0, 148298089));
+    CHECK(check_hash_result<cuco::xxhash_32<int32_t>>(0, 42, 2132181312));
+    CHECK(check_hash_result<cuco::xxhash_32<int32_t>>(42, 0, 1161967057));
+    CHECK(check_hash_result<cuco::xxhash_32<int32_t>>(123456789, 0, 2987034094));
+
+    CHECK(check_hash_result<cuco::xxhash_32<int64_t>>(0, 0, 3736311059));
+    CHECK(check_hash_result<cuco::xxhash_32<int64_t>>(0, 42, 1076387279));
+    CHECK(check_hash_result<cuco::xxhash_32<int64_t>>(42, 0, 2332451213));
+    CHECK(check_hash_result<cuco::xxhash_32<int64_t>>(123456789, 0, 1561711919));
+
+#if defined(CUCO_HAS_INT128)
+    CHECK(check_hash_result<cuco::xxhash_32<__int128>>(123456789, 0, 1846633701));
+#endif
+
+    // 32*4=128-byte key to test the pipelined outermost hashing loop
+    CHECK(check_hash_result<cuco::xxhash_32<large_key<32>>>(123456789, 0, 3715432378));
+  }
+
+  SECTION("Check if device-generated hash values match the reference implementation.")
+  {
+    thrust::device_vector<bool> result(20, true);
+
+    check_hash_result_kernel_32<<<1, 1>>>(result.begin());
+
+    CHECK(cuco::test::all_of(result.begin(), result.end(), [] __device__(bool v) { return v; }));
+  }
+}
+
+TEMPLATE_TEST_CASE_SIG("Static vs. dynamic key hash test",
+                       "",
+                       ((typename Hash), Hash),
+                       (cuco::murmurhash3_32<char>),
+                       (cuco::murmurhash3_32<int32_t>),
+                       (cuco::xxhash_32<char>),
+                       (cuco::xxhash_32<int32_t>),
+                       (cuco::xxhash_64<char>),
+                       (cuco::xxhash_64<int32_t>))
+{
+  using key_type = typename Hash::argument_type;
+
+  Hash hash;
+  key_type key = 42;
+
+  SECTION("Identical keys with static and dynamic key size should have the same hash value.")
+  {
+    CHECK(hash(key) ==
+          hash.compute_hash(reinterpret_cast<std::byte const*>(&key), sizeof(key_type)));
+  }
+}
\ No newline at end of file
diff --git a/tests/utility/storage_test.cu b/tests/utility/storage_test.cu
new file mode 100644
index 000000000..b776f628c
--- /dev/null
+++ b/tests/utility/storage_test.cu
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <utils.hpp>
+
+#include <cuco/aow_storage.cuh>
+#include <cuco/extent.cuh>
+#include <cuco/pair.cuh>
+#include <cuco/utility/allocator.hpp>
+
+#include <catch2/catch_template_test_macros.hpp>
+
+TEMPLATE_TEST_CASE_SIG("Storage tests",
+                       "",
+                       ((typename Key, typename Value), Key, Value),
+                       (int32_t, int32_t),
+                       (int32_t, int64_t),
+                       (int64_t, int64_t))
+{
+  constexpr std::size_t size{1'000};
+  constexpr int window_size{2};
+  constexpr std::size_t gold_capacity{2'000};
+
+  using allocator_type = cuco::cuda_allocator<char>;
+  auto allocator       = allocator_type{};
+
+  SECTION("Allocate array of pairs with AoS storage.")
+  {
+    auto s =
+      cuco::experimental::aow_storage<cuco::pair<Key, Value>,
+                                      window_size,
+                                      cuco::experimental::extent<std::size_t>,
+                                      allocator_type>(cuco::experimental::extent{size}, allocator);
+    auto const num_windows = s.num_windows();
+    auto const capacity    = s.capacity();
+
+    REQUIRE(num_windows == size);
+    REQUIRE(capacity == gold_capacity);
+  }
+
+  SECTION("Allocate array of pairs with AoS storage with static extent.")
+  {
+    using extent_type = cuco::experimental::extent<std::size_t, size>;
+    auto s            = cuco::experimental::
+      aow_storage<cuco::pair<Key, Value>, window_size, extent_type, allocator_type>(extent_type{},
+                                                                                    allocator);
+    auto const num_windows = s.num_windows();
+    auto const capacity    = s.capacity();
+
+    STATIC_REQUIRE(num_windows == size);
+    STATIC_REQUIRE(capacity == gold_capacity);
+  }
+
+  SECTION("Allocate array of keys with AoS storage.")
+  {
+    auto s = cuco::experimental::
+      aow_storage<Key, window_size, cuco::experimental::extent<std::size_t>, allocator_type>(
+        cuco::experimental::extent{size}, allocator);
+    auto const num_windows = s.num_windows();
+    auto const capacity    = s.capacity();
+
+    REQUIRE(num_windows == size);
+    REQUIRE(capacity == gold_capacity);
+  }
+
+  SECTION("Allocate array of keys with AoS storage with static extent.")
+  {
+    using extent_type = cuco::experimental::extent<std::size_t, size>;
+    auto s = cuco::experimental::aow_storage<Key, window_size, extent_type, allocator_type>(
+      extent_type{}, allocator);
+    auto const num_windows = s.num_windows();
+    auto const capacity    = s.capacity();
+
+    STATIC_REQUIRE(num_windows == size);
+    STATIC_REQUIRE(capacity == gold_capacity);
+  }
+}
diff --git a/tests/utils.hpp b/tests/utils.hpp
index dd2f6545f..3325027a9 100644
--- a/tests/utils.hpp
+++ b/tests/utils.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,10 +18,14 @@
 
 #include <utils.cuh>
 
+#include <cuco/detail/error.hpp>
+
 #include <thrust/functional.h>
 
 #include <cooperative_groups.h>
 
+#include <iterator>
+
 namespace cuco {
 namespace test {
 
@@ -35,23 +39,23 @@ enum class probe_sequence { linear_probing, double_hashing };
 template <typename Iterator, typename Predicate>
 int count_if(Iterator begin, Iterator end, Predicate p, cudaStream_t stream = 0)
 {
-  auto const size      = end - begin;
+  auto const size      = std::distance(begin, end);
   auto const grid_size = (size + block_size - 1) / block_size;
 
   int* count;
-  cudaMallocManaged(&count, sizeof(int));
+  CUCO_CUDA_TRY(cudaMallocManaged(&count, sizeof(int)));
 
   *count = 0;
   int device_id;
-  cudaGetDevice(&device_id);
-  cudaMemPrefetchAsync(count, sizeof(int), device_id, stream);
+  CUCO_CUDA_TRY(cudaGetDevice(&device_id));
+  CUCO_CUDA_TRY(cudaMemPrefetchAsync(count, sizeof(int), device_id, stream));
 
   detail::count_if<<<grid_size, block_size, 0, stream>>>(begin, end, count, p);
-  cudaStreamSynchronize(stream);
+  CUCO_CUDA_TRY(cudaStreamSynchronize(stream));
 
-  auto res = *count;
+  auto const res = *count;
 
-  cudaFree(count);
+  CUCO_CUDA_TRY(cudaFree(count));
 
   return res;
 }
@@ -59,7 +63,7 @@ int count_if(Iterator begin, Iterator end, Predicate p, cudaStream_t stream = 0)
 template <typename Iterator, typename Predicate>
 bool all_of(Iterator begin, Iterator end, Predicate p, cudaStream_t stream = 0)
 {
-  auto const size  = end - begin;
+  auto const size  = std::distance(begin, end);
   auto const count = count_if(begin, end, p, stream);
 
   return size == count;
@@ -81,23 +85,23 @@ bool none_of(Iterator begin, Iterator end, Predicate p, cudaStream_t stream = 0)
 template <typename Iterator1, typename Iterator2, typename Predicate>
 bool equal(Iterator1 begin1, Iterator1 end1, Iterator2 begin2, Predicate p, cudaStream_t stream = 0)
 {
-  auto const size      = end1 - begin1;
+  auto const size      = std::distance(begin1, end1);
   auto const grid_size = (size + block_size - 1) / block_size;
 
   int* count;
-  cudaMallocManaged(&count, sizeof(int));
+  CUCO_CUDA_TRY(cudaMallocManaged(&count, sizeof(int)));
 
   *count = 0;
   int device_id;
-  cudaGetDevice(&device_id);
-  cudaMemPrefetchAsync(count, sizeof(int), device_id, stream);
+  CUCO_CUDA_TRY(cudaGetDevice(&device_id));
+  CUCO_CUDA_TRY(cudaMemPrefetchAsync(count, sizeof(int), device_id, stream));
 
   detail::count_if<<<grid_size, block_size, 0, stream>>>(begin1, end1, begin2, count, p);
-  cudaStreamSynchronize(stream);
+  CUCO_CUDA_TRY(cudaStreamSynchronize(stream));
 
-  auto res = *count;
+  auto const res = *count;
 
-  cudaFree(count);
+  CUCO_CUDA_TRY(cudaFree(count));
 
   return res == size;
 }