diff --git a/.devcontainer/cuda12.2-gcc12/devcontainer.json b/.devcontainer/cuda12.2-gcc12/devcontainer.json new file mode 100644 index 000000000..199ce44f4 --- /dev/null +++ b/.devcontainer/cuda12.2-gcc12/devcontainer.json @@ -0,0 +1,39 @@ +{ + "shutdownAction": "stopContainer", + "image": "rapidsai/devcontainers:23.08-cpp-gcc12-cuda12.2-ubuntu22.04", + "hostRequirements": { + "gpu": true + }, + "initializeCommand": [ + "/bin/bash", + "-c", + "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}" + ], + "containerEnv": { + "SCCACHE_REGION": "us-east-2", + "SCCACHE_BUCKET": "rapids-sccache-devs", + "VAULT_HOST": "https://vault.ops.k8s.rapids.ai", + "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history", + "DEVCONTAINER_NAME": "cuda12.2-gcc12" + }, + "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", + "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", + "mounts": [ + "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", + "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", + "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent" + ], + "customizations": { + "vscode": { + "extensions": [ + "llvm-vs-code-extensions.vscode-clangd" + ], + "settings": { + "clangd.arguments": [ + "--compile-commands-dir=${workspaceFolder}/build/latest" + ] + } + } + }, + "name": "cuda12.2-gcc12" +} diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json new file mode 100644 index 000000000..84cfa82cc --- /dev/null +++ b/.devcontainer/devcontainer.json @@ -0,0 +1,37 @@ +{ + "shutdownAction": "stopContainer", + "image": "rapidsai/devcontainers:23.08-cpp-gcc12-cuda12.2-ubuntu22.04", + "hostRequirements": { + "gpu": true + }, + "initializeCommand": [ + "/bin/bash", + "-c", + "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}" + ], + "containerEnv": { + "SCCACHE_REGION": "us-east-2", + "SCCACHE_BUCKET": "rapids-sccache-devs", + "VAULT_HOST": "https://vault.ops.k8s.rapids.ai", + "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history" + }, + "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}", + "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent", + "mounts": [ + "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent", + "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent", + "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent" + ], + "customizations": { + "vscode": { + "extensions": [ + "llvm-vs-code-extensions.vscode-clangd" + ], + "settings": { + "clangd.arguments": [ + "--compile-commands-dir=${workspaceFolder}/build/latest" + ] + } + } + } +} \ No newline at end of file diff --git a/.devcontainer/launch.sh b/.devcontainer/launch.sh new file mode 100755 index 000000000..157a49bef --- /dev/null +++ b/.devcontainer/launch.sh @@ -0,0 +1,58 @@ +#! /usr/bin/env bash +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +launch_devcontainer() { + + # Ensure we're in the repo root + cd "$( cd "$( dirname "$(realpath -m "${BASH_SOURCE[0]}")" )" && pwd )/.."; + + if [[ -z $1 ]] || [[ -z $2 ]]; then + echo "Usage: $0 [CUDA version] [Host compiler]" + echo "Example: $0 12.1 gcc12" + return 1 + fi + + local cuda_version="$1" + local host_compiler="$2" + local workspace="$(basename "$(pwd)")"; + local tmpdir="$(mktemp -d)/${workspace}"; + local path="$(pwd)/.devcontainer/cuda${cuda_version}-${host_compiler}"; + + mkdir -p "${tmpdir}"; + mkdir -p "${tmpdir}/.devcontainer"; + cp -arL "$path/devcontainer.json" "${tmpdir}/.devcontainer"; + sed -i "s@\${localWorkspaceFolder}@$(pwd)@g" "${tmpdir}/.devcontainer/devcontainer.json"; + path="${tmpdir}"; + + local hash="$(echo -n "${path}" | xxd -pu - | tr -d '[:space:]')"; + local url="vscode://vscode-remote/dev-container+${hash}/home/coder/cuCollections"; + + echo "devcontainer URL: ${url}"; + + local launch=""; + if type open >/dev/null 2>&1; then + launch="open"; + elif type xdg-open >/dev/null 2>&1; then + launch="xdg-open"; + fi + + if [ -n "${launch}" ]; then + code --new-window "${tmpdir}"; + exec "${launch}" "${url}" >/dev/null 2>&1; + fi +} + +launch_devcontainer "$@"; \ No newline at end of file diff --git a/.devcontainer/make_devcontainers.sh b/.devcontainer/make_devcontainers.sh new file mode 100755 index 000000000..700dc3713 --- /dev/null +++ b/.devcontainer/make_devcontainers.sh @@ -0,0 +1,60 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This script parses the CI matrix.yaml file and generates a devcontainer.json file for each unique combination of +# CUDA version, compiler name/version, and Ubuntu version. The devcontainer.json files are written to the +# .devcontainer directory to a subdirectory named after the CUDA version and compiler name/version. +# GitHub docs on using multiple devcontainer.json files: +# https://docs.github.com/en/codespaces/setting-up-your-project-for-codespaces/adding-a-dev-container-configuration/introduction-to-dev-containers#devcontainerjson + +# Ensure the script is being executed in its containing directory +cd "$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"; + +# The root devcontainer.json file is used as a template for all other devcontainer.json files +# by replacing the `image:` field with the appropriate image name +base_devcontainer_file="./devcontainer.json" + + +# Read matrix.yaml and convert it to json +matrix_json=$(yq -o json ../ci/matrix.yml) + + +# Get the devcontainer image version and define image tag root +DEVCONTAINER_VERSION=$(echo "$matrix_json" | jq -r '.devcontainer_version') +IMAGE_ROOT="rapidsai/devcontainers:${DEVCONTAINER_VERSION}-cpp-" + +# Get unique combinations of cuda version, compiler name/version, and Ubuntu version +combinations=$(echo "$matrix_json" | jq -c '[.pull_request.nvcc[] | {cuda: .cuda, compiler_name: .compiler.name, compiler_version: .compiler.version, os: .os}] | unique | .[]') + +# For each unique combination +for combination in $combinations; do + cuda_version=$(echo "$combination" | jq -r '.cuda') + compiler_name=$(echo "$combination" | jq -r '.compiler_name') + compiler_version=$(echo "$combination" | jq -r '.compiler_version') + os=$(echo "$combination" | jq -r '.os') + + name="cuda$cuda_version-$compiler_name$compiler_version" + mkdir -p "$name" + devcontainer_file="$name/devcontainer.json" + image="$IMAGE_ROOT$compiler_name$compiler_version-cuda$cuda_version-$os" + + # Use the base_devcontainer.json as a template, plug in the CUDA, compiler names, versions, and Ubuntu version, + # and write the output to the new devcontainer.json file + #jq --arg image "$image" --arg name "$name" '. + {image: $image, name: $name}' $base_devcontainer_file > "$devcontainer_file" + jq --arg image "$image" --arg name "$name" '.image = $image | .name = $name | .containerEnv.DEVCONTAINER_NAME = $name' $base_devcontainer_file > "$devcontainer_file" + + echo "Created $devcontainer_file" +done \ No newline at end of file diff --git a/.github/actions/compute-matrix/action.yml b/.github/actions/compute-matrix/action.yml new file mode 100644 index 000000000..fbbe49b54 --- /dev/null +++ b/.github/actions/compute-matrix/action.yml @@ -0,0 +1,39 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Compute Matrix +description: "Compute the matrix for a given matrix type from the specified matrix file" + +inputs: + matrix_query: + description: "The jq query used to specify the desired matrix. e.g., .pull_request.nvcc" + required: true + matrix_file: + description: 'The file containing the matrix' + required: true +outputs: + matrix: + description: 'The requested matrix' + value: ${{ steps.compute-matrix.outputs.MATRIX }} + +runs: + using: "composite" + steps: + - name: Compute matrix + id: compute-matrix + run: | + MATRIX=$(./.github/actions/compute-matrix/compute-matrix.sh ${{inputs.matrix_file}} ${{inputs.matrix_query}} ) + echo "matrix=$MATRIX" | tee -a $GITHUB_OUTPUT + shell: bash -euxo pipefail {0} \ No newline at end of file diff --git a/.github/actions/compute-matrix/compute-matrix.sh b/.github/actions/compute-matrix/compute-matrix.sh new file mode 100755 index 000000000..64a6f5642 --- /dev/null +++ b/.github/actions/compute-matrix/compute-matrix.sh @@ -0,0 +1,37 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +# Check for the correct number of arguments +if [ $# -ne 2 ]; then + echo "Usage: $0 MATRIX_FILE MATRIX_QUERY" + echo "MATRIX_FILE: The path to the matrix file." + echo "MATRIX_QUERY: The jq query used to specify the desired matrix. e.g., '.pull-request.nvcc'" + exit 1 +fi + +# Get realpath before changing directory +MATRIX_FILE=$(realpath "$1") +MATRIX_QUERY="$2" + +# Ensure the script is being executed in its containing directory +cd "$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"; + +echo "Input matrix file:" >&2 +cat "$MATRIX_FILE" >&2 +echo "Query: $MATRIX_QUERY" >&2 +echo $(yq -o=json "$MATRIX_FILE" | jq -c -r "$MATRIX_QUERY | map(. as \$o | {std: .std[]} + del(\$o.std))") \ No newline at end of file diff --git a/.github/actions/configure_cccl_sccache/action.yml b/.github/actions/configure_cccl_sccache/action.yml new file mode 100644 index 000000000..458669688 --- /dev/null +++ b/.github/actions/configure_cccl_sccache/action.yml @@ -0,0 +1,34 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Set up AWS credentials and environment variables for sccache +description: "Set up AWS credentials and environment variables for sccache" +runs: + using: "composite" + steps: + - name: Get AWS credentials for sccache bucket + uses: aws-actions/configure-aws-credentials@v2 + with: + role-to-assume: arn:aws:iam::279114543810:role/gha-oidc-NVIDIA + aws-region: us-east-2 + role-duration-seconds: 43200 # 12 hours + - name: Set environment variables + run: | + echo "SCCACHE_BUCKET=rapids-sccache-east" >> $GITHUB_ENV + echo "SCCACHE_REGION=us-east-2" >> $GITHUB_ENV + echo "SCCACHE_IDLE_TIMEOUT=32768" >> $GITHUB_ENV + echo "SCCACHE_S3_USE_SSL=true" >> $GITHUB_ENV + echo "SCCACHE_S3_NO_CREDENTIALS=false" >> $GITHUB_ENV + shell: bash \ No newline at end of file diff --git a/.github/copy-pr-bot.yaml b/.github/copy-pr-bot.yaml new file mode 100644 index 000000000..895ba83ee --- /dev/null +++ b/.github/copy-pr-bot.yaml @@ -0,0 +1,4 @@ +# Configuration file for `copy-pr-bot` GitHub App +# https://docs.gha-runners.nvidia.com/apps/copy-pr-bot/ + +enabled: true diff --git a/.github/workflows/add_to_project.yml b/.github/workflows/add_to_project.yml deleted file mode 100644 index 72dd4acd2..000000000 --- a/.github/workflows/add_to_project.yml +++ /dev/null @@ -1,29 +0,0 @@ -name: Add new issue/PR to project - -on: - issues: - types: - - opened - - pull_request_target: - types: - - opened - -jobs: - add-to-project: - name: Add issue or PR to project - runs-on: ubuntu-latest - steps: - - name: Generate token - id: generate_token - uses: tibdex/github-app-token@36464acb844fc53b9b8b2401da68844f6b05ebb0 - with: - app_id: ${{ secrets.CCCL_AUTH_APP_ID }} - private_key: ${{ secrets.CCCL_AUTH_APP_PEM }} - - name: Add to Project - env: - TOKEN: ${{ steps.generate_token.outputs.token }} - uses: actions/add-to-project@v0.3.0 - with: - project-url: https://github.com/orgs/NVIDIA/projects/6 - github-token: ${{ env.TOKEN }} diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml new file mode 100644 index 000000000..6599e9dcb --- /dev/null +++ b/.github/workflows/build-and-test.yml @@ -0,0 +1,86 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: build and test + +defaults: + run: + shell: bash -eo pipefail {0} + +on: + workflow_call: + inputs: + devcontainer_version: {type: string, required: true} + cuda_version: {type: string, required: true} + compiler: {type: string, required: true} + compiler_exe: {type: string, required: true} + compiler_version: {type: string, required: true} + std: {type: string, required: true} + gpu_build_archs: {type: string, required: true} + cpu: {type: string, required: true} + os: {type: string, required: true} + build_script: {type: string, required: false} + test_script: {type: string, required: false} + run_tests: {type: boolean, required: false, default: true} + +jobs: + devcontainer_image: + name: Devcontainer ${{ inputs.os }}/${{ inputs.compiler }}${{ inputs.compiler_version }} + runs-on: ubuntu-latest + outputs: + image_name: ${{ steps.compute-devcontainer-image-name.outputs.name }} + steps: + - name: Compute devcontainer image name + id: compute-devcontainer-image-name + run: | + COMPILER_SEGMENT="" + if [ "${{ inputs.compiler }}" != "cc" ] && [ "${{ inputs.compiler_exe }}" != "c++" ]; then + COMPILER_SEGMENT="${{ inputs.compiler }}${{ inputs.compiler_version }}-" + fi + DEVCONTAINER_IMAGE="rapidsai/devcontainers:${{inputs.devcontainer_version}}-cpp-${COMPILER_SEGMENT}cuda${{inputs.cuda_version}}-${{inputs.os}}" + echo "DEVCONTAINER_IMAGE=$DEVCONTAINER_IMAGE" >> $GITHUB_ENV + echo "name=$DEVCONTAINER_IMAGE" >> $GITHUB_OUTPUT + - name: Check if devcontainer image exists + run: | + docker buildx imagetools inspect $DEVCONTAINER_IMAGE > /dev/null + if [ $? -ne 0 ]; then + echo "Error: Docker image $DEVCONTAINER_IMAGE does not exist." + exit 1 + fi + + build: + needs: devcontainer_image + if: inputs.build_script != '' && needs.devcontainer_image.outputs.image_name != '' + name: Build ${{inputs.compiler}}${{inputs.compiler_version}}/C++${{inputs.std}}/SM${{inputs.gpu_build_archs}} + uses: ./.github/workflows/run-as-coder.yml + with: + name: Build ${{inputs.compiler}}${{inputs.compiler_version}}/C++${{inputs.std}}/SM${{inputs.gpu_build_archs}} + runner: linux-${{inputs.cpu}}-cpu16 + image: ${{ needs.devcontainer_image.outputs.image_name }} + command: | + ${{ inputs.build_script }} "${{inputs.compiler_exe}}" "${{inputs.std}}" "${{inputs.gpu_build_archs}}" + + test: + needs: [devcontainer_image, build] + if: ${{ !cancelled() && ( needs.build.result == 'success' || needs.build.result == 'skipped' ) && inputs.test_script != '' && needs.devcontainer_image.outputs.image_name != '' && inputs.run_tests}} + name: Test ${{inputs.compiler}}${{inputs.compiler_version}}/C++${{inputs.std}}/SM${{inputs.gpu_build_archs}} + uses: ./.github/workflows/run-as-coder.yml + with: + name: Test ${{inputs.compiler}}${{inputs.compiler_version}}/C++${{inputs.std}}/SM${{inputs.gpu_build_archs}} + runner: linux-${{inputs.cpu}}-gpu-v100-latest-1 + image: ${{ needs.devcontainer_image.outputs.image_name }} + command: | + nvidia-smi + ${{ inputs.test_script }} "${{inputs.compiler_exe}}" "${{inputs.std}}" "${{inputs.gpu_build_archs}}" \ No newline at end of file diff --git a/.github/workflows/dispatch-build-and-test.yml b/.github/workflows/dispatch-build-and-test.yml new file mode 100644 index 000000000..dea71e00e --- /dev/null +++ b/.github/workflows/dispatch-build-and-test.yml @@ -0,0 +1,49 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Dispatch build and test + +on: + workflow_call: + inputs: + per_cuda_compiler_matrix: {type: string, required: true} + build_script: {type: string, required: false} + test_script: {type: string, required: false} + devcontainer_version: {type: string, required: true} + +jobs: + # Using a matrix to dispatch to the build-and-test reusable workflow for each build configuration + # ensures that the build/test steps can overlap across different configurations. For example, + # the build step for CUDA 12.1 + gcc 9.3 can run at the same time as the test step for CUDA 11.0 + clang 11. + build_and_test: + name: ${{matrix.cpu}} + uses: ./.github/workflows/build-and-test.yml + strategy: + fail-fast: false + matrix: + include: ${{ fromJSON(inputs.per_cuda_compiler_matrix) }} + with: + devcontainer_version: ${{ inputs.devcontainer_version }} + cuda_version: ${{ matrix.cuda }} + compiler: ${{ matrix.compiler.name }} + compiler_exe: ${{ matrix.compiler.exe }} + compiler_version: ${{ matrix.compiler.version }} + std: ${{ matrix.std }} + gpu_build_archs: ${{ matrix.gpu_build_archs }} + cpu: ${{ matrix.cpu }} + os: ${{ matrix.os }} + build_script: ${{ inputs.build_script }} + test_script: ${{ inputs.test_script }} + run_tests: ${{ contains(matrix.jobs, 'test') && !contains(github.event.head_commit.message, 'skip-tests') }} diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml new file mode 100644 index 000000000..061b30a99 --- /dev/null +++ b/.github/workflows/pr.yml @@ -0,0 +1,121 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This is the main workflow that runs on every PR and push to main +name: pr + +defaults: + run: + shell: bash -euo pipefail {0} + +on: + push: + branches: + - main + - dev + - "pull-request/[0-9]+" + +# Only runs one instance of this workflow at a time for a given PR and cancels any in-progress runs when a new one starts. +concurrency: + group: ${{ github.workflow }}-on-${{ github.event_name }}-from-${{ github.ref_name }} + cancel-in-progress: true + +jobs: + doxygen-check: + name: Doxygen check + runs-on: ubuntu-latest + steps: + - name: Checkout repo + uses: actions/checkout@v3 + - name: Install Doxygen + run: | + sudo apt-get update -q + sudo apt-get install -y doxygen + - name: Check Doxygen docs + run: | + ./ci/pre-commit/doxygen.sh + if [ $? -ne 0 ]; then + echo "Doxygen check failed" + exit 1 + fi + shell: bash -euxo pipefail {0} + + get-devcontainer-version: + name: Get devcontainer version + runs-on: ubuntu-latest + outputs: + DEVCONTAINER_VERSION: ${{ steps.set-outputs.outputs.DEVCONTAINER_VERSION }} + steps: + - name: Checkout repo + uses: actions/checkout@v3 + - name: Get devcontainer version + id: set-outputs + run: | + DEVCONTAINER_VERSION=$(yq -o json ci/matrix.yml | jq -r '.devcontainer_version') + echo "DEVCONTAINER_VERSION=$DEVCONTAINER_VERSION" | tee -a "$GITHUB_OUTPUT" + + compute-nvcc-matrix: + name: Compute NVCC matrix + runs-on: ubuntu-latest + outputs: + FULL_MATRIX: ${{ steps.set-outputs.outputs.FULL_MATRIX }} + CUDA_VERSIONS: ${{ steps.set-outputs.outputs.CUDA_VERSIONS }} + HOST_COMPILERS: ${{ steps.set-outputs.outputs.HOST_COMPILERS }} + PER_CUDA_COMPILER_MATRIX: ${{ steps.set-outputs.outputs.PER_CUDA_COMPILER_MATRIX }} + steps: + - name: Checkout repo + uses: actions/checkout@v3 + - name: Get full nvcc matrix + id: compute-nvcc-matrix + uses: ./.github/actions/compute-matrix + with: + matrix_file: './ci/matrix.yml' + matrix_query: '.pull_request.nvcc' + - name: Set outputs + id: set-outputs + run: | + FULL_MATRIX='${{steps.compute-nvcc-matrix.outputs.matrix}}' + echo "FULL_MATRIX=$FULL_MATRIX" | tee -a "$GITHUB_OUTPUT" + CUDA_VERSIONS=$(echo $FULL_MATRIX | jq -c '[.[] | .cuda] | unique') + echo "CUDA_VERSIONS=$CUDA_VERSIONS" | tee -a "$GITHUB_OUTPUT" + HOST_COMPILERS=$(echo $FULL_MATRIX | jq -c '[.[] | .compiler.name] | unique') + echo "HOST_COMPILERS=$HOST_COMPILERS" | tee -a "$GITHUB_OUTPUT" + PER_CUDA_COMPILER_MATRIX=$(echo $FULL_MATRIX | jq -c ' group_by(.cuda + .compiler.name) | map({(.[0].cuda + "-" + .[0].compiler.name): .}) | add') + echo "PER_CUDA_COMPILER_MATRIX=$PER_CUDA_COMPILER_MATRIX" | tee -a "$GITHUB_OUTPUT" + + ci: + name: CUDA${{ matrix.cuda_version }} ${{ matrix.compiler }} + needs: [compute-nvcc-matrix, get-devcontainer-version] + uses: ./.github/workflows/dispatch-build-and-test.yml + strategy: + fail-fast: false + matrix: + cuda_version: ${{ fromJSON(needs.compute-nvcc-matrix.outputs.CUDA_VERSIONS) }} + compiler: ${{ fromJSON(needs.compute-nvcc-matrix.outputs.HOST_COMPILERS) }} + with: + per_cuda_compiler_matrix: ${{ toJSON(fromJSON(needs.compute-nvcc-matrix.outputs.PER_CUDA_COMPILER_MATRIX)[ format('{0}-{1}', matrix.cuda_version, matrix.compiler) ]) }} + build_script: "./ci/build.sh" + test_script: "./ci/test.sh" + devcontainer_version: ${{ needs.get-devcontainer-version.outputs.DEVCONTAINER_VERSION }} + + # This job is the final job that runs after all other jobs and is used for branch protection status checks. + # See: https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/collaborating-on-repositories-with-code-quality-features/about-status-checks + ci-success: + runs-on: ubuntu-latest + name: CI success + needs: + - ci + steps: + - run: echo "CI success" \ No newline at end of file diff --git a/.github/workflows/run-as-coder.yml b/.github/workflows/run-as-coder.yml new file mode 100644 index 000000000..573ef134a --- /dev/null +++ b/.github/workflows/run-as-coder.yml @@ -0,0 +1,66 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Run as coder user + +defaults: + run: + shell: bash -exo pipefail {0} + + +on: + workflow_call: + inputs: + name: {type: string, required: true} + image: {type: string, required: true} + runner: {type: string, required: true} + command: {type: string, required: true} + env: { type: string, required: false, default: "" } + +jobs: + run-as-coder: + name: ${{inputs.name}} + runs-on: ${{inputs.runner}} + container: + options: -u root + image: ${{inputs.image}} + env: + NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }} + permissions: + id-token: write + steps: + - name: Checkout repo + uses: actions/checkout@v3 + with: + path: cuCollections + persist-credentials: false + - name: Move files to coder user home directory + run: | + cp -R cuCollections /home/coder/cuCollections + chown -R coder:coder /home/coder/ + - name: Configure credentials and environment variables for sccache + uses: ./cuCollections/.github/actions/configure_cccl_sccache + - name: Run command + shell: su coder {0} + run: | + set -exo pipefail + cd ~/cuCollections + eval "${{inputs.command}}" || exit_code=$? + if [ ! -z "$exit_code" ]; then + echo "::error::Error! To checkout the corresponding code and reproduce locally, run the following commands:" + echo "git clone --branch $GITHUB_REF_NAME --single-branch --recurse-submodules https://github.com/$GITHUB_REPOSITORY.git && cd $(echo $GITHUB_REPOSITORY | cut -d'/' -f2) && git checkout $GITHUB_SHA" + echo "docker run --rm -it --gpus all --pull=always --volume \$PWD:/repo --workdir /repo ${{ inputs.image }} ${{inputs.command}}" + exit $exit_code + fi diff --git a/.gitignore b/.gitignore index 4146530ed..6ccf378c2 100644 --- a/.gitignore +++ b/.gitignore @@ -8,7 +8,6 @@ __pycache__ *.dylib .cache .vscode -.devcontainer *.code-workspace *.swp *.pytest_cache @@ -140,3 +139,6 @@ ENV/ # clang compile_commands.json + +# figures +*.eps diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e2fe04169..5679bf67f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -20,7 +20,7 @@ repos: hooks: - id: doxygen-check name: doxygen-check - entry: ./ci/checks/doxygen.sh + entry: ./ci/pre-commit/doxygen.sh files: ^include/ types_or: [file] language: system diff --git a/CMakeLists.txt b/CMakeLists.txt index e1b5055d9..f3ca85a8a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,5 +1,5 @@ #============================================================================= -# Copyright (c) 2018-2022, NVIDIA CORPORATION. +# Copyright (c) 2018-2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,10 +13,10 @@ # See the License for the specific language governing permissions and # limitations under the License. #============================================================================= -cmake_minimum_required(VERSION 3.18 FATAL_ERROR) +cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR) if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/CUCO_RAPIDS.cmake) - file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-22.10/RAPIDS.cmake + file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-23.10/RAPIDS.cmake ${CMAKE_CURRENT_BINARY_DIR}/CUCO_RAPIDS.cmake) endif() include(${CMAKE_CURRENT_BINARY_DIR}/CUCO_RAPIDS.cmake) diff --git a/README.md b/README.md index dc8d4db80..93ac04027 100644 --- a/README.md +++ b/README.md @@ -5,13 +5,13 @@ Doxygen Documentation (TODO) -`cuCollections` (`cuco`) is an open-source, header-only library of GPU-accelerated, concurrent data structures. +`cuCollections` (`cuco`) is an open-source, header-only library of GPU-accelerated, concurrent data structures. -Similar to how [Thrust](https://github.com/thrust/thrust) and [CUB](https://github.com/thrust/cub) provide STL-like, GPU accelerated algorithms and primitives, `cuCollections` provides STL-like concurrent data structures. `cuCollections` is not a one-to-one, drop-in replacement for STL data structures like `std::unordered_map`. Instead, it provides functionally similar data structures tailored for efficient use with GPUs. +Similar to how [Thrust](https://github.com/thrust/thrust) and [CUB](https://github.com/thrust/cub) provide STL-like, GPU accelerated algorithms and primitives, `cuCollections` provides STL-like concurrent data structures. `cuCollections` is not a one-to-one, drop-in replacement for STL data structures like `std::unordered_map`. Instead, it provides functionally similar data structures tailored for efficient use with GPUs. ## Development Status -`cuCollections` is still under heavy development. Users should expect breaking changes and refactoring to be common. +`cuCollections` is still under heavy development. Users should expect breaking changes and refactoring to be common. ## Getting cuCollections @@ -21,14 +21,14 @@ Similar to how [Thrust](https://github.com/thrust/thrust) and [CUB](https://gith `cuCollections` is designed to make it easy to include within another CMake project. The `CMakeLists.txt` exports a `cuco` target that can be linked[1](#link-footnote) - into a target to setup include directories, dependencies, and compile flags necessary to use `cuCollections` in your project. + into a target to setup include directories, dependencies, and compile flags necessary to use `cuCollections` in your project. We recommend using [CMake Package Manager (CPM)](https://github.com/TheLartians/CPM.cmake) to fetch `cuCollections` into your project. With CPM, getting `cuCollections` is easy: -``` -cmake_minimum_required(VERSION 3.14 FATAL_ERROR) +```cmake +cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR) include(path/to/CPM.cmake) @@ -47,12 +47,12 @@ target_link_libraries(my_library cuco) This will take care of downloading `cuCollections` from GitHub and making the headers available in a location that can be found by CMake. Linking against the `cuco` target will provide everything needed for `cuco` to be used by the `my_library` target. -1: `cuCollections` is header-only and therefore there is no binary component to "link" against. The linking terminology comes from CMake's `target_link_libraries` which is still used even for header-only library targets. +1: `cuCollections` is header-only and therefore there is no binary component to "link" against. The linking terminology comes from CMake's `target_link_libraries` which is still used even for header-only library targets. ## Requirements -- `nvcc 11+` +- `nvcc 11.5+` - C++17 -- Volta+ +- Volta+ - Pascal is partially supported. Any data structures that require blocking algorithms are not supported. See [libcu++](https://nvidia.github.io/libcudacxx/setup/requirements.html#device-architectures) documentation for more details. ## Dependencies @@ -67,15 +67,15 @@ No action is required from the user to satisfy these dependencies. `cuCollection ## Building cuCollections -Since `cuCollections` is header-only, there is nothing to build to use it. +Since `cuCollections` is header-only, there is nothing to build to use it. To build the tests, benchmarks, and examples: -``` +```bash cd $CUCO_ROOT mkdir -p build cd build -cmake .. +cmake .. make ``` Binaries will be built into: @@ -179,23 +179,32 @@ class example_class { ## Data Structures -We plan to add many GPU-accelerated, concurrent data structures to `cuCollections`. As of now, the two flagships are variants of hash tables. +We plan to add many GPU-accelerated, concurrent data structures to `cuCollections`. As of now, the two flagships are variants of hash tables. + +### `static_set` + +`cuco::static_set` is a fixed-size container that stores unique elements in no particular order. See the Doxygen documentation in `static_set.cuh` for more detailed information. + +#### Examples: +- [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_set/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/z/Pzf6vabz1)) +- [Device-ref APIs for individual operations](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_set/device_ref_example.cu) (see [live example in godbolt](https://godbolt.org/z/sfG3qKqGv)) ### `static_map` `cuco::static_map` is a fixed-size hash table using open addressing with linear probing. See the Doxygen documentation in `static_map.cuh` for more detailed information. #### Examples: -- [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_map/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/z/ervPzqh64)) -- [Device-view APIs for individual operations](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_map/device_view_example.cu) (see [live example in godbolt](https://godbolt.org/z/qMWrfE6ET)) -- [Custom data types, key equality operators and hash functions](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_map/custom_type_example.cu) (see [live example in godbolt](https://godbolt.org/z/oGfYjzMGT)) +- [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_map/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/z/T49P85Mnd)) +- [Device-view APIs for individual operations](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_map/device_view_example.cu) (see [live example in godbolt](https://godbolt.org/z/dh8bMn3G1)) +- [Custom data types, key equality operators and hash functions](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_map/custom_type_example.cu) (see [live example in godbolt](https://godbolt.org/z/7djKevK6e)) +- [Key histogram](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_map/count_by_key_example.cu) (see [live example in godbolt](https://godbolt.org/z/vecGeYM48)) ### `static_multimap` `cuco::static_multimap` is a fixed-size hash table that supports storing equivalent keys. It uses double hashing by default and supports switching to linear probing. See the Doxygen documentation in `static_multimap.cuh` for more detailed information. #### Examples: -- [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_multimap/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/z/Po4eTEn1a)) +- [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_multimap/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/z/PrbqG6ae4)) ### `dynamic_map` diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index a037dc603..3635336e8 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -1,5 +1,5 @@ #============================================================================= -# Copyright (c) 2018-2022, NVIDIA CORPORATION. +# Copyright (c) 2018-2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,20 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. #============================================================================= -cmake_minimum_required(VERSION 3.18 FATAL_ERROR) - -CPMAddPackage( - NAME benchmark - GITHUB_REPOSITORY google/benchmark - VERSION 1.5.2 - OPTIONS - "BENCHMARK_ENABLE_TESTING Off" - # The REGEX feature test fails when gbench's cmake is run under CPM w/ gcc5.4 because it doesn't assume C++11 - # Additionally, attempting to set the CMAKE_CXX_VERSION here doesn't propogate to the feature test build - # Therefore, we just disable the feature test and assume platforms we care about have a regex impl available - "RUN_HAVE_STD_REGEX 0" # - "BENCHMARK_ENABLE_INSTALL OFF" -) +cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR) CPMAddPackage( NAME nvbench @@ -41,65 +28,58 @@ CPMAddPackage( ################################################################################################### ################################################################################################### -function(ConfigureBench BENCH_NAME BENCH_SRC) - add_executable(${BENCH_NAME} "${BENCH_SRC}") - set_target_properties(${BENCH_NAME} PROPERTIES - POSITION_INDEPENDENT_CODE ON - RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/gbenchmarks") - target_include_directories(${BENCH_NAME} PRIVATE - "${CMAKE_CURRENT_SOURCE_DIR}") - target_compile_options(${BENCH_NAME} PRIVATE --compiler-options=-Wall --compiler-options=-Wextra - --expt-extended-lambda --expt-relaxed-constexpr -Xcompiler -Wno-subobject-linkage) - target_link_libraries(${BENCH_NAME} PRIVATE - benchmark benchmark_main - pthread - cuco - CUDA::cudart) -endfunction(ConfigureBench) - -################################################################################################### -function(ConfigureNVBench BENCH_NAME) +function(ConfigureBench BENCH_NAME) add_executable(${BENCH_NAME} ${ARGN}) set_target_properties(${BENCH_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON - RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/nvbenchmarks") + RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/benchmarks") target_include_directories(${BENCH_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}") - #"${NVBench_SOURCE_DIR}") - target_compile_options(${BENCH_NAME} PRIVATE --expt-extended-lambda --expt-relaxed-constexpr) + target_compile_options(${BENCH_NAME} PRIVATE --expt-extended-lambda --expt-relaxed-constexpr -lineinfo) target_link_libraries(${BENCH_NAME} PRIVATE nvbench::main pthread cuco) -endfunction(ConfigureNVBench) +endfunction(ConfigureBench) ################################################################################################### ### benchmark sources ############################################################################# ################################################################################################### ################################################################################################### -# - dynamic_map benchmarks ------------------------------------------------------------------------ -set(DYNAMIC_MAP_BENCH_SRC "${CMAKE_CURRENT_SOURCE_DIR}/hash_table/dynamic_map_bench.cu") -ConfigureBench(DYNAMIC_MAP_BENCH "${DYNAMIC_MAP_BENCH_SRC}") +# - static_set benchmarks ------------------------------------------------------------------------- +ConfigureBench(STATIC_SET_BENCH + hash_table/static_set/contains_bench.cu + hash_table/static_set/find_bench.cu + hash_table/static_set/insert_bench.cu + hash_table/static_set/retrieve_all_bench.cu + hash_table/static_set/size_bench.cu) ################################################################################################### # - static_map benchmarks ------------------------------------------------------------------------- -set(STATIC_MAP_BENCH_SRC "${CMAKE_CURRENT_SOURCE_DIR}/hash_table/static_map_bench.cu") -ConfigureBench(STATIC_MAP_BENCH "${STATIC_MAP_BENCH_SRC}") +ConfigureBench(STATIC_MAP_BENCH + hash_table/static_map/insert_bench.cu + hash_table/static_map/find_bench.cu + hash_table/static_map/contains_bench.cu + hash_table/static_map/erase_bench.cu) ################################################################################################### # - static_multimap benchmarks -------------------------------------------------------------------- -ConfigureNVBench(STATIC_MULTIMAP_BENCH - hash_table/static_multimap/count_bench.cu +ConfigureBench(STATIC_MULTIMAP_BENCH hash_table/static_multimap/insert_bench.cu - hash_table/static_multimap/pair_retrieve_bench.cu + hash_table/static_multimap/retrieve_bench.cu hash_table/static_multimap/query_bench.cu - hash_table/static_multimap/retrieve_bench.cu) + hash_table/static_multimap/count_bench.cu) -ConfigureNVBench(RETRIEVE_BENCH - hash_table/static_multimap/optimal_retrieve_bench.cu) +################################################################################################### +# - dynamic_map benchmarks ------------------------------------------------------------------------ +ConfigureBench(DYNAMIC_MAP_BENCH + hash_table/dynamic_map/insert_bench.cu + hash_table/dynamic_map/find_bench.cu + hash_table/dynamic_map/contains_bench.cu + hash_table/dynamic_map/erase_bench.cu) ################################################################################################### -# - reduce_by_key benchmarks ---------------------------------------------------------------------- -set(RBK_BENCH_SRC "${CMAKE_CURRENT_SOURCE_DIR}/reduce_by_key/reduce_by_key.cu") -ConfigureBench(RBK_BENCH "${RBK_BENCH_SRC}") +# - hash function benchmarks ---------------------------------------------------------------------- +ConfigureBench(HASH_BENCH + hash_bench.cu) diff --git a/benchmarks/defaults.hpp b/benchmarks/defaults.hpp new file mode 100644 index 000000000..22e4f5338 --- /dev/null +++ b/benchmarks/defaults.hpp @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include +#include + +namespace cuco::benchmark::defaults { + +using KEY_TYPE_RANGE = nvbench::type_list; +using VALUE_TYPE_RANGE = nvbench::type_list; + +auto constexpr N = 100'000'000; +auto constexpr OCCUPANCY = 0.5; +auto constexpr MULTIPLICITY = 8; +auto constexpr MATCHING_RATE = 0.5; +auto constexpr MAX_NOISE = 3; +auto constexpr SKEW = 0.5; +auto constexpr BATCH_SIZE = 1'000'000; +auto constexpr INITIAL_SIZE = 50'000'000; + +auto const N_RANGE = nvbench::range(10'000'000, 100'000'000, 20'000'000); +auto const N_RANGE_CACHE = + std::vector{8'000, 80'000, 800'000, 8'000'000, 80'000'000}; +auto const OCCUPANCY_RANGE = nvbench::range(0.1, 0.9, 0.1); +auto const MULTIPLICITY_RANGE = std::vector{1, 2, 4, 8, 16}; +auto const MATCHING_RATE_RANGE = nvbench::range(0.1, 1., 0.1); +auto const SKEW_RANGE = nvbench::range(0.1, 1., 0.1); + +} // namespace cuco::benchmark::defaults diff --git a/benchmarks/hash_bench.cu b/benchmarks/hash_bench.cu new file mode 100644 index 000000000..ec35c186e --- /dev/null +++ b/benchmarks/hash_bench.cu @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include + +#include + +#include + +template +struct large_key { + constexpr __host__ __device__ large_key(int32_t seed) noexcept + { +#pragma unroll Words + for (int32_t i = 0; i < Words; ++i) { + data_[i] = seed; + } + } + + private: + int32_t data_[Words]; +}; + +template +__global__ void hash_bench_kernel(Hasher hash, + cuco::detail::index_type n, + OutputIt out, + bool materialize_result) +{ + cuco::detail::index_type const gid = BlockSize * blockIdx.x + threadIdx.x; + cuco::detail::index_type const loop_stride = gridDim.x * BlockSize; + cuco::detail::index_type idx = gid; + typename Hasher::result_type agg = 0; + + while (idx < n) { + typename Hasher::argument_type key(idx); + for (int32_t i = 0; i < 100; ++i) { // execute hash func 100 times + agg += hash(key); + } + idx += loop_stride; + } + + if (materialize_result) { out[gid] = agg; } +} + +/** + * @brief A benchmark evaluating performance of various hash functions + */ +template +void hash_eval(nvbench::state& state, nvbench::type_list) +{ + bool const materialize_result = false; + constexpr auto block_size = 128; + auto const num_keys = state.get_int64_or_default("NumInputs", cuco::benchmark::defaults::N * 10); + auto const grid_size = (num_keys + block_size * 16 - 1) / block_size * 16; + + thrust::device_vector hash_values((materialize_result) ? num_keys + : 1); + + state.add_element_count(num_keys); + + state.exec([&](nvbench::launch& launch) { + hash_bench_kernel<<>>( + Hash{}, num_keys, hash_values.begin(), materialize_result); + }); +} + +NVBENCH_BENCH_TYPES( + hash_eval, + NVBENCH_TYPE_AXES(nvbench::type_list, + cuco::murmurhash3_32, + cuco::murmurhash3_32>, // 32*4bytes + cuco::xxhash_32, + cuco::xxhash_32, + cuco::xxhash_32>, + cuco::xxhash_64, + cuco::xxhash_64, + cuco::xxhash_64>, + cuco::murmurhash3_fmix_32, + cuco::murmurhash3_fmix_64>)) + .set_name("hash_function_eval") + .set_type_axes_names({"Hash"}) + .set_max_noise(cuco::benchmark::defaults::MAX_NOISE); diff --git a/benchmarks/hash_table/dynamic_map/contains_bench.cu b/benchmarks/hash_table/dynamic_map/contains_bench.cu new file mode 100644 index 000000000..ff349bc53 --- /dev/null +++ b/benchmarks/hash_table/dynamic_map/contains_bench.cu @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include + +#include + +#include +#include + +using namespace cuco::benchmark; +using namespace cuco::utility; + +/** + * @brief A benchmark evaluating `cuco::dynamic_map::contains` performance + */ +template +std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> dynamic_map_contains( + nvbench::state& state, nvbench::type_list) +{ + using pair_type = cuco::pair; + + auto const num_keys = state.get_int64_or_default("NumInputs", defaults::N); + auto const initial_size = state.get_int64_or_default("InitSize", defaults::INITIAL_SIZE); + auto const matching_rate = state.get_float64_or_default("MatchingRate", defaults::MATCHING_RATE); + + thrust::device_vector keys(num_keys); + + key_generator gen; + gen.generate(dist_from_state(state), keys.begin(), keys.end()); + + thrust::device_vector pairs(num_keys); + thrust::transform(keys.begin(), keys.end(), pairs.begin(), [] __device__(Key const& key) { + return pair_type(key, {}); + }); + + cuco::dynamic_map map{ + static_cast(initial_size), cuco::empty_key{-1}, cuco::empty_value{-1}}; + map.insert(pairs.begin(), pairs.end()); + + gen.dropout(keys.begin(), keys.end(), matching_rate); + + thrust::device_vector result(num_keys); + + state.add_element_count(num_keys); + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + map.contains(keys.begin(), keys.end(), result.begin(), {}, {}, launch.get_stream()); + }); +} + +template +std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> dynamic_map_contains( + nvbench::state& state, nvbench::type_list) +{ + state.skip("Key should be the same type as Value."); +} + +NVBENCH_BENCH_TYPES(dynamic_map_contains, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + defaults::VALUE_TYPE_RANGE, + nvbench::type_list)) + .set_name("dynamic_map_contains_unique_num_inputs") + .set_type_axes_names({"Key", "Value", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE) + .add_int64_axis("NumInputs", defaults::N_RANGE); + +NVBENCH_BENCH_TYPES(dynamic_map_contains, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + defaults::VALUE_TYPE_RANGE, + nvbench::type_list)) + .set_name("dynamic_map_contains_unique_matching_rate") + .set_type_axes_names({"Key", "Value", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE) + .add_float64_axis("MatchingRate", defaults::MATCHING_RATE_RANGE); diff --git a/benchmarks/hash_table/dynamic_map/erase_bench.cu b/benchmarks/hash_table/dynamic_map/erase_bench.cu new file mode 100644 index 000000000..96f5ec7ec --- /dev/null +++ b/benchmarks/hash_table/dynamic_map/erase_bench.cu @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include + +#include + +#include +#include + +using namespace cuco::benchmark; +using namespace cuco::utility; + +/** + * @brief A benchmark evaluating `cuco::dynamic_map::erase` performance + */ +template +std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> dynamic_map_erase( + nvbench::state& state, nvbench::type_list) +{ + using pair_type = cuco::pair; + + auto const num_keys = state.get_int64_or_default("NumInputs", defaults::N); + auto const initial_size = state.get_int64_or_default("InitSize", defaults::INITIAL_SIZE); + auto const matching_rate = state.get_float64_or_default("MatchingRate", defaults::MATCHING_RATE); + + thrust::device_vector keys(num_keys); + + key_generator gen; + gen.generate(dist_from_state(state), keys.begin(), keys.end()); + + thrust::device_vector pairs(num_keys); + thrust::transform( + keys.begin(), keys.end(), pairs.begin(), [] __device__(auto i) { return pair_type(i, {}); }); + + gen.dropout(keys.begin(), keys.end(), matching_rate); + + state.add_element_count(num_keys); + + state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer, + [&](nvbench::launch& launch, auto& timer) { + // dynamic map with erase support + cuco::dynamic_map map{static_cast(initial_size), + cuco::empty_key{-1}, + cuco::empty_value{-1}, + cuco::erased_key{-2}}; + map.insert(pairs.begin(), pairs.end(), {}, {}, launch.get_stream()); + + timer.start(); + map.erase(keys.begin(), keys.end(), {}, {}, launch.get_stream()); + timer.stop(); + }); +} + +template +std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> dynamic_map_erase( + nvbench::state& state, nvbench::type_list) +{ + state.skip("Key should be the same type as Value."); +} + +NVBENCH_BENCH_TYPES(dynamic_map_erase, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + defaults::VALUE_TYPE_RANGE, + nvbench::type_list)) + .set_name("dynamic_map_erase_unique_num_inputs") + .set_type_axes_names({"Key", "Value", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE) + .add_int64_axis("NumInputs", defaults::N_RANGE); + +NVBENCH_BENCH_TYPES(dynamic_map_erase, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + defaults::VALUE_TYPE_RANGE, + nvbench::type_list)) + .set_name("dynamic_map_erase_unique_matching_rate") + .set_type_axes_names({"Key", "Value", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE) + .add_float64_axis("MatchingRate", defaults::MATCHING_RATE_RANGE); diff --git a/benchmarks/hash_table/dynamic_map/find_bench.cu b/benchmarks/hash_table/dynamic_map/find_bench.cu new file mode 100644 index 000000000..b06cfab4e --- /dev/null +++ b/benchmarks/hash_table/dynamic_map/find_bench.cu @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include + +#include + +#include +#include + +using namespace cuco::benchmark; +using namespace cuco::utility; + +/** + * @brief A benchmark evaluating `cuco::dynamic_map::find` performance + */ +template +std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> dynamic_map_find( + nvbench::state& state, nvbench::type_list) +{ + using pair_type = cuco::pair; + + auto const num_keys = state.get_int64_or_default("NumInputs", defaults::N); + auto const initial_size = state.get_int64_or_default("InitSize", defaults::INITIAL_SIZE); + auto const matching_rate = state.get_float64_or_default("MatchingRate", defaults::MATCHING_RATE); + + thrust::device_vector keys(num_keys); + + key_generator gen; + gen.generate(dist_from_state(state), keys.begin(), keys.end()); + + thrust::device_vector pairs(num_keys); + thrust::transform(keys.begin(), keys.end(), pairs.begin(), [] __device__(Key const& key) { + return pair_type(key, {}); + }); + + cuco::dynamic_map map{ + static_cast(initial_size), cuco::empty_key{-1}, cuco::empty_value{-1}}; + map.insert(pairs.begin(), pairs.end()); + + gen.dropout(keys.begin(), keys.end(), matching_rate); + + thrust::device_vector result(num_keys); + + state.add_element_count(num_keys); + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + map.find(keys.begin(), keys.end(), result.begin(), {}, {}, launch.get_stream()); + }); +} + +template +std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> dynamic_map_find( + nvbench::state& state, nvbench::type_list) +{ + state.skip("Key should be the same type as Value."); +} + +NVBENCH_BENCH_TYPES(dynamic_map_find, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + defaults::VALUE_TYPE_RANGE, + nvbench::type_list)) + .set_name("dynamic_map_find_unique_num_inputs") + .set_type_axes_names({"Key", "Value", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE) + .add_int64_axis("NumInputs", defaults::N_RANGE); + +NVBENCH_BENCH_TYPES(dynamic_map_find, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + defaults::VALUE_TYPE_RANGE, + nvbench::type_list)) + .set_name("dynamic_map_find_unique_matching_rate") + .set_type_axes_names({"Key", "Value", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE) + .add_float64_axis("MatchingRate", defaults::MATCHING_RATE_RANGE); diff --git a/benchmarks/hash_table/dynamic_map/insert_bench.cu b/benchmarks/hash_table/dynamic_map/insert_bench.cu new file mode 100644 index 000000000..8e8cc8a84 --- /dev/null +++ b/benchmarks/hash_table/dynamic_map/insert_bench.cu @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include + +#include + +#include +#include + +using namespace cuco::benchmark; +using namespace cuco::utility; + +/** + * @brief A benchmark evaluating `cuco::dynamic_map::insert` performance + */ +template +std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> dynamic_map_insert( + nvbench::state& state, nvbench::type_list) +{ + using pair_type = cuco::pair; + + auto const num_keys = state.get_int64_or_default("NumInputs", defaults::N); + auto const initial_size = state.get_int64_or_default("InitSize", defaults::INITIAL_SIZE); + auto const batch_size = state.get_int64_or_default("BatchSize", defaults::BATCH_SIZE); + + if (num_keys % batch_size) { state.skip("NumInputs must be divisible by BatchSize."); } + + thrust::device_vector keys(num_keys); + + key_generator gen; + gen.generate(dist_from_state(state), keys.begin(), keys.end()); + + thrust::device_vector pairs(num_keys); + thrust::transform(keys.begin(), keys.end(), pairs.begin(), [] __device__(Key const& key) { + return pair_type(key, {}); + }); + + state.add_element_count(num_keys); + + state.exec( + nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) { + cuco::dynamic_map map{static_cast(initial_size), + cuco::empty_key{-1}, + cuco::empty_value{-1}, + {}, + launch.get_stream()}; + + timer.start(); + for (std::size_t i = 0; i < num_keys; i += batch_size) { + map.insert(pairs.begin() + i, pairs.begin() + i + batch_size, {}, {}, launch.get_stream()); + } + timer.stop(); + }); +} + +template +std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> dynamic_map_insert( + nvbench::state& state, nvbench::type_list) +{ + state.skip("Key should be the same type as Value."); +} + +NVBENCH_BENCH_TYPES(dynamic_map_insert, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + defaults::VALUE_TYPE_RANGE, + nvbench::type_list)) + .set_name("dynamic_map_insert_unique_num_inputs") + .set_type_axes_names({"Key", "Value", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE) + .add_int64_axis("NumInputs", defaults::N_RANGE); + +NVBENCH_BENCH_TYPES(dynamic_map_insert, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + defaults::VALUE_TYPE_RANGE, + nvbench::type_list)) + .set_name("dynamic_map_insert_uniform_multiplicity") + .set_type_axes_names({"Key", "Value", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE) + .add_int64_axis("Multiplicity", defaults::MULTIPLICITY_RANGE); + +NVBENCH_BENCH_TYPES(dynamic_map_insert, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + defaults::VALUE_TYPE_RANGE, + nvbench::type_list)) + .set_name("dynamic_map_insert_gaussian_skew") + .set_type_axes_names({"Key", "Value", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE) + .add_float64_axis("Skew", defaults::SKEW_RANGE); diff --git a/benchmarks/hash_table/dynamic_map_bench.cu b/benchmarks/hash_table/dynamic_map_bench.cu deleted file mode 100644 index 90446ea57..000000000 --- a/benchmarks/hash_table/dynamic_map_bench.cu +++ /dev/null @@ -1,198 +0,0 @@ -/* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include - -#include - -#include - -#include -#include - -enum class dist_type { UNIQUE, UNIFORM, GAUSSIAN }; - -template -static void generate_keys(OutputIt output_begin, OutputIt output_end) -{ - auto num_keys = std::distance(output_begin, output_end); - - std::random_device rd; - std::mt19937 gen{rd()}; - - switch (Dist) { - case dist_type::UNIQUE: - for (auto i = 0; i < num_keys; ++i) { - output_begin[i] = i; - } - break; - case dist_type::UNIFORM: - for (auto i = 0; i < num_keys; ++i) { - output_begin[i] = std::abs(static_cast(gen())); - } - break; - case dist_type::GAUSSIAN: - std::normal_distribution<> dg{1e9, 1e7}; - for (auto i = 0; i < num_keys; ++i) { - output_begin[i] = std::abs(static_cast(dg(gen))); - } - break; - } -} - -static void gen_final_size(benchmark::internal::Benchmark* b) -{ - for (auto size = 10'000'000; size <= 150'000'000; size += 20'000'000) { - b->Args({size}); - } -} - -template -static void BM_dynamic_insert(::benchmark::State& state) -{ - using map_type = cuco::dynamic_map; - - std::size_t num_keys = state.range(0); - std::size_t initial_size = 1 << 27; - - std::vector h_keys(num_keys); - std::vector> h_pairs(num_keys); - - generate_keys(h_keys.begin(), h_keys.end()); - - for (std::size_t i = 0; i < num_keys; ++i) { - Key key = h_keys[i]; - Value val = h_keys[i]; - h_pairs[i].first = key; - h_pairs[i].second = val; - } - - thrust::device_vector> d_pairs(h_pairs); - - std::size_t batch_size = 1E6; - for (auto _ : state) { - map_type map{ - initial_size, cuco::sentinel::empty_key{-1}, cuco::sentinel::empty_value{-1}}; - { - cuda_event_timer raii{state}; - for (std::size_t i = 0; i < num_keys; i += batch_size) { - map.insert(d_pairs.begin() + i, d_pairs.begin() + i + batch_size); - } - } - } - - state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) * - int64_t(state.range(0))); -} - -template -static void BM_dynamic_search_all(::benchmark::State& state) -{ - using map_type = cuco::dynamic_map; - - std::size_t num_keys = state.range(0); - std::size_t initial_size = 1 << 27; - - std::vector h_keys(num_keys); - std::vector> h_pairs(num_keys); - - generate_keys(h_keys.begin(), h_keys.end()); - - for (std::size_t i = 0; i < num_keys; ++i) { - Key key = h_keys[i]; - Value val = h_keys[i]; - h_pairs[i].first = key; - h_pairs[i].second = val; - } - - thrust::device_vector d_keys(h_keys); - thrust::device_vector> d_pairs(h_pairs); - thrust::device_vector d_results(num_keys); - - map_type map{ - initial_size, cuco::sentinel::empty_key{-1}, cuco::sentinel::empty_value{-1}}; - map.insert(d_pairs.begin(), d_pairs.end()); - - for (auto _ : state) { - cuda_event_timer raii{state}; - map.find(d_keys.begin(), d_keys.end(), d_results.begin()); - } - - state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) * - int64_t(state.range(0))); -} - -BENCHMARK_TEMPLATE(BM_dynamic_insert, int32_t, int32_t, dist_type::UNIQUE) - ->Unit(benchmark::kMillisecond) - ->Apply(gen_final_size) - ->UseManualTime(); - -BENCHMARK_TEMPLATE(BM_dynamic_search_all, int32_t, int32_t, dist_type::UNIQUE) - ->Unit(benchmark::kMillisecond) - ->Apply(gen_final_size) - ->UseManualTime(); - -BENCHMARK_TEMPLATE(BM_dynamic_insert, int32_t, int32_t, dist_type::UNIFORM) - ->Unit(benchmark::kMillisecond) - ->Apply(gen_final_size) - ->UseManualTime(); - -BENCHMARK_TEMPLATE(BM_dynamic_search_all, int32_t, int32_t, dist_type::UNIFORM) - ->Unit(benchmark::kMillisecond) - ->Apply(gen_final_size) - ->UseManualTime(); - -BENCHMARK_TEMPLATE(BM_dynamic_insert, int32_t, int32_t, dist_type::GAUSSIAN) - ->Unit(benchmark::kMillisecond) - ->Apply(gen_final_size) - ->UseManualTime(); - -BENCHMARK_TEMPLATE(BM_dynamic_search_all, int32_t, int32_t, dist_type::GAUSSIAN) - ->Unit(benchmark::kMillisecond) - ->Apply(gen_final_size) - ->UseManualTime(); - -BENCHMARK_TEMPLATE(BM_dynamic_insert, int64_t, int64_t, dist_type::UNIQUE) - ->Unit(benchmark::kMillisecond) - ->Apply(gen_final_size) - ->UseManualTime(); - -BENCHMARK_TEMPLATE(BM_dynamic_search_all, int64_t, int64_t, dist_type::UNIQUE) - ->Unit(benchmark::kMillisecond) - ->Apply(gen_final_size) - ->UseManualTime(); - -BENCHMARK_TEMPLATE(BM_dynamic_insert, int64_t, int64_t, dist_type::UNIFORM) - ->Unit(benchmark::kMillisecond) - ->Apply(gen_final_size) - ->UseManualTime(); - -BENCHMARK_TEMPLATE(BM_dynamic_search_all, int64_t, int64_t, dist_type::UNIFORM) - ->Unit(benchmark::kMillisecond) - ->Apply(gen_final_size) - ->UseManualTime(); - -BENCHMARK_TEMPLATE(BM_dynamic_insert, int64_t, int64_t, dist_type::GAUSSIAN) - ->Unit(benchmark::kMillisecond) - ->Apply(gen_final_size) - ->UseManualTime(); - -BENCHMARK_TEMPLATE(BM_dynamic_search_all, int64_t, int64_t, dist_type::GAUSSIAN) - ->Unit(benchmark::kMillisecond) - ->Apply(gen_final_size) - ->UseManualTime(); diff --git a/benchmarks/hash_table/static_map/contains_bench.cu b/benchmarks/hash_table/static_map/contains_bench.cu new file mode 100644 index 000000000..0b5d482a1 --- /dev/null +++ b/benchmarks/hash_table/static_map/contains_bench.cu @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include + +#include + +#include +#include + +using namespace cuco::benchmark; +using namespace cuco::utility; + +/** + * @brief A benchmark evaluating `cuco::static_map::contains` performance + */ +template +std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_map_contains( + nvbench::state& state, nvbench::type_list) +{ + using pair_type = cuco::pair; + + auto const num_keys = state.get_int64_or_default("NumInputs", defaults::N); + auto const occupancy = state.get_float64_or_default("Occupancy", defaults::OCCUPANCY); + auto const matching_rate = state.get_float64_or_default("MatchingRate", defaults::MATCHING_RATE); + + std::size_t const size = num_keys / occupancy; + + thrust::device_vector keys(num_keys); + + key_generator gen; + gen.generate(dist_from_state(state), keys.begin(), keys.end()); + + thrust::device_vector pairs(num_keys); + thrust::transform(keys.begin(), keys.end(), pairs.begin(), [] __device__(Key const& key) { + return pair_type(key, {}); + }); + + cuco::static_map map{size, cuco::empty_key{-1}, cuco::empty_value{-1}}; + map.insert(pairs.begin(), pairs.end()); + + gen.dropout(keys.begin(), keys.end(), matching_rate); + + thrust::device_vector result(num_keys); + + state.add_element_count(num_keys); + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + map.contains(keys.begin(), keys.end(), result.begin(), {}, {}, launch.get_stream()); + }); +} + +template +std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> static_map_contains( + nvbench::state& state, nvbench::type_list) +{ + state.skip("Key should be the same type as Value."); +} + +NVBENCH_BENCH_TYPES(static_map_contains, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + defaults::VALUE_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_map_contains_unique_occupancy") + .set_type_axes_names({"Key", "Value", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE) + .add_float64_axis("Occupancy", defaults::OCCUPANCY_RANGE); + +NVBENCH_BENCH_TYPES(static_map_contains, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + defaults::VALUE_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_map_contains_unique_matching_rate") + .set_type_axes_names({"Key", "Value", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE) + .add_float64_axis("MatchingRate", defaults::MATCHING_RATE_RANGE); diff --git a/benchmarks/hash_table/static_map/erase_bench.cu b/benchmarks/hash_table/static_map/erase_bench.cu new file mode 100644 index 000000000..c6e56eb07 --- /dev/null +++ b/benchmarks/hash_table/static_map/erase_bench.cu @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include + +#include + +#include +#include + +using namespace cuco::benchmark; +using namespace cuco::utility; + +/** + * @brief A benchmark evaluating `cuco::static_map::erase` performance + */ +template +std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_map_erase( + nvbench::state& state, nvbench::type_list) +{ + using pair_type = cuco::pair; + + auto const num_keys = state.get_int64_or_default("NumInputs", defaults::N); + auto const occupancy = state.get_float64_or_default("Occupancy", defaults::OCCUPANCY); + auto const matching_rate = state.get_float64_or_default("MatchingRate", defaults::MATCHING_RATE); + + std::size_t const size = num_keys / occupancy; + + thrust::device_vector keys(num_keys); + + key_generator gen; + gen.generate(dist_from_state(state), keys.begin(), keys.end()); + + thrust::device_vector pairs(num_keys); + thrust::transform( + keys.begin(), keys.end(), pairs.begin(), [] __device__(auto i) { return pair_type(i, {}); }); + + gen.dropout(keys.begin(), keys.end(), matching_rate); + + state.add_element_count(num_keys); + + state.exec( + nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) { + // static map with erase support + cuco::static_map map{ + size, cuco::empty_key{-1}, cuco::empty_value{-1}, cuco::erased_key{-2}}; + map.insert(pairs.begin(), pairs.end(), {}, {}, launch.get_stream()); + + timer.start(); + map.erase(keys.begin(), keys.end(), {}, {}, launch.get_stream()); + timer.stop(); + }); +} + +template +std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> static_map_erase( + nvbench::state& state, nvbench::type_list) +{ + state.skip("Key should be the same type as Value."); +} + +NVBENCH_BENCH_TYPES(static_map_erase, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + defaults::VALUE_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_map_erase_unique_occupancy") + .set_type_axes_names({"Key", "Value", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE) + .add_float64_axis("Occupancy", defaults::OCCUPANCY_RANGE); + +NVBENCH_BENCH_TYPES(static_map_erase, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + defaults::VALUE_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_map_erase_unique_matching_rate") + .set_type_axes_names({"Key", "Value", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE) + .add_float64_axis("MatchingRate", defaults::MATCHING_RATE_RANGE); diff --git a/benchmarks/hash_table/static_map/find_bench.cu b/benchmarks/hash_table/static_map/find_bench.cu new file mode 100644 index 000000000..276a35e0b --- /dev/null +++ b/benchmarks/hash_table/static_map/find_bench.cu @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include + +#include + +#include +#include + +using namespace cuco::benchmark; +using namespace cuco::utility; + +/** + * @brief A benchmark evaluating `cuco::static_map::find` performance + */ +template +std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_map_find( + nvbench::state& state, nvbench::type_list) +{ + using pair_type = cuco::pair; + + auto const num_keys = state.get_int64_or_default("NumInputs", defaults::N); + auto const occupancy = state.get_float64_or_default("Occupancy", defaults::OCCUPANCY); + auto const matching_rate = state.get_float64_or_default("MatchingRate", defaults::MATCHING_RATE); + + std::size_t const size = num_keys / occupancy; + + thrust::device_vector keys(num_keys); + + key_generator gen; + gen.generate(dist_from_state(state), keys.begin(), keys.end()); + + thrust::device_vector pairs(num_keys); + thrust::transform(keys.begin(), keys.end(), pairs.begin(), [] __device__(Key const& key) { + return pair_type(key, {}); + }); + + cuco::static_map map{size, cuco::empty_key{-1}, cuco::empty_value{-1}}; + map.insert(pairs.begin(), pairs.end()); + + gen.dropout(keys.begin(), keys.end(), matching_rate); + + thrust::device_vector result(num_keys); + + state.add_element_count(num_keys); + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + map.find(keys.begin(), keys.end(), result.begin(), {}, {}, launch.get_stream()); + }); +} + +template +std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> static_map_find( + nvbench::state& state, nvbench::type_list) +{ + state.skip("Key should be the same type as Value."); +} + +NVBENCH_BENCH_TYPES(static_map_find, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + defaults::VALUE_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_map_find_unique_occupancy") + .set_type_axes_names({"Key", "Value", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE) + .add_float64_axis("Occupancy", defaults::OCCUPANCY_RANGE); + +NVBENCH_BENCH_TYPES(static_map_find, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + defaults::VALUE_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_map_find_unique_matching_rate") + .set_type_axes_names({"Key", "Value", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE) + .add_float64_axis("MatchingRate", defaults::MATCHING_RATE_RANGE); diff --git a/benchmarks/hash_table/static_map/insert_bench.cu b/benchmarks/hash_table/static_map/insert_bench.cu new file mode 100644 index 000000000..ef997bef8 --- /dev/null +++ b/benchmarks/hash_table/static_map/insert_bench.cu @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include + +#include + +#include +#include + +using namespace cuco::benchmark; +using namespace cuco::utility; + +/** + * @brief A benchmark evaluating `cuco::static_map::insert` performance + */ +template +std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_map_insert( + nvbench::state& state, nvbench::type_list) +{ + using pair_type = cuco::pair; + + auto const num_keys = state.get_int64_or_default("NumInputs", defaults::N); + auto const occupancy = state.get_float64_or_default("Occupancy", defaults::OCCUPANCY); + + std::size_t const size = num_keys / occupancy; + + thrust::device_vector keys(num_keys); + + key_generator gen; + gen.generate(dist_from_state(state), keys.begin(), keys.end()); + + thrust::device_vector pairs(num_keys); + thrust::transform(keys.begin(), keys.end(), pairs.begin(), [] __device__(Key const& key) { + return pair_type(key, {}); + }); + + state.add_element_count(num_keys); + + state.exec( + nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) { + cuco::static_map map{ + size, cuco::empty_key{-1}, cuco::empty_value{-1}, {}, launch.get_stream()}; + + timer.start(); + map.insert(pairs.begin(), pairs.end(), {}, {}, launch.get_stream()); + timer.stop(); + }); +} + +template +std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> static_map_insert( + nvbench::state& state, nvbench::type_list) +{ + state.skip("Key should be the same type as Value."); +} + +NVBENCH_BENCH_TYPES(static_map_insert, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + defaults::VALUE_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_map_insert_uniform_multiplicity") + .set_type_axes_names({"Key", "Value", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE) + .add_int64_axis("Multiplicity", defaults::MULTIPLICITY_RANGE); + +NVBENCH_BENCH_TYPES(static_map_insert, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + defaults::VALUE_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_map_insert_unique_occupancy") + .set_type_axes_names({"Key", "Value", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE) + .add_float64_axis("Occupancy", defaults::OCCUPANCY_RANGE); + +NVBENCH_BENCH_TYPES(static_map_insert, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + defaults::VALUE_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_map_insert_gaussian_skew") + .set_type_axes_names({"Key", "Value", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE) + .add_float64_axis("Skew", defaults::SKEW_RANGE); diff --git a/benchmarks/hash_table/static_map_bench.cu b/benchmarks/hash_table/static_map_bench.cu deleted file mode 100644 index e2b15b05e..000000000 --- a/benchmarks/hash_table/static_map_bench.cu +++ /dev/null @@ -1,259 +0,0 @@ -/* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include -#include - -#include - -#include -#include - -enum class dist_type { UNIQUE, UNIFORM, GAUSSIAN }; - -template -static void generate_keys(OutputIt output_begin, OutputIt output_end) -{ - auto num_keys = std::distance(output_begin, output_end); - - std::random_device rd; - std::mt19937 gen{rd()}; - - switch (Dist) { - case dist_type::UNIQUE: - for (auto i = 0; i < num_keys; ++i) { - output_begin[i] = i; - } - break; - case dist_type::UNIFORM: - for (auto i = 0; i < num_keys; ++i) { - output_begin[i] = std::abs(static_cast(gen())); - } - break; - case dist_type::GAUSSIAN: - std::normal_distribution<> dg{1e9, 1e7}; - for (auto i = 0; i < num_keys; ++i) { - output_begin[i] = std::abs(static_cast(dg(gen))); - } - break; - } -} - -/** - * @brief Generates input sizes and hash table occupancies - * - */ -static void generate_size_and_occupancy(benchmark::internal::Benchmark* b) -{ - for (auto size = 100'000'000; size <= 100'000'000; size *= 10) { - for (auto occupancy = 10; occupancy <= 90; occupancy += 10) { - b->Args({size, occupancy}); - } - } -} - -template -static void BM_static_map_insert(::benchmark::State& state) -{ - using map_type = cuco::static_map; - - std::size_t num_keys = state.range(0); - float occupancy = state.range(1) / float{100}; - std::size_t size = num_keys / occupancy; - - std::vector h_keys(num_keys); - std::vector> h_pairs(num_keys); - - generate_keys(h_keys.begin(), h_keys.end()); - - for (std::size_t i = 0; i < num_keys; ++i) { - Key key = h_keys[i]; - Value val = h_keys[i]; - h_pairs[i].first = key; - h_pairs[i].second = val; - } - - thrust::device_vector> d_pairs(h_pairs); - thrust::device_vector d_keys(h_keys); - - for (auto _ : state) { - map_type map{size, cuco::sentinel::empty_key{-1}, cuco::sentinel::empty_value{-1}}; - - cudaEvent_t start, stop; - cudaEventCreate(&start); - cudaEventCreate(&stop); - - cudaEventRecord(start); - map.insert(d_pairs.begin(), d_pairs.end()); - cudaEventRecord(stop); - cudaEventSynchronize(stop); - - float ms; - cudaEventElapsedTime(&ms, start, stop); - - state.SetIterationTime(ms / 1000); - } - - state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) * - int64_t(state.range(0))); -} - -template -static void BM_static_map_search_all(::benchmark::State& state) -{ - using map_type = cuco::static_map; - - std::size_t num_keys = state.range(0); - float occupancy = state.range(1) / float{100}; - std::size_t size = num_keys / occupancy; - - map_type map{size, cuco::sentinel::empty_key{-1}, cuco::sentinel::empty_value{-1}}; - - std::vector h_keys(num_keys); - std::vector h_values(num_keys); - std::vector> h_pairs(num_keys); - std::vector h_results(num_keys); - - generate_keys(h_keys.begin(), h_keys.end()); - - for (std::size_t i = 0; i < num_keys; ++i) { - Key key = h_keys[i]; - Value val = h_keys[i]; - h_pairs[i].first = key; - h_pairs[i].second = val; - } - - thrust::device_vector d_keys(h_keys); - thrust::device_vector d_results(num_keys); - thrust::device_vector> d_pairs(h_pairs); - - map.insert(d_pairs.begin(), d_pairs.end()); - - for (auto _ : state) { - map.find(d_keys.begin(), d_keys.end(), d_results.begin()); - // TODO: get rid of sync and rewrite the benchmark with `nvbench` - // once https://github.com/NVIDIA/nvbench/pull/80 is merged - cudaDeviceSynchronize(); - } - - state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) * - int64_t(state.range(0))); -} - -template -static void BM_static_map_erase_all(::benchmark::State& state) -{ - using map_type = cuco::static_map; - - std::size_t num_keys = state.range(0); - float occupancy = state.range(1) / float{100}; - std::size_t size = num_keys / occupancy; - - // static map with erase support - map_type map{size, - cuco::sentinel::empty_key{-1}, - cuco::sentinel::empty_value{-1}, - cuco::sentinel::erased_key{-2}}; - - std::vector h_keys(num_keys); - std::vector h_values(num_keys); - std::vector> h_pairs(num_keys); - std::vector h_results(num_keys); - - generate_keys(h_keys.begin(), h_keys.end()); - - for (std::size_t i = 0; i < num_keys; ++i) { - Key key = h_keys[i]; - Value val = h_keys[i]; - h_pairs[i].first = key; - h_pairs[i].second = val; - } - - thrust::device_vector d_keys(h_keys); - thrust::device_vector d_results(num_keys); - thrust::device_vector> d_pairs(h_pairs); - - for (auto _ : state) { - state.PauseTiming(); - map.insert(d_pairs.begin(), d_pairs.end()); - state.ResumeTiming(); - - map.erase(d_keys.begin(), d_keys.end()); - } - - state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) * - int64_t(state.range(0))); -} - -BENCHMARK_TEMPLATE(BM_static_map_insert, int32_t, int32_t, dist_type::UNIQUE) - ->Unit(benchmark::kMillisecond) - ->Apply(generate_size_and_occupancy) - ->UseManualTime(); - -BENCHMARK_TEMPLATE(BM_static_map_search_all, int32_t, int32_t, dist_type::UNIQUE) - ->Unit(benchmark::kMillisecond) - ->Apply(generate_size_and_occupancy); - -BENCHMARK_TEMPLATE(BM_static_map_insert, int32_t, int32_t, dist_type::UNIFORM) - ->Unit(benchmark::kMillisecond) - ->Apply(generate_size_and_occupancy) - ->UseManualTime(); - -BENCHMARK_TEMPLATE(BM_static_map_search_all, int32_t, int32_t, dist_type::UNIFORM) - ->Unit(benchmark::kMillisecond) - ->Apply(generate_size_and_occupancy); - -BENCHMARK_TEMPLATE(BM_static_map_insert, int32_t, int32_t, dist_type::GAUSSIAN) - ->Unit(benchmark::kMillisecond) - ->Apply(generate_size_and_occupancy) - ->UseManualTime(); - -BENCHMARK_TEMPLATE(BM_static_map_search_all, int32_t, int32_t, dist_type::GAUSSIAN) - ->Unit(benchmark::kMillisecond) - ->Apply(generate_size_and_occupancy); - -BENCHMARK_TEMPLATE(BM_static_map_insert, int64_t, int64_t, dist_type::UNIQUE) - ->Unit(benchmark::kMillisecond) - ->Apply(generate_size_and_occupancy) - ->UseManualTime(); - -BENCHMARK_TEMPLATE(BM_static_map_search_all, int64_t, int64_t, dist_type::UNIQUE) - ->Unit(benchmark::kMillisecond) - ->Apply(generate_size_and_occupancy); - -BENCHMARK_TEMPLATE(BM_static_map_insert, int64_t, int64_t, dist_type::UNIFORM) - ->Unit(benchmark::kMillisecond) - ->Apply(generate_size_and_occupancy) - ->UseManualTime(); - -BENCHMARK_TEMPLATE(BM_static_map_search_all, int64_t, int64_t, dist_type::UNIFORM) - ->Unit(benchmark::kMillisecond) - ->Apply(generate_size_and_occupancy); - -BENCHMARK_TEMPLATE(BM_static_map_insert, int64_t, int64_t, dist_type::GAUSSIAN) - ->Unit(benchmark::kMillisecond) - ->Apply(generate_size_and_occupancy) - ->UseManualTime(); - -BENCHMARK_TEMPLATE(BM_static_map_search_all, int64_t, int64_t, dist_type::GAUSSIAN) - ->Unit(benchmark::kMillisecond) - ->Apply(generate_size_and_occupancy); - -BENCHMARK_TEMPLATE(BM_static_map_erase_all, int32_t, int32_t, dist_type::UNIQUE) - ->Unit(benchmark::kMillisecond) - ->Apply(generate_size_and_occupancy); diff --git a/benchmarks/hash_table/static_multimap/count_bench.cu b/benchmarks/hash_table/static_multimap/count_bench.cu index 0659fe742..fa71c8d0c 100644 --- a/benchmarks/hash_table/static_multimap/count_bench.cu +++ b/benchmarks/hash_table/static_multimap/count_bench.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,102 +14,88 @@ * limitations under the License. */ -#include +#include +#include #include +#include #include #include +#include + +using namespace cuco::benchmark; +using namespace cuco::utility; /** - * @brief A benchmark evaluating multi-value `count` performance: - * - Total number of insertions: 100'000'000 - * - CG size: 8 + * @brief A benchmark evaluating `cuco::static_multimap::count` performance */ -template -std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> nvbench_static_multimap_count( - nvbench::state& state, - nvbench::type_list, nvbench::enum_type>) +template +std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_multimap_count( + nvbench::state& state, nvbench::type_list) { - auto const num_keys = state.get_int64("NumInputs"); - auto const occupancy = state.get_float64("Occupancy"); - auto const matching_rate = state.get_float64("MatchingRate"); + using pair_type = cuco::pair; - std::size_t const size = num_keys / occupancy; + auto const num_keys = state.get_int64_or_default("NumInputs", defaults::N); + auto const occupancy = state.get_float64_or_default("Occupancy", defaults::OCCUPANCY); + auto const matching_rate = state.get_float64_or_default("MatchingRate", defaults::MATCHING_RATE); - std::vector h_keys(num_keys); - std::vector> h_pairs(num_keys); + std::size_t const size = num_keys / occupancy; - generate_keys(h_keys.begin(), h_keys.end()); + thrust::device_vector keys(num_keys); - for (auto i = 0; i < num_keys; ++i) { - Key key = h_keys[i]; - Value val = h_keys[i]; - h_pairs[i].first = key; - h_pairs[i].second = val; - } + key_generator gen; + gen.generate(dist_from_state(state), keys.begin(), keys.end()); - generate_probe_keys(matching_rate, h_keys.begin(), h_keys.end()); + thrust::device_vector pairs(num_keys); + thrust::transform(keys.begin(), keys.end(), pairs.begin(), [] __device__(Key const& key) { + return pair_type(key, {}); + }); - thrust::device_vector d_keys(h_keys); - thrust::device_vector> d_pairs(h_pairs); + gen.dropout(keys.begin(), keys.end(), matching_rate); - state.add_element_count(num_keys, "NumKeys"); + state.add_element_count(num_keys); cuco::static_multimap map{ - size, cuco::sentinel::empty_key{-1}, cuco::sentinel::empty_value{-1}}; - map.insert(d_pairs.begin(), d_pairs.end()); + size, cuco::empty_key{-1}, cuco::empty_value{-1}}; + map.insert(pairs.begin(), pairs.end()); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - auto count = map.count(d_keys.begin(), d_keys.end(), launch.get_stream()); + auto count = map.count(keys.begin(), keys.end(), launch.get_stream()); }); } -template -std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> nvbench_static_multimap_count( - nvbench::state& state, - nvbench::type_list, nvbench::enum_type>) +template +std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> static_multimap_count( + nvbench::state& state, nvbench::type_list) { state.skip("Key should be the same type as Value."); } -using key_type = nvbench::type_list; -using value_type = nvbench::type_list; -using d_type = - nvbench::enum_type_list; - -using multiplicity = nvbench::enum_type_list<1, 2, 4, 8, 16, 32, 64, 128, 256>; - -NVBENCH_BENCH_TYPES(nvbench_static_multimap_count, - NVBENCH_TYPE_AXES(key_type, - value_type, - nvbench::enum_type_list, - multiplicity)) - .set_name("staic_multimap_count_uniform_multiplicity") - .set_type_axes_names({"Key", "Value", "Distribution", "Multiplicity"}) - .set_timeout(100) // Custom timeout: 100 s. Default is 15 s. - .set_max_noise(3) // Custom noise: 3%. By default: 0.5%. - .add_int64_axis("NumInputs", {100'000'000}) // Total number of key/value pairs: 100'000'000 - .add_float64_axis("Occupancy", {0.8}) - .add_float64_axis("MatchingRate", {0.5}); - -NVBENCH_BENCH_TYPES(nvbench_static_multimap_count, - NVBENCH_TYPE_AXES(key_type, value_type, d_type, nvbench::enum_type_list<8>)) - .set_name("staic_multimap_count_occupancy") - .set_type_axes_names({"Key", "Value", "Distribution", "Multiplicity"}) - .set_timeout(100) // Custom timeout: 100 s. Default is 15 s. - .set_max_noise(3) // Custom noise: 3%. By default: 0.5%. - .add_int64_axis("NumInputs", {100'000'000}) // Total number of key/value pairs: 100'000'000 - .add_float64_axis("Occupancy", nvbench::range(0.1, 0.9, 0.1)) - .add_float64_axis("MatchingRate", {0.5}); - -NVBENCH_BENCH_TYPES(nvbench_static_multimap_count, - NVBENCH_TYPE_AXES(key_type, value_type, d_type, nvbench::enum_type_list<8>)) - .set_name("staic_multimap_count_matching_rate") - .set_type_axes_names({"Key", "Value", "Distribution", "Multiplicity"}) - .set_timeout(100) // Custom timeout: 100 s. Default is 15 s. - .set_max_noise(3) // Custom noise: 3%. By default: 0.5%. - .add_int64_axis("NumInputs", {100'000'000}) // Total number of key/value pairs: 100'000'000 - .add_float64_axis("Occupancy", {0.8}) - .add_float64_axis("MatchingRate", {0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1}); +NVBENCH_BENCH_TYPES(static_multimap_count, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + defaults::VALUE_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_multimap_count_uniform_occupancy") + .set_type_axes_names({"Key", "Value", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE) + .add_float64_axis("Occupancy", defaults::OCCUPANCY_RANGE); + +NVBENCH_BENCH_TYPES(static_multimap_count, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + defaults::VALUE_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_multimap_count_uniform_matching_rate") + .set_type_axes_names({"Key", "Value", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE) + .add_float64_axis("MatchingRate", defaults::MATCHING_RATE_RANGE); + +NVBENCH_BENCH_TYPES(static_multimap_count, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + defaults::VALUE_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_multimap_count_uniform_multiplicity") + .set_type_axes_names({"Key", "Value", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE) + .add_int64_axis("Multiplicity", defaults::MULTIPLICITY_RANGE); diff --git a/benchmarks/hash_table/static_multimap/insert_bench.cu b/benchmarks/hash_table/static_multimap/insert_bench.cu index 17f8723df..aa41044bb 100644 --- a/benchmarks/hash_table/static_multimap/insert_bench.cu +++ b/benchmarks/hash_table/static_multimap/insert_bench.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,87 +14,87 @@ * limitations under the License. */ -#include +#include +#include #include +#include #include #include +#include + +using namespace cuco::benchmark; +using namespace cuco::utility; /** - * @brief A benchmark evaluating multi-value `insert` performance: - * - Total number of insertions: 100'000'000 - * - CG size: 8 + * @brief A benchmark evaluating `cuco::static_multimap::insert` performance */ -template -std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> nvbench_static_multimap_insert( - nvbench::state& state, - nvbench::type_list, nvbench::enum_type>) +template +std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_multimap_insert( + nvbench::state& state, nvbench::type_list) { - auto const num_keys = state.get_int64("NumInputs"); - auto const occupancy = state.get_float64("Occupancy"); + using pair_type = cuco::pair; - std::size_t const size = num_keys / occupancy; + auto const num_keys = state.get_int64_or_default("NumInputs", defaults::N); + auto const occupancy = state.get_float64_or_default("Occupancy", defaults::OCCUPANCY); - std::vector h_keys(num_keys); - std::vector> h_pairs(num_keys); + std::size_t const size = num_keys / occupancy; - generate_keys(h_keys.begin(), h_keys.end()); + thrust::device_vector keys(num_keys); - for (auto i = 0; i < num_keys; ++i) { - Key key = h_keys[i]; - Value val = h_keys[i]; - h_pairs[i].first = key; - h_pairs[i].second = val; - } + key_generator gen; + gen.generate(dist_from_state(state), keys.begin(), keys.end()); - thrust::device_vector> d_pairs(h_pairs); + thrust::device_vector pairs(num_keys); + thrust::transform(keys.begin(), keys.end(), pairs.begin(), [] __device__(Key const& key) { + return pair_type(key, {}); + }); - state.add_element_count(num_keys, "NumKeys"); + state.add_element_count(num_keys); state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) { cuco::static_multimap map{ - size, cuco::sentinel::empty_key{-1}, cuco::sentinel::empty_value{-1}}; + size, cuco::empty_key{-1}, cuco::empty_value{-1}, launch.get_stream()}; - // Use timers to explicitly mark the target region timer.start(); - map.insert(d_pairs.begin(), d_pairs.end(), launch.get_stream()); + map.insert(pairs.begin(), pairs.end(), launch.get_stream()); timer.stop(); }); } -template -std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> nvbench_static_multimap_insert( - nvbench::state& state, - nvbench::type_list, nvbench::enum_type>) +template +std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> static_multimap_insert( + nvbench::state& state, nvbench::type_list) { state.skip("Key should be the same type as Value."); } -using key_type = nvbench::type_list; -using value_type = nvbench::type_list; -using d_type = - nvbench::enum_type_list; - -using multiplicity = nvbench::enum_type_list<1, 2, 4, 8, 16, 32, 64, 128, 256>; - -NVBENCH_BENCH_TYPES(nvbench_static_multimap_insert, - NVBENCH_TYPE_AXES(key_type, - value_type, - nvbench::enum_type_list, - multiplicity)) - .set_name("staic_multimap_insert_uniform_multiplicity") - .set_type_axes_names({"Key", "Value", "Distribution", "Multiplicity"}) - .set_max_noise(3) // Custom noise: 3%. By default: 0.5%. - .add_int64_axis("NumInputs", {100'000'000}) // Total number of key/value pairs: 100'000'000 - .add_float64_axis("Occupancy", {0.8}); - -NVBENCH_BENCH_TYPES(nvbench_static_multimap_insert, - NVBENCH_TYPE_AXES(key_type, value_type, d_type, nvbench::enum_type_list<8>)) - .set_name("staic_multimap_insert_occupancy") - .set_type_axes_names({"Key", "Value", "Distribution", "Multiplicity"}) - .set_max_noise(3) // Custom noise: 3%. By default: 0.5%. - .add_int64_axis("NumInputs", {100'000'000}) // Total number of key/value pairs: 100'000'000 - .add_float64_axis("Occupancy", nvbench::range(0.1, 0.9, 0.1)); +NVBENCH_BENCH_TYPES(static_multimap_insert, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + defaults::VALUE_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_multimap_insert_uniform_multiplicity") + .set_type_axes_names({"Key", "Value", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE) + .add_int64_axis("Multiplicity", defaults::MULTIPLICITY_RANGE); + +NVBENCH_BENCH_TYPES(static_multimap_insert, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + defaults::VALUE_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_multimap_insert_unique_occupancy") + .set_type_axes_names({"Key", "Value", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE) + .add_float64_axis("Occupancy", defaults::OCCUPANCY_RANGE); + +NVBENCH_BENCH_TYPES(static_multimap_insert, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + defaults::VALUE_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_multimap_insert_gaussian_skew") + .set_type_axes_names({"Key", "Value", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE) + .add_float64_axis("Skew", defaults::SKEW_RANGE); diff --git a/benchmarks/hash_table/static_multimap/optimal_retrieve_bench.cu b/benchmarks/hash_table/static_multimap/optimal_retrieve_bench.cu deleted file mode 100644 index a4a202161..000000000 --- a/benchmarks/hash_table/static_multimap/optimal_retrieve_bench.cu +++ /dev/null @@ -1,122 +0,0 @@ -/* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include - -#include - -/** - * @brief Generates input keys by a given number of repetitions per key. - * - */ -template -static void generate_multikeys(OutputIt output_begin, - OutputIt output_end, - size_t const multiplicity) -{ - auto num_keys = std::distance(output_begin, output_end); - - for (auto i = 0; i < num_keys; ++i) { - output_begin[i] = (i % (num_keys / multiplicity)) + 1; - } -} - -/** - * @brief A benchmark evaluating multi-value retrieval performance by varing number of repetitions - * per key: - * - 100'000'000 keys are inserted - * - Map occupancy is fixed at 0.4 - * - Number of repetitions per key: 1, ... , 128, 256 - * - */ -template -std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> nvbench_retrieve( - nvbench::state& state, - nvbench::type_list, nvbench::enum_type>) -{ - std::size_t const num_keys = state.get_int64("NumInputs"); - auto const occupancy = state.get_float64("Occupancy"); - std::size_t const size = num_keys / occupancy; - std::size_t const multiplicity = state.get_int64("Multiplicity"); - - state.add_element_count(num_keys, "NumKeys"); - state.add_global_memory_writes(num_keys * 2); - - std::vector h_keys(num_keys); - std::vector> h_pairs(num_keys); - - generate_multikeys(h_keys.begin(), h_keys.end(), multiplicity); - for (auto i = 0; i < num_keys; ++i) { - Key key = h_keys[i]; - Value val = h_keys[i]; - h_pairs[i].first = key; - h_pairs[i].second = val; - } - - thrust::device_vector d_keys(h_keys); - thrust::device_vector> d_pairs(h_pairs); - - cuco::static_multimap, - cuco::double_hashing, - cuco::detail::MurmurHash3_32>> - map{size, cuco::sentinel::empty_key{-1}, cuco::sentinel::empty_value{-1}}; - map.insert(d_pairs.begin(), d_pairs.end()); - - auto const output_size = map.count_outer(d_keys.begin(), d_keys.end()); - thrust::device_vector> d_results(output_size); - - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - map.retrieve_outer(d_keys.begin(), d_keys.end(), d_results.data().get(), launch.get_stream()); - }); -} - -template -std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> nvbench_retrieve( - nvbench::state& state, - nvbench::type_list, nvbench::enum_type>) -{ - state.skip("Key should be the same type as Value."); -} - -using key_type = nvbench::type_list; -using value_type = nvbench::type_list; -using cg_size = nvbench::enum_type_list<1, 2, 4, 8, 16, 32>; -using buffer_size = nvbench::enum_type_list<1, 2, 4, 8, 16>; - -NVBENCH_BENCH_TYPES(nvbench_retrieve, - NVBENCH_TYPE_AXES(key_type, value_type, cg_size, nvbench::enum_type_list<2>)) - .set_type_axes_names({"Key", "Value", "CGSize", "BufferSize"}) - .set_timeout(100) // Custom timeout: 100 s. Default is 15 s. - .set_max_noise(3) // Custom noise: 3%. By default: 0.5%. - .add_int64_axis("NumInputs", {100'000'000}) // Total number of key/value pairs: 100'000'000 - .add_float64_axis("Occupancy", {0.4}) - .add_int64_power_of_two_axis("Multiplicity", nvbench::range(0, 8, 1)); - -NVBENCH_BENCH_TYPES( - nvbench_retrieve, - NVBENCH_TYPE_AXES(key_type, value_type, nvbench::enum_type_list<8>, buffer_size)) - .set_type_axes_names({"Key", "Value", "CGSize", "BufferSize"}) - .set_timeout(100) // Custom timeout: 100 s. Default is 15 s. - .set_max_noise(3) // Custom noise: 3%. By default: 0.5%. - .add_int64_axis("NumInputs", {100'000'000}) // Total number of key/value pairs: 100'000'000 - .add_float64_axis("Occupancy", {0.4}) - .add_int64_power_of_two_axis("Multiplicity", nvbench::range(0, 8, 1)); diff --git a/benchmarks/hash_table/static_multimap/pair_retrieve_bench.cu b/benchmarks/hash_table/static_multimap/pair_retrieve_bench.cu deleted file mode 100644 index b341fce76..000000000 --- a/benchmarks/hash_table/static_multimap/pair_retrieve_bench.cu +++ /dev/null @@ -1,127 +0,0 @@ -/* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include - -#include - -#include -#include -#include -#include -#include -#include - -namespace { -// Custom pair equal -template -struct pair_equal { - __device__ bool operator()(const cuco::pair_type& lhs, - const cuco::pair_type& rhs) const - { - return lhs.first == rhs.first; - } -}; -} // anonymous namespace - -/** - * @brief A benchmark evaluating `pair_retrieve` performance: - * - CG size: 8 - */ -template -std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> nvbench_static_multimap_pair_retrieve( - nvbench::state& state, nvbench::type_list>) -{ - auto constexpr matching_rate = 0.5; - auto constexpr occupancy = 0.5; - auto constexpr dist = dist_type::UNIFORM; - - auto const num_input = state.get_int64("NumInputs"); - - std::size_t const size = num_input / occupancy; - - std::vector h_keys(num_input); - std::vector> h_pairs(num_input); - - generate_keys(h_keys.begin(), h_keys.end()); - - for (auto i = 0; i < num_input; ++i) { - Key key = h_keys[i]; - Value val = h_keys[i]; - h_pairs[i].first = key; - h_pairs[i].second = val; - } - - thrust::device_vector> d_pairs(h_pairs); - auto const pair_begin = d_pairs.begin(); - - cuco::static_multimap map{ - size, cuco::sentinel::empty_key{-1}, cuco::sentinel::empty_value{-1}}; - map.insert(pair_begin, pair_begin + num_input); - - generate_probe_keys(matching_rate, h_keys.begin(), h_keys.end()); - thrust::device_vector d_keys(h_keys); - - thrust::transform( - thrust::device, d_keys.begin(), d_keys.begin() + num_input, pair_begin, [] __device__(Key i) { - return cuco::pair_type{i, i}; - }); - - state.add_element_count(num_input, "NumInputs"); - - auto const output_size = - map.pair_count(pair_begin, pair_begin + num_input, pair_equal{}); - thrust::device_vector> d_results(output_size); - - auto out1_begin = thrust::make_zip_iterator( - thrust::make_tuple(thrust::make_discard_iterator(), thrust::make_discard_iterator())); - auto out2_begin = thrust::make_zip_iterator( - thrust::make_tuple(thrust::make_discard_iterator(), thrust::make_discard_iterator())); - - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - auto [out1_end, out2_end] = map.pair_retrieve( - pair_begin, pair_begin + num_input, out1_begin, out2_begin, pair_equal{}); - }); -} - -template -std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> nvbench_static_multimap_pair_retrieve( - nvbench::state& state, nvbench::type_list>) -{ - state.skip("Key should be the same type as Value."); -} - -using key_type = nvbench::type_list; -using value_type = nvbench::type_list; -using d_type = - nvbench::enum_type_list; - -using multiplicity = nvbench::enum_type_list<1, 2, 4, 8, 16, 32, 64, 128, 256>; - -NVBENCH_BENCH_TYPES(nvbench_static_multimap_pair_retrieve, - NVBENCH_TYPE_AXES(key_type, value_type, multiplicity)) - .set_name("staic_multimap_pair_retrieve_uniform_multiplicity") - .set_type_axes_names({"Key", "Value", "Multiplicity"}) - .set_timeout(100) // Custom timeout: 100 s. Default is 15 s. - .set_max_noise(3) // Custom noise: 3%. By default: 0.5%. - .add_int64_axis("NumInputs", - {1'000, - 100'000, - 1'000'000, - 10'000'000, - 100'000'000}); // Total number of key/value pairs: 100'000'000 diff --git a/benchmarks/hash_table/static_multimap/query_bench.cu b/benchmarks/hash_table/static_multimap/query_bench.cu index 91c3ca645..7d6202297 100644 --- a/benchmarks/hash_table/static_multimap/query_bench.cu +++ b/benchmarks/hash_table/static_multimap/query_bench.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,106 +14,89 @@ * limitations under the License. */ -#include +#include +#include #include +#include #include #include +#include + +using namespace cuco::benchmark; +using namespace cuco::utility; /** - * @brief A benchmark evaluating multi-value query (`count` + `retrieve`) performance: - * - Total number of insertions: 100'000'000 - * - CG size: 8 + * @brief A benchmark evaluating 'cuco::static_multimap::query' (`count` + `retrieve`) performance */ -template -std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> nvbench_static_multimap_query( - nvbench::state& state, - nvbench::type_list, nvbench::enum_type>) +template +std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_multimap_query( + nvbench::state& state, nvbench::type_list) { - auto const num_keys = state.get_int64("NumInputs"); - auto const occupancy = state.get_float64("Occupancy"); - auto const matching_rate = state.get_float64("MatchingRate"); + using pair_type = cuco::pair; - std::size_t const size = num_keys / occupancy; + auto const num_keys = state.get_int64_or_default("NumInputs", defaults::N); + auto const occupancy = state.get_float64_or_default("Occupancy", defaults::OCCUPANCY); + auto const matching_rate = state.get_float64_or_default("MatchingRate", defaults::MATCHING_RATE); - std::vector h_keys(num_keys); - std::vector> h_pairs(num_keys); + std::size_t const size = num_keys / occupancy; - generate_keys(h_keys.begin(), h_keys.end()); + thrust::device_vector keys(num_keys); - for (auto i = 0; i < num_keys; ++i) { - Key key = h_keys[i]; - Value val = h_keys[i]; - h_pairs[i].first = key; - h_pairs[i].second = val; - } + key_generator gen; + gen.generate(dist_from_state(state), keys.begin(), keys.end()); - generate_probe_keys(matching_rate, h_keys.begin(), h_keys.end()); + thrust::device_vector pairs(num_keys); + thrust::transform(keys.begin(), keys.end(), pairs.begin(), [] __device__(Key const& key) { + return pair_type(key, {}); + }); - thrust::device_vector d_keys(h_keys); - thrust::device_vector> d_pairs(h_pairs); + gen.dropout(keys.begin(), keys.end(), matching_rate); - state.add_element_count(num_keys, "NumKeys"); + state.add_element_count(num_keys); cuco::static_multimap map{ - size, cuco::sentinel::empty_key{-1}, cuco::sentinel::empty_value{-1}}; - map.insert(d_pairs.begin(), d_pairs.end()); - - auto const output_size = map.count_outer(d_keys.begin(), d_keys.end()); - thrust::device_vector> d_results(output_size); + size, cuco::empty_key{-1}, cuco::empty_value{-1}}; + map.insert(pairs.begin(), pairs.end()); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - auto count = map.count_outer(d_keys.begin(), d_keys.end(), launch.get_stream()); - map.retrieve_outer(d_keys.begin(), d_keys.end(), d_results.data().get(), launch.get_stream()); + auto count = map.count_outer(keys.begin(), keys.end(), launch.get_stream()); + map.retrieve_outer(keys.begin(), keys.end(), pairs.begin(), launch.get_stream()); }); } -template -std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> nvbench_static_multimap_query( - nvbench::state& state, - nvbench::type_list, nvbench::enum_type>) +template +std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> static_multimap_query( + nvbench::state& state, nvbench::type_list) { state.skip("Key should be the same type as Value."); } -using key_type = nvbench::type_list; -using value_type = nvbench::type_list; -using d_type = - nvbench::enum_type_list; - -using multiplicity = nvbench::enum_type_list<1, 2, 4, 8, 16, 32, 64, 128, 256>; - -NVBENCH_BENCH_TYPES(nvbench_static_multimap_query, - NVBENCH_TYPE_AXES(key_type, - value_type, - nvbench::enum_type_list, - multiplicity)) - .set_name("staic_multimap_query_uniform_multiplicity") - .set_type_axes_names({"Key", "Value", "Distribution", "Multiplicity"}) - .set_timeout(100) // Custom timeout: 100 s. Default is 15 s. - .set_max_noise(3) // Custom noise: 3%. By default: 0.5%. - .add_int64_axis("NumInputs", {100'000'000}) // Total number of key/value pairs: 100'000'000 - .add_float64_axis("Occupancy", {0.8}) - .add_float64_axis("MatchingRate", {0.5}); - -NVBENCH_BENCH_TYPES(nvbench_static_multimap_query, - NVBENCH_TYPE_AXES(key_type, value_type, d_type, nvbench::enum_type_list<8>)) - .set_name("staic_multimap_query_occupancy") - .set_type_axes_names({"Key", "Value", "Distribution", "Multiplicity"}) - .set_timeout(100) // Custom timeout: 100 s. Default is 15 s. - .set_max_noise(3) // Custom noise: 3%. By default: 0.5%. - .add_int64_axis("NumInputs", {100'000'000}) // Total number of key/value pairs: 100'000'000 - .add_float64_axis("Occupancy", nvbench::range(0.1, 0.9, 0.1)) - .add_float64_axis("MatchingRate", {0.5}); - -NVBENCH_BENCH_TYPES(nvbench_static_multimap_query, - NVBENCH_TYPE_AXES(key_type, value_type, d_type, nvbench::enum_type_list<8>)) - .set_name("staic_multimap_query_matching_rate") - .set_type_axes_names({"Key", "Value", "Distribution", "Multiplicity"}) - .set_timeout(100) // Custom timeout: 100 s. Default is 15 s. - .set_max_noise(3) // Custom noise: 3%. By default: 0.5%. - .add_int64_axis("NumInputs", {100'000'000}) // Total number of key/value pairs: 100'000'000 - .add_float64_axis("Occupancy", {0.8}) - .add_float64_axis("MatchingRate", {0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1}); +NVBENCH_BENCH_TYPES(static_multimap_query, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + defaults::VALUE_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_multimap_query_uniform_occupancy") + .set_type_axes_names({"Key", "Value", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE) + .add_float64_axis("Occupancy", defaults::OCCUPANCY_RANGE); + +NVBENCH_BENCH_TYPES(static_multimap_query, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + defaults::VALUE_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_multimap_query_uniform_matching_rate") + .set_type_axes_names({"Key", "Value", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE) + .add_float64_axis("MatchingRate", defaults::MATCHING_RATE_RANGE); + +NVBENCH_BENCH_TYPES(static_multimap_query, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + defaults::VALUE_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_multimap_query_uniform_multiplicity") + .set_type_axes_names({"Key", "Value", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE) + .add_int64_axis("Multiplicity", defaults::MULTIPLICITY_RANGE); diff --git a/benchmarks/hash_table/static_multimap/retrieve_bench.cu b/benchmarks/hash_table/static_multimap/retrieve_bench.cu index d92f3528e..e30fbe547 100644 --- a/benchmarks/hash_table/static_multimap/retrieve_bench.cu +++ b/benchmarks/hash_table/static_multimap/retrieve_bench.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,105 +14,88 @@ * limitations under the License. */ -#include +#include +#include #include +#include #include #include +#include + +using namespace cuco::benchmark; +using namespace cuco::utility; /** - * @brief A benchmark evaluating multi-value `retrieve` performance: - * - Total number of insertions: 100'000'000 - * - CG size: 8 + * @brief A benchmark evaluating `cuco::static_multimap::retrieve` performance */ -template -std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> nvbench_static_multimap_retrieve( - nvbench::state& state, - nvbench::type_list, nvbench::enum_type>) +template +std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_multimap_retrieve( + nvbench::state& state, nvbench::type_list) { - auto const num_keys = state.get_int64("NumInputs"); - auto const occupancy = state.get_float64("Occupancy"); - auto const matching_rate = state.get_float64("MatchingRate"); + using pair_type = cuco::pair; - std::size_t const size = num_keys / occupancy; + auto const num_keys = state.get_int64_or_default("NumInputs", defaults::N); + auto const occupancy = state.get_float64_or_default("Occupancy", defaults::OCCUPANCY); + auto const matching_rate = state.get_float64_or_default("MatchingRate", defaults::MATCHING_RATE); - std::vector h_keys(num_keys); - std::vector> h_pairs(num_keys); + std::size_t const size = num_keys / occupancy; - generate_keys(h_keys.begin(), h_keys.end()); + thrust::device_vector keys(num_keys); - for (auto i = 0; i < num_keys; ++i) { - Key key = h_keys[i]; - Value val = h_keys[i]; - h_pairs[i].first = key; - h_pairs[i].second = val; - } + key_generator gen; + gen.generate(dist_from_state(state), keys.begin(), keys.end()); - generate_probe_keys(matching_rate, h_keys.begin(), h_keys.end()); + thrust::device_vector pairs(num_keys); + thrust::transform(keys.begin(), keys.end(), pairs.begin(), [] __device__(Key const& key) { + return pair_type(key, {}); + }); - thrust::device_vector d_keys(h_keys); - thrust::device_vector> d_pairs(h_pairs); + gen.dropout(keys.begin(), keys.end(), matching_rate); - state.add_element_count(num_keys, "NumKeys"); + state.add_element_count(num_keys); cuco::static_multimap map{ - size, cuco::sentinel::empty_key{-1}, cuco::sentinel::empty_value{-1}}; - map.insert(d_pairs.begin(), d_pairs.end()); - - auto const output_size = map.count_outer(d_keys.begin(), d_keys.end()); - thrust::device_vector> d_results(output_size); + size, cuco::empty_key{-1}, cuco::empty_value{-1}}; + map.insert(pairs.begin(), pairs.end()); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - map.retrieve_outer(d_keys.begin(), d_keys.end(), d_results.data().get(), launch.get_stream()); + map.retrieve_outer(keys.begin(), keys.end(), pairs.begin(), launch.get_stream()); }); } -template -std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> nvbench_static_multimap_retrieve( - nvbench::state& state, - nvbench::type_list, nvbench::enum_type>) +template +std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> static_multimap_retrieve( + nvbench::state& state, nvbench::type_list) { state.skip("Key should be the same type as Value."); } -using key_type = nvbench::type_list; -using value_type = nvbench::type_list; -using d_type = - nvbench::enum_type_list; - -using multiplicity = nvbench::enum_type_list<1, 2, 4, 8, 16, 32, 64, 128, 256>; - -NVBENCH_BENCH_TYPES(nvbench_static_multimap_retrieve, - NVBENCH_TYPE_AXES(key_type, - value_type, - nvbench::enum_type_list, - multiplicity)) - .set_name("staic_multimap_retrieve_uniform_multiplicity") - .set_type_axes_names({"Key", "Value", "Distribution", "Multiplicity"}) - .set_timeout(100) // Custom timeout: 100 s. Default is 15 s. - .set_max_noise(3) // Custom noise: 3%. By default: 0.5%. - .add_int64_axis("NumInputs", {100'000'000}) // Total number of key/value pairs: 100'000'000 - .add_float64_axis("Occupancy", {0.8}) - .add_float64_axis("MatchingRate", {0.5}); - -NVBENCH_BENCH_TYPES(nvbench_static_multimap_retrieve, - NVBENCH_TYPE_AXES(key_type, value_type, d_type, nvbench::enum_type_list<8>)) - .set_name("staic_multimap_retrieve_occupancy") - .set_type_axes_names({"Key", "Value", "Distribution", "Multiplicity"}) - .set_timeout(100) // Custom timeout: 100 s. Default is 15 s. - .set_max_noise(3) // Custom noise: 3%. By default: 0.5%. - .add_int64_axis("NumInputs", {100'000'000}) // Total number of key/value pairs: 100'000'000 - .add_float64_axis("Occupancy", nvbench::range(0.1, 0.9, 0.1)) - .add_float64_axis("MatchingRate", {0.5}); - -NVBENCH_BENCH_TYPES(nvbench_static_multimap_retrieve, - NVBENCH_TYPE_AXES(key_type, value_type, d_type, nvbench::enum_type_list<8>)) - .set_name("staic_multimap_retrieve_matching_rate") - .set_type_axes_names({"Key", "Value", "Distribution", "Multiplicity"}) - .set_timeout(100) // Custom timeout: 100 s. Default is 15 s. - .set_max_noise(3) // Custom noise: 3%. By default: 0.5%. - .add_int64_axis("NumInputs", {100'000'000}) // Total number of key/value pairs: 100'000'000 - .add_float64_axis("Occupancy", {0.8}) - .add_float64_axis("MatchingRate", {0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1}); +NVBENCH_BENCH_TYPES(static_multimap_retrieve, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + defaults::VALUE_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_multimap_retrieve_uniform_occupancy") + .set_type_axes_names({"Key", "Value", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE) + .add_float64_axis("Occupancy", defaults::OCCUPANCY_RANGE); + +NVBENCH_BENCH_TYPES(static_multimap_retrieve, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + defaults::VALUE_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_multimap_retrieve_uniform_matching_rate") + .set_type_axes_names({"Key", "Value", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE) + .add_float64_axis("MatchingRate", defaults::MATCHING_RATE_RANGE); + +NVBENCH_BENCH_TYPES(static_multimap_retrieve, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + defaults::VALUE_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_multimap_retrieve_uniform_multiplicity") + .set_type_axes_names({"Key", "Value", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE) + .add_int64_axis("Multiplicity", defaults::MULTIPLICITY_RANGE); diff --git a/benchmarks/hash_table/static_set/contains_bench.cu b/benchmarks/hash_table/static_set/contains_bench.cu new file mode 100644 index 000000000..35362ed9e --- /dev/null +++ b/benchmarks/hash_table/static_set/contains_bench.cu @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include + +#include + +#include + +using namespace cuco::benchmark; +using namespace cuco::utility; + +/** + * @brief A benchmark evaluating `cuco::static_set::contains` performance + */ +template +void static_set_contains(nvbench::state& state, nvbench::type_list) +{ + auto const num_keys = state.get_int64_or_default("NumInputs", defaults::N); + auto const occupancy = state.get_float64_or_default("Occupancy", defaults::OCCUPANCY); + auto const matching_rate = state.get_float64_or_default("MatchingRate", defaults::MATCHING_RATE); + + std::size_t const size = num_keys / occupancy; + + thrust::device_vector keys(num_keys); + + key_generator gen; + gen.generate(dist_from_state(state), keys.begin(), keys.end()); + + cuco::experimental::static_set set{size, cuco::empty_key{-1}}; + set.insert(keys.begin(), keys.end()); + + gen.dropout(keys.begin(), keys.end(), matching_rate); + + thrust::device_vector result(num_keys); + + state.add_element_count(num_keys); + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + set.contains(keys.begin(), keys.end(), result.begin(), {launch.get_stream()}); + }); +} + +NVBENCH_BENCH_TYPES(static_set_contains, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_set_contains_unique_occupancy") + .set_type_axes_names({"Key", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE) + .add_float64_axis("Occupancy", defaults::OCCUPANCY_RANGE); + +NVBENCH_BENCH_TYPES(static_set_contains, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_set_contains_unique_matching_rate") + .set_type_axes_names({"Key", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE) + .add_float64_axis("MatchingRate", defaults::MATCHING_RATE_RANGE); + +NVBENCH_BENCH_TYPES(static_set_contains, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_set_constains_unique_capacity") + .set_type_axes_names({"Key", "Distribution"}) + .add_int64_axis("NumInputs", defaults::N_RANGE_CACHE); diff --git a/benchmarks/hash_table/static_set/find_bench.cu b/benchmarks/hash_table/static_set/find_bench.cu new file mode 100644 index 000000000..e0ab9111c --- /dev/null +++ b/benchmarks/hash_table/static_set/find_bench.cu @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include + +#include + +#include +#include + +using namespace cuco::benchmark; +using namespace cuco::utility; + +/** + * @brief A benchmark evaluating `cuco::static_set::find` performance + */ +template +void static_set_find(nvbench::state& state, nvbench::type_list) +{ + auto const num_keys = state.get_int64_or_default("NumInputs", defaults::N); + auto const occupancy = state.get_float64_or_default("Occupancy", defaults::OCCUPANCY); + auto const matching_rate = state.get_float64_or_default("MatchingRate", defaults::MATCHING_RATE); + + std::size_t const size = num_keys / occupancy; + + thrust::device_vector keys(num_keys); + + key_generator gen; + gen.generate(dist_from_state(state), keys.begin(), keys.end()); + + cuco::experimental::static_set set{size, cuco::empty_key{-1}}; + set.insert(keys.begin(), keys.end()); + + // TODO: would crash if not passing nullptr, why? + gen.dropout(keys.begin(), keys.end(), matching_rate, nullptr); + + thrust::device_vector result(num_keys); + + state.add_element_count(num_keys); + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + set.find(keys.begin(), keys.end(), result.begin(), {launch.get_stream()}); + }); +} + +NVBENCH_BENCH_TYPES(static_set_find, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_set_find_unique_occupancy") + .set_type_axes_names({"Key", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE) + .add_float64_axis("Occupancy", defaults::OCCUPANCY_RANGE); + +NVBENCH_BENCH_TYPES(static_set_find, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_set_find_unique_matching_rate") + .set_type_axes_names({"Key", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE) + .add_float64_axis("MatchingRate", defaults::MATCHING_RATE_RANGE); + +NVBENCH_BENCH_TYPES(static_set_find, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_set_find_unique_capacity") + .set_type_axes_names({"Key", "Distribution"}) + .add_int64_axis("NumInputs", defaults::N_RANGE_CACHE); diff --git a/benchmarks/hash_table/static_set/insert_bench.cu b/benchmarks/hash_table/static_set/insert_bench.cu new file mode 100644 index 000000000..48bc37fa4 --- /dev/null +++ b/benchmarks/hash_table/static_set/insert_bench.cu @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include + +#include + +#include + +using namespace cuco::benchmark; +using namespace cuco::utility; + +/** + * @brief A benchmark evaluating `cuco::static_set::insert` performance + */ +template +void static_set_insert(nvbench::state& state, nvbench::type_list) +{ + auto const num_keys = state.get_int64_or_default("NumInputs", defaults::N); + auto const occupancy = state.get_float64_or_default("Occupancy", defaults::OCCUPANCY); + + std::size_t const size = num_keys / occupancy; + + thrust::device_vector keys(num_keys); + + key_generator gen; + gen.generate(dist_from_state(state), keys.begin(), keys.end()); + + state.add_element_count(num_keys); + + state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer, + [&](nvbench::launch& launch, auto& timer) { + cuco::experimental::static_set set{ + size, cuco::empty_key{-1}, {}, {}, {}, {launch.get_stream()}}; + + timer.start(); + set.insert(keys.begin(), keys.end(), {launch.get_stream()}); + timer.stop(); + }); +} + +NVBENCH_BENCH_TYPES(static_set_insert, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_set_insert_uniform_multiplicity") + .set_type_axes_names({"Key", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE) + .add_int64_axis("Multiplicity", defaults::MULTIPLICITY_RANGE); + +NVBENCH_BENCH_TYPES(static_set_insert, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_set_insert_unique_occupancy") + .set_type_axes_names({"Key", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE) + .add_float64_axis("Occupancy", defaults::OCCUPANCY_RANGE); + +NVBENCH_BENCH_TYPES(static_set_insert, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_set_insert_gaussian_skew") + .set_type_axes_names({"Key", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE) + .add_float64_axis("Skew", defaults::SKEW_RANGE); diff --git a/benchmarks/hash_table/static_set/retrieve_all_bench.cu b/benchmarks/hash_table/static_set/retrieve_all_bench.cu new file mode 100644 index 000000000..17ea66384 --- /dev/null +++ b/benchmarks/hash_table/static_set/retrieve_all_bench.cu @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include + +#include + +#include + +using namespace cuco::benchmark; +using namespace cuco::utility; + +/** + * @brief A benchmark evaluating `cuco::static_set::retrieve_all` performance + */ +template +void static_set_retrieve_all(nvbench::state& state, nvbench::type_list) +{ + auto const num_keys = state.get_int64_or_default("NumInputs", defaults::N); + auto const occupancy = state.get_float64_or_default("Occupancy", defaults::OCCUPANCY); + + std::size_t const size = num_keys / occupancy; + + thrust::device_vector keys(num_keys); + + key_generator gen; + gen.generate(dist_from_state(state), keys.begin(), keys.end()); + + cuco::experimental::static_set set{size, cuco::empty_key{-1}}; + set.insert(keys.begin(), keys.end()); + + thrust::device_vector result(num_keys); + + state.add_element_count(num_keys); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto end = set.retrieve_all(result.begin(), {launch.get_stream()}); + }); +} + +NVBENCH_BENCH_TYPES(static_set_retrieve_all, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_set_retrieve_all_unique_occupancy") + .set_type_axes_names({"Key", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE) + .add_float64_axis("Occupancy", defaults::OCCUPANCY_RANGE); diff --git a/benchmarks/hash_table/static_set/size_bench.cu b/benchmarks/hash_table/static_set/size_bench.cu new file mode 100644 index 000000000..fbddc3951 --- /dev/null +++ b/benchmarks/hash_table/static_set/size_bench.cu @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include + +#include + +#include + +using namespace cuco::benchmark; +using namespace cuco::utility; + +/** + * @brief A benchmark evaluating `cuco::static_set::size` performance + */ +template +void static_set_size(nvbench::state& state, nvbench::type_list) +{ + auto const num_keys = state.get_int64_or_default("NumInputs", defaults::N); + auto const occupancy = state.get_float64_or_default("Occupancy", defaults::OCCUPANCY); + + std::size_t const size = num_keys / occupancy; + + thrust::device_vector keys(num_keys); + + key_generator gen; + gen.generate(dist_from_state(state), keys.begin(), keys.end()); + + state.add_element_count(num_keys); + + cuco::experimental::static_set set{size, cuco::empty_key{-1}}; + + set.insert(keys.begin(), keys.end()); + + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { auto const size = set.size({launch.get_stream()}); }); +} + +NVBENCH_BENCH_TYPES(static_set_size, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_set_size_unique_occupancy") + .set_type_axes_names({"Key", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE) + .add_float64_axis("Occupancy", defaults::OCCUPANCY_RANGE); diff --git a/benchmarks/key_generator.hpp b/benchmarks/key_generator.hpp deleted file mode 100644 index bd90e6caa..000000000 --- a/benchmarks/key_generator.hpp +++ /dev/null @@ -1,114 +0,0 @@ -/* - * Copyright (c) 2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include - -#include -#include - -enum class dist_type { GAUSSIAN, GEOMETRIC, UNIFORM }; - -NVBENCH_DECLARE_ENUM_TYPE_STRINGS( - // Enum type: - dist_type, - // Callable to generate input strings: - // Short identifier used for tables, command-line args, etc. - // Used when context is available to figure out the enum type. - [](dist_type d) { - switch (d) { - case dist_type::GAUSSIAN: return "GAUSSIAN"; - case dist_type::GEOMETRIC: return "GEOMETRIC"; - case dist_type::UNIFORM: return "UNIFORM"; - default: return "ERROR"; - } - }, - // Callable to generate descriptions: - // If non-empty, these are used in `--list` to describe values. - // Used when context may not be available to figure out the type from the - // input string. - // Just use `[](auto) { return std::string{}; }` if you don't want these. - [](auto) { return std::string{}; }) - -template -static void generate_keys(OutputIt output_begin, OutputIt output_end) -{ - auto const num_keys = std::distance(output_begin, output_end); - - std::random_device rd; - std::mt19937 gen{rd()}; - - switch (Dist) { - case dist_type::GAUSSIAN: { - auto const mean = static_cast(num_keys / 2); - auto const dev = static_cast(num_keys / 5); - - std::normal_distribution<> distribution{mean, dev}; - - for (auto i = 0; i < num_keys; ++i) { - auto k = distribution(gen); - while (k >= num_keys) { - k = distribution(gen); - } - output_begin[i] = k; - } - break; - } - case dist_type::GEOMETRIC: { - auto const max = std::numeric_limits::max(); - auto const coeff = static_cast(num_keys) / static_cast(max); - // Random sampling in range [0, INT32_MAX] - std::geometric_distribution distribution{1e-9}; - - for (auto i = 0; i < num_keys; ++i) { - output_begin[i] = distribution(gen) * coeff; - } - break; - } - case dist_type::UNIFORM: { - std::uniform_int_distribution distribution{1, static_cast(num_keys / Multiplicity)}; - - for (auto i = 0; i < num_keys; ++i) { - output_begin[i] = distribution(gen); - } - break; - } - } // switch -} - -template -static void generate_probe_keys(double const matching_rate, - OutputIt output_begin, - OutputIt output_end) -{ - auto const num_keys = std::distance(output_begin, output_end); - auto const max = std::numeric_limits::max(); - - std::random_device rd; - std::mt19937 gen{rd()}; - - std::uniform_real_distribution rate_dist(0.0, 1.0); - std::uniform_int_distribution non_match_dist{static_cast(num_keys), max}; - - for (auto i = 0; i < num_keys; ++i) { - auto const tmp_rate = rate_dist(gen); - - if (tmp_rate > matching_rate) { output_begin[i] = non_match_dist(gen); } - } - - std::random_shuffle(output_begin, output_end); -} diff --git a/benchmarks/reduce_by_key/reduce_by_key.cu b/benchmarks/reduce_by_key/reduce_by_key.cu deleted file mode 100644 index 1de05a42f..000000000 --- a/benchmarks/reduce_by_key/reduce_by_key.cu +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Copyright (c) 2020, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/** - * @brief Generates input sizes and number of unique keys - * - */ -static void generate_size_and_num_unique(benchmark::internal::Benchmark* b) -{ - for (auto num_unique = 64; num_unique <= 1 << 20; num_unique <<= 1) { - for (auto size = 10'000'000; size <= 10'000'000; size *= 10) { - b->Args({size, num_unique}); - } - } -} - -template -void thrust_reduce_by_key(KeyRandomIterator keys_begin, - KeyRandomIterator keys_end, - ValueRandomIterator values_begin) -{ - using Key = typename thrust::iterator_traits::value_type; - using Value = typename thrust::iterator_traits::value_type; - - // Exact size of output is unknown (number of unique keys), but upper bounded - // by the number of keys - auto maximum_output_size = thrust::distance(keys_begin, keys_end); - thrust::device_vector output_keys(maximum_output_size); - thrust::device_vector output_values(maximum_output_size); - - thrust::sort_by_key(thrust::device, keys_begin, keys_end, values_begin); - thrust::reduce_by_key( - thrust::device, keys_begin, keys_end, values_begin, output_keys.begin(), output_values.end()); -} - -template -static void BM_thrust(::benchmark::State& state) -{ - auto const num_unique_keys = state.range(1); - for (auto _ : state) { - state.PauseTiming(); - thrust::device_vector keys(state.range(0)); - auto begin = thrust::make_counting_iterator(0); - thrust::transform( - begin, begin + state.range(0), keys.begin(), [num_unique_keys] __device__(auto i) { - return i % num_unique_keys; - }); - - thrust::device_vector values(state.range(0)); - state.ResumeTiming(); - thrust_reduce_by_key(keys.begin(), keys.end(), values.begin()); - cudaDeviceSynchronize(); - } -} -BENCHMARK_TEMPLATE(BM_thrust, int32_t, int32_t) - ->Unit(benchmark::kMillisecond) - ->Apply(generate_size_and_num_unique); - -BENCHMARK_TEMPLATE(BM_thrust, int64_t, int64_t) - ->Unit(benchmark::kMillisecond) - ->Apply(generate_size_and_num_unique); - -// TODO: Hash based reduce by key benchmark diff --git a/benchmarks/synchronization.hpp b/benchmarks/synchronization.hpp deleted file mode 100644 index f0d7807be..000000000 --- a/benchmarks/synchronization.hpp +++ /dev/null @@ -1,135 +0,0 @@ -/* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -// Google Benchmark library -#include - -#include - -#include - -#define BENCH_CUDA_TRY(call) \ - do { \ - auto const status = (call); \ - if (cudaSuccess != status) { throw std::runtime_error("CUDA error detected."); } \ - } while (0) - -#define BENCH_ASSERT_CUDA_SUCCESS(expr) \ - do { \ - cudaError_t const status = (expr); \ - assert(cudaSuccess == status); \ - } while (0) -/** - * @brief This class serves as a wrapper for using `cudaEvent_t` as the user - * defined timer within the framework of google benchmark - * (https://github.com/google/benchmark). - * - * It is built on top of the idea of Resource acquisition is initialization - * (RAII). In the following we show a minimal example of how to use this class. - * - * \code{cpp} - * #include - * - * static void sample_cuda_benchmark(benchmark::State& state) { - * - * for (auto _ : state){ - * cudaStream_t stream = 0; - * - * // Create (Construct) an object of this class. You HAVE to pass in the - * // benchmark::State object you are using. It measures the time from its - * // creation to its destruction that is spent on the specified CUDA stream. - * // It also clears the L2 cache by cudaMemset'ing a device buffer that is of - * // the size of the L2 cache (if flush_l2_cache is set to true and there is - * // an L2 cache on the current device). - * cuda_event_timer raii(state, true, stream); // flush_l2_cache = true - * - * // Now perform the operations that is to be benchmarked - * sample_kernel<<<1, 256, 0, stream>>>(); // Possibly launching a CUDA kernel - * - * } - * } - * - * // Register the function as a benchmark. You will need to set the `UseManualTime()` - * // flag in order to use the timer embeded in this class. - * BENCHMARK(sample_cuda_benchmark)->UseManualTime(); - * \endcode - * - * - */ -class cuda_event_timer { - public: - /** - * @brief Constructs a `cuda_event_timer` beginning a manual timing range. - * - * Optionally flushes L2 cache. - * - * @param[in,out] state This is the benchmark::State whose timer we are going - * to update. - * @param[in] flush_l2_cache_ whether or not to flush the L2 cache before - * every iteration. - * @param[in] stream_ The CUDA stream we are measuring time on. - */ - cuda_event_timer(benchmark::State& state, bool flush_l2_cache = false, cudaStream_t stream = 0) - : p_state(&state), stream_(stream) - { - // flush all of L2$ - if (flush_l2_cache) { - int current_device = 0; - BENCH_CUDA_TRY(cudaGetDevice(¤t_device)); - - int l2_cache_bytes = 0; - BENCH_CUDA_TRY( - cudaDeviceGetAttribute(&l2_cache_bytes, cudaDevAttrL2CacheSize, current_device)); - - if (l2_cache_bytes > 0) { - const int memset_value = 0; - int* l2_cache_buffer = nullptr; - BENCH_CUDA_TRY(cudaMalloc(&l2_cache_buffer, l2_cache_bytes)); - BENCH_CUDA_TRY(cudaMemsetAsync(l2_cache_buffer, memset_value, l2_cache_bytes, stream_)); - BENCH_CUDA_TRY(cudaFree(l2_cache_buffer)); - } - } - - BENCH_CUDA_TRY(cudaEventCreate(&start_)); - BENCH_CUDA_TRY(cudaEventCreate(&stop_)); - BENCH_CUDA_TRY(cudaEventRecord(start_, stream_)); - } - - cuda_event_timer() = delete; - - /** - * @brief Destroy the `cuda_event_timer` and ending the manual time range. - * - */ - ~cuda_event_timer() - { - BENCH_ASSERT_CUDA_SUCCESS(cudaEventRecord(stop_, stream_)); - BENCH_ASSERT_CUDA_SUCCESS(cudaEventSynchronize(stop_)); - float milliseconds = 0.0f; - BENCH_ASSERT_CUDA_SUCCESS(cudaEventElapsedTime(&milliseconds, start_, stop_)); - p_state->SetIterationTime(milliseconds / (1000.0f)); - BENCH_ASSERT_CUDA_SUCCESS(cudaEventDestroy(start_)); - BENCH_ASSERT_CUDA_SUCCESS(cudaEventDestroy(stop_)); - } - - private: - cudaEvent_t start_; - cudaEvent_t stop_; - cudaStream_t stream_; - benchmark::State* p_state; -}; diff --git a/benchmarks/utils.hpp b/benchmarks/utils.hpp new file mode 100644 index 000000000..a8a84a3b6 --- /dev/null +++ b/benchmarks/utils.hpp @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include + +namespace cuco::benchmark { + +template +auto dist_from_state(nvbench::state const& state) +{ + if constexpr (std::is_same_v) { + return Dist{}; + } else if constexpr (std::is_same_v) { + auto const multiplicity = state.get_int64_or_default("Multiplicity", defaults::MULTIPLICITY); + return Dist{multiplicity}; + } else if constexpr (std::is_same_v) { + auto const skew = state.get_float64_or_default("Skew", defaults::SKEW); + return Dist{skew}; + } else { + CUCO_FAIL("Unexpected distribution type"); + } +} + +} // namespace cuco::benchmark + +NVBENCH_DECLARE_TYPE_STRINGS(cuco::utility::distribution::unique, "UNIQUE", "distribution::unique"); +NVBENCH_DECLARE_TYPE_STRINGS(cuco::utility::distribution::uniform, + "UNIFORM", + "distribution::uniform"); +NVBENCH_DECLARE_TYPE_STRINGS(cuco::utility::distribution::gaussian, + "GAUSSIAN", + "distribution::gaussian"); diff --git a/ci/build.sh b/ci/build.sh new file mode 100755 index 000000000..0baeaa68c --- /dev/null +++ b/ci/build.sh @@ -0,0 +1,121 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -eo pipefail + +# Ensure the script is being executed in its containing directory +cd "$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"; + +# Script defaults +CUDA_COMPILER=nvcc + +# Check if the correct number of arguments has been provided +function usage { + echo "Usage: $0 [OPTIONS] " + echo "The PARALLEL_LEVEL environment variable controls the amount of build parallelism. Default is the number of cores." + echo "Example: PARALLEL_LEVEL=8 $0 g++-8 14 \"70\" " + echo "Example: $0 clang++-8 17 \"70;75;80-virtual\" " + echo "Possible options: " + echo " -nvcc: path/to/nvcc" + echo " -v/--verbose: enable shell echo for debugging" + exit 1 +} + +# Check for extra options +# While there are more than 3 arguments, parse switches/options +while [ "$#" -gt 3 ] +do + case "${1}" in + -h) usage ;; + -help) usage ;; + --help) usage ;; + --verbose) VERBOSE=1; shift ;; + -v) VERBOSE=1; shift ;; + -nvcc) CUDA_COMPILER="${2}"; shift 2;; + *) usage ;; + esac +done + +if [ $VERBOSE ]; then + set -x +fi + +if [ "$#" -ne 3 ]; then + echo "Invalid number of arguments" + usage +fi + +# Begin processing unsets after option parsing +set -u + +# Assign command line arguments to variables +readonly HOST_COMPILER=$(which $1) +readonly CXX_STANDARD=$2 + +# Replace spaces, commas and semicolons with semicolons for CMake list +readonly GPU_ARCHS=$(echo $3 | tr ' ,' ';') + +readonly PARALLEL_LEVEL=${PARALLEL_LEVEL:=$(nproc)} +readonly NVCC_VERSION=$($CUDA_COMPILER --version | grep release | awk '{print $6}' | cut -c2-) + +if [ -z ${DEVCONTAINER_NAME+x} ]; then + BUILD_DIR=../build/local +else + BUILD_DIR=../build/${DEVCONTAINER_NAME} +fi + +# The most recent build will always be symlinked to cuCollections/build/latest +mkdir -p $BUILD_DIR +rm -f ../build/latest +ln -sf $BUILD_DIR ../build/latest +export BUILD_DIR +echo $BUILD_DIR + +CMAKE_OPTIONS=" + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_CXX_STANDARD=${CXX_STANDARD} \ + -DCMAKE_CUDA_STANDARD=${CXX_STANDARD} \ + -DCMAKE_CXX_COMPILER=${HOST_COMPILER} \ + -DCMAKE_CUDA_COMPILER=${CUDA_COMPILER} \ + -DCMAKE_CUDA_HOST_COMPILER=${HOST_COMPILER} \ + -DCMAKE_CUDA_ARCHITECTURES=${GPU_ARCHS} \ + -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \ +" + +echo "========================================" +echo "Begin build" +echo "pwd=$(pwd)" +echo "NVCC_VERSION=$NVCC_VERSION" +echo "HOST_COMPILER=$HOST_COMPILER" +echo "CXX_STANDARD=$CXX_STANDARD" +echo "GPU_ARCHS=$GPU_ARCHS" +echo "PARALLEL_LEVEL=$PARALLEL_LEVEL" +echo "BUILD_DIR=$BUILD_DIR" +echo "========================================" + +function configure(){ + cmake -S .. -B $BUILD_DIR $CMAKE_OPTIONS +} + +function build(){ + source "./sccache_stats.sh" start + cmake --build $BUILD_DIR --parallel $PARALLEL_LEVEL + echo "Build complete" + source "./sccache_stats.sh" end +} + +configure +build \ No newline at end of file diff --git a/ci/checks/style.sh b/ci/checks/style.sh deleted file mode 100755 index fbbe1d120..000000000 --- a/ci/checks/style.sh +++ /dev/null @@ -1,33 +0,0 @@ -#!/bin/bash -# Copyright (c) 2018-2022, NVIDIA CORPORATION. -############################## -# cuCollections Style Tester # -############################## - -# Ignore errors and set path -set +e -PATH=/conda/bin:$PATH -# LC_ALL=C.UTF-8 -# LANG=C.UTF-8 - -# Activate common conda env -. /opt/conda/etc/profile.d/conda.sh -conda activate rapids - -# Run clang-format and check for a consistent code format -CLANG_FORMAT=`pre-commit run clang-format --all-files 2>&1` -CLANG_FORMAT_RETVAL=$? - -# Run doxygen check -DOXYGEN_CHECK=`ci/checks/doxygen.sh` -DOXYGEN_CHECK_RETVAL=$? - -echo -e "$DOXYGEN_CHECK" - -RETVALS=( - $CLANG_FORMAT_RETVAL -) -IFS=$'\n' -RETVAL=`echo "${RETVALS[*]}" | sort -nr | head -n1` - -exit $RETVAL diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh deleted file mode 100644 index 8ae26bcf4..000000000 --- a/ci/gpu/build.sh +++ /dev/null @@ -1,76 +0,0 @@ -#!/bin/bash -# Copyright (c) 2021, NVIDIA CORPORATION. -##############################################i### -# cuCollections GPU build and test script for CI # -################################################## -set -e -NUMARGS=$# -ARGS=$* - -# Arg parsing function -function hasArg { - (( ${NUMARGS} != 0 )) && (echo " ${ARGS} " | grep -q " $1 ") -} - -# Set path and build parallel level -export PATH=/opt/conda/bin:/usr/local/cuda/bin:$PATH -export PARALLEL_LEVEL=${PARALLEL_LEVEL:-4} -export CUDA_REL=${CUDA_VERSION%.*} - -# Set home to the job's workspace -export HOME=$WORKSPACE - -################################################################################ -# SETUP - Check environment -################################################################################ - -gpuci_logger "Check environment" -env - -gpuci_logger "Check GPU usage" -nvidia-smi - -gpuci_logger "Install Dependencies" -. /opt/conda/etc/profile.d/conda.sh -conda create -y -n cuda -c nvidia -c conda-forge "cudatoolkit=${CUDA_VER}" "cmake>=3.18.*" -conda activate cuda - -gpuci_logger "Check versions" -python --version - -gpuci_logger "Check conda environment" -conda info -conda config --show-sources -conda list --show-channel-urls - -################################################################################ -# BUILD - Build from Source -################################################################################ - -gpuci_logger "Build Tests/Examples" -cd ${WORKSPACE} -mkdir -p build -cd build -cmake .. -make - -################################################################################ -# TEST - Run Tests -################################################################################ - -if hasArg --skip-tests; then - gpuci_logger "Skipping Tests" -else - gpuci_logger "Check GPU usage" - nvidia-smi - cd ${WORKSPACE}/build/tests - ctest . - - # This block may provide more verbose testing output since each test is ran individually - #cd ${WORKSPACE}/build/tests - #for gt in "$WORKSPACE/build/tests"* ; do - # test_name=$(basename ${gt}) - # echo "Running $test_name" - # ${gt} - #done -fi diff --git a/ci/matrix.yml b/ci/matrix.yml new file mode 100644 index 000000000..5916dd113 --- /dev/null +++ b/ci/matrix.yml @@ -0,0 +1,46 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +cuda_oldest: &cuda_oldest '11.8' +cuda_newest: &cuda_newest '12.2' + +# The GPUs to test on +# Note: This assumes that the appropriate gpu_build_archs are set to include building for the GPUs listed here +gpus: + - 'v100' + +# The version of the devcontainer images to use from https://hub.docker.com/r/rapidsai/devcontainers +devcontainer_version: '23.08' + +# Each environment below will generate a unique build/test job +# See the "compute-matrix" job in the workflow for how this is parsed and used +# cuda: The CUDA Toolkit version +# os: The operating system used +# cpu: The CPU architecture +# compiler: The compiler to use +# name: The compiler name +# version: The compiler version +# exe: The unverionsed compiler binary name +# To use the system's default compiler set "exe: 'c++'" or "name: 'cc'" +# gpu_build_archs: The GPU architectures to build for (comma-separated list) +# std: The C++ standards to build for +# This field is unique as it will generate an independent build/test job for each value + +# Configurations that will run for every PR +pull_request: + nvcc: + # There is currently only one CUDA 11.8 image available which comes with the system's default C++ compiler. For ubuntu22.04, we know that the default CC is gcc11.3 + - {cuda: *cuda_oldest, os: 'ubuntu22.04', cpu: 'amd64', compiler: {name: 'gcc', version: '11', exe: 'c++'}, gpu_build_archs: '60', std: [17], jobs: ['build', 'test']} + - {cuda: *cuda_newest, os: 'ubuntu22.04', cpu: 'amd64', compiler: {name: 'gcc', version: '12', exe: 'g++'}, gpu_build_archs: '70', std: [17], jobs: ['build', 'test']} \ No newline at end of file diff --git a/ci/checks/doxygen.sh b/ci/pre-commit/doxygen.sh similarity index 59% rename from ci/checks/doxygen.sh rename to ci/pre-commit/doxygen.sh index b9a243cd1..8f387c6ea 100755 --- a/ci/checks/doxygen.sh +++ b/ci/pre-commit/doxygen.sh @@ -1,8 +1,18 @@ #!/bin/bash -# Copyright (c) 2022, NVIDIA CORPORATION. -######################################## -# cuCollections doxygen warnings check # -######################################## +# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. # skip if doxygen is not installed if ! [ -x "$(command -v doxygen)" ]; then @@ -16,9 +26,9 @@ function version { echo "$@" | awk -F. '{ printf("%d%03d%03d%03d\n", $1,$2,$3,$4 # Doxygen supported version 1.8.20 to 1.9.1 DOXYGEN_VERSION=$(doxygen --version) if [ $(version "$DOXYGEN_VERSION") -lt $(version "1.8.20") ] || [ $(version $DOXYGEN_VERSION) -gt $(version "1.9.1") ]; then - echo -e "Warning: Unsupported Doxygen version $DOXYGEN_VERSION" - echo -e "Expecting Doxygen version from 1.8.20 to 1.9.1" - exit 0 + echo -e "Warning: Unsupported Doxygen version $DOXYGEN_VERSION" + echo -e "Expecting Doxygen version from 1.8.20 to 1.9.1" + exit 0 fi # Run doxygen, ignore missing tag files error diff --git a/ci/sccache_hit_rate.sh b/ci/sccache_hit_rate.sh new file mode 100755 index 000000000..8b6d2d3f5 --- /dev/null +++ b/ci/sccache_hit_rate.sh @@ -0,0 +1,55 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euo pipefail + +# Ensure two arguments are provided +if [ $# -ne 2 ]; then + echo "Usage: $0 " >&2 + exit 1 +fi + +# Print the contents of the before file +echo "=== Contents of $1 ===" >&2 +cat $1 >&2 +echo "=== End of $1 ===" >&2 + +# Print the contents of the after file +echo "=== Contents of $2 ===" >&2 +cat $2 >&2 +echo "=== End of $2 ===" >&2 + +# Extract compile requests and cache hits from the before and after files +requests_before=$(awk '/^[ \t]*Compile requests[ \t]+[0-9]+/ {print $3}' "$1") +hits_before=$(awk '/^[ \t]*Cache hits[ \t]+[0-9]+/ {print $3}' "$1") +requests_after=$(awk '/^[ \t]*Compile requests[ \t]+[0-9]+/ {print $3}' "$2") +hits_after=$(awk '/^[ \t]*Cache hits[ \t]+[0-9]+/ {print $3}' "$2") + +# Calculate the differences to find out how many new requests and hits +requests_diff=$((requests_after - requests_before)) +hits_diff=$((hits_after - hits_before)) + +echo "New Compile Requests: $requests_diff" >&2 +echo "New Hits: $hits_diff" >&2 + +# Calculate and print the hit rate +if [ $requests_diff -eq 0 ]; then + echo "No new compile requests, hit rate is not applicable" +else + hit_rate=$(awk -v hits=$hits_diff -v requests=$requests_diff 'BEGIN {printf "%.2f", hits/requests * 100}') + echo "sccache hit rate: $hit_rate%" >&2 + echo "$hit_rate" +fi \ No newline at end of file diff --git a/ci/sccache_stats.sh b/ci/sccache_stats.sh new file mode 100755 index 000000000..a834347cb --- /dev/null +++ b/ci/sccache_stats.sh @@ -0,0 +1,60 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This script prints the sccache hit rate between two calls to sccache --show-stats. +# It should be sourced in your script before and after the operations you want to profile, +# with the 'start' or 'end' argument respectively. + +mode=$1 + +if [[ "$mode" != "start" && "$mode" != "end" ]]; then + echo "Invalid mode: $mode" + echo "Usage: $0 {start|end}" + exit 1 +fi + +case $mode in + start) + export SCCACHE_START_HITS=$(sccache --show-stats | awk '/^[ \t]*Cache hits[ \t]+[0-9]+/ {print $3}') + export SCCACHE_START_MISSES=$(sccache --show-stats | awk '/^[ \t]*Cache misses[ \t]+[0-9]+/ {print $3}') + ;; + end) + if [[ -z ${SCCACHE_START_HITS+x} || -z ${SCCACHE_START_MISSES+x} ]]; then + echo "Error: start stats not collected. Did you call this script with 'start' before your operations?" + exit 1 + fi + + final_hits=$(sccache --show-stats | awk '/^[ \t]*Cache hits[ \t]+[0-9]+/ {print $3}') + final_misses=$(sccache --show-stats | awk '/^[ \t]*Cache misses[ \t]+[0-9]+/ {print $3}') + hits=$((final_hits - SCCACHE_START_HITS)) + misses=$((final_misses - SCCACHE_START_MISSES)) + total=$((hits + misses)) + + prefix="" + if [ ${GITHUB_ACTIONS:-false} = "true" ]; then + prefix="::notice::" + fi + + if (( total > 0 )); then + hit_rate=$(awk -v hits="$hits" -v total="$total" 'BEGIN { printf "%.2f", (hits / total) * 100 }') + echo ${prefix}"sccache hits: $hits | misses: $misses | hit rate: $hit_rate%" + else + echo ${prefix}"sccache stats: N/A No new compilation requests" + fi + unset SCCACHE_START_HITS + unset SCCACHE_START_MISSES + ;; +esac \ No newline at end of file diff --git a/ci/test.sh b/ci/test.sh new file mode 100755 index 000000000..cfcce2acd --- /dev/null +++ b/ci/test.sh @@ -0,0 +1,24 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Ensure the script is being executed in its containing directory +cd "$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"; + +source ./build.sh "$@" + +ctest --test-dir ${BUILD_DIR}/tests --output-on-failure --timeout 60 + +echo "Test complete" \ No newline at end of file diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 0a83a3cb1..91e1417aa 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -1,5 +1,5 @@ #============================================================================= -# Copyright (c) 2018-2022, NVIDIA CORPORATION. +# Copyright (c) 2018-2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. #============================================================================= -cmake_minimum_required(VERSION 3.18 FATAL_ERROR) +cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR) ################################################################################################### # - compiler function ----------------------------------------------------------------------------- @@ -33,7 +33,11 @@ endfunction(ConfigureExample) ### Example sources ############################################################################### ################################################################################################### +ConfigureExample(STATIC_SET_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/host_bulk_example.cu") +ConfigureExample(STATIC_SET_DEVICE_REF_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/device_ref_example.cu") +ConfigureExample(STATIC_SET_DEVICE_SUBSETS_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/device_subsets_example.cu") ConfigureExample(STATIC_MAP_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/host_bulk_example.cu") ConfigureExample(STATIC_MAP_DEVICE_SIDE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/device_view_example.cu") ConfigureExample(STATIC_MAP_CUSTOM_TYPE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/custom_type_example.cu") +ConfigureExample(STATIC_MAP_COUNT_BY_KEY_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/count_by_key_example.cu") ConfigureExample(STATIC_MULTIMAP_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_multimap/host_bulk_example.cu") diff --git a/examples/static_map/count_by_key_example.cu b/examples/static_map/count_by_key_example.cu new file mode 100644 index 000000000..4c8cfdb11 --- /dev/null +++ b/examples/static_map/count_by_key_example.cu @@ -0,0 +1,163 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include + +#include + +#include + +#include +#include +#include +#include +#include + +/** + * @file count_by_key_example.cu + * @brief Demonstrates usage of the device side APIs for individual operations like insert/find in + * the context of a count-by-key operation, i.e. for a histogram over keys. + * + * Individual operations like a single insert or find can be performed in device code via the + * static_map "device_view" types. + * + * @note This example is for demonstration purposes only. It is not intended to show the most + * performant way to do the example algorithm. + * + */ + +/** + * @brief Inserts keys and counts how often they occur in the input sequence. + * + * @tparam BlockSize CUDA block size + * @tparam Map Type of the map returned from static_map::get_device_mutable_view + * @tparam KeyIter Input iterator whose value_type convertible to Map::key_type + * @tparam UniqueIter Output iterator whose value_type is convertible to uint64_t + * + * @param[in] map_view View of the map into which inserts will be performed + * @param[in] key_begin The beginning of the range of keys to insert + * @param[in] num_keys The total number of keys and values + * @param[out] num_unique_keys The total number of distinct keys inserted + */ +template +__global__ void count_by_key(Map map_view, + KeyIter keys, + uint64_t num_keys, + UniqueIter num_unique_keys) +{ + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + + int64_t const loop_stride = gridDim.x * BlockSize; + int64_t idx = BlockSize * blockIdx.x + threadIdx.x; + + uint64_t thread_unique_keys = 0; + while (idx < num_keys) { + // insert key into the map with a count of 1 + auto [slot, is_new_key] = map_view.insert_and_find({keys[idx], 1}); + if (is_new_key) { + // first occurrence of the key + thread_unique_keys++; + } else { + // key is already in the map -> increment count + slot->second.fetch_add(1, cuda::memory_order_relaxed); + } + idx += loop_stride; + } + + // compute number of successfully inserted new keys for each block + // and atomically add to the grand total + uint64_t block_unique_keys = BlockReduce(temp_storage).Sum(thread_unique_keys); + if (threadIdx.x == 0) { + cuda::atomic_ref grid_unique_keys( + *thrust::raw_pointer_cast(num_unique_keys)); + grid_unique_keys.fetch_add(block_unique_keys, cuda::memory_order_relaxed); + } +} + +int main(void) +{ + // Note that if (sizeof(Key)+sizeof(Count))>8 then the minimum required CUDA architecture is sm_70 + using Key = uint32_t; + using Count = uint32_t; + + // Empty slots are represented by reserved "sentinel" values. These values should be selected such + // that they never occur in your input data. + Key constexpr empty_key_sentinel = static_cast(-1); + Count constexpr empty_value_sentinel = static_cast(-1); + + // Number of keys to be inserted + auto constexpr num_keys = 50'000; + // How often each distinct key occurs in the example input + auto constexpr key_duplicates = 5; + static_assert((num_keys % key_duplicates) == 0, + "For this example, num_keys must be divisible by key_duplicates in order to pass " + "the unit test."); + + thrust::device_vector insert_keys(num_keys); + // Create a sequence of keys. Eeach distinct key has key_duplicates many matches. + thrust::transform( + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(insert_keys.size()), + insert_keys.begin(), + [] __device__(auto i) { return static_cast(i % (num_keys / key_duplicates)); }); + + // Allocate storage for count of number of unique keys + thrust::device_vector num_unique_keys(1); + + // Compute capacity based on a 50% load factor + auto constexpr load_factor = 0.5; + + // If the number of unique keys is known in advance, we can use it to calculate the map capacity + std::size_t const capacity = std::ceil((num_keys / key_duplicates) / load_factor); + // If we can't give an estimated upper bound on the number of unique keys + // we conservatively assume each key in the input is distinct + // std::size_t const capacity = std::ceil(num_keys / load_factor); + + // Constructs a map with "capacity" slots. + cuco::static_map map{ + capacity, cuco::empty_key{empty_key_sentinel}, cuco::empty_value{empty_value_sentinel}}; + + // Get a non-owning, mutable view of the map that allows inserts to pass by value into the kernel + auto device_insert_view = map.get_device_mutable_view(); + + auto constexpr block_size = 256; + auto const grid_size = (num_keys + block_size - 1) / block_size; + count_by_key<<>>( + device_insert_view, insert_keys.begin(), num_keys, num_unique_keys.data()); + + // Retrieve contents of all the non-empty slots in the map + thrust::device_vector result_keys(num_unique_keys[0]); + thrust::device_vector result_counts(num_unique_keys[0]); + map.retrieve_all(result_keys.begin(), result_counts.begin()); + + // Check if the number of result keys is correct + auto num_keys_check = num_unique_keys[0] == (num_keys / key_duplicates); + + // Iterate over all result counts and verify that they are correct + auto counts_check = thrust::all_of( + result_counts.begin(), result_counts.end(), [] __host__ __device__(Count const count) { + return count == key_duplicates; + }); + + if (num_keys_check and counts_check) { std::cout << "Success!\n"; } + + return 0; +} diff --git a/examples/static_map/custom_type_example.cu b/examples/static_map/custom_type_example.cu index efc04e0c8..e150a858e 100644 --- a/examples/static_map/custom_type_example.cu +++ b/examples/static_map/custom_type_example.cu @@ -93,9 +93,7 @@ int main(void) // Construct a map with 100,000 slots using the given empty key/value sentinels. Note the // capacity is chosen knowing we will insert 80,000 keys, for an load factor of 80%. cuco::static_map map{ - 100'000, - cuco::sentinel::empty_key{empty_key_sentinel}, - cuco::sentinel::empty_value{empty_value_sentinel}}; + 100'000, cuco::empty_key{empty_key_sentinel}, cuco::empty_value{empty_value_sentinel}}; // Inserts 80,000 pairs into the map by using the custom hasher and custom equality callable map.insert(pairs_begin, pairs_begin + num_pairs, custom_hash{}, custom_key_equals{}); diff --git a/examples/static_map/device_view_example.cu b/examples/static_map/device_view_example.cu index a65e12162..f3414e3ff 100644 --- a/examples/static_map/device_view_example.cu +++ b/examples/static_map/device_view_example.cu @@ -135,9 +135,8 @@ int main(void) std::size_t const capacity = std::ceil(num_keys / load_factor); // Constructs a map with "capacity" slots using -1 and -1 as the empty key/value sentinels. - cuco::static_map map{capacity, - cuco::sentinel::empty_key{empty_key_sentinel}, - cuco::sentinel::empty_value{empty_value_sentinel}}; + cuco::static_map map{ + capacity, cuco::empty_key{empty_key_sentinel}, cuco::empty_value{empty_value_sentinel}}; // Get a non-owning, mutable view of the map that allows inserts to pass by value into the kernel auto device_insert_view = map.get_device_mutable_view(); diff --git a/examples/static_map/host_bulk_example.cu b/examples/static_map/host_bulk_example.cu index d682442fb..746857511 100644 --- a/examples/static_map/host_bulk_example.cu +++ b/examples/static_map/host_bulk_example.cu @@ -54,9 +54,8 @@ int main(void) std::size_t const capacity = std::ceil(num_keys / load_factor); // Constructs a map with "capacity" slots using -1 and -1 as the empty key/value sentinels. - cuco::static_map map{capacity, - cuco::sentinel::empty_key{empty_key_sentinel}, - cuco::sentinel::empty_value{empty_value_sentinel}}; + cuco::static_map map{ + capacity, cuco::empty_key{empty_key_sentinel}, cuco::empty_value{empty_value_sentinel}}; // Create a sequence of keys and values {{0,0}, {1,1}, ... {i,i}} thrust::device_vector insert_keys(num_keys); diff --git a/examples/static_multimap/host_bulk_example.cu b/examples/static_multimap/host_bulk_example.cu index 149abd112..d1fe5589a 100644 --- a/examples/static_multimap/host_bulk_example.cu +++ b/examples/static_multimap/host_bulk_example.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -38,9 +38,7 @@ int main(void) // sentinels. Note the capacity is chosen knowing we will insert 50,000 keys, // for an load factor of 50%. cuco::static_multimap map{ - N * 2, - cuco::sentinel::empty_key{empty_key_sentinel}, - cuco::sentinel::empty_value{empty_value_sentinel}}; + N * 2, cuco::empty_key{empty_key_sentinel}, cuco::empty_value{empty_value_sentinel}}; thrust::device_vector> pairs(N); @@ -62,13 +60,12 @@ int main(void) // The `_outer` suffix indicates that the occurrence of a non-match is 1. auto const output_size = map.count_outer(keys_to_find.begin(), keys_to_find.end()); - thrust::device_vector> d_results(output_size); + thrust::device_vector> d_results(output_size); // Finds all keys {0, 1, 2, ...} and stores associated key/value pairs into `d_results` // If a key `keys_to_find[i]` doesn't exist, `d_results[i].second == empty_value_sentinel` - auto output_end = - map.retrieve_outer(keys_to_find.begin(), keys_to_find.end(), d_results.data().get()); - auto retrieve_size = output_end - d_results.data().get(); + auto output_end = map.retrieve_outer(keys_to_find.begin(), keys_to_find.end(), d_results.begin()); + auto retrieve_size = output_end - d_results.begin(); // The total number of outer matches should be `N + N / 2` assert(not(output_size == retrieve_size == N + N / 2)); diff --git a/examples/static_set/device_ref_example.cu b/examples/static_set/device_ref_example.cu new file mode 100644 index 000000000..52e41cf45 --- /dev/null +++ b/examples/static_set/device_ref_example.cu @@ -0,0 +1,119 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include + +#include + +#include +#include + +/** + * @file device_reference_example.cu + * @brief Demonstrates usage of the static_set device-side APIs. + * + * static_set provides a non-owning reference which can be used to interact with + * the container from within device code. + */ + +// insert a set of keys into a hash set using one cooperative group for each task +template +__global__ void custom_cooperative_insert(SetRef set, InputIterator keys, std::size_t n) +{ + namespace cg = cooperative_groups; + + constexpr auto cg_size = SetRef::cg_size; + + auto tile = cg::tiled_partition(cg::this_thread_block()); + + int64_t const loop_stride = gridDim.x * blockDim.x / cg_size; + int64_t idx = (blockDim.x * blockIdx.x + threadIdx.x) / cg_size; + + while (idx < n) { + set.insert(tile, *(keys + idx)); + idx += loop_stride; + } +} + +template +__global__ void custom_contains(SetRef set, InputIterator keys, std::size_t n, OutputIterator found) +{ + int64_t const loop_stride = gridDim.x * blockDim.x; + int64_t idx = blockDim.x * blockIdx.x + threadIdx.x; + + auto const tile = + cooperative_groups::tiled_partition(cooperative_groups::this_thread_block()); + + while (idx < n) { + found[idx] = set.contains(tile, *(keys + idx)); + idx += loop_stride; + } +} + +int main(void) +{ + using Key = int; + + // Empty slots are represented by reserved "sentinel" values. These values should be selected such + // that they never occur in your input data. + Key constexpr empty_key_sentinel = -1; + + // Number of keys to be inserted + std::size_t constexpr num_keys = 50'000; + + // Compute capacity based on a 50% load factor + auto constexpr load_factor = 0.5; + std::size_t const capacity = std::ceil(num_keys / load_factor); + + using set_type = cuco::experimental::static_set; + + // Constructs a hash set with at least "capacity" slots using -1 as the empty key sentinel. + set_type set{capacity, cuco::empty_key{empty_key_sentinel}}; + + // Create a sequence of keys {0, 1, 2, .., i} + thrust::device_vector keys(num_keys); + thrust::sequence(keys.begin(), keys.end(), 0); + + // Insert the first half of the keys into the set + set.insert(keys.begin(), keys.begin() + num_keys / 2); + + // Insert the second half of keys using a custom CUDA kernel. + custom_cooperative_insert<<<128, 128>>>( + set.ref(cuco::experimental::insert), keys.begin() + num_keys / 2, num_keys / 2); + + // Storage for result + thrust::device_vector found(num_keys); + + // Check if all keys are now contained in the set. Note that we pass a reference that already has + // the `contains` operator. + // In general, using two or more reference objects to the same container but with + // a different set of operators concurrently is undefined behavior. + // This does not apply here since the two kernels do not overlap. + custom_contains<<<128, 128>>>( + set.ref(cuco::experimental::contains), keys.begin(), num_keys, found.begin()); + + // Verify that all keys have been found + bool const all_keys_found = thrust::all_of(found.begin(), found.end(), thrust::identity()); + + if (all_keys_found) { std::cout << "Success! Found all keys.\n"; } + + return 0; +} diff --git a/examples/static_set/device_subsets_example.cu b/examples/static_set/device_subsets_example.cu new file mode 100644 index 000000000..827342f95 --- /dev/null +++ b/examples/static_set/device_subsets_example.cu @@ -0,0 +1,183 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include + +#include + +#include + +#include +#include +#include +#include + +/** + * @file device_subsets_example.cu + * @brief Demonstrates how to use one bulk set storage to create multiple subsets and perform + * individual operations via device-side ref APIs. + * + * To optimize memory usage, especially when dealing with expensive data allocation and multiple + * hashsets, a practical solution involves employing a single bulk storage for generating subsets. + * This eliminates the need for separate memory allocation and deallocation for each container. This + * can be achieved by using the lightweight non-owning ref type. + * + * @note This example is for demonstration purposes only. It is not intended to show the most + * performant way to do the example algorithm. + */ + +auto constexpr cg_size = 8; ///< A CUDA Cooperative Group of 8 threads to handle each subset +auto constexpr window_size = 1; ///< Number of concurrent slots handled by each thread +auto constexpr N = 10; ///< Number of elements to insert and query + +using key_type = int; ///< Key type +using probing_scheme_type = cuco::experimental::linear_probing< + cg_size, + cuco::default_hash_function>; ///< Type controls CG granularity and probing scheme + ///< (linear probing v.s. double hashing) +/// Type of bulk allocation storage +using storage_type = cuco::experimental::aow_storage; +/// Lightweight non-owning storage ref type +using storage_ref_type = typename storage_type::ref_type; +using ref_type = cuco::experimental::static_set_ref, + probing_scheme_type, + storage_ref_type>; ///< Set ref type + +/// Sample data to insert and query +__device__ constexpr std::array data = {1, 3, 5, 7, 9, 11, 13, 15, 17, 19}; +/// Empty slots are represented by reserved "sentinel" values. These values should be selected such +/// that they never occur in your input data. +key_type constexpr empty_key_sentinel = -1; + +/** + * @brief Inserts sample data into subsets by using cooperative group + * + * Each Cooperative Group creates its own subset and inserts `N` sample data. + * + * @param set_refs Pointer to the array of subset objects + */ +__global__ void insert(ref_type* set_refs) +{ + namespace cg = cooperative_groups; + + auto const tile = cg::tiled_partition(cg::this_thread_block()); + // Get subset (or CG) index + auto const idx = (blockDim.x * blockIdx.x + threadIdx.x) / cg_size; + + auto raw_set_ref = *(set_refs + idx); + auto insert_set_ref = std::move(raw_set_ref).with(cuco::experimental::insert); + + // Insert `N` elemtns into the set with CG insert + for (int i = 0; i < N; i++) { + insert_set_ref.insert(tile, data[i]); + } +} + +/** + * @brief All inserted data can be found + * + * Each Cooperative Group reconstructs its own subset ref based on the storage parameters and + * verifies all inserted data can be found. + * + * @param set_refs Pointer to the array of subset objects + */ +__global__ void find(ref_type* set_refs) +{ + namespace cg = cooperative_groups; + + auto const tile = cg::tiled_partition(cg::this_thread_block()); + auto const idx = (blockDim.x * blockIdx.x + threadIdx.x) / cg_size; + + auto raw_set_ref = *(set_refs + idx); + auto find_set_ref = std::move(raw_set_ref).with(cuco::experimental::find); + + // Result denoting if any of the inserted data is not found + __shared__ int result; + if (threadIdx.x == 0) { result = 0; } + __syncthreads(); + + for (int i = 0; i < N; i++) { + // Query the set with inserted data + auto const found = find_set_ref.find(tile, data[i]); + // Record if the inserted data has been found + atomicOr(&result, *found != data[i]); + } + __syncthreads(); + + if (threadIdx.x == 0) { + // If the result is still 0, all inserted data are found. + if (result == 0) { printf("Success! Found all inserted elements.\n"); } + } +} + +int main() +{ + // Number of subsets to be created + auto constexpr num = 16; + // Each subset may have a different requested size + auto constexpr subset_sizes = + std::array{20, 20, 20, 20, 30, 30, 30, 30, 40, 40, 40, 40, 50, 50, 50, 50}; + + auto valid_sizes = std::vector(); + valid_sizes.reserve(num); + + for (size_t i = 0; i < num; ++i) { + valid_sizes.emplace_back( + static_cast(cuco::experimental::make_window_extent(subset_sizes[i]))); + } + + std::vector offsets(num + 1, 0); + + // prefix sum to compute offsets and total number of windows + std::size_t current_sum = 0; + for (std::size_t i = 0; i < valid_sizes.size(); ++i) { + current_sum += valid_sizes[i]; + offsets[i + 1] = current_sum; + } + + // total number of windows is located at the back of the offsets array + auto const total_num_windows = offsets.back(); + + // Create a single bulk storage used by all subsets + auto set_storage = storage_type{total_num_windows}; + // Initializes the storage with the given sentinel + set_storage.initialize(empty_key_sentinel); + + std::vector set_refs; + + // create subsets + for (std::size_t i = 0; i < num; ++i) { + storage_ref_type storage_ref{valid_sizes[i], set_storage.data() + offsets[i]}; + set_refs.emplace_back( + ref_type{cuco::empty_key{empty_key_sentinel}, {}, {}, storage_ref}); + } + + thrust::device_vector d_set_refs(set_refs); + + // Insert sample data + insert<<<1, 128>>>(d_set_refs.data().get()); + // Find all inserted data + find<<<1, 128>>>(d_set_refs.data().get()); + + return 0; +} diff --git a/examples/static_set/host_bulk_example.cu b/examples/static_set/host_bulk_example.cu new file mode 100644 index 000000000..3b8c4deb4 --- /dev/null +++ b/examples/static_set/host_bulk_example.cu @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include + +#include +#include + +/** + * @file host_bulk_example.cu + * @brief Demonstrates usage of the static_set "bulk" host APIs. + * + * The bulk APIs are only invocable from the host and are used for doing operations like `insert` or + * `contains` on a set of keys. + * + */ +int main(void) +{ + using Key = int; + + // Empty slots are represented by reserved "sentinel" values. These values should be selected such + // that they never occur in your input data. + Key constexpr empty_key_sentinel = -1; + + // Number of keys to be inserted + std::size_t constexpr num_keys = 50'000; + + // Compute capacity based on a 50% load factor + auto constexpr load_factor = 0.5; + std::size_t const capacity = std::ceil(num_keys / load_factor); + + // Constructs a set with at least `capacity` slots using -1 as the empty keys sentinel. + cuco::experimental::static_set set{capacity, cuco::empty_key{empty_key_sentinel}}; + + // Create a sequence of keys {0, 1, 2, .., i} + thrust::device_vector keys(num_keys); + thrust::sequence(keys.begin(), keys.end(), 0); + + // Inserts all keys into the hash set + set.insert(keys.begin(), keys.end()); + + // Storage for result + thrust::device_vector found(num_keys); + + // Check if all keys are contained in the set + set.contains(keys.begin(), keys.end(), found.begin()); + + // Verify that all keys have been found + bool const all_keys_found = thrust::all_of(found.begin(), found.end(), thrust::identity()); + + if (all_keys_found) { std::cout << "Success! Found all keys.\n"; } + + return 0; +} diff --git a/include/cuco/aow_storage.cuh b/include/cuco/aow_storage.cuh new file mode 100644 index 000000000..479246fac --- /dev/null +++ b/include/cuco/aow_storage.cuh @@ -0,0 +1,229 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +namespace cuco { +namespace experimental { + +/// Window type alias +template +using window = detail::window; + +/// forward declaration +template +class aow_storage_ref; + +/** + * @brief Array of Window open addressing storage class. + * + * @tparam T Slot type + * @tparam WindowSize Number of slots in each window + * @tparam Extent Type of extent denoting number of windows + * @tparam Allocator Type of allocator used for device storage (de)allocation + */ +template , + typename Allocator = cuco::cuda_allocator>> +class aow_storage : public detail::aow_storage_base { + public: + using base_type = detail::aow_storage_base; ///< AoW base class type + + using base_type::window_size; ///< Number of elements processed per window + + using extent_type = typename base_type::extent_type; ///< Storage extent type + using size_type = typename base_type::size_type; ///< Storage size type + using value_type = typename base_type::value_type; ///< Slot type + using window_type = typename base_type::window_type; ///< Slot window type + + using base_type::capacity; + using base_type::num_windows; + + /// Type of the allocator to (de)allocate windows + using allocator_type = typename std::allocator_traits::rebind_alloc; + using window_deleter_type = + detail::custom_deleter; ///< Type of window deleter + using ref_type = aow_storage_ref; ///< Storage ref type + + /** + * @brief Constructor of AoW storage. + * + * @note The input `size` should be exclusively determined by the return value of + * `make_window_extent` since it depends on the requested low-bound value, the probing scheme, and + * the storage. + * + * @param size Number of windows to (de)allocate + * @param allocator Allocator used for (de)allocating device storage + */ + explicit constexpr aow_storage(Extent size, Allocator const& allocator = {}) noexcept; + + aow_storage(aow_storage&&) = default; ///< Move constructor + /** + * @brief Replaces the contents of the storage with another storage. + * + * @return Reference of the current storage object + */ + aow_storage& operator=(aow_storage&&) = default; + ~aow_storage() = default; ///< Destructor + + aow_storage(aow_storage const&) = delete; + aow_storage& operator=(aow_storage const&) = delete; + + /** + * @brief Gets windows array. + * + * @return Pointer to the first window + */ + [[nodiscard]] constexpr window_type* data() const noexcept; + + /** + * @brief Gets the storage allocator. + * + * @return The storage allocator + */ + [[nodiscard]] constexpr allocator_type allocator() const noexcept; + + /** + * @brief Gets window storage reference. + * + * @return Reference of window storage + */ + [[nodiscard]] constexpr ref_type ref() const noexcept; + + /** + * @brief Initializes each slot in the AoW storage to contain `key`. + * + * @param key Key to which all keys in `slots` are initialized + * @param stream Stream used for executing the kernel + */ + void initialize(value_type key, cuda_stream_ref stream = {}) noexcept; + + /** + * @brief Asynchronously initializes each slot in the AoW storage to contain `key`. + * + * @param key Key to which all keys in `slots` are initialized + * @param stream Stream used for executing the kernel + */ + void initialize_async(value_type key, cuda_stream_ref stream = {}) noexcept; + + private: + allocator_type allocator_; ///< Allocator used to (de)allocate windows + window_deleter_type window_deleter_; ///< Custom windows deleter + std::unique_ptr windows_; ///< Pointer to AoW storage +}; + +/** + * @brief Non-owning AoW storage reference type. + * + * @tparam T Storage element type + * @tparam WindowSize Number of slots in each window + * @tparam Extent Type of extent denoting storage capacity + */ +template > +class aow_storage_ref : public detail::aow_storage_base { + public: + using base_type = detail::aow_storage_base; ///< AoW base class type + + using base_type::window_size; ///< Number of elements processed per window + + using extent_type = typename base_type::extent_type; ///< Storage extent type + using size_type = typename base_type::size_type; ///< Storage size type + using value_type = typename base_type::value_type; ///< Slot type + using window_type = typename base_type::window_type; ///< Slot window type + + using base_type::capacity; + using base_type::num_windows; + + /** + * @brief Constructor of AoS storage ref. + * + * @param size Number of windows + * @param windows Pointer to the windows array + */ + __host__ __device__ explicit constexpr aow_storage_ref(Extent size, + window_type* windows) noexcept; + + /** + * @brief Custom un-incrementable input iterator for the convenience of `find` operations. + * + * @note This iterator is for read only and NOT incrementable. + */ + struct iterator; + using const_iterator = iterator const; ///< Const forward iterator type + + /** + * @brief Returns an iterator to one past the last slot. + * + * This is provided for convenience for those familiar with checking + * an iterator returned from `find()` against the `end()` iterator. + * + * @return An iterator to one past the last slot + */ + [[nodiscard]] __device__ constexpr iterator end() noexcept; + + /** + * @brief Returns a const_iterator to one past the last slot. + * + * This is provided for convenience for those familiar with checking + * an iterator returned from `find()` against the `end()` iterator. + * + * @return A const_iterator to one past the last slot + */ + [[nodiscard]] __device__ constexpr const_iterator end() const noexcept; + + /** + * @brief Gets windows array. + * + * @return Pointer to the first window + */ + [[nodiscard]] __device__ constexpr window_type* data() noexcept; + + /** + * @brief Gets windows array. + * + * @return Pointer to the first window + */ + [[nodiscard]] __device__ constexpr window_type* data() const noexcept; + + /** + * @brief Returns an array of slots (or a window) for a given index. + * + * @param index Index of the window + * @return An array of slots + */ + [[nodiscard]] __device__ constexpr window_type operator[](size_type index) const noexcept; + + private: + window_type* windows_; ///< Pointer to the windows array +}; + +} // namespace experimental +} // namespace cuco + +#include diff --git a/include/cuco/cuda_stream_ref.hpp b/include/cuco/cuda_stream_ref.hpp new file mode 100644 index 000000000..bf0a5dea9 --- /dev/null +++ b/include/cuco/cuda_stream_ref.hpp @@ -0,0 +1,142 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +#include + +namespace cuco { +namespace experimental { + +/** + * @brief Strongly-typed non-owning wrapper for CUDA streams with default constructor. + * + * This wrapper is simply a "view": it does not own the lifetime of the stream it wraps. + */ +class cuda_stream_ref { + public: + constexpr cuda_stream_ref() = default; ///< Default constructor + constexpr cuda_stream_ref(cuda_stream_ref const&) = default; ///< Copy constructor + constexpr cuda_stream_ref(cuda_stream_ref&&) = default; ///< Move constructor + + /** + * @brief Copy-assignment operator. + * + * @return Copy of this stream reference. + */ + constexpr cuda_stream_ref& operator=(cuda_stream_ref const&) = default; + + /** + * @brief Move-assignment operator. + * + * @return New location of this stream reference. + */ + constexpr cuda_stream_ref& operator=(cuda_stream_ref&&) = default; ///< Move-assignment operator + + ~cuda_stream_ref() = default; + + constexpr cuda_stream_ref(int) = delete; //< Prevent cast from literal 0 + constexpr cuda_stream_ref(std::nullptr_t) = delete; //< Prevent cast from nullptr + + /** + * @brief Implicit conversion from `cudaStream_t`. + * + * @param stream The CUDA stream to reference. + */ + constexpr cuda_stream_ref(cudaStream_t stream) noexcept : stream_{stream} {} + + /** + * @brief Get the wrapped stream. + * + * @return The wrapped stream. + */ + [[nodiscard]] constexpr cudaStream_t value() const noexcept { return stream_; } + + /** + * @brief Implicit conversion to `cudaStream_t`. + * + * @return The underlying `cudaStream_t`. + */ + constexpr operator cudaStream_t() const noexcept { return value(); } + + /** + * @brief Return true if the wrapped stream is the CUDA per-thread default stream. + * + * @return True if the wrapped stream is the per-thread default stream; else false. + */ + [[nodiscard]] inline bool is_per_thread_default() const noexcept; + + /** + * @brief Return true if the wrapped stream is explicitly the CUDA legacy default stream. + * + * @return True if the wrapped stream is the default stream; else false. + */ + [[nodiscard]] inline bool is_default() const noexcept; + + /** + * @brief Synchronize the viewed CUDA stream. + * + * Calls `cudaStreamSynchronize()`. + * + * @throw cuco::cuda_error if stream synchronization fails + */ + void synchronize() const; + + private: + cudaStream_t stream_{}; +}; + +/** + * @brief Static `cuda_stream_ref` of the default stream (stream 0), for convenience + */ +static constexpr cuda_stream_ref cuda_stream_default{}; + +/** + * @brief Static `cuda_stream_ref` of cudaStreamLegacy, for convenience + */ +static const cuda_stream_ref cuda_stream_legacy{cudaStreamLegacy}; + +/** + * @brief Static `cuda_stream_ref` of cudaStreamPerThread, for convenience + */ +static const cuda_stream_ref cuda_stream_per_thread{cudaStreamPerThread}; + +// /** +// * @brief Equality comparison operator for streams +// * +// * @param lhs The first stream view to compare +// * @param rhs The second stream view to compare +// * @return true if equal, false if unequal +// */ +// inline bool operator==(cuda_stream_ref lhs, cuda_stream_ref rhs) +// { +// return lhs.value() == rhs.value(); +// } + +// /** +// * @brief Inequality comparison operator for streams +// * +// * @param lhs The first stream view to compare +// * @param rhs The second stream view to compare +// * @return true if unequal, false if equal +// */ +// inline bool operator!=(cuda_stream_ref lhs, cuda_stream_ref rhs) { return not(lhs == rhs); } + +} // namespace experimental +} // namespace cuco + +#include \ No newline at end of file diff --git a/include/cuco/detail/__config b/include/cuco/detail/__config index 40eb75aa2..07dec5e50 100644 --- a/include/cuco/detail/__config +++ b/include/cuco/detail/__config @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,9 +14,23 @@ * limitations under the License. */ - #pragma once +#pragma once - #include +#include + +#if !defined(__CUDACC_VER_MAJOR__) || !defined(__CUDACC_VER_MINOR__) +#error "NVCC version not found" +#elif __CUDACC_VER_MAJOR__ < 11 || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ < 5) +#error "NVCC version 11.5 or later is required" +#endif + +#if !defined(__CUDACC_RELAXED_CONSTEXPR__) +#error "Support for relaxed constexpr is required" +#endif + +#if !defined(__CUDACC_EXTENDED_LAMBDA__) +#error "Support for extended device lambdas is required" +#endif // WAR for libcudacxx/296 #define CUCO_CUDA_MINIMUM_ARCH _NV_FIRST_ARG(__CUDA_ARCH_LIST__) @@ -25,10 +39,14 @@ #define CUCO_HAS_CUDA_BARRIER #endif -#if defined(CUDART_VERSION) && (CUDART_VERSION >= 11100) +#if defined(CUDART_VERSION) && (CUDART_VERSION >= 11010) #define CUCO_HAS_CG_MEMCPY_ASYNC #endif #if (CUCO_CUDA_MINIMUM_ARCH >= 700) #define CUCO_HAS_INDEPENDENT_THREADS #endif + +#if defined(__SIZEOF_INT128__) +#define CUCO_HAS_INT128 +#endif \ No newline at end of file diff --git a/include/cuco/detail/bitwise_compare.cuh b/include/cuco/detail/bitwise_compare.cuh index 3038943a0..a8a5a69d1 100644 --- a/include/cuco/detail/bitwise_compare.cuh +++ b/include/cuco/detail/bitwise_compare.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,6 +16,10 @@ #pragma once +#include + +#include + #include #include @@ -58,6 +62,16 @@ struct bitwise_compare_impl<8> { } }; +/** + * @brief Gives value to use as alignment for a type that is at least the + * size of type, or 16, whichever is smaller. + */ +template +constexpr std::size_t alignment() +{ + return std::min(std::size_t{16}, cuda::std::bit_ceil(sizeof(T))); +} + /** * @brief Performs a bitwise equality comparison between the two specified objects * @@ -73,8 +87,11 @@ __host__ __device__ constexpr bool bitwise_compare(T const& lhs, T const& rhs) cuco::is_bitwise_comparable_v, "Bitwise compared objects must have unique object representations or be explicitly declared as " "safe for bitwise comparison via specialization of cuco::is_bitwise_comparable_v."); - return detail::bitwise_compare_impl::compare(reinterpret_cast(&lhs), - reinterpret_cast(&rhs)); + + alignas(detail::alignment()) T __lhs{lhs}; + alignas(detail::alignment()) T __rhs{rhs}; + return detail::bitwise_compare_impl::compare(reinterpret_cast(&__lhs), + reinterpret_cast(&__rhs)); } } // namespace detail diff --git a/include/cuco/detail/common_functors.cuh b/include/cuco/detail/common_functors.cuh new file mode 100644 index 000000000..12fe14e0a --- /dev/null +++ b/include/cuco/detail/common_functors.cuh @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + */ + +#pragma once + +namespace cuco { +namespace experimental { +namespace detail { + +/** + * @brief Device functor returning the content of the slot indexed by `idx`. + * + * @tparam StorageRef Storage ref type + */ +template +struct get_slot { + StorageRef storage_; ///< Storage ref + + /** + * @brief Constructs `get_slot` functor with the given storage ref. + * + * @param s Input storage ref + */ + explicit constexpr get_slot(StorageRef s) noexcept : storage_{s} {} + + /** + * @brief Accesses the slot content with the given index. + * + * @param idx The slot index + * @return The slot content + */ + __device__ constexpr auto operator()(typename StorageRef::size_type idx) const noexcept + { + auto const window_idx = idx / StorageRef::window_size; + auto const intra_idx = idx % StorageRef::window_size; + return storage_[window_idx][intra_idx]; + } +}; + +} // namespace detail +} // namespace experimental +} // namespace cuco diff --git a/include/cuco/detail/common_kernels.cuh b/include/cuco/detail/common_kernels.cuh new file mode 100644 index 000000000..759041bad --- /dev/null +++ b/include/cuco/detail/common_kernels.cuh @@ -0,0 +1,264 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +#include + +#include + +#include + +namespace cuco { +namespace experimental { +namespace detail { + +/** + * @brief Inserts all elements in the range `[first, first + n)` and returns the number of + * successful insertions if `pred` of the corresponding stencil returns true. + * + * @note If multiple elements in `[first, first + n)` compare equal, it is unspecified which element + * is inserted. + * @note The key `*(first + i)` is inserted if `pred( *(stencil + i) )` returns true. + * + * @tparam CGSize Number of threads in each CG + * @tparam BlockSize Number of threads in each block + * @tparam InputIterator Device accessible input iterator whose `value_type` is + * convertible to the `value_type` of the data structure + * @tparam StencilIt Device accessible random access iterator whose value_type is + * convertible to Predicate's argument type + * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` + * and argument type is convertible from `std::iterator_traits::value_type` + * @tparam AtomicT Atomic counter type + * @tparam Ref Type of non-owning device container ref allowing access to storage + * + * @param first Beginning of the sequence of input elements + * @param n Number of input elements + * @param stencil Beginning of the stencil sequence + * @param pred Predicate to test on every element in the range `[stencil, stencil + n)` + * @param num_successes Number of successful inserted elements + * @param ref Non-owning container device ref used to access the slot storage + */ +template +__global__ void insert_if_n(InputIterator first, + cuco::detail::index_type n, + StencilIt stencil, + Predicate pred, + AtomicT* num_successes, + Ref ref) +{ + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + typename Ref::size_type thread_num_successes = 0; + + auto const loop_stride = cuco::detail::grid_stride() / CGSize; + auto idx = cuco::detail::global_thread_id() / CGSize; + + while (idx < n) { + if (pred(*(stencil + idx))) { + typename Ref::value_type const insert_element{*(first + idx)}; + if constexpr (CGSize == 1) { + if (ref.insert(insert_element)) { thread_num_successes++; }; + } else { + auto const tile = + cooperative_groups::tiled_partition(cooperative_groups::this_thread_block()); + if (ref.insert(tile, insert_element) && tile.thread_rank() == 0) { thread_num_successes++; } + } + } + idx += loop_stride; + } + + // compute number of successfully inserted elements for each block + // and atomically add to the grand total + auto const block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes); + if (threadIdx.x == 0) { + num_successes->fetch_add(block_num_successes, cuda::std::memory_order_relaxed); + } +} + +/** + * @brief Inserts all elements in the range `[first, first + n)` if `pred` of the corresponding + * stencil returns true. + * + * @note If multiple elements in `[first, first + n)` compare equal, it is unspecified which element + * is inserted. + * @note The key `*(first + i)` is inserted if `pred( *(stencil + i) )` returns true. + * + * @tparam CGSize Number of threads in each CG + * @tparam BlockSize Number of threads in each block + * @tparam InputIterator Device accessible input iterator whose `value_type` is + * convertible to the `value_type` of the data structure + * @tparam StencilIt Device accessible random access iterator whose value_type is + * convertible to Predicate's argument type + * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` + * and argument type is convertible from `std::iterator_traits::value_type` + * @tparam Ref Type of non-owning device ref allowing access to storage + * + * @param first Beginning of the sequence of input elements + * @param n Number of input elements + * @param stencil Beginning of the stencil sequence + * @param pred Predicate to test on every element in the range `[stencil, stencil + n)` + * @param ref Non-owning container device ref used to access the slot storage + */ +template +__global__ void insert_if_n( + InputIterator first, cuco::detail::index_type n, StencilIt stencil, Predicate pred, Ref ref) +{ + auto const loop_stride = cuco::detail::grid_stride() / CGSize; + auto idx = cuco::detail::global_thread_id() / CGSize; + + while (idx < n) { + if (pred(*(stencil + idx))) { + typename Ref::value_type const insert_element{*(first + idx)}; + if constexpr (CGSize == 1) { + ref.insert(insert_element); + } else { + auto const tile = + cooperative_groups::tiled_partition(cooperative_groups::this_thread_block()); + ref.insert(tile, insert_element); + } + } + idx += loop_stride; + } +} + +/** + * @brief Indicates whether the keys in the range `[first, first + n)` are contained in the data + * structure if `pred` of the corresponding stencil returns true. + * + * @note If `pred( *(stencil + i) )` is true, stores `true` or `false` to `(output_begin + i)` + * indicating if the key `*(first + i)` is present in the container. If `pred( *(stencil + i) )` is + * false, stores false to `(output_begin + i)`. + * + * @tparam CGSize Number of threads in each CG + * @tparam BlockSize The size of the thread block + * @tparam InputIt Device accessible input iterator + * @tparam StencilIt Device accessible random access iterator whose value_type is + * convertible to Predicate's argument type + * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` + * and argument type is convertible from `std::iterator_traits::value_type` + * @tparam OutputIt Device accessible output iterator assignable from `bool` + * @tparam Ref Type of non-owning device ref allowing access to storage + * + * @param first Beginning of the sequence of keys + * @param n Number of keys + * @param stencil Beginning of the stencil sequence + * @param pred Predicate to test on every element in the range `[stencil, stencil + n)` + * @param output_begin Beginning of the sequence of booleans for the presence of each key + * @param ref Non-owning container device ref used to access the slot storage + */ +template +__global__ void contains_if_n(InputIt first, + cuco::detail::index_type n, + StencilIt stencil, + Predicate pred, + OutputIt output_begin, + Ref ref) +{ + namespace cg = cooperative_groups; + + auto const block = cg::this_thread_block(); + auto const thread_idx = block.thread_rank(); + auto const loop_stride = cuco::detail::grid_stride() / CGSize; + auto idx = cuco::detail::global_thread_id() / CGSize; + + __shared__ bool output_buffer[BlockSize / CGSize]; + + while (idx - thread_idx < n) { // the whole thread block falls into the same iteration + if constexpr (CGSize == 1) { + if (idx < n) { + auto const key = *(first + idx); + /* + * The ld.relaxed.gpu instruction causes L1 to flush more frequently, causing increased + * sector stores from L2 to global memory. By writing results to shared memory and then + * synchronizing before writing back to global, we no longer rely on L1, preventing the + * increase in sector stores from L2 to global and improving performance. + */ + output_buffer[thread_idx] = pred(*(stencil + idx)) ? ref.contains(key) : false; + } + block.sync(); + if (idx < n) { *(output_begin + idx) = output_buffer[thread_idx]; } + } else { + auto const tile = cg::tiled_partition(cg::this_thread_block()); + if (idx < n) { + auto const key = *(first + idx); + auto const found = pred(*(stencil + idx)) ? ref.contains(tile, key) : false; + if (tile.thread_rank() == 0) { *(output_begin + idx) = found; } + } + } + idx += loop_stride; + } +} + +/** + * @brief Calculates the number of filled slots for the given window storage. + * + * @tparam BlockSize Number of threads in each block + * @tparam StorageRef Type of non-owning ref allowing access to storage + * @tparam Predicate Type of predicate indicating if the given slot is filled + * @tparam AtomicT Atomic counter type + * + * @param storage Non-owning device ref used to access the slot storage + * @param is_filled Predicate indicating if the given slot is filled + * @param count Number of filled slots + */ +template +__global__ void size(StorageRef storage, Predicate is_filled, AtomicT* count) +{ + using size_type = typename StorageRef::size_type; + + auto const loop_stride = cuco::detail::grid_stride(); + auto idx = cuco::detail::global_thread_id(); + + size_type thread_count = 0; + auto const n = storage.num_windows(); + + while (idx < n) { + auto const window = storage[idx]; +#pragma unroll + for (auto const& it : window) { + thread_count += static_cast(is_filled(it)); + } + idx += loop_stride; + } + + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + auto const block_count = BlockReduce(temp_storage).Sum(thread_count); + if (threadIdx.x == 0) { count->fetch_add(block_count, cuda::std::memory_order_relaxed); } +} + +} // namespace detail +} // namespace experimental +} // namespace cuco diff --git a/include/cuco/detail/cuda_stream_ref.inl b/include/cuco/detail/cuda_stream_ref.inl new file mode 100644 index 000000000..64aa078aa --- /dev/null +++ b/include/cuco/detail/cuda_stream_ref.inl @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include + +#include + +namespace cuco { +namespace experimental { + +[[nodiscard]] inline bool cuda_stream_ref::is_per_thread_default() const noexcept +{ +#ifdef CUDA_API_PER_THREAD_DEFAULT_STREAM + return value() == cuda_stream_per_thread || value() == nullptr; +#else + return value() == cuda_stream_per_thread; +#endif +} + +[[nodiscard]] inline bool cuda_stream_ref::is_default() const noexcept +{ +#ifdef CUDA_API_PER_THREAD_DEFAULT_STREAM + return value() == cuda_stream_legacy; +#else + return value() == cuda_stream_legacy || value() == nullptr; +#endif +} + +inline void cuda_stream_ref::synchronize() const +{ + CUCO_CUDA_TRY(cudaStreamSynchronize(this->stream_)); +} + +} // namespace experimental +} // namespace cuco \ No newline at end of file diff --git a/include/cuco/detail/dynamic_map.inl b/include/cuco/detail/dynamic_map.inl index 0c1d2e377..7b5145190 100644 --- a/include/cuco/detail/dynamic_map.inl +++ b/include/cuco/detail/dynamic_map.inl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,13 +17,14 @@ namespace cuco { template -dynamic_map::dynamic_map( - std::size_t initial_capacity, - sentinel::empty_key empty_key_sentinel, - sentinel::empty_value empty_value_sentinel, - Allocator const& alloc) +dynamic_map::dynamic_map(std::size_t initial_capacity, + empty_key empty_key_sentinel, + empty_value empty_value_sentinel, + Allocator const& alloc, + cudaStream_t stream) : empty_key_sentinel_(empty_key_sentinel.value), empty_value_sentinel_(empty_value_sentinel.value), + erased_key_sentinel_(empty_key_sentinel.value), size_(0), capacity_(initial_capacity), min_insert_size_(1E4), @@ -32,23 +33,49 @@ dynamic_map::dynamic_map( { submaps_.push_back(std::make_unique>( initial_capacity, - sentinel::empty_key{empty_key_sentinel}, - sentinel::empty_value{empty_value_sentinel}, - alloc)); + empty_key{empty_key_sentinel}, + empty_value{empty_value_sentinel}, + alloc, + stream)); submap_views_.push_back(submaps_[0]->get_device_view()); submap_mutable_views_.push_back(submaps_[0]->get_device_mutable_view()); - - CUCO_CUDA_TRY(cudaMallocManaged(&num_successes_, sizeof(atomic_ctr_type))); -} // namespace cuco + submap_num_successes_.push_back(submaps_[0]->num_successes_); +} template -dynamic_map::~dynamic_map() +dynamic_map::dynamic_map(std::size_t initial_capacity, + empty_key empty_key_sentinel, + empty_value empty_value_sentinel, + erased_key erased_key_sentinel, + Allocator const& alloc, + cudaStream_t stream) + : empty_key_sentinel_(empty_key_sentinel.value), + empty_value_sentinel_(empty_value_sentinel.value), + erased_key_sentinel_(erased_key_sentinel.value), + size_(0), + capacity_(initial_capacity), + min_insert_size_(1E4), + max_load_factor_(0.60), + alloc_{alloc} { - CUCO_ASSERT_CUDA_SUCCESS(cudaFree(num_successes_)); + CUCO_EXPECTS(empty_key_sentinel_ != erased_key_sentinel_, + "The empty key sentinel and erased key sentinel cannot be the same value.", + std::runtime_error); + + submaps_.push_back(std::make_unique>( + initial_capacity, + empty_key{empty_key_sentinel_}, + empty_value{empty_value_sentinel_}, + erased_key{erased_key_sentinel_}, + alloc, + stream)); + submap_views_.push_back(submaps_[0]->get_device_view()); + submap_mutable_views_.push_back(submaps_[0]->get_device_mutable_view()); + submap_num_successes_.push_back(submaps_[0]->num_successes_); } template -void dynamic_map::reserve(std::size_t n) +void dynamic_map::reserve(std::size_t n, cudaStream_t stream) { int64_t num_elements_remaining = n; uint32_t submap_idx = 0; @@ -62,14 +89,25 @@ void dynamic_map::reserve(std::size_t n) // if the submap does not exist yet, create it else { submap_capacity = capacity_; - submaps_.push_back(std::make_unique>( - submap_capacity, - sentinel::empty_key{empty_key_sentinel_}, - sentinel::empty_value{empty_value_sentinel_}, - alloc_)); + if (erased_key_sentinel_ != empty_key_sentinel_) { + submaps_.push_back(std::make_unique>( + submap_capacity, + empty_key{empty_key_sentinel_}, + empty_value{empty_value_sentinel_}, + erased_key{erased_key_sentinel_}, + alloc_, + stream)); + } else { + submaps_.push_back(std::make_unique>( + submap_capacity, + empty_key{empty_key_sentinel_}, + empty_value{empty_value_sentinel_}, + alloc_, + stream)); + } + submap_num_successes_.push_back(submaps_[submap_idx]->num_successes_); submap_views_.push_back(submaps_[submap_idx]->get_device_view()); submap_mutable_views_.push_back(submaps_[submap_idx]->get_device_mutable_view()); - capacity_ *= 2; } @@ -80,13 +118,20 @@ void dynamic_map::reserve(std::size_t n) template template -void dynamic_map::insert(InputIt first, - InputIt last, - Hash hash, - KeyEqual key_equal) +void dynamic_map::insert( + InputIt first, InputIt last, Hash hash, KeyEqual key_equal, cudaStream_t stream) { + // TODO: memset an atomic variable is unsafe + static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type), + "sizeof(atomic_ctr_type) must be equal to sizeof(std:size_t)."); + + auto constexpr block_size = 128; + auto constexpr stride = 1; + auto constexpr tile_size = 4; + std::size_t num_to_insert = std::distance(first, last); - reserve(size_ + num_to_insert); + + reserve(size_ + num_to_insert, stream); uint32_t submap_idx = 0; while (num_to_insert > 0) { @@ -95,30 +140,29 @@ void dynamic_map::insert(InputIt first, // If we are tying to insert some of the remaining keys into this submap, we can insert // only if we meet the minimum insert size. if (capacity_remaining >= min_insert_size_) { - *num_successes_ = 0; - int device_id; - CUCO_CUDA_TRY(cudaGetDevice(&device_id)); - CUCO_CUDA_TRY(cudaMemPrefetchAsync(num_successes_, sizeof(atomic_ctr_type), device_id)); - - auto n = std::min(capacity_remaining, num_to_insert); - auto const block_size = 128; - auto const stride = 1; - auto const tile_size = 4; - auto const grid_size = (tile_size * n + stride * block_size - 1) / (stride * block_size); - - detail::insert> - <<>>(first, - first + n, - submap_views_.data().get(), - submap_mutable_views_.data().get(), - num_successes_, - submap_idx, - submaps_.size(), - hash, - key_equal); - CUCO_CUDA_TRY(cudaDeviceSynchronize()); - - std::size_t h_num_successes = num_successes_->load(cuda::std::memory_order_relaxed); + CUCO_CUDA_TRY( + cudaMemsetAsync(submap_num_successes_[submap_idx], 0, sizeof(atomic_ctr_type), stream)); + + auto const n = std::min(capacity_remaining, num_to_insert); + auto const grid_size = (tile_size * n + stride * block_size - 1) / (stride * block_size); + + detail::insert> + <<>>(first, + first + n, + submap_views_.data().get(), + submap_mutable_views_.data().get(), + submap_num_successes_.data().get(), + submap_idx, + submaps_.size(), + hash, + key_equal); + + std::size_t h_num_successes; + CUCO_CUDA_TRY(cudaMemcpyAsync(&h_num_successes, + submap_num_successes_[submap_idx], + sizeof(atomic_ctr_type), + cudaMemcpyDeviceToHost, + stream)); submaps_[submap_idx]->size_ += h_num_successes; size_ += h_num_successes; first += n; @@ -128,34 +172,88 @@ void dynamic_map::insert(InputIt first, } } +template +template +void dynamic_map::erase( + InputIt first, InputIt last, Hash hash, KeyEqual key_equal, cudaStream_t stream) +{ + // TODO: memset an atomic variable is unsafe + static_assert(sizeof(std::size_t) == sizeof(atomic_ctr_type), + "sizeof(atomic_ctr_type) must be equal to sizeof(std:size_t)."); + + auto constexpr block_size = 128; + auto constexpr stride = 1; + auto constexpr tile_size = 4; + + auto const num_keys = std::distance(first, last); + auto const grid_size = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size); + + // zero out submap success counters + for (uint32_t i = 0; i < submaps_.size(); ++i) { + CUCO_CUDA_TRY(cudaMemsetAsync(submap_num_successes_[i], 0, sizeof(atomic_ctr_type), stream)); + } + + auto const temp_storage_size = submaps_.size() * sizeof(unsigned long long); + + detail::erase + <<>>(first, + first + num_keys, + submap_mutable_views_.data().get(), + submap_num_successes_.data().get(), + submaps_.size(), + hash, + key_equal); + + for (uint32_t i = 0; i < submaps_.size(); ++i) { + std::size_t h_submap_num_successes; + CUCO_CUDA_TRY(cudaMemcpyAsync(&h_submap_num_successes, + submap_num_successes_[i], + sizeof(atomic_ctr_type), + cudaMemcpyDeviceToHost, + stream)); + submaps_[i]->size_ -= h_submap_num_successes; + size_ -= h_submap_num_successes; + } +} + template template -void dynamic_map::find( - InputIt first, InputIt last, OutputIt output_begin, Hash hash, KeyEqual key_equal) +void dynamic_map::find(InputIt first, + InputIt last, + OutputIt output_begin, + Hash hash, + KeyEqual key_equal, + cudaStream_t stream) { - auto num_keys = std::distance(first, last); - auto const block_size = 128; - auto const stride = 1; - auto const tile_size = 4; - auto const grid_size = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size); + auto constexpr block_size = 128; + auto constexpr stride = 1; + auto constexpr tile_size = 4; - detail::find<<>>( + auto const num_keys = std::distance(first, last); + auto const grid_size = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size); + + detail::find<<>>( first, last, output_begin, submap_views_.data().get(), submaps_.size(), hash, key_equal); CUCO_CUDA_TRY(cudaDeviceSynchronize()); } template template -void dynamic_map::contains( - InputIt first, InputIt last, OutputIt output_begin, Hash hash, KeyEqual key_equal) +void dynamic_map::contains(InputIt first, + InputIt last, + OutputIt output_begin, + Hash hash, + KeyEqual key_equal, + cudaStream_t stream) { - auto num_keys = std::distance(first, last); - auto const block_size = 128; - auto const stride = 1; - auto const tile_size = 4; - auto const grid_size = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size); + auto constexpr block_size = 128; + auto constexpr stride = 1; + auto constexpr tile_size = 4; + + auto const num_keys = std::distance(first, last); + auto const grid_size = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size); - detail::contains<<>>( + detail::contains<<>>( first, last, output_begin, submap_views_.data().get(), submaps_.size(), hash, key_equal); CUCO_CUDA_TRY(cudaDeviceSynchronize()); } diff --git a/include/cuco/detail/dynamic_map_kernels.cuh b/include/cuco/detail/dynamic_map_kernels.cuh index f261b49aa..566576e1e 100644 --- a/include/cuco/detail/dynamic_map_kernels.cuh +++ b/include/cuco/detail/dynamic_map_kernels.cuh @@ -41,6 +41,7 @@ namespace cg = cooperative_groups; * @tparam viewT Type of device view allowing access of hash map storage * @tparam Hash Unary callable type * @tparam KeyEqual Binary callable type + * * @param first Beginning of the sequence of key/value pairs * @param last End of the sequence of key/value pairs * @param submap_views Array of `static_map::device_view` objects used to @@ -71,7 +72,7 @@ __global__ void insert(InputIt first, Hash hash, KeyEqual key_equal) { - typedef cub::BlockReduce BlockReduce; + using BlockReduce = cub::BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; std::size_t thread_num_successes = 0; @@ -97,8 +98,10 @@ __global__ void insert(InputIt first, tid += gridDim.x * blockDim.x; } - std::size_t block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes); - if (threadIdx.x == 0) { *num_successes += block_num_successes; } + std::size_t const block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes); + if (threadIdx.x == 0) { + num_successes->fetch_add(block_num_successes, cuda::std::memory_order_relaxed); + } } /** @@ -122,13 +125,14 @@ __global__ void insert(InputIt first, * @tparam viewT Type of device view allowing access of hash map storage * @tparam Hash Unary callable type * @tparam KeyEqual Binary callable type + * * @param first Beginning of the sequence of key/value pairs * @param last End of the sequence of key/value pairs * @param submap_views Array of `static_map::device_view` objects used to * perform `contains` operations on each underlying `static_map` * @param submap_mutable_views Array of `static_map::device_mutable_view` objects * used to perform an `insert` into the target `static_map` submap - * @param num_successes The number of successfully inserted key/value pairs + * @param submap_num_successes The number of successfully inserted key/value pairs for each submap * @param insert_idx The index of the submap we are inserting into * @param num_submaps The total number of submaps in the map * @param hash The unary function to apply to hash each key @@ -147,13 +151,13 @@ __global__ void insert(InputIt first, InputIt last, viewT* submap_views, mutableViewT* submap_mutable_views, - atomicT* num_successes, + atomicT** submap_num_successes, uint32_t insert_idx, uint32_t num_submaps, Hash hash, KeyEqual key_equal) { - typedef cub::BlockReduce BlockReduce; + using BlockReduce = cub::BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; std::size_t thread_num_successes = 0; @@ -182,8 +186,154 @@ __global__ void insert(InputIt first, it += (gridDim.x * blockDim.x) / tile_size; } - std::size_t block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes); - if (threadIdx.x == 0) { *num_successes += block_num_successes; } + std::size_t const block_num_successes = BlockReduce(temp_storage).Sum(thread_num_successes); + if (threadIdx.x == 0) { + submap_num_successes[insert_idx]->fetch_add(block_num_successes, + cuda::std::memory_order_relaxed); + } +} + +/** + * @brief Erases the key/value pairs corresponding to all keys in the range `[first, last)`. + * + * If the key `*(first + i)` exists in the map, its slot is erased and made available for future + insertions. + * Else, no effect. + * + * @tparam block_size The size of the thread block + * @tparam InputIt Device accessible input iterator whose `value_type` is + * convertible to the map's `key_type` + * @tparam mutableViewT Type of device view allowing modification of hash map storage + * @tparam atomicT Type of atomic storage + * @tparam Hash Unary callable type + * @tparam KeyEqual Binary callable type + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param submap_mutable_views Array of `static_map::mutable_device_view` objects used to + * perform `erase` operations on each underlying `static_map` + * @param num_successes The number of successfully erased key/value pairs + * @param submap_num_successes The number of successfully erased key/value pairs + * in each submap + * @param num_submaps The number of submaps in the map + * @param hash The unary function to apply to hash each key + * @param key_equal The binary function to compare two keys for equality + */ +template +__global__ void erase(InputIt first, + InputIt last, + mutableViewT* submap_mutable_views, + atomicT** submap_num_successes, + uint32_t num_submaps, + Hash hash, + KeyEqual key_equal) +{ + extern __shared__ unsigned long long submap_block_num_successes[]; + + auto tid = block_size * blockIdx.x + threadIdx.x; + auto it = first + tid; + + for (auto i = threadIdx.x; i < num_submaps; i += block_size) { + submap_block_num_successes[i] = 0; + } + __syncthreads(); + + while (it < last) { + for (auto i = 0; i < num_submaps; ++i) { + if (submap_mutable_views[i].erase(*it, hash, key_equal)) { + atomicAdd(&submap_block_num_successes[i], 1); + break; + } + } + it += gridDim.x * blockDim.x; + } + __syncthreads(); + + for (auto i = 0; i < num_submaps; ++i) { + if (threadIdx.x == 0) { + submap_num_successes[i]->fetch_add(static_cast(submap_block_num_successes[i]), + cuda::std::memory_order_relaxed); + } + } +} + +/** + * @brief Erases the key/value pairs corresponding to all keys in the range `[first, last)`. + * + * If the key `*(first + i)` exists in the map, its slot is erased and made available for future + * insertions. + * Else, no effect. + * + * @tparam block_size The size of the thread block + * @tparam tile_size The number of threads in the Cooperative Groups used to perform erase + * @tparam InputIt Device accessible input iterator whose `value_type` is + * convertible to the map's `key_type` + * @tparam mutableViewT Type of device view allowing modification of hash map storage + * @tparam atomicT Type of atomic storage + * @tparam Hash Unary callable type + * @tparam KeyEqual Binary callable type + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param submap_mutable_views Array of `static_map::mutable_device_view` objects used to + * perform `erase` operations on each underlying `static_map` + * @param num_successes The number of successfully erased key/value pairs + * @param submap_num_successes The number of successfully erased key/value pairs + * in each submap + * @param num_submaps The number of submaps in the map + * @param hash The unary function to apply to hash each key + * @param key_equal The binary function to compare two keys for equality + */ +template +__global__ void erase(InputIt first, + InputIt last, + mutableViewT* submap_mutable_views, + atomicT** submap_num_successes, + uint32_t num_submaps, + Hash hash, + KeyEqual key_equal) +{ + extern __shared__ unsigned long long submap_block_num_successes[]; + + auto block = cg::this_thread_block(); + auto tile = cg::tiled_partition(cg::this_thread_block()); + auto tid = block_size * block.group_index().x + block.thread_rank(); + auto it = first + tid / tile_size; + + for (auto i = threadIdx.x; i < num_submaps; i += block_size) { + submap_block_num_successes[i] = 0; + } + block.sync(); + + while (it < last) { + auto erased = false; + int i = 0; + for (i = 0; i < num_submaps; ++i) { + erased = submap_mutable_views[i].erase(tile, *it, hash, key_equal); + if (erased) { break; } + } + if (erased && tile.thread_rank() == 0) { atomicAdd(&submap_block_num_successes[i], 1); } + it += (gridDim.x * blockDim.x) / tile_size; + } + block.sync(); + + for (auto i = 0; i < num_submaps; ++i) { + if (threadIdx.x == 0) { + submap_num_successes[i]->fetch_add(static_cast(submap_block_num_successes[i]), + cuda::std::memory_order_relaxed); + } + } } /** @@ -191,6 +341,7 @@ __global__ void insert(InputIt first, * * If the key `*(first + i)` exists in the map, copies its associated value to `(output_begin + i)`. * Else, copies the empty value sentinel. + * * @tparam block_size The number of threads in the thread block * @tparam Value The mapped value type for the map * @tparam InputIt Device accessible input iterator whose `value_type` is @@ -200,6 +351,7 @@ __global__ void insert(InputIt first, * @tparam viewT Type of `static_map` device view * @tparam Hash Unary callable type * @tparam KeyEqual Binary callable type + * * @param first Beginning of the sequence of keys * @param last End of the sequence of keys * @param output_begin Beginning of the sequence of values retrieved for each key @@ -273,6 +425,7 @@ __global__ void find(InputIt first, * @tparam viewT Type of `static_map` device view * @tparam Hash Unary callable type * @tparam KeyEqual Binary callable type + * * @param first Beginning of the sequence of keys * @param last End of the sequence of keys * @param output_begin Beginning of the sequence of values retrieved for each key @@ -345,6 +498,7 @@ __global__ void find(InputIt first, * @tparam viewT Type of `static_map` device view * @tparam Hash Unary callable type * @tparam KeyEqual Binary callable type + * * @param first Beginning of the sequence of keys * @param last End of the sequence of keys * @param output_begin Beginning of the sequence of booleans for the presence of each key @@ -411,6 +565,7 @@ __global__ void contains(InputIt first, * @tparam viewT Type of `static_map` device view * @tparam Hash Unary callable type * @tparam KeyEqual Binary callable type + * * @param first Beginning of the sequence of keys * @param last End of the sequence of keys * @param output_begin Beginning of the sequence of booleans for the presence of each key diff --git a/include/cuco/detail/equal_wrapper.cuh b/include/cuco/detail/equal_wrapper.cuh new file mode 100644 index 000000000..d2ded4a33 --- /dev/null +++ b/include/cuco/detail/equal_wrapper.cuh @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +#include + +namespace cuco { +namespace experimental { +namespace detail { + +/** + * @brief Enum of equality comparison results. + */ +enum class equal_result : int32_t { UNEQUAL = 0, EMPTY = 1, EQUAL = 2 }; + +/** + * @brief Key equality wrapper. + * + * User-provided equality binary callable cannot be used to compare against sentinel value. + * + * @tparam T Right-hand side Element type + * @tparam Equal Type of user-provided equality binary callable + */ +template +struct equal_wrapper { + T empty_sentinel_; ///< Sentinel value + Equal equal_; ///< Custom equality callable + + /** + * @brief Equality wrapper ctor. + * + * @param sentinel Sentinel value + * @param equal Equality binary callable + */ + __host__ __device__ constexpr equal_wrapper(T sentinel, Equal const& equal) noexcept + : empty_sentinel_{sentinel}, equal_{equal} + { + } + + /** + * @brief Equality check with the given equality callable. + * + * @tparam U Right-hand side Element type + * + * @param lhs Left-hand side element to check equality + * @param rhs Right-hand side element to check equality + * + * @return `EQUAL` if `lhs` and `rhs` are equivalent. `UNEQUAL` otherwise. + */ + template + __device__ constexpr equal_result equal_to(T const& lhs, U const& rhs) const noexcept + { + return equal_(lhs, rhs) ? equal_result::EQUAL : equal_result::UNEQUAL; + } + + /** + * @brief Order-sensitive equality operator. + * + * @note This function always compares the left-hand side element against `empty_sentinel_` value + * first then perform a equality check with the given `equal_` callable, i.e., `equal_(lhs, rhs)`. + * @note Container (like set or map) keys MUST be always on the left-hand side. + * + * @tparam U Right-hand side Element type + * + * @param lhs Left-hand side element to check equality + * @param rhs Right-hand side element to check equality + * + * @return Three way equality comparison result + */ + template + __device__ constexpr equal_result operator()(T const& lhs, U const& rhs) const noexcept + { + return cuco::detail::bitwise_compare(lhs, empty_sentinel_) ? equal_result::EMPTY + : this->equal_to(lhs, rhs); + } +}; + +} // namespace detail +} // namespace experimental +} // namespace cuco diff --git a/include/cuco/detail/error.hpp b/include/cuco/detail/error.hpp index 45f78a2e0..1d1ff6135 100644 --- a/include/cuco/detail/error.hpp +++ b/include/cuco/detail/error.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,31 +16,9 @@ #pragma once -#include - -#include -#include +#include -namespace cuco { -/** - * @brief Exception thrown when a CUDA error is encountered. - * - */ -struct cuda_error : public std::runtime_error { - /** - * @brief Constructs a `cuda_error` object with the given `message`. - * - * @param message The error char array used to construct `cuda_error` - */ - cuda_error(const char* message) : std::runtime_error(message) {} - /** - * @brief Constructs a `cuda_error` object with the given `message` string. - * - * @param message The `std::string` used to construct `cuda_error` - */ - cuda_error(std::string const& message) : cuda_error{message.c_str()} {} -}; -} // namespace cuco +#include #define STRINGIFY_DETAIL(x) #x #define CUCO_STRINGIFY(x) STRINGIFY_DETAIL(x) @@ -58,7 +36,7 @@ struct cuda_error : public std::runtime_error { * Example: * ```c++ * - * // Throws `rmm::cuda_error` if `cudaMalloc` fails + * // Throws `cuco::cuda_error` if `cudaMalloc` fails * CUCO_CUDA_TRY(cudaMalloc(&p, 100)); * * // Throws `std::runtime_error` if `cudaMalloc` fails @@ -93,21 +71,72 @@ struct cuda_error : public std::runtime_error { } while (0) /** - * @brief Macro for checking runtime conditions that throws an exception when + * @brief Macro for checking (pre-)conditions that throws an exception when * a condition is violated. * + * Defaults to throwing `cuco::logic_error`, but a custom exception may also be + * specified. + * * Example usage: + * ``` + * // throws cuco::logic_error + * CUCO_EXPECTS(p != nullptr, "Unexpected null pointer"); * - * @code - * CUCO_RUNTIME_EXPECTS(key == value, "Key value mismatch"); - * @endcode + * // throws std::runtime_error + * CUCO_EXPECTS(p != nullptr, "Unexpected nullptr", std::runtime_error); + * ``` + * @param ... This macro accepts either two or three arguments: + * - The first argument must be an expression that evaluates to true or + * false, and is the condition being checked. + * - The second argument is a string literal used to construct the `what` of + * the exception. + * - When given, the third argument is the exception to be thrown. When not + * specified, defaults to `cuco::logic_error`. + * @throw `_exception_type` if the condition evaluates to 0 (false). + */ +#define CUCO_EXPECTS(...) \ + GET_CUCO_EXPECTS_MACRO(__VA_ARGS__, CUCO_EXPECTS_3, CUCO_EXPECTS_2) \ + (__VA_ARGS__) + +#define GET_CUCO_EXPECTS_MACRO(_1, _2, _3, NAME, ...) NAME + +#define CUCO_EXPECTS_3(_condition, _reason, _exception_type) \ + do { \ + static_assert(std::is_base_of_v); \ + (_condition) ? static_cast(0) \ + : throw _exception_type /*NOLINT(bugprone-macro-parentheses)*/ \ + {"CUCO failure at: " __FILE__ ":" CUCO_STRINGIFY(__LINE__) ": " _reason}; \ + } while (0) + +#define CUCO_EXPECTS_2(_condition, _reason) CUCO_EXPECTS_3(_condition, _reason, cuco::logic_error) + +/** + * @brief Indicates that an erroneous code path has been taken. + * + * Example usage: + * ```c++ + * // Throws `cuco::logic_error` + * CUCO_FAIL("Unsupported code path"); * - * @param[in] cond Expression that evaluates to true or false - * @param[in] reason String literal description of the reason that cond is - * expected to be true - * @throw std::runtime_error if the condition evaluates to false. + * // Throws `std::runtime_error` + * CUCO_FAIL("Unsupported code path", std::runtime_error); + * ``` + * + * @param ... This macro accepts either one or two arguments: + * - The first argument is a string literal used to construct the `what` of + * the exception. + * - When given, the second argument is the exception to be thrown. When not + * specified, defaults to `cuco::logic_error`. + * @throw `_exception_type` if the condition evaluates to 0 (false). */ -#define CUCO_RUNTIME_EXPECTS(cond, reason) \ - (!!(cond)) ? static_cast(0) \ - : throw std::runtime_error("cuco failure at: " __FILE__ \ - ":" CUCO_STRINGIFY(__LINE__) ": " reason) +#define CUCO_FAIL(...) \ + GET_CUCO_FAIL_MACRO(__VA_ARGS__, CUCO_FAIL_2, CUCO_FAIL_1) \ + (__VA_ARGS__) + +#define GET_CUCO_FAIL_MACRO(_1, _2, NAME, ...) NAME + +#define CUCO_FAIL_2(_what, _exception_type) \ + /*NOLINTNEXTLINE(bugprone-macro-parentheses)*/ \ + throw _exception_type { "CUCO failure at:" __FILE__ ":" CUCO_STRINGIFY(__LINE__) ": " _what } + +#define CUCO_FAIL_1(_what) CUCO_FAIL_2(_what, cuco::logic_error) diff --git a/include/cuco/detail/extent/extent.inl b/include/cuco/detail/extent/extent.inl new file mode 100644 index 000000000..a7cd83dcd --- /dev/null +++ b/include/cuco/detail/extent/extent.inl @@ -0,0 +1,119 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include // TODO move to detail/extent/ +#include +#include +#include + +#include + +namespace cuco { +namespace experimental { + +template +struct window_extent { + using value_type = SizeType; ///< Extent value type + + __host__ __device__ constexpr value_type value() const noexcept { return N; } + __host__ __device__ explicit constexpr operator value_type() const noexcept { return value(); } + + private: + __host__ __device__ explicit constexpr window_extent() noexcept {} + __host__ __device__ explicit constexpr window_extent(SizeType) noexcept {} + + template + friend auto constexpr make_window_extent(extent ext); +}; + +template +struct window_extent : cuco::utility::fast_int { + using value_type = + typename cuco::utility::fast_int::fast_int::value_type; ///< Extent value type + + private: + using cuco::utility::fast_int::fast_int; + + template + friend auto constexpr make_window_extent(extent ext); +}; + +template +[[nodiscard]] auto constexpr make_window_extent(extent ext) +{ + return make_window_extent(ext); +} + +template +[[nodiscard]] auto constexpr make_window_extent(SizeType size) +{ + return make_window_extent(extent{size}); +} + +template +[[nodiscard]] auto constexpr make_window_extent(extent ext) +{ + auto constexpr max_prime = cuco::detail::primes.back(); + auto constexpr max_value = + (static_cast(std::numeric_limits::max()) < max_prime) + ? std::numeric_limits::max() + : static_cast(max_prime); + auto const size = cuco::detail::int_div_ceil( + std::max(static_cast(ext), static_cast(1)), CGSize * WindowSize); + if (size > max_value) { CUCO_FAIL("Invalid input extent"); } + + if constexpr (N == dynamic_extent) { + return window_extent{static_cast( + *cuco::detail::lower_bound( + cuco::detail::primes.begin(), cuco::detail::primes.end(), static_cast(size)) * + CGSize)}; + } + if constexpr (N != dynamic_extent) { + return window_extent( + *cuco::detail::lower_bound(cuco::detail::primes.begin(), + cuco::detail::primes.end(), + static_cast(size)) * + CGSize)>{}; + } +} + +template +[[nodiscard]] auto constexpr make_window_extent(SizeType size) +{ + return make_window_extent(extent{size}); +} + +namespace detail { + +template +struct is_window_extent : std::false_type { +}; + +template +struct is_window_extent> : std::true_type { +}; + +template +inline constexpr bool is_window_extent_v = is_window_extent::value; + +} // namespace detail + +} // namespace experimental +} // namespace cuco diff --git a/include/cuco/detail/hash_functions.cuh b/include/cuco/detail/hash_functions.cuh deleted file mode 100644 index 7be6cab20..000000000 --- a/include/cuco/detail/hash_functions.cuh +++ /dev/null @@ -1,122 +0,0 @@ -/* - * Copyright (c) 2017-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -namespace cuco { - -using hash_value_type = uint32_t; - -namespace detail { - -/** - * @brief A `MurmurHash3_32` hash function to hash the given argument on host and device. - * - * MurmurHash3_32 implementation from - * https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp - * ----------------------------------------------------------------------------- - * MurmurHash3 was written by Austin Appleby, and is placed in the public domain. The author - * hereby disclaims copyright to this source code. - * - * Note - The x86 and x64 versions do _not_ produce the same results, as the algorithms are - * optimized for their respective platforms. You can still compile and run any of them on any - * platform, but your performance with the non-native version will be less than optimal. - * - * @tparam Key The type of the values to hash - */ -template -struct MurmurHash3_32 { - using argument_type = Key; ///< The type of the values taken as argument - using result_type = uint32_t; ///< The type of the hash values produced - - /// Default constructor - __host__ __device__ constexpr MurmurHash3_32() : MurmurHash3_32{0} {} - - /** - * @brief Constructs a MurmurHash3_32 hash function with the given `seed`. - * - * @param seed A custom number to randomize the resulting hash value - */ - __host__ __device__ constexpr MurmurHash3_32(uint32_t seed) : m_seed(seed) {} - - /** - * @brief Returns a hash value for its argument, as a value of type `result_type`. - * - * @param key The input argument to hash - * @return A resulting hash value for `key` - */ - constexpr result_type __host__ __device__ operator()(Key const& key) const noexcept - { - constexpr int len = sizeof(argument_type); - const uint8_t* const data = (const uint8_t*)&key; - constexpr int nblocks = len / 4; - - uint32_t h1 = m_seed; - constexpr uint32_t c1 = 0xcc9e2d51; - constexpr uint32_t c2 = 0x1b873593; - //---------- - // body - const uint32_t* const blocks = (const uint32_t*)(data + nblocks * 4); - for (int i = -nblocks; i; i++) { - uint32_t k1 = blocks[i]; // getblock32(blocks,i); - k1 *= c1; - k1 = rotl32(k1, 15); - k1 *= c2; - h1 ^= k1; - h1 = rotl32(h1, 13); - h1 = h1 * 5 + 0xe6546b64; - } - //---------- - // tail - const uint8_t* tail = (const uint8_t*)(data + nblocks * 4); - uint32_t k1 = 0; - switch (len & 3) { - case 3: k1 ^= tail[2] << 16; - case 2: k1 ^= tail[1] << 8; - case 1: - k1 ^= tail[0]; - k1 *= c1; - k1 = rotl32(k1, 15); - k1 *= c2; - h1 ^= k1; - }; - //---------- - // finalization - h1 ^= len; - h1 = fmix32(h1); - return h1; - } - - private: - constexpr __host__ __device__ uint32_t rotl32(uint32_t x, int8_t r) const noexcept - { - return (x << r) | (x >> (32 - r)); - } - - constexpr __host__ __device__ uint32_t fmix32(uint32_t h) const noexcept - { - h ^= h >> 16; - h *= 0x85ebca6b; - h ^= h >> 13; - h *= 0xc2b2ae35; - h ^= h >> 16; - return h; - } - uint32_t m_seed; -}; - -} // namespace detail -} // namespace cuco diff --git a/include/cuco/detail/hash_functions/murmurhash3.cuh b/include/cuco/detail/hash_functions/murmurhash3.cuh new file mode 100644 index 000000000..a12143523 --- /dev/null +++ b/include/cuco/detail/hash_functions/murmurhash3.cuh @@ -0,0 +1,209 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include +#include +#include + +namespace cuco::detail { + +/** + * @brief The 32bit integer finalizer hash function of `MurmurHash3`. + * + * @throw Key type must be 4 bytes in size + * + * @tparam Key The type of the values to hash + */ +template +struct MurmurHash3_fmix32 { + static_assert(sizeof(Key) == 4, "Key type must be 4 bytes in size."); + + using argument_type = Key; ///< The type of the values taken as argument + using result_type = std::uint32_t; ///< The type of the hash values produced + + /** + * @brief Constructs a MurmurHash3_fmix32 hash function with the given `seed`. + * + * @param seed A custom number to randomize the resulting hash value + */ + __host__ __device__ constexpr MurmurHash3_fmix32(std::uint32_t seed = 0) : seed_{seed} {} + + /** + * @brief Returns a hash value for its argument, as a value of type `result_type`. + * + * @param key The input argument to hash + * @return A resulting hash value for `key` + */ + constexpr result_type __host__ __device__ operator()(Key const& key) const noexcept + { + std::uint32_t h = static_cast(key) ^ seed_; + h ^= h >> 16; + h *= 0x85ebca6b; + h ^= h >> 13; + h *= 0xc2b2ae35; + h ^= h >> 16; + return h; + } + + private: + std::uint32_t seed_; +}; + +/** + * @brief The 64bit integer finalizer hash function of `MurmurHash3`. + * + * @throw Key type must be 8 bytes in size + * + * @tparam Key The type of the values to hash + */ +template +struct MurmurHash3_fmix64 { + static_assert(sizeof(Key) == 8, "Key type must be 8 bytes in size."); + + using argument_type = Key; ///< The type of the values taken as argument + using result_type = std::uint64_t; ///< The type of the hash values produced + + /** + * @brief Constructs a MurmurHash3_fmix64 hash function with the given `seed`. + * + * @param seed A custom number to randomize the resulting hash value + */ + __host__ __device__ constexpr MurmurHash3_fmix64(std::uint64_t seed = 0) : seed_{seed} {} + + /** + * @brief Returns a hash value for its argument, as a value of type `result_type`. + * + * @param key The input argument to hash + * @return A resulting hash value for `key` + */ + constexpr result_type __host__ __device__ operator()(Key const& key) const noexcept + { + std::uint64_t h = static_cast(key) ^ seed_; + h ^= h >> 33; + h *= 0xff51afd7ed558ccd; + h ^= h >> 33; + h *= 0xc4ceb9fe1a85ec53; + h ^= h >> 33; + return h; + } + + private: + std::uint64_t seed_; +}; + +/** + * @brief A `MurmurHash3_32` hash function to hash the given argument on host and device. + * + * MurmurHash3_32 implementation from + * https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp + * ----------------------------------------------------------------------------- + * MurmurHash3 was written by Austin Appleby, and is placed in the public domain. The author + * hereby disclaims copyright to this source code. + * + * Note - The x86 and x64 versions do _not_ produce the same results, as the algorithms are + * optimized for their respective platforms. You can still compile and run any of them on any + * platform, but your performance with the non-native version will be less than optimal. + * + * @tparam Key The type of the values to hash + */ +template +struct MurmurHash3_32 { + using argument_type = Key; ///< The type of the values taken as argument + using result_type = std::uint32_t; ///< The type of the hash values produced + + /** + * @brief Constructs a MurmurHash3_32 hash function with the given `seed`. + * + * @param seed A custom number to randomize the resulting hash value + */ + __host__ __device__ constexpr MurmurHash3_32(std::uint32_t seed = 0) : fmix32_{0}, seed_{seed} {} + + /** + * @brief Returns a hash value for its argument, as a value of type `result_type`. + * + * @param key The input argument to hash + * @return The resulting hash value for `key` + */ + constexpr result_type __host__ __device__ operator()(Key const& key) const noexcept + { + return compute_hash(reinterpret_cast(&key), + cuco::experimental::extent{}); + } + + /** + * @brief Returns a hash value for its argument, as a value of type `result_type`. + * + * @tparam Extent The extent type + * + * @param bytes The input argument to hash + * @param size The extent of the data in bytes + * @return The resulting hash value + */ + template + constexpr result_type __host__ __device__ compute_hash(std::byte const* bytes, + Extent size) const noexcept + { + auto const nblocks = size / 4; + + std::uint32_t h1 = seed_; + constexpr std::uint32_t c1 = 0xcc9e2d51; + constexpr std::uint32_t c2 = 0x1b873593; + //---------- + // body + for (std::remove_const_t i = 0; size >= 4 && i < nblocks; i++) { + std::uint32_t k1 = load_chunk(bytes, i); + k1 *= c1; + k1 = rotl32(k1, 15); + k1 *= c2; + h1 ^= k1; + h1 = rotl32(h1, 13); + h1 = h1 * 5 + 0xe6546b64; + } + //---------- + // tail + std::uint32_t k1 = 0; + switch (size & 3) { + case 3: k1 ^= std::to_integer(bytes[nblocks * 4 + 2]) << 16; [[fallthrough]]; + case 2: k1 ^= std::to_integer(bytes[nblocks * 4 + 1]) << 8; [[fallthrough]]; + case 1: + k1 ^= std::to_integer(bytes[nblocks * 4 + 0]); + k1 *= c1; + k1 = rotl32(k1, 15); + k1 *= c2; + h1 ^= k1; + }; + //---------- + // finalization + h1 ^= size; + h1 = fmix32_(h1); + return h1; + } + + private: + constexpr __host__ __device__ std::uint32_t rotl32(std::uint32_t x, std::int8_t r) const noexcept + { + return (x << r) | (x >> (32 - r)); + } + + MurmurHash3_fmix32 fmix32_; + std::uint32_t seed_; +}; +} // namespace cuco::detail \ No newline at end of file diff --git a/include/cuco/detail/hash_functions/utils.cuh b/include/cuco/detail/hash_functions/utils.cuh new file mode 100644 index 000000000..37e279ba7 --- /dev/null +++ b/include/cuco/detail/hash_functions/utils.cuh @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +namespace cuco::detail { + +template +constexpr __host__ __device__ T load_chunk(U const* const data, Extent index) noexcept +{ + auto const bytes = reinterpret_cast(data); + T chunk; + memcpy(&chunk, bytes + index * sizeof(T), sizeof(T)); + return chunk; +} + +}; // namespace cuco::detail \ No newline at end of file diff --git a/include/cuco/detail/hash_functions/xxhash.cuh b/include/cuco/detail/hash_functions/xxhash.cuh new file mode 100644 index 000000000..a36f74bca --- /dev/null +++ b/include/cuco/detail/hash_functions/xxhash.cuh @@ -0,0 +1,381 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include +#include + +namespace cuco::detail { + +/** + * @brief A `XXHash_32` hash function to hash the given argument on host and device. + * + * XXHash_32 implementation from + * https://github.com/Cyan4973/xxHash + * ----------------------------------------------------------------------------- + * xxHash - Extremely Fast Hash algorithm + * Header File + * Copyright (C) 2012-2021 Yann Collet + * + * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @tparam Key The type of the values to hash + */ +template +struct XXHash_32 { + private: + static constexpr std::uint32_t prime1 = 0x9e3779b1u; + static constexpr std::uint32_t prime2 = 0x85ebca77u; + static constexpr std::uint32_t prime3 = 0xc2b2ae3du; + static constexpr std::uint32_t prime4 = 0x27d4eb2fu; + static constexpr std::uint32_t prime5 = 0x165667b1u; + + public: + using argument_type = Key; ///< The type of the values taken as argument + using result_type = std::uint32_t; ///< The type of the hash values produced + + /** + * @brief Constructs a XXH32 hash function with the given `seed`. + * + * @param seed A custom number to randomize the resulting hash value + */ + __host__ __device__ constexpr XXHash_32(std::uint32_t seed = 0) : seed_{seed} {} + + /** + * @brief Returns a hash value for its argument, as a value of type `result_type`. + * + * @param key The input argument to hash + * @return The resulting hash value for `key` + */ + constexpr result_type __host__ __device__ operator()(Key const& key) const noexcept + { + return compute_hash(reinterpret_cast(&key), + cuco::experimental::extent{}); + } + + /** + * @brief Returns a hash value for its argument, as a value of type `result_type`. + * + * @tparam Extent The extent type + * + * @param bytes The input argument to hash + * @param size The extent of the data in bytes + * @return The resulting hash value + */ + template + constexpr result_type __host__ __device__ compute_hash(std::byte const* bytes, + Extent size) const noexcept + { + std::size_t offset = 0; + std::uint32_t h32; + + // data can be processed in 16-byte chunks + if (size >= 16) { + auto const limit = size - 16; + std::uint32_t v1 = seed_ + prime1 + prime2; + std::uint32_t v2 = seed_ + prime2; + std::uint32_t v3 = seed_; + std::uint32_t v4 = seed_ - prime1; + + do { + // pipeline 4*4byte computations + auto const pipeline_offset = offset / 4; + v1 += load_chunk(bytes, pipeline_offset + 0) * prime2; + v1 = rotl(v1, 13); + v1 *= prime1; + v2 += load_chunk(bytes, pipeline_offset + 1) * prime2; + v2 = rotl(v2, 13); + v2 *= prime1; + v3 += load_chunk(bytes, pipeline_offset + 2) * prime2; + v3 = rotl(v3, 13); + v3 *= prime1; + v4 += load_chunk(bytes, pipeline_offset + 3) * prime2; + v4 = rotl(v4, 13); + v4 *= prime1; + offset += 16; + } while (offset <= limit); + + h32 = rotl(v1, 1) + rotl(v2, 7) + rotl(v3, 12) + rotl(v4, 18); + } else { + h32 = seed_ + prime5; + } + + h32 += size; + + // remaining data can be processed in 4-byte chunks + if ((size % 16) >= 4) { + for (; offset <= size - 4; offset += 4) { + h32 += load_chunk(bytes, offset / 4) * prime3; + h32 = rotl(h32, 17) * prime4; + } + } + + // the following loop is only needed if the size of the key is not a multiple of the block size + if (size % 4) { + while (offset < size) { + h32 += (std::to_integer(bytes[offset]) & 255) * prime5; + h32 = rotl(h32, 11) * prime1; + ++offset; + } + } + + return finalize(h32); + } + + private: + constexpr __host__ __device__ std::uint32_t rotl(std::uint32_t h, std::int8_t r) const noexcept + { + return ((h << r) | (h >> (32 - r))); + } + + // avalanche helper + constexpr __host__ __device__ std::uint32_t finalize(std::uint32_t h) const noexcept + { + h ^= h >> 15; + h *= prime2; + h ^= h >> 13; + h *= prime3; + h ^= h >> 16; + return h; + } + + std::uint32_t seed_; +}; + +/** + * @brief A `XXHash_64` hash function to hash the given argument on host and device. + * + * XXHash_64 implementation from + * https://github.com/Cyan4973/xxHash + * ----------------------------------------------------------------------------- + * xxHash - Extremely Fast Hash algorithm + * Header File + * Copyright (C) 2012-2021 Yann Collet + * + * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * You can contact the author at: + * - xxHash homepage: https://www.xxhash.com + * - xxHash source repository: https://github.com/Cyan4973/xxHash + * + * @tparam Key The type of the values to hash + */ +template +struct XXHash_64 { + private: + static constexpr std::uint64_t prime1 = 11400714785074694791ull; + static constexpr std::uint64_t prime2 = 14029467366897019727ull; + static constexpr std::uint64_t prime3 = 1609587929392839161ull; + static constexpr std::uint64_t prime4 = 9650029242287828579ull; + static constexpr std::uint64_t prime5 = 2870177450012600261ull; + + public: + using argument_type = Key; ///< The type of the values taken as argument + using result_type = std::uint64_t; ///< The type of the hash values produced + + /** + * @brief Constructs a XXH64 hash function with the given `seed`. + * + * @param seed A custom number to randomize the resulting hash value + */ + __host__ __device__ constexpr XXHash_64(std::uint64_t seed = 0) : seed_{seed} {} + + /** + * @brief Returns a hash value for its argument, as a value of type `result_type`. + * + * @param key The input argument to hash + * @return The resulting hash value for `key` + */ + constexpr result_type __host__ __device__ operator()(Key const& key) const noexcept + { + return compute_hash(reinterpret_cast(&key), + cuco::experimental::extent{}); + } + + /** + * @brief Returns a hash value for its argument, as a value of type `result_type`. + * + * @tparam Extent The extent type + * + * @param bytes The input argument to hash + * @param size The extent of the data in bytes + * @return The resulting hash value + */ + template + constexpr result_type __host__ __device__ compute_hash(std::byte const* bytes, + Extent size) const noexcept + { + std::size_t offset = 0; + std::uint64_t h64; + + // data can be processed in 32-byte chunks + if (size >= 32) { + auto const limit = size - 32; + std::uint64_t v1 = seed_ + prime1 + prime2; + std::uint64_t v2 = seed_ + prime2; + std::uint64_t v3 = seed_; + std::uint64_t v4 = seed_ - prime1; + + do { + // pipeline 4*8byte computations + auto const pipeline_offset = offset / 8; + v1 += load_chunk(bytes, pipeline_offset + 0) * prime2; + v1 = rotl(v1, 31); + v1 *= prime1; + v2 += load_chunk(bytes, pipeline_offset + 1) * prime2; + v2 = rotl(v2, 31); + v2 *= prime1; + v3 += load_chunk(bytes, pipeline_offset + 2) * prime2; + v3 = rotl(v3, 31); + v3 *= prime1; + v4 += load_chunk(bytes, pipeline_offset + 3) * prime2; + v4 = rotl(v4, 31); + v4 *= prime1; + offset += 32; + } while (offset <= limit); + + h64 = rotl(v1, 1) + rotl(v2, 7) + rotl(v3, 12) + rotl(v4, 18); + + v1 *= prime2; + v1 = rotl(v1, 31); + v1 *= prime1; + h64 ^= v1; + h64 = h64 * prime1 + prime4; + + v2 *= prime2; + v2 = rotl(v2, 31); + v2 *= prime1; + h64 ^= v2; + h64 = h64 * prime1 + prime4; + + v3 *= prime2; + v3 = rotl(v3, 31); + v3 *= prime1; + h64 ^= v3; + h64 = h64 * prime1 + prime4; + + v4 *= prime2; + v4 = rotl(v4, 31); + v4 *= prime1; + h64 ^= v4; + h64 = h64 * prime1 + prime4; + } else { + h64 = seed_ + prime5; + } + + h64 += size; + + // remaining data can be processed in 8-byte chunks + if ((size % 32) >= 8) { + for (; offset <= size - 8; offset += 8) { + std::uint64_t k1 = load_chunk(bytes, offset / 8) * prime2; + k1 = rotl(k1, 31) * prime1; + h64 ^= k1; + h64 = rotl(h64, 27) * prime1 + prime4; + } + } + + // remaining data can be processed in 4-byte chunks + if ((size % 8) >= 4) { + for (; offset <= size - 4; offset += 4) { + h64 ^= (load_chunk(bytes, offset / 4) & 0xffffffffull) * prime1; + h64 = rotl(h64, 23) * prime2 + prime3; + } + } + + // the following loop is only needed if the size of the key is not a multiple of a previous + // block size + if (size % 4) { + while (offset < size) { + h64 ^= (std::to_integer(bytes[offset]) & 0xff) * prime5; + h64 = rotl(h64, 11) * prime1; + ++offset; + } + } + return finalize(h64); + } + + private: + constexpr __host__ __device__ std::uint64_t rotl(std::uint64_t h, std::int8_t r) const noexcept + { + return ((h << r) | (h >> (64 - r))); + } + + // avalanche helper + constexpr __host__ __device__ std::uint64_t finalize(std::uint64_t h) const noexcept + { + h ^= h >> 33; + h *= prime2; + h ^= h >> 29; + h *= prime3; + h ^= h >> 32; + return h; + } + + std::uint64_t seed_; +}; + +} // namespace cuco::detail diff --git a/include/cuco/detail/open_addressing_impl.cuh b/include/cuco/detail/open_addressing_impl.cuh new file mode 100644 index 000000000..2bc3a7225 --- /dev/null +++ b/include/cuco/detail/open_addressing_impl.cuh @@ -0,0 +1,545 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +#include + +namespace cuco { +namespace experimental { +namespace detail { +/** + * @brief An open addressing impl class. + * + * @note This class should NOT be used directly. + * + * @throw If the size of the given key type is larger than 8 bytes + * @throw If the size of the given slot type is larger than 16 bytes + * @throw If the given key type doesn't have unique object representations, i.e., + * `cuco::bitwise_comparable_v == false` + * @throw If the probing scheme type is not inherited from `cuco::detail::probing_scheme_base` + * + * @tparam Key Type used for keys. Requires `cuco::is_bitwise_comparable_v` + * @tparam Value Type used for storage values. + * @tparam Extent Data structure size type + * @tparam Scope The scope in which operations will be performed by individual threads. + * @tparam KeyEqual Binary callable type used to compare two keys for equality + * @tparam ProbingScheme Probing scheme (see `include/cuco/probing_scheme.cuh` for choices) + * @tparam Allocator Type of allocator used for device storage + * @tparam Storage Slot window storage type + */ +template +class open_addressing_impl { + static_assert(sizeof(Key) <= 8, "Container does not support key types larger than 8 bytes."); + + static_assert(sizeof(Value) <= 16, "Container does not support slot types larger than 16 bytes."); + + static_assert( + cuco::is_bitwise_comparable_v, + "Key type must have unique object representations or have been explicitly declared as safe for " + "bitwise comparison via specialization of cuco::is_bitwise_comparable_v."); + + static_assert( + std::is_base_of_v, + ProbingScheme>, + "ProbingScheme must inherit from cuco::detail::probing_scheme_base"); + + public: + static constexpr auto cg_size = ProbingScheme::cg_size; ///< CG size used for probing + static constexpr auto window_size = Storage::window_size; ///< Window size used for probing + static constexpr auto thread_scope = Scope; ///< CUDA thread scope + + using key_type = Key; ///< Key type + using value_type = Value; ///< The storage value type, NOT payload type + /// Extent type + using extent_type = decltype(make_window_extent(std::declval())); + using size_type = typename extent_type::value_type; ///< Size type + using key_equal = KeyEqual; ///< Key equality comparator type + using storage_type = + detail::storage; ///< Storage type + using allocator_type = typename storage_type::allocator_type; ///< Allocator type + + using storage_ref_type = typename storage_type::ref_type; ///< Non-owning window storage ref type + using probing_scheme_type = ProbingScheme; ///< Probe scheme type + + /** + * @brief Constructs a statically-sized open addressing data structure with the specified initial + * capacity, sentinel values and CUDA stream. + * + * @note The actual capacity depends on the given `capacity`, the probing scheme, CG size, and the + * window size and it is computed via the `make_window_extent` factory. Insert operations will not + * automatically grow the container. Attempting to insert more unique keys than the capacity of + * the container results in undefined behavior. + * @note Any `*_sentinel`s are reserved and behavior is undefined when attempting to insert + * this sentinel value. + * @note If a non-default CUDA stream is provided, the caller is responsible for synchronizing the + * stream before the object is first used. + * + * @param capacity The requested lower-bound size + * @param empty_key_sentinel The reserved key value for empty slots + * @param empty_slot_sentinel The reserved slot value for empty slots + * @param pred Key equality binary predicate + * @param probing_scheme Probing scheme + * @param alloc Allocator used for allocating device storage + * @param stream CUDA stream used to initialize the data structure + */ + constexpr open_addressing_impl(Extent capacity, + key_type empty_key_sentinel, + value_type empty_slot_sentinel, + KeyEqual const& pred, + ProbingScheme const& probing_scheme, + Allocator const& alloc, + cuda_stream_ref stream) noexcept + : empty_key_sentinel_{empty_key_sentinel}, + empty_slot_sentinel_{empty_slot_sentinel}, + predicate_{pred}, + probing_scheme_{probing_scheme}, + storage_{make_window_extent(capacity), alloc} + { + this->clear_async(stream); + } + + /** + * @brief Erases all elements from the container. After this call, `size()` returns zero. + * Invalidates any references, pointers, or iterators referring to contained elements. + * + * @param stream CUDA stream this operation is executed in + */ + void clear(cuda_stream_ref stream) noexcept { storage_.initialize(empty_slot_sentinel_, stream); } + + /** + * @brief Asynchronously erases all elements from the container. After this call, `size()` returns + * zero. Invalidates any references, pointers, or iterators referring to contained elements. + * + * @param stream CUDA stream this operation is executed in + */ + void clear_async(cuda_stream_ref stream) noexcept + { + storage_.initialize_async(empty_slot_sentinel_, stream); + } + + /** + * @brief Inserts all keys in the range `[first, last)` and returns the number of successful + * insertions. + * + * @note This function synchronizes the given stream. For asynchronous execution use + * `insert_async`. + * + * @tparam InputIt Device accessible random access input iterator where + * std::is_convertible::value_type, + * open_addressing_impl::value_type> is `true` + * @tparam Ref Type of non-owning device container ref allowing access to storage + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param container_ref Non-owning device container ref used to access the slot storage + * @param stream CUDA stream used for insert + * + * @return Number of successfully inserted keys + */ + template + size_type insert(InputIt first, InputIt last, Ref container_ref, cuda_stream_ref stream) + { + auto const num_keys = cuco::detail::distance(first, last); + if (num_keys == 0) { return 0; } + + auto counter = + detail::counter_storage{this->allocator()}; + counter.reset(stream); + + auto const grid_size = cuco::detail::grid_size(num_keys, cg_size); + + auto const always_true = thrust::constant_iterator{true}; + detail::insert_if_n + <<>>( + first, num_keys, always_true, thrust::identity{}, counter.data(), container_ref); + + return counter.load_to_host(stream); + } + + /** + * @brief Asynchronously inserts all keys in the range `[first, last)`. + * + * @tparam InputIt Device accessible random access input iterator where + * std::is_convertible::value_type, + * open_addressing_impl::value_type> is `true` + * @tparam Ref Type of non-owning device container ref allowing access to storage + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param container_ref Non-owning device container ref used to access the slot storage + * @param stream CUDA stream used for insert + */ + template + void insert_async(InputIt first, InputIt last, Ref container_ref, cuda_stream_ref stream) noexcept + { + auto const num_keys = cuco::detail::distance(first, last); + if (num_keys == 0) { return; } + + auto const grid_size = cuco::detail::grid_size(num_keys, cg_size); + + auto const always_true = thrust::constant_iterator{true}; + detail::insert_if_n + <<>>( + first, num_keys, always_true, thrust::identity{}, container_ref); + } + + /** + * @brief Inserts keys in the range `[first, last)` if `pred` of the corresponding stencil returns + * true. + * + * @note The key `*(first + i)` is inserted if `pred( *(stencil + i) )` returns true. + * @note This function synchronizes the given stream and returns the number of successful + * insertions. For asynchronous execution use `insert_if_async`. + * + * @tparam InputIt Device accessible random access iterator whose `value_type` is + * convertible to the container's `value_type` + * @tparam StencilIt Device accessible random access iterator whose value_type is + * convertible to Predicate's argument type + * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` and + * argument type is convertible from std::iterator_traits::value_type + * @tparam Ref Type of non-owning device container ref allowing access to storage + * + * @param first Beginning of the sequence of key/value pairs + * @param last End of the sequence of key/value pairs + * @param stencil Beginning of the stencil sequence + * @param pred Predicate to test on every element in the range `[stencil, stencil + + * std::distance(first, last))` + * @param container_ref Non-owning device container ref used to access the slot storage + * @param stream CUDA stream used for the operation + * + * @return Number of successfully inserted keys + */ + template + size_type insert_if(InputIt first, + InputIt last, + StencilIt stencil, + Predicate pred, + Ref container_ref, + cuda_stream_ref stream) + { + auto const num_keys = cuco::detail::distance(first, last); + if (num_keys == 0) { return 0; } + + auto counter = + detail::counter_storage{this->allocator()}; + counter.reset(stream); + + auto const grid_size = cuco::detail::grid_size(num_keys, cg_size); + + detail::insert_if_n + <<>>( + first, num_keys, stencil, pred, counter.data(), container_ref); + + return counter.load_to_host(stream); + } + + /** + * @brief Asynchronously inserts keys in the range `[first, last)` if `pred` of the corresponding + * stencil returns true. + * + * @note The key `*(first + i)` is inserted if `pred( *(stencil + i) )` returns true. + * + * @tparam InputIt Device accessible random access iterator whose `value_type` is + * convertible to the container's `value_type` + * @tparam StencilIt Device accessible random access iterator whose value_type is + * convertible to Predicate's argument type + * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` and + * argument type is convertible from std::iterator_traits::value_type + * @tparam Ref Type of non-owning device container ref allowing access to storage + * + * @param first Beginning of the sequence of key/value pairs + * @param last End of the sequence of key/value pairs + * @param stencil Beginning of the stencil sequence + * @param pred Predicate to test on every element in the range `[stencil, stencil + + * std::distance(first, last))` + * @param container_ref Non-owning device container ref used to access the slot storage + * @param stream CUDA stream used for the operation + */ + template + void insert_if_async(InputIt first, + InputIt last, + StencilIt stencil, + Predicate pred, + Ref container_ref, + cuda_stream_ref stream) noexcept + { + auto const num_keys = cuco::detail::distance(first, last); + if (num_keys == 0) { return; } + + auto const grid_size = cuco::detail::grid_size(num_keys, cg_size); + + detail::insert_if_n + <<>>( + first, num_keys, stencil, pred, container_ref); + } + + /** + * @brief Asynchronously indicates whether the keys in the range `[first, last)` are contained in + * the container. + * + * @tparam InputIt Device accessible input iterator + * @tparam OutputIt Device accessible output iterator assignable from `bool` + * @tparam Ref Type of non-owning device container ref allowing access to storage + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param output_begin Beginning of the sequence of booleans for the presence of each key + * @param container_ref Non-owning device container ref used to access the slot storage + * @param stream Stream used for executing the kernels + */ + template + void contains_async(InputIt first, + InputIt last, + OutputIt output_begin, + Ref container_ref, + cuda_stream_ref stream) const noexcept + { + auto const num_keys = cuco::detail::distance(first, last); + if (num_keys == 0) { return; } + + auto const grid_size = cuco::detail::grid_size(num_keys, cg_size); + + auto const always_true = thrust::constant_iterator{true}; + detail::contains_if_n + <<>>( + first, num_keys, always_true, thrust::identity{}, output_begin, container_ref); + } + + /** + * @brief Asynchronously indicates whether the keys in the range `[first, last)` are contained in + * the container if `pred` of the corresponding stencil returns true. + * + * @note If `pred( *(stencil + i) )` is true, stores `true` or `false` to `(output_begin + i)` + * indicating if the key `*(first + i)` is present int the container. If `pred( *(stencil + i) )` + * is false, stores false to `(output_begin + i)`. + * + * @tparam InputIt Device accessible input iterator + * @tparam StencilIt Device accessible random access iterator whose value_type is + * convertible to Predicate's argument type + * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` and + * argument type is convertible from std::iterator_traits::value_type + * @tparam OutputIt Device accessible output iterator assignable from `bool` + * @tparam Ref Type of non-owning device container ref allowing access to storage + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param stencil Beginning of the stencil sequence + * @param pred Predicate to test on every element in the range `[stencil, stencil + + * std::distance(first, last))` + * @param output_begin Beginning of the sequence of booleans for the presence of each key + * @param container_ref Non-owning device container ref used to access the slot storage + * @param stream Stream used for executing the kernels + */ + template + void contains_if_async(InputIt first, + InputIt last, + StencilIt stencil, + Predicate pred, + OutputIt output_begin, + Ref container_ref, + cuda_stream_ref stream) const noexcept + { + auto const num_keys = cuco::detail::distance(first, last); + if (num_keys == 0) { return; } + + auto const grid_size = cuco::detail::grid_size(num_keys, cg_size); + + detail::contains_if_n + <<>>( + first, num_keys, stencil, pred, output_begin, container_ref); + } + + /** + * @brief Retrieves all keys contained in the container. + * + * @note This API synchronizes the given stream. + * @note The order in which keys are returned is implementation defined and not guaranteed to be + * consistent between subsequent calls to `retrieve_all`. + * @note Behavior is undefined if the range beginning at `output_begin` is smaller than the return + * value of `size()`. + * + * @tparam InputIt Device accessible container slot iterator + * @tparam OutputIt Device accessible random access output iterator whose `value_type` is + * convertible from the container's `value_type` + * @tparam Predicate Type of predicate indicating if the given slot is filled + * + * @param begin Beginning of the container slot iterator + * @param output_begin Beginning output iterator for keys + * @param is_filled Predicate indicating if the given slot is filled + * @param stream CUDA stream used for this operation + * + * @return Iterator indicating the end of the output + */ + template + [[nodiscard]] OutputIt retrieve_all(InputIt begin, + OutputIt output_begin, + Predicate const& is_filled, + cuda_stream_ref stream) const + { + std::size_t temp_storage_bytes = 0; + using temp_allocator_type = typename std::allocator_traits::rebind_alloc; + auto temp_allocator = temp_allocator_type{this->allocator()}; + auto d_num_out = reinterpret_cast( + std::allocator_traits::allocate(temp_allocator, sizeof(size_type))); + CUCO_CUDA_TRY(cub::DeviceSelect::If(nullptr, + temp_storage_bytes, + begin, + output_begin, + d_num_out, + this->capacity(), + is_filled, + stream)); + + // Allocate temporary storage + auto d_temp_storage = temp_allocator.allocate(temp_storage_bytes); + + CUCO_CUDA_TRY(cub::DeviceSelect::If(d_temp_storage, + temp_storage_bytes, + begin, + output_begin, + d_num_out, + this->capacity(), + is_filled, + stream)); + + size_type h_num_out; + CUCO_CUDA_TRY( + cudaMemcpyAsync(&h_num_out, d_num_out, sizeof(size_type), cudaMemcpyDeviceToHost, stream)); + stream.synchronize(); + std::allocator_traits::deallocate( + temp_allocator, reinterpret_cast(d_num_out), sizeof(size_type)); + temp_allocator.deallocate(d_temp_storage, temp_storage_bytes); + + return output_begin + h_num_out; + } + + /** + * @brief Gets the number of elements in the container. + * + * @note This function synchronizes the given stream. + * + * @tparam Predicate Type of predicate indicating if the given slot is filled + * + * @param is_filled Predicate indicating if the given slot is filled + * @param stream CUDA stream used to get the number of inserted elements + * + * @return The number of elements in the container + */ + template + [[nodiscard]] size_type size(Predicate const& is_filled, cuda_stream_ref stream) const noexcept + { + auto counter = + detail::counter_storage{this->allocator()}; + counter.reset(stream); + + auto const grid_size = cuco::detail::grid_size(storage_.num_windows()); + + // TODO: custom kernel to be replaced by cub::DeviceReduce::Sum when cub version is bumped to + // v2.1.0 + detail::size + <<>>( + storage_.ref(), is_filled, counter.data()); + + return counter.load_to_host(stream); + } + + /** + * @brief Gets the maximum number of elements the container can hold. + * + * @return The maximum number of elements the container can hold + */ + [[nodiscard]] constexpr auto capacity() const noexcept { return storage_.capacity(); } + + /** + * @brief Gets the sentinel value used to represent an empty key slot. + * + * @return The sentinel value used to represent an empty key slot + */ + [[nodiscard]] constexpr key_type empty_key_sentinel() const noexcept + { + return empty_key_sentinel_; + } + + /** + * @brief Gets the key comparator. + * + * @return The comparator used to compare keys + */ + [[nodiscard]] constexpr key_equal key_eq() const noexcept { return predicate_; } + + /** + * @brief Gets the probing scheme. + * + * @return The probing scheme used for the container + */ + [[nodiscard]] constexpr probing_scheme_type const& probing_scheme() const noexcept + { + return probing_scheme_; + } + + /** + * @brief Gets the container allocator. + * + * @return The container allocator + */ + [[nodiscard]] constexpr allocator_type allocator() const noexcept { return storage_.allocator(); } + + /** + * @brief Gets the non-owning storage ref. + * + * @return The non-owning storage ref of the container + */ + [[nodiscard]] constexpr storage_ref_type storage_ref() const noexcept { return storage_.ref(); } + + protected: + key_type empty_key_sentinel_; ///< Key value that represents an empty slot + value_type empty_slot_sentinel_; ///< Slot value that represents an empty slot + key_equal predicate_; ///< Key equality binary predicate + probing_scheme_type probing_scheme_; ///< Probing scheme + storage_type storage_; ///< Slot window storage +}; + +} // namespace detail +} // namespace experimental +} // namespace cuco diff --git a/include/cuco/detail/open_addressing_ref_impl.cuh b/include/cuco/detail/open_addressing_ref_impl.cuh new file mode 100644 index 000000000..cce691c21 --- /dev/null +++ b/include/cuco/detail/open_addressing_ref_impl.cuh @@ -0,0 +1,876 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include + +#include +#include + +#include + +#include + +#include +#include + +namespace cuco { +namespace experimental { +namespace detail { + +/// Three-way insert result enum +enum class insert_result : int32_t { CONTINUE = 0, SUCCESS = 1, DUPLICATE = 2 }; + +/** + * @brief Helper struct to store intermediate window probing results. + */ +struct window_probing_results { + detail::equal_result state_; ///< Equal result + int32_t intra_window_index_; ///< Intra-window index + + /** + * @brief Constructs window_probing_results. + * + * @param state The three way equality result + * @param index Intra-window index + */ + __device__ explicit constexpr window_probing_results(detail::equal_result state, + int32_t index) noexcept + : state_{state}, intra_window_index_{index} + { + } +}; + +/** + * @brief Common device non-owning "ref" implementation class. + * + * @note This class should NOT be used directly. + * + * @throw If the size of the given key type is larger than 8 bytes + * @throw If the given key type doesn't have unique object representations, i.e., + * `cuco::bitwise_comparable_v == false` + * @throw If the probing scheme type is not inherited from `cuco::detail::probing_scheme_base` + * + * @tparam Key Type used for keys. Requires `cuco::is_bitwise_comparable_v` returning true + * @tparam Scope The scope in which operations will be performed by individual threads. + * @tparam ProbingScheme Probing scheme (see `include/cuco/probing_scheme.cuh` for options) + * @tparam StorageRef Storage ref type + */ +template +class open_addressing_ref_impl { + static_assert(sizeof(Key) <= 8, "Container does not support key types larger than 8 bytes."); + + static_assert( + cuco::is_bitwise_comparable_v, + "Key type must have unique object representations or have been explicitly declared as safe for " + "bitwise comparison via specialization of cuco::is_bitwise_comparable_v."); + + static_assert( + std::is_base_of_v, + ProbingScheme>, + "ProbingScheme must inherit from cuco::detail::probing_scheme_base"); + + // TODO: how to re-enable this check? + // static_assert(is_window_extent_v, + // "Extent is not a valid cuco::window_extent"); + + public: + using key_type = Key; ///< Key type + using probing_scheme_type = ProbingScheme; ///< Type of probing scheme + using storage_ref_type = StorageRef; ///< Type of storage ref + using window_type = typename storage_ref_type::window_type; ///< Window type + using value_type = typename storage_ref_type::value_type; ///< Storage element type + using extent_type = typename storage_ref_type::extent_type; ///< Extent type + using size_type = typename storage_ref_type::size_type; ///< Probing scheme size type + using iterator = typename storage_ref_type::iterator; ///< Slot iterator type + using const_iterator = typename storage_ref_type::const_iterator; ///< Const slot iterator type + + static constexpr auto cg_size = probing_scheme_type::cg_size; ///< Cooperative group size + static constexpr auto window_size = + storage_ref_type::window_size; ///< Number of elements handled per window + + /** + * @brief Constructs open_addressing_ref_impl. + * + * @param empty_slot_sentinel Sentinel indicating an empty slot + * @param probing_scheme Probing scheme + * @param storage_ref Non-owning ref of slot storage + */ + __host__ __device__ explicit constexpr open_addressing_ref_impl( + value_type empty_slot_sentinel, + probing_scheme_type const& probing_scheme, + storage_ref_type storage_ref) noexcept + : empty_slot_sentinel_{empty_slot_sentinel}, + probing_scheme_{probing_scheme}, + storage_ref_{storage_ref} + { + } + + /** + * @brief Gets the maximum number of elements the container can hold. + * + * @return The maximum number of elements the container can hold + */ + [[nodiscard]] __host__ __device__ constexpr auto capacity() const noexcept + { + return storage_ref_.capacity(); + } + + /** + * @brief Returns a const_iterator to one past the last slot. + * + * @return A const_iterator to one past the last slot + */ + [[nodiscard]] __host__ __device__ constexpr const_iterator end() const noexcept + { + return storage_ref_.end(); + } + + /** + * @brief Returns an iterator to one past the last slot. + * + * @return An iterator to one past the last slot + */ + [[nodiscard]] __host__ __device__ constexpr iterator end() noexcept { return storage_ref_.end(); } + + /** + * @brief Inserts an element. + * + * @tparam HasPayload Boolean indicating it's a set or map implementation + * @tparam Predicate Predicate type + * + * @param value The element to insert + * @param predicate Predicate used to compare slot content against `key` + * + * @return True if the given element is successfully inserted + */ + template + __device__ bool insert(value_type const& value, Predicate const& predicate) noexcept + { + static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme"); + + auto const key = [&]() { + if constexpr (HasPayload) { + return value.first; + } else { + return value; + } + }(); + auto probing_iter = probing_scheme_(key, storage_ref_.window_extent()); + + while (true) { + auto const window_slots = storage_ref_[*probing_iter]; + + for (auto& slot_content : window_slots) { + auto const eq_res = predicate(slot_content, key); + + // If the key is already in the container, return false + if (eq_res == detail::equal_result::EQUAL) { return false; } + if (eq_res == detail::equal_result::EMPTY) { + auto const intra_window_index = thrust::distance(window_slots.begin(), &slot_content); + switch (attempt_insert( + (storage_ref_.data() + *probing_iter)->data() + intra_window_index, value, predicate)) { + case insert_result::CONTINUE: continue; + case insert_result::SUCCESS: return true; + case insert_result::DUPLICATE: return false; + } + } + } + ++probing_iter; + } + } + + /** + * @brief Inserts an element. + * + * @tparam HasPayload Boolean indicating it's a set or map implementation + * @tparam Predicate Predicate type + * + * @param group The Cooperative Group used to perform group insert + * @param value The element to insert + * @param predicate Predicate used to compare slot content against `key` + * + * @return True if the given element is successfully inserted + */ + template + __device__ bool insert(cooperative_groups::thread_block_tile const& group, + value_type const& value, + Predicate const& predicate) noexcept + { + auto const key = [&]() { + if constexpr (HasPayload) { + return value.first; + } else { + return value; + } + }(); + auto probing_iter = probing_scheme_(group, key, storage_ref_.window_extent()); + + while (true) { + auto const window_slots = storage_ref_[*probing_iter]; + + auto const [state, intra_window_index] = [&]() { + for (auto i = 0; i < window_size; ++i) { + switch (predicate(window_slots[i], key)) { + case detail::equal_result::EMPTY: + return window_probing_results{detail::equal_result::EMPTY, i}; + case detail::equal_result::EQUAL: + return window_probing_results{detail::equal_result::EQUAL, i}; + default: continue; + } + } + // returns dummy index `-1` for UNEQUAL + return window_probing_results{detail::equal_result::UNEQUAL, -1}; + }(); + + // If the key is already in the container, return false + if (group.any(state == detail::equal_result::EQUAL)) { return false; } + + auto const group_contains_empty = group.ballot(state == detail::equal_result::EMPTY); + + if (group_contains_empty) { + auto const src_lane = __ffs(group_contains_empty) - 1; + auto const status = + (group.thread_rank() == src_lane) + ? attempt_insert( + (storage_ref_.data() + *probing_iter)->data() + intra_window_index, + value, + predicate) + : insert_result::CONTINUE; + + switch (group.shfl(status, src_lane)) { + case insert_result::SUCCESS: return true; + case insert_result::DUPLICATE: return false; + default: continue; + } + } else { + ++probing_iter; + } + } + } + + /** + * @brief Inserts the given element into the container. + * + * @note This API returns a pair consisting of an iterator to the inserted element (or to the + * element that prevented the insertion) and a `bool` denoting whether the insertion took place or + * not. + * + * @tparam HasPayload Boolean indicating it's a set or map implementation + * @tparam Predicate Predicate type + * + * @param value The element to insert + * @param predicate Predicate used to compare slot content against `key` + * + * @return a pair consisting of an iterator to the element and a bool indicating whether the + * insertion is successful or not. + */ + template + __device__ thrust::pair insert_and_find(value_type const& value, + Predicate const& predicate) noexcept + { + static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme"); + + auto const key = [&]() { + if constexpr (HasPayload) { + return value.first; + } else { + return value; + } + }(); + auto probing_iter = probing_scheme_(key, storage_ref_.window_extent()); + + while (true) { + auto const window_slots = storage_ref_[*probing_iter]; + + for (auto i = 0; i < window_size; ++i) { + auto const eq_res = predicate(window_slots[i], key); + auto* window_ptr = (storage_ref_.data() + *probing_iter)->data(); + + // If the key is already in the container, return false + if (eq_res == detail::equal_result::EQUAL) { return {iterator{&window_ptr[i]}, false}; } + if (eq_res == detail::equal_result::EMPTY) { + switch ([&]() { + if constexpr (sizeof(value_type) <= 8) { + return packed_cas(window_ptr + i, value, predicate); + } else { + return cas_dependent_write(window_ptr + i, value, predicate); + } + }()) { + case insert_result::SUCCESS: { + return {iterator{&window_ptr[i]}, true}; + } + case insert_result::DUPLICATE: { + return {iterator{&window_ptr[i]}, false}; + } + default: continue; + } + } + } + ++probing_iter; + }; + } + + /** + * @brief Inserts the given element into the container. + * + * @note This API returns a pair consisting of an iterator to the inserted element (or to the + * element that prevented the insertion) and a `bool` denoting whether the insertion took place or + * not. + * + * @tparam HasPayload Boolean indicating it's a set or map implementation + * @tparam Predicate Predicate type + * + * @param group The Cooperative Group used to perform group insert_and_find + * @param value The element to insert + * @param predicate Predicate used to compare slot content against `key` + * + * @return a pair consisting of an iterator to the element and a bool indicating whether the + * insertion is successful or not. + */ + template + __device__ thrust::pair insert_and_find( + cooperative_groups::thread_block_tile const& group, + value_type const& value, + Predicate const& predicate) noexcept + { + auto const key = [&]() { + if constexpr (HasPayload) { + return value.first; + } else { + return value; + } + }(); + auto probing_iter = probing_scheme_(group, key, storage_ref_.window_extent()); + + while (true) { + auto const window_slots = storage_ref_[*probing_iter]; + + auto const [state, intra_window_index] = [&]() { + for (auto i = 0; i < window_size; ++i) { + switch (predicate(window_slots[i], key)) { + case detail::equal_result::EMPTY: + return window_probing_results{detail::equal_result::EMPTY, i}; + case detail::equal_result::EQUAL: + return window_probing_results{detail::equal_result::EQUAL, i}; + default: continue; + } + } + // returns dummy index `-1` for UNEQUAL + return window_probing_results{detail::equal_result::UNEQUAL, -1}; + }(); + + auto* slot_ptr = (storage_ref_.data() + *probing_iter)->data() + intra_window_index; + + // If the key is already in the container, return false + auto const group_finds_equal = group.ballot(state == detail::equal_result::EQUAL); + if (group_finds_equal) { + auto const src_lane = __ffs(group_finds_equal) - 1; + auto const res = group.shfl(reinterpret_cast(slot_ptr), src_lane); + return {iterator{reinterpret_cast(res)}, false}; + } + + auto const group_contains_empty = group.ballot(state == detail::equal_result::EMPTY); + if (group_contains_empty) { + auto const src_lane = __ffs(group_contains_empty) - 1; + auto const res = group.shfl(reinterpret_cast(slot_ptr), src_lane); + auto const status = [&]() { + if (group.thread_rank() != src_lane) { return insert_result::CONTINUE; } + if constexpr (sizeof(value_type) <= 8) { + return packed_cas(slot_ptr, value, predicate); + } else { + return cas_dependent_write(slot_ptr, value, predicate); + } + }(); + + switch (group.shfl(status, src_lane)) { + case insert_result::SUCCESS: { + return {iterator{reinterpret_cast(res)}, true}; + } + case insert_result::DUPLICATE: { + return {iterator{reinterpret_cast(res)}, false}; + } + default: continue; + } + } else { + ++probing_iter; + } + } + } + + /** + * @brief Indicates whether the probe key `key` was inserted into the container. + * + * @note If the probe key `key` was inserted into the container, returns true. Otherwise, returns + * false. + * + * @tparam ProbeKey Probe key type + * @tparam Predicate Predicate type + * + * @param key The key to search for + * @param predicate Predicate used to compare slot content against `key` + * + * @return A boolean indicating whether the probe key is present + */ + template + [[nodiscard]] __device__ bool contains(ProbeKey const& key, + Predicate const& predicate) const noexcept + { + static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme"); + auto probing_iter = probing_scheme_(key, storage_ref_.window_extent()); + + while (true) { + // TODO atomic_ref::load if insert operator is present + auto const window_slots = storage_ref_[*probing_iter]; + + for (auto& slot_content : window_slots) { + switch (predicate(slot_content, key)) { + case detail::equal_result::UNEQUAL: continue; + case detail::equal_result::EMPTY: return false; + case detail::equal_result::EQUAL: return true; + } + } + ++probing_iter; + } + } + + /** + * @brief Indicates whether the probe key `key` was inserted into the container. + * + * @note If the probe key `key` was inserted into the container, returns true. Otherwise, returns + * false. + * + * @tparam ProbeKey Probe key type + * @tparam Predicate Predicate type + * + * @param group The Cooperative Group used to perform group contains + * @param key The key to search for + * @param predicate Predicate used to compare slot content against `key` + * + * @return A boolean indicating whether the probe key is present + */ + template + [[nodiscard]] __device__ bool contains( + cooperative_groups::thread_block_tile const& group, + ProbeKey const& key, + Predicate const& predicate) const noexcept + { + auto probing_iter = probing_scheme_(group, key, storage_ref_.window_extent()); + + while (true) { + auto const window_slots = storage_ref_[*probing_iter]; + + auto const state = [&]() { + for (auto& slot : window_slots) { + switch (predicate(slot, key)) { + case detail::equal_result::EMPTY: return detail::equal_result::EMPTY; + case detail::equal_result::EQUAL: return detail::equal_result::EQUAL; + default: continue; + } + } + return detail::equal_result::UNEQUAL; + }(); + + if (group.any(state == detail::equal_result::EQUAL)) { return true; } + if (group.any(state == detail::equal_result::EMPTY)) { return false; } + + ++probing_iter; + } + } + + /** + * @brief Finds an element in the container with key equivalent to the probe key. + * + * @note Returns a un-incrementable input iterator to the element whose key is equivalent to + * `key`. If no such element exists, returns `end()`. + * + * @tparam ProbeKey Probe key type + * @tparam Predicate Predicate type + * + * @param key The key to search for + * @param predicate Predicate used to compare slot content against `key` + * + * @return An iterator to the position at which the equivalent key is stored + */ + template + [[nodiscard]] __device__ const_iterator find(ProbeKey const& key, + Predicate const& predicate) const noexcept + { + static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme"); + auto probing_iter = probing_scheme_(key, storage_ref_.window_extent()); + + while (true) { + // TODO atomic_ref::load if insert operator is present + auto const window_slots = storage_ref_[*probing_iter]; + + for (auto i = 0; i < window_size; ++i) { + switch (predicate(window_slots[i], key)) { + case detail::equal_result::EMPTY: { + return this->end(); + } + case detail::equal_result::EQUAL: { + return const_iterator{&(*(storage_ref_.data() + *probing_iter))[i]}; + } + default: continue; + } + } + ++probing_iter; + } + } + + /** + * @brief Finds an element in the container with key equivalent to the probe key. + * + * @note Returns a un-incrementable input iterator to the element whose key is equivalent to + * `key`. If no such element exists, returns `end()`. + * + * @tparam ProbeKey Probe key type + * @tparam Predicate Predicate type + * + * @param group The Cooperative Group used to perform this operation + * @param key The key to search for + * @param predicate Predicate used to compare slot content against `key` + * + * @return An iterator to the position at which the equivalent key is stored + */ + template + [[nodiscard]] __device__ const_iterator + find(cooperative_groups::thread_block_tile const& group, + ProbeKey const& key, + Predicate const& predicate) const noexcept + { + auto probing_iter = probing_scheme_(group, key, storage_ref_.window_extent()); + + while (true) { + auto const window_slots = storage_ref_[*probing_iter]; + + auto const [state, intra_window_index] = [&]() { + for (auto i = 0; i < window_size; ++i) { + switch (predicate(window_slots[i], key)) { + case detail::equal_result::EMPTY: + return window_probing_results{detail::equal_result::EMPTY, i}; + case detail::equal_result::EQUAL: + return window_probing_results{detail::equal_result::EQUAL, i}; + default: continue; + } + } + // returns dummy index `-1` for UNEQUAL + return window_probing_results{detail::equal_result::UNEQUAL, -1}; + }(); + + // Find a match for the probe key, thus return an iterator to the entry + auto const group_finds_match = group.ballot(state == detail::equal_result::EQUAL); + if (group_finds_match) { + auto const src_lane = __ffs(group_finds_match) - 1; + auto const res = group.shfl( + reinterpret_cast(&(*(storage_ref_.data() + *probing_iter))[intra_window_index]), + src_lane); + return const_iterator{reinterpret_cast(res)}; + } + + // Find an empty slot, meaning that the probe key isn't present in the container + if (group.any(state == detail::equal_result::EMPTY)) { return this->end(); } + + ++probing_iter; + } + } + + /** + * @brief Compares the content of the address `address` (old value) with the `expected` value and, + * only if they are the same, sets the content of `address` to `desired`. + * + * @tparam T Address content type + * + * @param address The target address + * @param expected The value expected to be found at the target address + * @param desired The value to store at the target address if it is as expected + * + * @return The old value located at address `address` + */ + template + __device__ constexpr auto compare_and_swap(T* address, T expected, T desired) + { + // temporary workaround due to performance regression + // https://github.com/NVIDIA/libcudacxx/issues/366 + if constexpr (sizeof(T) == sizeof(unsigned int)) { + auto* const slot_ptr = reinterpret_cast(address); + auto const* const expected_ptr = reinterpret_cast(&expected); + auto const* const desired_ptr = reinterpret_cast(&desired); + if constexpr (Scope == cuda::thread_scope_system) { + return atomicCAS_system(slot_ptr, *expected_ptr, *desired_ptr); + } else if constexpr (Scope == cuda::thread_scope_device) { + return atomicCAS(slot_ptr, *expected_ptr, *desired_ptr); + } else if constexpr (Scope == cuda::thread_scope_block) { + return atomicCAS_block(slot_ptr, *expected_ptr, *desired_ptr); + } else { + static_assert(cuco::dependent_false, "Unsupported thread scope"); + } + } else if constexpr (sizeof(T) == sizeof(unsigned long long int)) { + auto* const slot_ptr = reinterpret_cast(address); + auto const* const expected_ptr = reinterpret_cast(&expected); + auto const* const desired_ptr = reinterpret_cast(&desired); + if constexpr (Scope == cuda::thread_scope_system) { + return atomicCAS_system(slot_ptr, *expected_ptr, *desired_ptr); + } else if constexpr (Scope == cuda::thread_scope_device) { + return atomicCAS(slot_ptr, *expected_ptr, *desired_ptr); + } else if constexpr (Scope == cuda::thread_scope_block) { + return atomicCAS_block(slot_ptr, *expected_ptr, *desired_ptr); + } else { + static_assert(cuco::dependent_false, "Unsupported thread scope"); + } + } + } + + /** + * @brief Atomically stores `value` at the given `address`. + * + * @tparam T Address content type + * + * @param address The target address + * @param value The value to store + */ + template + __device__ constexpr void atomic_store(T* address, T value) + { + if constexpr (sizeof(T) == sizeof(unsigned int)) { + auto* const slot_ptr = reinterpret_cast(address); + auto const* const value_ptr = reinterpret_cast(&value); + if constexpr (Scope == cuda::thread_scope_system) { + atomicExch_system(slot_ptr, *value_ptr); + } else if constexpr (Scope == cuda::thread_scope_device) { + atomicExch(slot_ptr, *value_ptr); + } else if constexpr (Scope == cuda::thread_scope_block) { + atomicExch_block(slot_ptr, *value_ptr); + } else { + static_assert(cuco::dependent_false, "Unsupported thread scope"); + } + } else if constexpr (sizeof(T) == sizeof(unsigned long long int)) { + auto* const slot_ptr = reinterpret_cast(address); + auto const* const value_ptr = reinterpret_cast(&value); + if constexpr (Scope == cuda::thread_scope_system) { + atomicExch_system(slot_ptr, *value_ptr); + } else if constexpr (Scope == cuda::thread_scope_device) { + atomicExch(slot_ptr, *value_ptr); + } else if constexpr (Scope == cuda::thread_scope_block) { + atomicExch_block(slot_ptr, *value_ptr); + } else { + static_assert(cuco::dependent_false, "Unsupported thread scope"); + } + } + } + + /** + * @brief Gets the sentinel used to represent an empty slot. + * + * @return The sentinel value used to represent an empty slot + */ + [[nodiscard]] __device__ constexpr value_type empty_slot_sentinel() const noexcept + { + return empty_slot_sentinel_; + } + + /** + * @brief Gets the probing scheme. + * + * @return The probing scheme used for the container + */ + [[nodiscard]] __device__ constexpr probing_scheme_type const& probing_scheme() const noexcept + { + return probing_scheme_; + } + + /** + * @brief Gets the non-owning storage ref. + * + * @return The non-owning storage ref of the container + */ + [[nodiscard]] __device__ constexpr storage_ref_type storage_ref() const noexcept + { + return storage_ref_; + } + + private: + /** + * @brief Inserts the specified element with one single CAS operation. + * + * @tparam HasPayload Boolean indicating it's a set or map implementation + * @tparam Predicate Predicate type + * + * @param slot Pointer to the slot in memory + * @param value Element to insert + * @param predicate Predicate used to compare slot content against `key` + * + * @return Result of this operation, i.e., success/continue/duplicate + */ + template + [[nodiscard]] __device__ constexpr insert_result packed_cas(value_type* slot, + value_type const& value, + Predicate const& predicate) noexcept + { + auto old = compare_and_swap(slot, this->empty_slot_sentinel_, value); + auto* old_ptr = reinterpret_cast(&old); + auto const inserted = [&]() { + if constexpr (HasPayload) { + // If it's a map implementation, compare keys only + return cuco::detail::bitwise_compare(old_ptr->first, this->empty_slot_sentinel_.first); + } else { + // If it's a set implementation, compare the whole slot content + return cuco::detail::bitwise_compare(*old_ptr, this->empty_slot_sentinel_); + } + }(); + if (inserted) { + return insert_result::SUCCESS; + } else { + // Shouldn't use `predicate` operator directly since it includes a redundant bitwise compare + auto const res = [&]() { + if constexpr (HasPayload) { + // If it's a map implementation, compare keys only + return predicate.equal_to(old_ptr->first, value.first); + } else { + // If it's a set implementation, compare the whole slot content + return predicate.equal_to(*old_ptr, value); + } + }(); + return res == detail::equal_result::EQUAL ? insert_result::DUPLICATE + : insert_result::CONTINUE; + } + } + + /** + * @brief Inserts the specified element with two back-to-back CAS operations. + * + * @tparam Predicate Predicate type + * + * @param slot Pointer to the slot in memory + * @param value Element to insert + * @param predicate Predicate used to compare slot content against `key` + * + * @return Result of this operation, i.e., success/continue/duplicate + */ + template + [[nodiscard]] __device__ constexpr insert_result back_to_back_cas( + value_type* slot, value_type const& value, Predicate const& predicate) noexcept + { + auto const expected_key = this->empty_slot_sentinel_.first; + auto const expected_payload = this->empty_slot_sentinel_.second; + + auto old_key = compare_and_swap(&slot->first, expected_key, value.first); + auto old_payload = compare_and_swap(&slot->second, expected_payload, value.second); + + using mapped_type = decltype(expected_payload); + + auto* old_key_ptr = reinterpret_cast(&old_key); + auto* old_payload_ptr = reinterpret_cast(&old_payload); + + // if key success + if (cuco::detail::bitwise_compare(*old_key_ptr, expected_key)) { + while (not cuco::detail::bitwise_compare(*old_payload_ptr, expected_payload)) { + old_payload = compare_and_swap(&slot->second, expected_payload, value.second); + } + return insert_result::SUCCESS; + } else if (cuco::detail::bitwise_compare(*old_payload_ptr, expected_payload)) { + atomic_store(&slot->second, expected_payload); + } + + // Our key was already present in the slot, so our key is a duplicate + // Shouldn't use `predicate` operator directly since it includes a redundant bitwise compare + if (predicate.equal_to(*old_key_ptr, value.first) == detail::equal_result::EQUAL) { + return insert_result::DUPLICATE; + } + + return insert_result::CONTINUE; + } + + /** + * @brief Inserts the specified element with CAS-dependent write operations. + * + * @tparam Predicate Predicate type + * + * @param slot Pointer to the slot in memory + * @param value Element to insert + * @param predicate Predicate used to compare slot content against `key` + * + * @return Result of this operation, i.e., success/continue/duplicate + */ + template + [[nodiscard]] __device__ constexpr insert_result cas_dependent_write( + value_type* slot, value_type const& value, Predicate const& predicate) noexcept + { + auto const expected_key = this->empty_slot_sentinel_.first; + + auto old_key = compare_and_swap(&slot->first, expected_key, value.first); + + auto* old_key_ptr = reinterpret_cast(&old_key); + + // if key success + if (cuco::detail::bitwise_compare(*old_key_ptr, expected_key)) { + atomic_store(&slot->second, value.second); + return insert_result::SUCCESS; + } + + // Our key was already present in the slot, so our key is a duplicate + // Shouldn't use `predicate` operator directly since it includes a redundant bitwise compare + if (predicate.equal_to(*old_key_ptr, value.first) == detail::equal_result::EQUAL) { + return insert_result::DUPLICATE; + } + + return insert_result::CONTINUE; + } + + /** + * @brief Attempts to insert an element into a slot. + * + * @note Dispatches the correct implementation depending on the container + * type and presence of other operator mixins. + * + * @tparam HasPayload Boolean indicating it's a set or map implementation + * @tparam Predicate Predicate type + * + * @param slot Pointer to the slot in memory + * @param value Element to insert + * @param predicate Predicate used to compare slot content against `key` + * + * @return Result of this operation, i.e., success/continue/duplicate + */ + template + [[nodiscard]] __device__ insert_result attempt_insert(value_type* slot, + value_type const& value, + Predicate const& predicate) noexcept + { + if constexpr (sizeof(value_type) <= 8) { + return packed_cas(slot, value, predicate); + } else { +#if (_CUDA_ARCH__ < 700) + return cas_dependent_write(slot, value, predicate); +#else + return back_to_back_cas(slot, value, predicate); +#endif + } + } + + value_type empty_slot_sentinel_; ///< Sentinel value indicating an empty slot + probing_scheme_type probing_scheme_; ///< Probing scheme + storage_ref_type storage_ref_; ///< Slot storage ref +}; + +} // namespace detail +} // namespace experimental +} // namespace cuco diff --git a/include/cuco/detail/operator.inl b/include/cuco/detail/operator.inl new file mode 100644 index 000000000..fdd5884e8 --- /dev/null +++ b/include/cuco/detail/operator.inl @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include + +namespace cuco { +namespace experimental { +namespace detail { + +/** + * @brief CRTP mixin which augments a given `Reference` with an `Operator`. + * + * @throw If the operator is not defined in `include/cuco/operator.hpp` + * + * @tparam Operator Operator type, i.e., `cuco::op::*_tag` + * @tparam Reference The reference type. + * + * @note This primary template should never be instantiated. + */ +template +class operator_impl { + static_assert(cuco::dependent_false, + "Operator type is not supported by reference type."); +}; + +/** + * @brief Checks if the given `Operator` is contained in a list of `Operators`. + * + * @tparam Operator Operator type, i.e., `cuco::op::*_tag` + * @tparam Operators List of operators to search in + * + * @return `true` if `Operator` is contained in `Operators`, `false` otherwise. + */ +template +static constexpr bool has_operator() +{ + return ((std::is_same_v) || ...); +} + +} // namespace detail +} // namespace experimental +} // namespace cuco diff --git a/include/cuco/detail/pair.cuh b/include/cuco/detail/pair.cuh deleted file mode 100644 index 7ea39889c..000000000 --- a/include/cuco/detail/pair.cuh +++ /dev/null @@ -1,275 +0,0 @@ -/* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include -#include - -#include -#include -#include - -namespace cuco { -namespace detail { - -/** - * @brief Rounds `v` to the nearest power of 2 greater than or equal to `v`. - * - * @param v - * @return The nearest power of 2 greater than or equal to `v`. - */ -constexpr std::size_t next_pow2(std::size_t v) noexcept -{ - --v; - v |= v >> 1; - v |= v >> 2; - v |= v >> 4; - v |= v >> 8; - v |= v >> 16; - return ++v; -} - -/** - * @brief Gives value to use as alignment for a pair type that is at least the - * size of the sum of the size of the first type and second type, or 16, - * whichever is smaller. - */ -template -constexpr std::size_t pair_alignment() -{ - return std::min(std::size_t{16}, next_pow2(sizeof(First) + sizeof(Second))); -} - -template -struct is_std_pair_like : std::false_type { -}; - -template -struct is_std_pair_like< - T, - std::void_t(std::declval())), decltype(std::get<1>(std::declval()))>> - : std::conditional_t::value == 2, std::true_type, std::false_type> { -}; - -template -struct is_thrust_pair_like_impl : std::false_type { -}; - -template -struct is_thrust_pair_like_impl(std::declval())), - decltype(thrust::get<1>(std::declval()))>> - : std::conditional_t::value == 2, std::true_type, std::false_type> { -}; - -template -struct is_thrust_pair_like - : is_thrust_pair_like_impl< - std::remove_reference_t()))>> { -}; - -/** - * @brief Denotes the equivalent packed type based on the size of the object. - * - * @tparam N The size of the object - */ -template -struct packed { - using type = void; ///< `void` type by default -}; -/** - * @brief Denotes the packed type when the size of the object is 8. - */ -template <> -struct packed { - using type = uint64_t; ///< Packed type as `uint64_t` if the size of the object is 8 -}; -/** - * @brief Denotes the packed type when the size of the object is 4. - */ -template <> -struct packed { - using type = uint32_t; ///< Packed type as `uint32_t` if the size of the object is 4 -}; -template -using packed_t = typename packed::type; - -/** - * @brief Indicates if a pair type can be packed. - * - * When the size of the key,value pair being inserted into the hash table is - * equal in size to a type where atomicCAS is natively supported, it is more - * efficient to "pack" the pair and insert it with a single atomicCAS. - * - * Pair types whose key and value have the same object representation may be - * packed. Also, the `pair_type` must not contain any padding bits otherwise - * accessing the packed value would be undefined. - * - * @tparam pair_type The pair type that will be packed - * - * @return true If the pair type can be packed - * @return false If the pair type cannot be packed - */ -template -constexpr bool is_packable() -{ - return not std::is_void>::value and - std::has_unique_object_representations_v; -} - -/** - * @brief Allows viewing a pair in a packed representation. - * - * Used as an optimization for inserting when a pair can be inserted with a - * single atomicCAS - */ -template -union pair_converter { - using packed_type = packed_t; ///< The packed pair type - packed_type packed; ///< The pair in the packed representation - pair_type pair; ///< The pair in the pair representation - - /** - * @brief Constructs a pair converter by copying from `p` - * - * @tparam T Type that is convertible to `pair_type` - * - * @param p The pair to copy from - */ - template - __device__ pair_converter(T&& p) : pair{p} - { - } - - /** - * @brief Constructs a pair converter by copying from `p` - * - * @param p The packed data to copy from - */ - __device__ pair_converter(packed_type p) : packed{p} {} -}; - -} // namespace detail - -/** - * @brief Custom pair type - * - * This is necessary because `thrust::pair` is under aligned. - * - * @tparam First Type of the first value in the pair - * @tparam Second Type of the second value in the pair - */ -template -struct alignas(detail::pair_alignment()) pair { - using first_type = First; ///< Type of the first value in the pair - using second_type = Second; ///< Type of the second value in the pair - - pair() = default; - ~pair() = default; - pair(pair const&) = default; ///< Copy constructor - pair(pair&&) = default; ///< Move constructor - - /** - * @brief Replaces the contents of the pair with another pair. - * - * @return Reference of the current pair object - */ - pair& operator=(pair const&) = default; - - /** - * @brief Replaces the contents of the pair with another pair. - * - * @return Reference of the current pair object - */ - pair& operator=(pair&&) = default; - - /** - * @brief Constructs a pair from objects `f` and `s`. - * - * @param f The object to copy into `first` - * @param s The object to copy into `second` - */ - __host__ __device__ constexpr pair(First const& f, Second const& s) : first{f}, second{s} {} - - /** - * @brief Constructs a pair by copying from the given pair `p`. - * - * @tparam F Type of the first value of `p` - * @tparam S Type of the second value of `p` - * - * @param p The pair to copy from - */ - template - __host__ __device__ constexpr pair(pair const& p) : first{p.first}, second{p.second} - { - } - - /** - * @brief Constructs a pair from the given std::pair-like `p`. - * - * @tparam T Type of the pair to copy from - * - * @param p The input pair to copy from - */ - template ::value>* = nullptr> - __host__ __device__ constexpr pair(T const& p) - : pair{std::get<0>(thrust::raw_reference_cast(p)), std::get<1>(thrust::raw_reference_cast(p))} - { - } - - /** - * @brief Constructs a pair from the given thrust::pair-like `p`. - * - * @tparam T Type of the pair to copy from - * - * @param p The input pair to copy from - */ - template ::value>* = nullptr> - __host__ __device__ constexpr pair(T const& p) - : pair{thrust::get<0>(thrust::raw_reference_cast(p)), - thrust::get<1>(thrust::raw_reference_cast(p))} - { - } - - First first; ///< The first value in the pair - Second second; ///< The second value in the pair -}; - -template -using pair_type = cuco::pair; - -/** - * @brief Creates a pair of type `pair_type` - * - * @tparam F - * @tparam S - * - * @param f - * @param s - * @return pair_type with first element `f` and second element `s`. - */ -template -__host__ __device__ pair_type make_pair(F&& f, S&& s) noexcept -{ - return pair_type{std::forward(f), std::forward(s)}; -} - -} // namespace cuco diff --git a/include/cuco/detail/pair.inl b/include/cuco/detail/pair.inl new file mode 100644 index 000000000..56d16e4fb --- /dev/null +++ b/include/cuco/detail/pair.inl @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +namespace cuco { + +template +__host__ __device__ constexpr pair::pair(First const& f, Second const& s) + : first{f}, second{s} +{ +} + +template +template +__host__ __device__ constexpr pair::pair(pair const& p) + : first{p.first}, second{p.second} +{ +} + +template +__host__ __device__ constexpr pair, std::decay_t> make_pair(F&& f, + S&& s) noexcept +{ + return pair, std::decay_t>(std::forward(f), std::forward(s)); +} + +template +__host__ __device__ constexpr bool operator==(cuco::pair const& lhs, + cuco::pair const& rhs) noexcept +{ + return lhs.first == rhs.first and lhs.second == rhs.second; +} + +} // namespace cuco diff --git a/include/cuco/detail/prime.hpp b/include/cuco/detail/prime.hpp index 93ddde1a0..c788fa245 100644 --- a/include/cuco/detail/prime.hpp +++ b/include/cuco/detail/prime.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,12 +16,18 @@ #pragma once -#include +#include + +#include +#include +#include +#include namespace cuco { namespace detail { -constexpr std::array primes = { +// TODO use CTAD instead of explicitly specifying the array size once we drop support for nvcc <11.5 +inline constexpr std::array primes = { 2, 3, 5, 7, 13, 19, 29, 37, 43, 53, 59, 67, 73, 79, 89, 97, 103, 109, 127, 137, 149, @@ -20129,43 +20135,6 @@ constexpr std::array primes = { 17176447243, 17176578343, 17176709449, 17176840529, 17176971601, 17177102693, 17177233783, 17177364857, 17177495953, 17177627053, 17177758133}; -/** - * @brief Indicates whether the input `num` is a prime number. - * - * @param num - * @return A boolean indicating whether the input `num` is a prime number - */ -constexpr bool is_prime(std::size_t num) noexcept -{ - bool flag = true; - // 0 and 1 are not prime numbers - if (num == 0lu || num == 1lu) { - flag = false; - } else { - for (auto i = 2lu; i <= num / 2lu; ++i) { - if (num % i == 0) { - flag = false; - break; - } - } - } - return flag; -} - -/** - * @brief Computes the smallest prime number greater than or equal to `num`. - * - * @param num - * @return The smallest prime number greater than or equal to `num` - */ -constexpr std::size_t compute_prime(std::size_t num) noexcept -{ - while (not is_prime(num)) { - num++; - } - return num; -} - /** * @brief Calculates the valid capacity based on `cg_size` , `vector_width` * and the initial `capacity`. @@ -20177,15 +20146,15 @@ constexpr std::size_t compute_prime(std::size_t num) noexcept * @param capacity The initially requested capacity * @return A valid capacity no smaller than the requested `capacity` */ -template -constexpr std::size_t get_valid_capacity(std::size_t capacity) noexcept +template +constexpr T get_valid_capacity(T capacity) noexcept { auto const stride = [&]() { if constexpr (uses_vector_load) { return cg_size * vector_width; } if constexpr (not uses_vector_load) { return cg_size; } }(); - auto const c = SDIV(capacity, stride); + auto const c = int_div_ceil(capacity, stride); auto const min_prime = std::lower_bound(primes.begin(), primes.end(), c); return *min_prime * stride; } diff --git a/include/cuco/detail/probe_sequence_impl.cuh b/include/cuco/detail/probe_sequence_impl.cuh index 688b2f28f..c108840b2 100644 --- a/include/cuco/detail/probe_sequence_impl.cuh +++ b/include/cuco/detail/probe_sequence_impl.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,8 +16,8 @@ #pragma once -#include -#include +#include +#include #include @@ -72,13 +72,13 @@ template class probe_sequence_impl_base { protected: - using value_type = cuco::pair_type; ///< Type of key/value pairs + using value_type = cuco::pair; ///< Type of key/value pairs using key_type = Key; ///< Key type using mapped_type = Value; ///< Type of mapped values using atomic_key_type = cuda::atomic; ///< Type of atomic keys using atomic_mapped_type = cuda::atomic; ///< Type of atomic mapped values /// Pair type of atomic key and atomic mapped value - using pair_atomic_type = cuco::pair_type; + using pair_atomic_type = cuco::pair; /// Type of the forward iterator to `pair_atomic_type` using iterator = pair_atomic_type*; /// Type of the forward iterator to `const pair_atomic_type` diff --git a/include/cuco/detail/probing_scheme_base.cuh b/include/cuco/detail/probing_scheme_base.cuh new file mode 100644 index 000000000..03f712155 --- /dev/null +++ b/include/cuco/detail/probing_scheme_base.cuh @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace cuco { +namespace experimental { +namespace detail { + +/** + * @brief Base class of public probing scheme. + * + * This class should not be used directly. + * + * @tparam CGSize Size of CUDA Cooperative Groups + */ +template +class probing_scheme_base { + public: + /** + * @brief The size of the CUDA cooperative thread group. + */ + static constexpr int32_t cg_size = CGSize; +}; +} // namespace detail +} // namespace experimental +} // namespace cuco diff --git a/include/cuco/detail/probing_scheme_impl.inl b/include/cuco/detail/probing_scheme_impl.inl new file mode 100644 index 000000000..3090d026e --- /dev/null +++ b/include/cuco/detail/probing_scheme_impl.inl @@ -0,0 +1,160 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace cuco { +namespace experimental { +namespace detail { + +/** + * @brief Probing iterator class. + * + * @tparam Extent Type of Extent + */ +template +class probing_iterator { + public: + using extent_type = Extent; ///< Extent type + using size_type = typename extent_type::value_type; ///< Size type + + /** + * @brief Constructs an probing iterator + * + * @param start Iteration starting point + * @param step_size Double hashing step size + * @param upper_bound Upper bound of the iteration + */ + __host__ __device__ constexpr probing_iterator(size_type start, + size_type step_size, + extent_type upper_bound) noexcept + : curr_index_{start}, step_size_{step_size}, upper_bound_{upper_bound} + { + // TODO: revise this API when introducing quadratic probing into cuco + } + + /** + * @brief Dereference operator + * + * @return Current slot index + */ + __host__ __device__ constexpr auto operator*() const noexcept { return curr_index_; } + + /** + * @brief Prefix increment operator + * + * @return Current iterator + */ + __host__ __device__ constexpr auto operator++() noexcept + { + // TODO: step_size_ can be a build time constant (e.g. linear probing) + // Worth passing another extent type? + curr_index_ = (curr_index_ + step_size_) % upper_bound_; + return *this; + } + + /** + * @brief Postfix increment operator + * + * @return Old iterator before increment + */ + __host__ __device__ constexpr auto operator++(int32_t) noexcept + { + auto temp = *this; + ++(*this); + return temp; + } + + private: + size_type curr_index_; + size_type step_size_; + extent_type upper_bound_; +}; +} // namespace detail + +template +__host__ __device__ constexpr linear_probing::linear_probing(Hash const& hash) + : hash_{hash} +{ +} + +template +template +__host__ __device__ constexpr auto linear_probing::operator()( + ProbeKey const& probe_key, Extent upper_bound) const noexcept +{ + using size_type = typename Extent::value_type; + return detail::probing_iterator{ + cuco::detail::sanitize_hash(hash_(probe_key)) % upper_bound, + 1, // step size is 1 + upper_bound}; +} + +template +template +__host__ __device__ constexpr auto linear_probing::operator()( + cooperative_groups::thread_block_tile const& g, + ProbeKey const& probe_key, + Extent upper_bound) const noexcept +{ + using size_type = typename Extent::value_type; + return detail::probing_iterator{ + cuco::detail::sanitize_hash(hash_(probe_key) + g.thread_rank()) % upper_bound, + cg_size, + upper_bound}; +} + +template +__host__ __device__ constexpr double_hashing::double_hashing( + Hash1 const& hash1, Hash2 const& hash2) + : hash1_{hash1}, hash2_{hash2} +{ +} + +template +template +__host__ __device__ constexpr auto double_hashing::operator()( + ProbeKey const& probe_key, Extent upper_bound) const noexcept +{ + using size_type = typename Extent::value_type; + return detail::probing_iterator{ + cuco::detail::sanitize_hash(hash1_(probe_key)) % upper_bound, + max(size_type{1}, + cuco::detail::sanitize_hash(hash2_(probe_key)) % + upper_bound), // step size in range [1, prime - 1] + upper_bound}; +} + +template +template +__host__ __device__ constexpr auto double_hashing::operator()( + cooperative_groups::thread_block_tile const& g, + ProbeKey const& probe_key, + Extent upper_bound) const noexcept +{ + using size_type = typename Extent::value_type; + return detail::probing_iterator{ + cuco::detail::sanitize_hash(hash1_(probe_key) + g.thread_rank()) % upper_bound, + static_cast((cuco::detail::sanitize_hash(hash2_(probe_key)) % + (upper_bound.value() / cg_size - 1) + + 1) * + cg_size), + upper_bound}; // TODO use fast_int operator +} +} // namespace experimental +} // namespace cuco diff --git a/include/cuco/detail/static_map.inl b/include/cuco/detail/static_map.inl index 93059729f..f6f8a9464 100644 --- a/include/cuco/detail/static_map.inl +++ b/include/cuco/detail/static_map.inl @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -27,12 +28,11 @@ namespace cuco { template -static_map::static_map( - std::size_t capacity, - sentinel::empty_key empty_key_sentinel, - sentinel::empty_value empty_value_sentinel, - Allocator const& alloc, - cudaStream_t stream) +static_map::static_map(std::size_t capacity, + empty_key empty_key_sentinel, + empty_value empty_value_sentinel, + Allocator const& alloc, + cudaStream_t stream) : capacity_{std::max(capacity, std::size_t{1})}, // to avoid dereferencing a nullptr (Issue #72) empty_key_sentinel_{empty_key_sentinel.value}, empty_value_sentinel_{empty_value_sentinel.value}, @@ -52,13 +52,12 @@ static_map::static_map( } template -static_map::static_map( - std::size_t capacity, - sentinel::empty_key empty_key_sentinel, - sentinel::empty_value empty_value_sentinel, - sentinel::erased_key erased_key_sentinel, - Allocator const& alloc, - cudaStream_t stream) +static_map::static_map(std::size_t capacity, + empty_key empty_key_sentinel, + empty_value empty_value_sentinel, + erased_key erased_key_sentinel, + Allocator const& alloc, + cudaStream_t stream) : capacity_{std::max(capacity, std::size_t{1})}, // to avoid dereferencing a nullptr (Issue #72) empty_key_sentinel_{empty_key_sentinel.value}, empty_value_sentinel_{empty_value_sentinel.value}, @@ -66,8 +65,9 @@ static_map::static_map( slot_allocator_{alloc}, counter_allocator_{alloc} { - CUCO_RUNTIME_EXPECTS(empty_key_sentinel_ != erased_key_sentinel_, - "The empty key sentinel and erased key sentinel cannot be the same value."); + CUCO_EXPECTS(empty_key_sentinel_ != erased_key_sentinel_, + "The empty key sentinel and erased key sentinel cannot be the same value.", + std::runtime_error); slots_ = std::allocator_traits::allocate(slot_allocator_, capacity_); num_successes_ = std::allocator_traits::allocate(counter_allocator_, 1); @@ -102,7 +102,7 @@ template void static_map::insert( InputIt first, InputIt last, Hash hash, KeyEqual key_equal, cudaStream_t stream) { - auto num_keys = std::distance(first, last); + auto const num_keys = cuco::detail::distance(first, last); if (num_keys == 0) { return; } auto const block_size = 128; @@ -116,8 +116,8 @@ void static_map::insert( CUCO_CUDA_TRY(cudaMemsetAsync(num_successes_, 0, sizeof(atomic_ctr_type), stream)); std::size_t h_num_successes; - detail::insert<<>>( - first, first + num_keys, num_successes_, view, hash, key_equal); + detail::insert + <<>>(first, num_keys, num_successes_, view, hash, key_equal); CUCO_CUDA_TRY(cudaMemcpyAsync( &h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream)); @@ -140,7 +140,7 @@ void static_map::insert_if(InputIt first, KeyEqual key_equal, cudaStream_t stream) { - auto num_keys = std::distance(first, last); + auto const num_keys = cuco::detail::distance(first, last); if (num_keys == 0) { return; } auto constexpr block_size = 128; @@ -167,10 +167,11 @@ template void static_map::erase( InputIt first, InputIt last, Hash hash, KeyEqual key_equal, cudaStream_t stream) { - CUCO_RUNTIME_EXPECTS(get_empty_key_sentinel() != get_erased_key_sentinel(), - "You must provide a unique erased key sentinel value at map construction."); + CUCO_EXPECTS(get_empty_key_sentinel() != get_erased_key_sentinel(), + "You must provide a unique erased key sentinel value at map construction.", + std::runtime_error); - auto num_keys = std::distance(first, last); + auto const num_keys = cuco::detail::distance(first, last); if (num_keys == 0) { return; } auto constexpr block_size = 128; @@ -184,8 +185,8 @@ void static_map::erase( CUCO_CUDA_TRY(cudaMemsetAsync(num_successes_, 0, sizeof(atomic_ctr_type), stream)); std::size_t h_num_successes; - detail::erase<<>>( - first, first + num_keys, num_successes_, view, hash, key_equal); + detail::erase + <<>>(first, num_keys, num_successes_, view, hash, key_equal); CUCO_CUDA_TRY(cudaMemcpyAsync( &h_num_successes, num_successes_, sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream)); @@ -203,7 +204,7 @@ void static_map::find(InputIt first, KeyEqual key_equal, cudaStream_t stream) { - auto num_keys = std::distance(first, last); + auto const num_keys = cuco::detail::distance(first, last); if (num_keys == 0) { return; } auto const block_size = 128; @@ -213,13 +214,13 @@ void static_map::find(InputIt first, auto view = get_device_view(); detail::find - <<>>(first, last, output_begin, view, hash, key_equal); + <<>>(first, num_keys, output_begin, view, hash, key_equal); } template template std::pair static_map::retrieve_all( - KeyOut keys_out, ValueOut values_out, cudaStream_t stream) + KeyOut keys_out, ValueOut values_out, cudaStream_t stream) const { static_assert(sizeof(pair_atomic_type) == sizeof(value_type)); auto slots_begin = reinterpret_cast(slots_); @@ -259,6 +260,10 @@ std::pair static_map::retrieve_a CUCO_CUDA_TRY( cudaMemcpyAsync(&h_num_out, d_num_out, sizeof(std::size_t), cudaMemcpyDeviceToHost, stream)); CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); + std::allocator_traits::deallocate( + temp_allocator, reinterpret_cast(d_num_out), sizeof(std::size_t)); + std::allocator_traits::deallocate( + temp_allocator, d_temp_storage, temp_storage_bytes); return std::make_pair(keys_out + h_num_out, values_out + h_num_out); } @@ -272,7 +277,7 @@ void static_map::contains(InputIt first, KeyEqual key_equal, cudaStream_t stream) const { - auto num_keys = std::distance(first, last); + auto const num_keys = cuco::detail::distance(first, last); if (num_keys == 0) { return; } auto const block_size = 128; @@ -282,7 +287,7 @@ void static_map::contains(InputIt first, auto view = get_device_view(); detail::contains - <<>>(first, last, output_begin, view, hash, key_equal); + <<>>(first, num_keys, output_begin, view, hash, key_equal); } template diff --git a/include/cuco/detail/static_map/functors.cuh b/include/cuco/detail/static_map/functors.cuh new file mode 100644 index 000000000..f508206f0 --- /dev/null +++ b/include/cuco/detail/static_map/functors.cuh @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + */ + +#pragma once + +#include + +#include + +namespace cuco { +namespace experimental { +namespace static_map_ns { +namespace detail { + +/** + * @brief Device functor returning the content of the slot indexed by `idx`. + * + * @tparam StorageRef Storage ref type + */ +template +struct get_slot { + StorageRef storage_; ///< Storage ref + + /** + * @brief Constructs `get_slot` functor with the given storage ref. + * + * @param s Input storage ref + */ + explicit constexpr get_slot(StorageRef s) noexcept : storage_{s} {} + + /** + * @brief Accesses the slot content with the given index. + * + * @param idx The slot index + * @return The slot content + */ + __device__ constexpr auto operator()(typename StorageRef::size_type idx) const noexcept + { + auto const window_idx = idx / StorageRef::window_size; + auto const intra_idx = idx % StorageRef::window_size; + auto const [first, second] = storage_[window_idx][intra_idx]; + return thrust::make_tuple(first, second); + } +}; + +/** + * @brief Device functor returning whether the input slot indexed by `idx` is filled. + * + * @tparam T The slot key type + * @tparam U The slot value type + */ +template +struct slot_is_filled { + T empty_sentinel_; ///< The value of the empty key sentinel + + /** + * @brief Constructs `slot_is_filled` functor with the given empty sentinel. + * + * @param s Sentinel indicating empty slot + */ + explicit constexpr slot_is_filled(T const& s) noexcept : empty_sentinel_{s} {} + + /** + * @brief Indicates if the target slot `slot` is filled. + * + * @tparam U Slot content type + * + * @param slot The slot + * + * @return `true` if slot is filled + */ + template + __device__ constexpr bool operator()(Slot const& slot) const noexcept + { + return not cuco::detail::bitwise_compare(empty_sentinel_, thrust::get<0>(slot)); + } + + /** + * @brief Indicates if the target slot `slot` is filled. + * + * @param slot The slot + * + * @return `true` if slot is filled + */ + __device__ constexpr bool operator()(cuco::pair const& slot) const noexcept + { + return not cuco::detail::bitwise_compare(empty_sentinel_, slot.first); + } +}; + +} // namespace detail +} // namespace static_map_ns +} // namespace experimental +} // namespace cuco diff --git a/include/cuco/detail/static_map/kernels.cuh b/include/cuco/detail/static_map/kernels.cuh new file mode 100644 index 000000000..a36095462 --- /dev/null +++ b/include/cuco/detail/static_map/kernels.cuh @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include + +#include + +#include + +#include + +namespace cuco { +namespace experimental { +namespace static_map_ns { +namespace detail { + +/** + * @brief For any key-value pair `{k, v}` in the range `[first, first + n)`, if a key equivalent to + * `k` already exists in the container, assigns `v` to the mapped_type corresponding to the key `k`. + * If the key does not exist, inserts the pair as if by insert. + * + * @note If multiple elements in `[first, first + n)` compare equal, it is unspecified which element + * is inserted. + * + * @tparam CGSize Number of threads in each CG + * @tparam BlockSize Number of threads in each block + * @tparam InputIterator Device accessible input iterator whose `value_type` is + * convertible to the `value_type` of the data structure + * @tparam Ref Type of non-owning device ref allowing access to storage + * + * @param first Beginning of the sequence of input elements + * @param n Number of input elements + * @param ref Non-owning container device ref used to access the slot storage + */ +template +__global__ void insert_or_assign(InputIterator first, cuco::detail::index_type n, Ref ref) +{ + auto const loop_stride = cuco::detail::grid_stride() / CGSize; + auto idx = cuco::detail::global_thread_id() / CGSize; + + while (idx < n) { + typename Ref::value_type const insert_pair{*(first + idx)}; + if constexpr (CGSize == 1) { + ref.insert_or_assign(insert_pair); + } else { + auto const tile = + cooperative_groups::tiled_partition(cooperative_groups::this_thread_block()); + ref.insert_or_assign(tile, insert_pair); + } + idx += loop_stride; + } +} + +/** + * @brief Finds the equivalent map elements of all keys in the range `[first, first + n)`. + * + * @note If the key `*(first + i)` has a match in the container, copies the payload of its matched + * element to `(output_begin + i)`. Else, copies the empty value sentinel. Uses the CUDA Cooperative + * Groups API to leverage groups of multiple threads to find each key. This provides a significant + * boost in throughput compared to the non Cooperative Group `find` at moderate to high load + * factors. + * + * @tparam CGSize Number of threads in each CG + * @tparam BlockSize The size of the thread block + * @tparam InputIt Device accessible input iterator + * @tparam OutputIt Device accessible output iterator assignable from the map's `mapped_type` + * @tparam Ref Type of non-owning device ref allowing access to storage + * + * @param first Beginning of the sequence of keys + * @param n Number of keys to query + * @param output_begin Beginning of the sequence of matched payloads retrieved for each key + * @param ref Non-owning map device ref used to access the slot storage + */ +template +__global__ void find(InputIt first, cuco::detail::index_type n, OutputIt output_begin, Ref ref) +{ + namespace cg = cooperative_groups; + + auto const block = cg::this_thread_block(); + auto const thread_idx = block.thread_rank(); + auto const loop_stride = cuco::detail::grid_stride() / CGSize; + auto idx = cuco::detail::global_thread_id() / CGSize; + + __shared__ typename Ref::mapped_type output_buffer[BlockSize / CGSize]; + + while (idx - thread_idx < n) { // the whole thread block falls into the same iteration + if (idx < n) { + auto const key = *(first + idx); + if constexpr (CGSize == 1) { + auto const found = ref.find(key); + /* + * The ld.relaxed.gpu instruction causes L1 to flush more frequently, causing increased + * sector stores from L2 to global memory. By writing results to shared memory and then + * synchronizing before writing back to global, we no longer rely on L1, preventing the + * increase in sector stores from L2 to global and improving performance. + */ + output_buffer[thread_idx] = + found == ref.end() ? ref.empty_value_sentinel() : (*found).second; + block.sync(); + *(output_begin + idx) = output_buffer[thread_idx]; + } else { + auto const tile = cg::tiled_partition(block); + auto const found = ref.find(tile, key); + + if (tile.thread_rank() == 0) { + *(output_begin + idx) = found == ref.end() ? ref.empty_value_sentinel() : (*found).second; + } + } + } + idx += loop_stride; + } +} + +} // namespace detail +} // namespace static_map_ns +} // namespace experimental +} // namespace cuco diff --git a/include/cuco/detail/static_map/static_map.inl b/include/cuco/detail/static_map/static_map.inl new file mode 100644 index 000000000..d7274245e --- /dev/null +++ b/include/cuco/detail/static_map/static_map.inl @@ -0,0 +1,402 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace cuco { +namespace experimental { + +template +constexpr static_map:: + static_map(Extent capacity, + empty_key empty_key_sentinel, + empty_value empty_value_sentinel, + KeyEqual const& pred, + ProbingScheme const& probing_scheme, + Allocator const& alloc, + cuda_stream_ref stream) + : impl_{std::make_unique(capacity, + empty_key_sentinel, + cuco::pair{empty_key_sentinel, empty_value_sentinel}, + pred, + probing_scheme, + alloc, + stream)}, + empty_value_sentinel_{empty_value_sentinel} +{ +} + +template +void static_map::clear( + cuda_stream_ref stream) noexcept +{ + impl_->clear(stream); +} + +template +void static_map::clear_async( + cuda_stream_ref stream) noexcept +{ + impl_->clear_async(stream); +} + +template +template +static_map::size_type +static_map::insert( + InputIt first, InputIt last, cuda_stream_ref stream) +{ + return impl_->insert(first, last, ref(op::insert), stream); +} + +template +template +void static_map::insert_async( + InputIt first, InputIt last, cuda_stream_ref stream) noexcept +{ + impl_->insert_async(first, last, ref(op::insert), stream); +} + +template +template +static_map::size_type +static_map::insert_if( + InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda_stream_ref stream) +{ + return impl_->insert_if(first, last, stencil, pred, ref(op::insert), stream); +} + +template +template +void static_map:: + insert_if_async( + InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda_stream_ref stream) noexcept +{ + impl_->insert_if_async(first, last, stencil, pred, ref(op::insert), stream); +} + +template +template +void static_map:: + insert_or_assign(InputIt first, InputIt last, cuda_stream_ref stream) noexcept +{ + return this->insert_or_assign_async(first, last, stream); + stream.synchronize(); +} + +template +template +void static_map:: + insert_or_assign_async(InputIt first, InputIt last, cuda_stream_ref stream) noexcept +{ + auto const num = cuco::detail::distance(first, last); + if (num == 0) { return; } + + auto const grid_size = cuco::detail::grid_size(num, cg_size); + + static_map_ns::detail::insert_or_assign + <<>>( + first, num, ref(op::insert_or_assign)); +} + +template +template +void static_map::contains( + InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream) const +{ + contains_async(first, last, output_begin, stream); + stream.synchronize(); +} + +template +template +void static_map::contains_async( + InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream) const noexcept +{ + impl_->contains_async(first, last, output_begin, ref(op::contains), stream); +} + +template +template +void static_map::contains_if( + InputIt first, + InputIt last, + StencilIt stencil, + Predicate pred, + OutputIt output_begin, + cuda_stream_ref stream) const +{ + contains_if_async(first, last, stencil, pred, output_begin, stream); + stream.synchronize(); +} + +template +template +void static_map:: + contains_if_async(InputIt first, + InputIt last, + StencilIt stencil, + Predicate pred, + OutputIt output_begin, + cuda_stream_ref stream) const noexcept +{ + impl_->contains_if_async(first, last, stencil, pred, output_begin, ref(op::contains), stream); +} + +template +template +void static_map::find( + InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream) const +{ + find_async(first, last, output_begin, stream); + stream.synchronize(); +} + +template +template +void static_map::find_async( + InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream) const +{ + auto const num_keys = cuco::detail::distance(first, last); + if (num_keys == 0) { return; } + + auto const grid_size = cuco::detail::grid_size(num_keys, cg_size); + + static_map_ns::detail::find + <<>>( + first, num_keys, output_begin, ref(op::find)); +} + +template +template +std::pair +static_map::retrieve_all( + KeyOut keys_out, ValueOut values_out, cuda_stream_ref stream) const +{ + auto const begin = thrust::make_transform_iterator( + thrust::counting_iterator{0}, + static_map_ns::detail::get_slot(impl_->storage_ref())); + auto const is_filled = static_map_ns::detail::slot_is_filled(this->empty_key_sentinel()); + auto zipped_out_begin = thrust::make_zip_iterator(thrust::make_tuple(keys_out, values_out)); + auto const zipped_out_end = impl_->retrieve_all(begin, zipped_out_begin, is_filled, stream); + auto const num = std::distance(zipped_out_begin, zipped_out_end); + + return std::make_pair(keys_out + num, values_out + num); +} + +template +static_map::size_type +static_map::size( + cuda_stream_ref stream) const noexcept +{ + auto const is_filled = static_map_ns::detail::slot_is_filled(this->empty_key_sentinel()); + return impl_->size(is_filled, stream); +} + +template +constexpr auto +static_map::capacity() + const noexcept +{ + return impl_->capacity(); +} + +template +constexpr static_map::key_type +static_map::empty_key_sentinel() + const noexcept +{ + return impl_->empty_key_sentinel(); +} + +template +constexpr static_map:: + mapped_type + static_map:: + empty_value_sentinel() const noexcept +{ + return this->empty_value_sentinel_; +} + +template +template +auto static_map::ref( + Operators...) const noexcept +{ + static_assert(sizeof...(Operators), "No operators specified"); + return ref_type{cuco::empty_key(this->empty_key_sentinel()), + cuco::empty_value(this->empty_value_sentinel()), + impl_->key_eq(), + impl_->probing_scheme(), + impl_->storage_ref()}; +} +} // namespace experimental +} // namespace cuco diff --git a/include/cuco/detail/static_map/static_map_ref.inl b/include/cuco/detail/static_map/static_map_ref.inl new file mode 100644 index 000000000..28b3ffaf2 --- /dev/null +++ b/include/cuco/detail/static_map/static_map_ref.inl @@ -0,0 +1,674 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include + +#include + +namespace cuco { +namespace experimental { + +template +__host__ __device__ constexpr static_map_ref< + Key, + T, + Scope, + KeyEqual, + ProbingScheme, + StorageRef, + Operators...>::static_map_ref(cuco::empty_key empty_key_sentinel, + cuco::empty_value empty_value_sentinel, + KeyEqual const& predicate, + ProbingScheme const& probing_scheme, + StorageRef storage_ref) noexcept + : impl_{cuco::pair{empty_key_sentinel, empty_value_sentinel}, probing_scheme, storage_ref}, + empty_value_sentinel_{empty_value_sentinel}, + predicate_{empty_key_sentinel, predicate} +{ +} + +template +template +__host__ __device__ constexpr static_map_ref:: + static_map_ref( + static_map_ref&& + other) noexcept + : impl_{std::move(other.impl_)}, + predicate_{std::move(other.predicate_)}, + empty_value_sentinel_{std::move(other.empty_value_sentinel_)} +{ +} + +template +__host__ __device__ constexpr auto +static_map_ref::capacity() + const noexcept +{ + return impl_.capacity(); +} + +template +__host__ __device__ constexpr Key +static_map_ref:: + empty_key_sentinel() const noexcept +{ + return predicate_.empty_sentinel_; +} + +template +__host__ __device__ constexpr T +static_map_ref:: + empty_value_sentinel() const noexcept +{ + return empty_value_sentinel_; +} + +template +template +auto static_map_ref::with( + NewOperators...) && noexcept +{ + return static_map_ref( + std::move(*this)); +} + +template +struct static_map_ref:: + predicate_wrapper { + detail::equal_wrapper predicate_; + + /** + * @brief Map predicate wrapper ctor. + * + * @param sentinel Sentinel value + * @param equal Equality binary callable + */ + __host__ __device__ constexpr predicate_wrapper(key_type empty_key_sentinel, + key_equal const& equal) noexcept + : predicate_{empty_key_sentinel, equal} + { + } + + /** + * @brief Equality check with the given equality callable. + * + * @tparam U Right-hand side Element type + * + * @param lhs Left-hand side element to check equality + * @param rhs Right-hand side element to check equality + * + * @return `EQUAL` if `lhs` and `rhs` are equivalent. `UNEQUAL` otherwise. + */ + template + __device__ constexpr detail::equal_result equal_to(value_type const& lhs, + U const& rhs) const noexcept + { + return predicate_.equal_to(lhs.first, rhs); + } + + /** + * @brief Equality check with the given equality callable. + * + * @param lhs Left-hand side element to check equality + * @param rhs Right-hand side element to check equality + * + * @return `EQUAL` if `lhs` and `rhs` are equivalent. `UNEQUAL` otherwise. + */ + __device__ constexpr detail::equal_result equal_to(value_type const& lhs, + value_type const& rhs) const noexcept + { + return predicate_.equal_to(lhs.first, rhs.first); + } + + /** + * @brief Equality check with the given equality callable. + * + * @param lhs Left-hand side key to check equality + * @param rhs Right-hand side key to check equality + * + * @return `EQUAL` if `lhs` and `rhs` are equivalent. `UNEQUAL` otherwise. + */ + __device__ constexpr detail::equal_result equal_to(key_type const& lhs, + key_type const& rhs) const noexcept + { + return predicate_.equal_to(lhs, rhs); + } + + /** + * @brief Order-sensitive equality operator. + * + * @note Container keys MUST be always on the left-hand side. + * + * @tparam U Right-hand side Element type + * + * @param lhs Left-hand side element to check equality + * @param rhs Right-hand side element to check equality + * + * @return Three way equality comparison result + */ + template + __device__ constexpr detail::equal_result operator()(value_type const& lhs, + U const& rhs) const noexcept + { + return predicate_(lhs.first, rhs); + } +}; + +namespace detail { + +template +class operator_impl< + op::insert_tag, + static_map_ref> { + using base_type = static_map_ref; + using ref_type = static_map_ref; + using key_type = typename base_type::key_type; + using value_type = typename base_type::value_type; + + static constexpr auto cg_size = base_type::cg_size; + static constexpr auto window_size = base_type::window_size; + + public: + /** + * @brief Inserts an element. + * + * @param value The element to insert + * @return True if the given element is successfully inserted + */ + __device__ bool insert(value_type const& value) noexcept + { + ref_type& ref_ = static_cast(*this); + auto constexpr has_payload = true; + return ref_.impl_.insert(value, ref_.predicate_); + } + + /** + * @brief Inserts an element. + * + * @param group The Cooperative Group used to perform group insert + * @param value The element to insert + * @return True if the given element is successfully inserted + */ + __device__ bool insert(cooperative_groups::thread_block_tile const& group, + value_type const& value) noexcept + { + auto& ref_ = static_cast(*this); + auto constexpr has_payload = true; + return ref_.impl_.insert(group, value, ref_.predicate_); + } +}; + +template +class operator_impl< + op::insert_or_assign_tag, + static_map_ref> { + using base_type = static_map_ref; + using ref_type = static_map_ref; + using key_type = typename base_type::key_type; + using value_type = typename base_type::value_type; + + static constexpr auto cg_size = base_type::cg_size; + static constexpr auto window_size = base_type::window_size; + + static_assert(sizeof(T) == 4 or sizeof(T) == 8, + "sizeof(mapped_type) must be either 4 bytes or 8 bytes."); + + public: + /** + * @brief Inserts a key-value pair `{k, v}` if it's not present in the map. Otherwise, assigns `v` + * to the mapped_type corresponding to the key `k`. + * + * @param value The element to insert + */ + __device__ void insert_or_assign(value_type const& value) noexcept + { + static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme"); + + ref_type& ref_ = static_cast(*this); + auto const key = value.first; + auto& probing_scheme = ref_.impl_.probing_scheme(); + auto storage_ref = ref_.impl_.storage_ref(); + auto probing_iter = probing_scheme(key, storage_ref.window_extent()); + + while (true) { + auto const window_slots = storage_ref[*probing_iter]; + + for (auto& slot_content : window_slots) { + auto const eq_res = ref_.predicate_(slot_content, key); + + // If the key is already in the container, update the payload and return + if (eq_res == detail::equal_result::EQUAL) { + auto const intra_window_index = thrust::distance(window_slots.begin(), &slot_content); + ref_.impl_.atomic_store( + &((storage_ref.data() + *probing_iter)->data() + intra_window_index)->second, + value.second); + return; + } + if (eq_res == detail::equal_result::EMPTY) { + auto const intra_window_index = thrust::distance(window_slots.begin(), &slot_content); + if (attempt_insert_or_assign( + (storage_ref.data() + *probing_iter)->data() + intra_window_index, value)) { + return; + } + } + } + ++probing_iter; + } + } + + /** + * @brief Inserts an element. + * + * @brief Inserts a key-value pair `{k, v}` if it's not present in the map. Otherwise, assigns `v` + * to the mapped_type corresponding to the key `k`. + * + * @param group The Cooperative Group used to perform group insert + * @param value The element to insert + */ + __device__ void insert_or_assign(cooperative_groups::thread_block_tile const& group, + value_type const& value) noexcept + { + ref_type& ref_ = static_cast(*this); + + auto const key = value.first; + auto& probing_scheme = ref_.impl_.probing_scheme(); + auto storage_ref = ref_.impl_.storage_ref(); + auto probing_iter = probing_scheme(group, key, storage_ref.window_extent()); + + while (true) { + auto const window_slots = storage_ref[*probing_iter]; + + auto const [state, intra_window_index] = [&]() { + for (auto i = 0; i < window_size; ++i) { + switch (ref_.predicate_(window_slots[i], key)) { + case detail::equal_result::EMPTY: + return detail::window_probing_results{detail::equal_result::EMPTY, i}; + case detail::equal_result::EQUAL: + return detail::window_probing_results{detail::equal_result::EQUAL, i}; + default: continue; + } + } + // returns dummy index `-1` for UNEQUAL + return detail::window_probing_results{detail::equal_result::UNEQUAL, -1}; + }(); + + auto const group_contains_equal = group.ballot(state == detail::equal_result::EQUAL); + if (group_contains_equal) { + auto const src_lane = __ffs(group_contains_equal) - 1; + if (group.thread_rank() == src_lane) { + ref_.impl_.atomic_store( + &((storage_ref.data() + *probing_iter)->data() + intra_window_index)->second, + value.second); + } + group.sync(); + return; + } + + auto const group_contains_empty = group.ballot(state == detail::equal_result::EMPTY); + if (group_contains_empty) { + auto const src_lane = __ffs(group_contains_empty) - 1; + auto const status = + (group.thread_rank() == src_lane) + ? attempt_insert_or_assign( + (storage_ref.data() + *probing_iter)->data() + intra_window_index, value) + : false; + + // Exit if inserted or assigned + if (group.shfl(status, src_lane)) { return; } + } else { + ++probing_iter; + } + } + } + + private: + /** + * @brief Attempts to insert an element into a slot or update the matching payload with the given + * element + * + * @brief Inserts a key-value pair `{k, v}` if it's not present in the map. Otherwise, assigns `v` + * to the mapped_type corresponding to the key `k`. + * + * @param group The Cooperative Group used to perform group insert + * @param value The element to insert + * + * @return Returns `true` if the given `value` is inserted or `value` has a match in the map. + */ + __device__ constexpr bool attempt_insert_or_assign(value_type* slot, + value_type const& value) noexcept + { + ref_type& ref_ = static_cast(*this); + auto const expected_key = ref_.impl_.empty_slot_sentinel().first; + + auto old_key = ref_.impl_.compare_and_swap(&slot->first, expected_key, value.first); + auto* old_key_ptr = reinterpret_cast(&old_key); + + // if key success or key was already present in the map + if (cuco::detail::bitwise_compare(*old_key_ptr, expected_key) or + (ref_.predicate_.equal_to(*old_key_ptr, value.first) == detail::equal_result::EQUAL)) { + // Update payload + ref_.impl_.atomic_store(&slot->second, value.second); + return true; + } + return false; + } +}; + +template +class operator_impl< + op::insert_and_find_tag, + static_map_ref> { + using base_type = static_map_ref; + using ref_type = static_map_ref; + using key_type = typename base_type::key_type; + using value_type = typename base_type::value_type; + using iterator = typename base_type::iterator; + using const_iterator = typename base_type::const_iterator; + + static constexpr auto cg_size = base_type::cg_size; + static constexpr auto window_size = base_type::window_size; + + public: + /** + * @brief Returns a const_iterator to one past the last slot. + * + * @note This API is available only when `find_tag` or `insert_and_find_tag` is present. + * + * @return A const_iterator to one past the last slot + */ + [[nodiscard]] __host__ __device__ constexpr const_iterator end() const noexcept + { + auto const& ref_ = static_cast(*this); + return ref_.impl_.end(); + } + + /** + * @brief Returns an iterator to one past the last slot. + * + * @note This API is available only when `find_tag` or `insert_and_find_tag` is present. + * + * @return An iterator to one past the last slot + */ + [[nodiscard]] __host__ __device__ constexpr iterator end() noexcept + { + auto const& ref_ = static_cast(*this); + return ref_.impl_.end(); + } + + /** + * @brief Inserts the given element into the map. + * + * @note This API returns a pair consisting of an iterator to the inserted element (or to the + * element that prevented the insertion) and a `bool` denoting whether the insertion took place or + * not. + * + * @param value The element to insert + * + * @return a pair consisting of an iterator to the element and a bool indicating whether the + * insertion is successful or not. + */ + __device__ thrust::pair insert_and_find(value_type const& value) noexcept + { + ref_type& ref_ = static_cast(*this); + auto constexpr has_payload = true; + return ref_.impl_.insert_and_find(value, ref_.predicate_); + } + + /** + * @brief Inserts the given element into the map. + * + * @note This API returns a pair consisting of an iterator to the inserted element (or to the + * element that prevented the insertion) and a `bool` denoting whether the insertion took place or + * not. + * + * @param group The Cooperative Group used to perform group insert_and_find + * @param value The element to insert + * + * @return a pair consisting of an iterator to the element and a bool indicating whether the + * insertion is successful or not. + */ + __device__ thrust::pair insert_and_find( + cooperative_groups::thread_block_tile const& group, value_type const& value) noexcept + { + ref_type& ref_ = static_cast(*this); + auto constexpr has_payload = true; + return ref_.impl_.insert_and_find(group, value, ref_.predicate_); + } +}; + +template +class operator_impl< + op::contains_tag, + static_map_ref> { + using base_type = static_map_ref; + using ref_type = static_map_ref; + using key_type = typename base_type::key_type; + using value_type = typename base_type::value_type; + + static constexpr auto cg_size = base_type::cg_size; + static constexpr auto window_size = base_type::window_size; + + public: + /** + * @brief Indicates whether the probe key `key` was inserted into the container. + * + * @note If the probe key `key` was inserted into the container, returns + * true. Otherwise, returns false. + * + * @tparam ProbeKey Probe key type + * + * @param key The key to search for + * + * @return A boolean indicating whether the probe key is present + */ + template + [[nodiscard]] __device__ bool contains(ProbeKey const& key) const noexcept + { + // CRTP: cast `this` to the actual ref type + auto const& ref_ = static_cast(*this); + return ref_.impl_.contains(key, ref_.predicate_); + } + + /** + * @brief Indicates whether the probe key `key` was inserted into the container. + * + * @note If the probe key `key` was inserted into the container, returns + * true. Otherwise, returns false. + * + * @tparam ProbeKey Probe key type + * + * @param group The Cooperative Group used to perform group contains + * @param key The key to search for + * + * @return A boolean indicating whether the probe key is present + */ + template + [[nodiscard]] __device__ bool contains( + cooperative_groups::thread_block_tile const& group, ProbeKey const& key) const noexcept + { + auto const& ref_ = static_cast(*this); + return ref_.impl_.contains(group, key, ref_.predicate_); + } +}; + +template +class operator_impl< + op::find_tag, + static_map_ref> { + using base_type = static_map_ref; + using ref_type = static_map_ref; + using key_type = typename base_type::key_type; + using value_type = typename base_type::value_type; + using iterator = typename base_type::iterator; + using const_iterator = typename base_type::const_iterator; + + static constexpr auto cg_size = base_type::cg_size; + static constexpr auto window_size = base_type::window_size; + + public: + /** + * @brief Returns a const_iterator to one past the last slot. + * + * @note This API is available only when `find_tag` or `insert_and_find_tag` is present. + * + * @return A const_iterator to one past the last slot + */ + [[nodiscard]] __host__ __device__ constexpr const_iterator end() const noexcept + { + auto const& ref_ = static_cast(*this); + return ref_.impl_.end(); + } + + /** + * @brief Returns an iterator to one past the last slot. + * + * @note This API is available only when `find_tag` or `insert_and_find_tag` is present. + * + * @return An iterator to one past the last slot + */ + [[nodiscard]] __host__ __device__ constexpr iterator end() noexcept + { + auto const& ref_ = static_cast(*this); + return ref_.impl_.end(); + } + + /** + * @brief Finds an element in the map with key equivalent to the probe key. + * + * @note Returns a un-incrementable input iterator to the element whose key is equivalent to + * `key`. If no such element exists, returns `end()`. + * + * @tparam ProbeKey Probe key type + * + * @param key The key to search for + * + * @return An iterator to the position at which the equivalent key is stored + */ + template + [[nodiscard]] __device__ const_iterator find(ProbeKey const& key) const noexcept + { + // CRTP: cast `this` to the actual ref type + auto const& ref_ = static_cast(*this); + return ref_.impl_.find(key, ref_.predicate_); + } + + /** + * @brief Finds an element in the map with key equivalent to the probe key. + * + * @note Returns a un-incrementable input iterator to the element whose key is equivalent to + * `key`. If no such element exists, returns `end()`. + * + * @tparam ProbeKey Probe key type + * + * @param group The Cooperative Group used to perform this operation + * @param key The key to search for + * + * @return An iterator to the position at which the equivalent key is stored + */ + template + [[nodiscard]] __device__ const_iterator find( + cooperative_groups::thread_block_tile const& group, ProbeKey const& key) const noexcept + { + auto const& ref_ = static_cast(*this); + return ref_.impl_.find(group, key, ref_.predicate_); + } +}; + +} // namespace detail +} // namespace experimental +} // namespace cuco diff --git a/include/cuco/detail/static_map_kernels.cuh b/include/cuco/detail/static_map_kernels.cuh index 7a3ca0dfa..73c22997a 100644 --- a/include/cuco/detail/static_map_kernels.cuh +++ b/include/cuco/detail/static_map_kernels.cuh @@ -36,6 +36,7 @@ namespace cg = cooperative_groups; * @tparam Key key type * @tparam Value value type * @tparam pair_atomic_type key/value pair type + * * @param slots Pointer to flat storage for the map's key/value pairs * @param k Key to which all keys in `slots` are initialized * @param v Value to which all values in `slots` are initialized @@ -47,13 +48,14 @@ template -__global__ void initialize(pair_atomic_type* const slots, Key k, Value v, std::size_t size) +__global__ void initialize(pair_atomic_type* const slots, Key k, Value v, int64_t size) { - auto tid = block_size * blockIdx.x + threadIdx.x; - while (tid < size) { - new (&slots[tid].first) atomic_key_type{k}; - new (&slots[tid].second) atomic_mapped_type{v}; - tid += gridDim.x * block_size; + int64_t const loop_stride = gridDim.x * block_size; + int64_t idx = block_size * blockIdx.x + threadIdx.x; + while (idx < size) { + new (&slots[idx].first) atomic_key_type{k}; + new (&slots[idx].second) atomic_mapped_type{v}; + idx += loop_stride; } } @@ -70,8 +72,9 @@ __global__ void initialize(pair_atomic_type* const slots, Key k, Value v, std::s * @tparam viewT Type of device view allowing access of hash map storage * @tparam Hash Unary callable type * @tparam KeyEqual Binary callable type + * * @param first Beginning of the sequence of key/value pairs - * @param last End of the sequence of key/value pairs + * @param n Number of the key/value pairs to insert * @param num_successes The number of successfully inserted key/value pairs * @param view Mutable device view used to access the hash map's slot storage * @param hash The unary function to apply to hash each key @@ -84,19 +87,19 @@ template __global__ void insert( - InputIt first, InputIt last, atomicT* num_successes, viewT view, Hash hash, KeyEqual key_equal) + InputIt first, int64_t n, atomicT* num_successes, viewT view, Hash hash, KeyEqual key_equal) { typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; std::size_t thread_num_successes = 0; - auto tid = block_size * blockIdx.x + threadIdx.x; - auto it = first + tid; + int64_t const loop_stride = gridDim.x * block_size; + int64_t idx = block_size * blockIdx.x + threadIdx.x; - while (it < last) { - typename viewT::value_type const insert_pair{*it}; + while (idx < n) { + typename viewT::value_type const insert_pair{*(first + idx)}; if (view.insert(insert_pair, hash, key_equal)) { thread_num_successes++; } - it += gridDim.x * block_size; + idx += loop_stride; } // compute number of successfully inserted elements for each block @@ -123,8 +126,9 @@ __global__ void insert( * @tparam viewT Type of device view allowing access of hash map storage * @tparam Hash Unary callable type * @tparam KeyEqual Binary callable type + * * @param first Beginning of the sequence of key/value pairs - * @param last End of the sequence of key/value pairs + * @param n Number of the key/value pairs to insert * @param num_successes The number of successfully inserted key/value pairs * @param view Mutable device view used to access the hash map's slot storage * @param hash The unary function to apply to hash each key @@ -138,23 +142,23 @@ template __global__ void insert( - InputIt first, InputIt last, atomicT* num_successes, viewT view, Hash hash, KeyEqual key_equal) + InputIt first, int64_t n, atomicT* num_successes, viewT view, Hash hash, KeyEqual key_equal) { typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; std::size_t thread_num_successes = 0; - auto tile = cg::tiled_partition(cg::this_thread_block()); - auto tid = block_size * blockIdx.x + threadIdx.x; - auto it = first + tid / tile_size; + auto tile = cg::tiled_partition(cg::this_thread_block()); + int64_t const loop_stride = gridDim.x * block_size / tile_size; + int64_t idx = (block_size * blockIdx.x + threadIdx.x) / tile_size; - while (it < last) { + while (idx < n) { // force conversion to value_type - typename viewT::value_type const insert_pair{*it}; + typename viewT::value_type const insert_pair{*(first + idx)}; if (view.insert(tile, insert_pair, hash, key_equal) && tile.thread_rank() == 0) { thread_num_successes++; } - it += (gridDim.x * block_size) / tile_size; + idx += loop_stride; } // compute number of successfully inserted elements for each block @@ -163,6 +167,28 @@ __global__ void insert( if (threadIdx.x == 0) { *num_successes += block_num_successes; } } +/** + * @brief Erases the key/value pairs corresponding to all keys in the range `[first, last)`. + * + * If the key `*(first + i)` exists in the map, its slot is erased and made available for future + * insertions. + * Else, no effect. + * + * @tparam block_size The size of the thread block + * @tparam InputIt Device accessible input iterator whose `value_type` is + * convertible to the map's `key_type` + * @tparam atomicT Type of atomic storage + * @tparam viewT Type of device view allowing access of hash map storage + * @tparam Hash Unary callable type + * @tparam KeyEqual Binary callable type + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param num_successes The number of successfully erased key/value pairs + * @param view Device view used to access the hash map's slot storage + * @param hash The unary function to apply to hash each key + * @param key_equal The binary function to compare two keys for equality + */ template __global__ void erase( - InputIt first, InputIt last, atomicT* num_successes, viewT view, Hash hash, KeyEqual key_equal) + InputIt first, int64_t n, atomicT* num_successes, viewT view, Hash hash, KeyEqual key_equal) { using BlockReduce = cub::BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; std::size_t thread_num_successes = 0; - auto tid = block_size * blockIdx.x + threadIdx.x; - auto it = first + tid; + const int64_t loop_stride = gridDim.x * block_size; + int64_t idx = block_size * blockIdx.x + threadIdx.x; - while (it < last) { - if (view.erase(*it, hash, key_equal)) { thread_num_successes++; } - it += gridDim.x * block_size; + while (idx < n) { + if (view.erase(*(first + idx), hash, key_equal)) { thread_num_successes++; } + idx += loop_stride; } // compute number of successfully inserted elements for each block @@ -192,6 +218,29 @@ __global__ void erase( } } +/** + * @brief Erases the key/value pairs corresponding to all keys in the range `[first, last)`. + * + * If the key `*(first + i)` exists in the map, its slot is erased and made available for future + * insertions. + * Else, no effect. + * + * @tparam block_size The size of the thread block + * @tparam tile_size The number of threads in the Cooperative Groups used to perform erase + * @tparam InputIt Device accessible input iterator whose `value_type` is + * convertible to the map's `key_type` + * @tparam atomicT Type of atomic storage + * @tparam viewT Type of device view allowing access of hash map storage + * @tparam Hash Unary callable type + * @tparam KeyEqual Binary callable type + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param num_successes The number of successfully erased key/value pairs + * @param view Device view used to access the hash map's slot storage + * @param hash The unary function to apply to hash each key + * @param key_equal The binary function to compare two keys for equality + */ template __global__ void erase( - InputIt first, InputIt last, atomicT* num_successes, viewT view, Hash hash, KeyEqual key_equal) + InputIt first, int64_t n, atomicT* num_successes, viewT view, Hash hash, KeyEqual key_equal) { typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; std::size_t thread_num_successes = 0; - auto tile = cg::tiled_partition(cg::this_thread_block()); - auto tid = block_size * blockIdx.x + threadIdx.x; - auto it = first + tid / tile_size; + auto tile = cg::tiled_partition(cg::this_thread_block()); + int64_t const loop_stride = gridDim.x * block_size / tile_size; + int64_t idx = (block_size * blockIdx.x + threadIdx.x) / tile_size; - while (it < last) { - if (view.erase(tile, *it, hash, key_equal) and tile.thread_rank() == 0) { + while (idx < n) { + if (view.erase(tile, *(first + idx), hash, key_equal) and tile.thread_rank() == 0) { thread_num_successes++; } - it += (gridDim.x * block_size) / tile_size; + idx += loop_stride; } // compute number of successfully inserted elements for each block @@ -244,6 +293,7 @@ __global__ void erase( * and argument type is convertible from `std::iterator_traits::value_type` * @tparam Hash Unary callable type * @tparam KeyEqual Binary callable type + * * @param first Beginning of the sequence of key/value pairs * @param n Number of elements to insert * @param num_successes The number of successfully inserted key/value pairs @@ -263,7 +313,7 @@ template __global__ void insert_if_n(InputIt first, - std::size_t n, + int64_t n, atomicT* num_successes, viewT view, StencilIt stencil, @@ -275,18 +325,18 @@ __global__ void insert_if_n(InputIt first, __shared__ typename BlockReduce::TempStorage temp_storage; std::size_t thread_num_successes = 0; - auto tile = cg::tiled_partition(cg::this_thread_block()); - auto tid = block_size * blockIdx.x + threadIdx.x; - auto i = tid / tile_size; + auto tile = cg::tiled_partition(cg::this_thread_block()); + int64_t const loop_stride = gridDim.x * block_size / tile_size; + int64_t idx = (block_size * blockIdx.x + threadIdx.x) / tile_size; - while (i < n) { - if (pred(*(stencil + i))) { - typename viewT::value_type const insert_pair{*(first + i)}; + while (idx < n) { + if (pred(*(stencil + idx))) { + typename viewT::value_type const insert_pair{*(first + idx)}; if (view.insert(tile, insert_pair, hash, key_equal) and tile.thread_rank() == 0) { thread_num_successes++; } } - i += (gridDim.x * block_size) / tile_size; + idx += loop_stride; } // compute number of successfully inserted elements for each block @@ -311,8 +361,9 @@ __global__ void insert_if_n(InputIt first, * @tparam viewT Type of device view allowing access of hash map storage * @tparam Hash Unary callable type * @tparam KeyEqual Binary callable type + * * @param first Beginning of the sequence of keys - * @param last End of the sequence of keys + * @param n Number of keys to query * @param output_begin Beginning of the sequence of values retrieved for each key * @param view Device view used to access the hash map's slot storage * @param hash The unary function to apply to hash each key @@ -326,14 +377,14 @@ template __global__ void find( - InputIt first, InputIt last, OutputIt output_begin, viewT view, Hash hash, KeyEqual key_equal) + InputIt first, int64_t n, OutputIt output_begin, viewT view, Hash hash, KeyEqual key_equal) { - auto tid = block_size * blockIdx.x + threadIdx.x; - auto key_idx = tid; + int64_t const loop_stride = gridDim.x * block_size; + int64_t idx = block_size * blockIdx.x + threadIdx.x; __shared__ Value writeBuffer[block_size]; - while (first + key_idx < last) { - auto key = *(first + key_idx); + while (idx < n) { + auto key = *(first + idx); auto found = view.find(key, hash, key_equal); /* @@ -347,8 +398,8 @@ __global__ void find( ? view.get_empty_value_sentinel() : found->second.load(cuda::std::memory_order_relaxed); __syncthreads(); - *(output_begin + key_idx) = writeBuffer[threadIdx.x]; - key_idx += gridDim.x * block_size; + *(output_begin + idx) = writeBuffer[threadIdx.x]; + idx += loop_stride; } } @@ -371,8 +422,9 @@ __global__ void find( * @tparam viewT Type of device view allowing access of hash map storage * @tparam Hash Unary callable type * @tparam KeyEqual Binary callable type + * * @param first Beginning of the sequence of keys - * @param last End of the sequence of keys + * @param n Number of keys to query * @param output_begin Beginning of the sequence of values retrieved for each key * @param view Device view used to access the hash map's slot storage * @param hash The unary function to apply to hash each key @@ -387,15 +439,15 @@ template __global__ void find( - InputIt first, InputIt last, OutputIt output_begin, viewT view, Hash hash, KeyEqual key_equal) + InputIt first, int64_t n, OutputIt output_begin, viewT view, Hash hash, KeyEqual key_equal) { - auto tile = cg::tiled_partition(cg::this_thread_block()); - auto tid = block_size * blockIdx.x + threadIdx.x; - auto key_idx = tid / tile_size; - __shared__ Value writeBuffer[block_size]; + auto tile = cg::tiled_partition(cg::this_thread_block()); + int64_t const loop_stride = gridDim.x * block_size / tile_size; + int64_t idx = (block_size * blockIdx.x + threadIdx.x) / tile_size; + __shared__ Value writeBuffer[block_size / tile_size]; - while (first + key_idx < last) { - auto key = *(first + key_idx); + while (idx < n) { + auto key = *(first + idx); auto found = view.find(tile, key, hash, key_equal); /* @@ -411,10 +463,8 @@ __global__ void find( : found->second.load(cuda::std::memory_order_relaxed); } __syncthreads(); - if (tile.thread_rank() == 0) { - *(output_begin + key_idx) = writeBuffer[threadIdx.x / tile_size]; - } - key_idx += (gridDim.x * block_size) / tile_size; + if (tile.thread_rank() == 0) { *(output_begin + idx) = writeBuffer[threadIdx.x / tile_size]; } + idx += loop_stride; } } @@ -431,8 +481,9 @@ __global__ void find( * @tparam viewT Type of device view allowing access of hash map storage * @tparam Hash Unary callable type * @tparam KeyEqual Binary callable type + * * @param first Beginning of the sequence of keys - * @param last End of the sequence of keys + * @param n Number of keys to query * @param output_begin Beginning of the sequence of booleans for the presence of each key * @param view Device view used to access the hash map's slot storage * @param hash The unary function to apply to hash each key @@ -445,14 +496,14 @@ template __global__ void contains( - InputIt first, InputIt last, OutputIt output_begin, viewT view, Hash hash, KeyEqual key_equal) + InputIt first, int64_t n, OutputIt output_begin, viewT view, Hash hash, KeyEqual key_equal) { - auto tid = block_size * blockIdx.x + threadIdx.x; - auto key_idx = tid; + int64_t const loop_stride = gridDim.x * block_size; + int64_t idx = block_size * blockIdx.x + threadIdx.x; __shared__ bool writeBuffer[block_size]; - while (first + key_idx < last) { - auto key = *(first + key_idx); + while (idx < n) { + auto key = *(first + idx); /* * The ld.relaxed.gpu instruction used in view.find causes L1 to @@ -463,8 +514,8 @@ __global__ void contains( */ writeBuffer[threadIdx.x] = view.contains(key, hash, key_equal); __syncthreads(); - *(output_begin + key_idx) = writeBuffer[threadIdx.x]; - key_idx += gridDim.x * block_size; + *(output_begin + idx) = writeBuffer[threadIdx.x]; + idx += loop_stride; } } @@ -486,8 +537,9 @@ __global__ void contains( * @tparam viewT Type of device view allowing access of hash map storage * @tparam Hash Unary callable type * @tparam KeyEqual Binary callable type + * * @param first Beginning of the sequence of keys - * @param last End of the sequence of keys + * @param n Number of keys to query * @param output_begin Beginning of the sequence of booleans for the presence of each key * @param view Device view used to access the hash map's slot storage * @param hash The unary function to apply to hash each key @@ -501,15 +553,15 @@ template __global__ void contains( - InputIt first, InputIt last, OutputIt output_begin, viewT view, Hash hash, KeyEqual key_equal) + InputIt first, int64_t n, OutputIt output_begin, viewT view, Hash hash, KeyEqual key_equal) { - auto tile = cg::tiled_partition(cg::this_thread_block()); - auto tid = block_size * blockIdx.x + threadIdx.x; - auto key_idx = tid / tile_size; - __shared__ bool writeBuffer[block_size]; + auto tile = cg::tiled_partition(cg::this_thread_block()); + int64_t const loop_stride = gridDim.x * block_size / tile_size; + int64_t idx = (block_size * blockIdx.x + threadIdx.x) / tile_size; + __shared__ bool writeBuffer[block_size / tile_size]; - while (first + key_idx < last) { - auto key = *(first + key_idx); + while (idx < n) { + auto key = *(first + idx); auto found = view.contains(tile, key, hash, key_equal); /* @@ -521,10 +573,8 @@ __global__ void contains( */ if (tile.thread_rank() == 0) { writeBuffer[threadIdx.x / tile_size] = found; } __syncthreads(); - if (tile.thread_rank() == 0) { - *(output_begin + key_idx) = writeBuffer[threadIdx.x / tile_size]; - } - key_idx += (gridDim.x * block_size) / tile_size; + if (tile.thread_rank() == 0) { *(output_begin + idx) = writeBuffer[threadIdx.x / tile_size]; } + idx += loop_stride; } } diff --git a/include/cuco/detail/static_multimap/device_view_impl.inl b/include/cuco/detail/static_multimap/device_view_impl.inl index c6612a7c8..98c08e720 100644 --- a/include/cuco/detail/static_multimap/device_view_impl.inl +++ b/include/cuco/detail/static_multimap/device_view_impl.inl @@ -22,6 +22,7 @@ #include #include +#include namespace cuco { template ::device_view_ } offset = g.shfl(offset, 0); - if constexpr (thrust::is_contiguous_iterator_v) { #if defined(CUCO_HAS_CG_MEMCPY_ASYNC) + constexpr bool uses_memcpy_async = thrust::is_contiguous_iterator_v; +#else + constexpr bool uses_memcpy_async = false; +#endif // end CUCO_HAS_CG_MEMCPY_ASYNC + + if constexpr (uses_memcpy_async) { #if defined(CUCO_HAS_CUDA_BARRIER) cooperative_groups::memcpy_async( g, - output_begin + offset, + &thrust::raw_reference_cast(*(output_begin + offset)), output_buffer, cuda::aligned_size_t(sizeof(value_type) * num_outputs)); #else - cooperative_groups::memcpy_async( - g, output_begin + offset, output_buffer, sizeof(value_type) * num_outputs); + cooperative_groups::memcpy_async(g, + &thrust::raw_reference_cast(*(output_begin + offset)), + output_buffer, + sizeof(value_type) * num_outputs); #endif // end CUCO_HAS_CUDA_BARRIER - return; -#endif // end CUCO_HAS_CG_MEMCPY_ASYNC } - for (auto index = lane_id; index < num_outputs; index += g.size()) { - *(output_begin + offset + index) = output_buffer[index]; + + if constexpr (not uses_memcpy_async) { + for (auto index = lane_id; index < num_outputs; index += g.size()) { + *(output_begin + offset + index) = output_buffer[index]; + } } } @@ -991,8 +1000,12 @@ class static_multimap::device_view_ if (*flushing_cg_counter + flushing_cg.size() * vector_width() > buffer_size) { flush_output_buffer( flushing_cg, *flushing_cg_counter, output_buffer, num_matches, output_begin); + // Everyone in the group reads the counter when flushing, so + // sync before writing. + flushing_cg.sync(); // First lane reset warp-level counter if (flushing_cg.thread_rank() == 0) { *flushing_cg_counter = 0; } + flushing_cg.sync(); } current_slot = next_slot(current_slot); @@ -1083,8 +1096,12 @@ class static_multimap::device_view_ // Flush if the next iteration won't fit into buffer if ((*cg_counter + g.size()) > buffer_size) { flush_output_buffer(g, *cg_counter, output_buffer, num_matches, output_begin); + // Everyone in the group reads the counter when flushing, so + // sync before writing. + g.sync(); // First lane reset CG-level counter if (lane_id == 0) { *cg_counter = 0; } + g.sync(); } current_slot = next_slot(current_slot); } // while running @@ -1419,8 +1436,12 @@ class static_multimap::device_view_ num_matches, probe_output_begin, contained_output_begin); + // Everyone in the group reads the counter when flushing, so + // sync before writing. + flushing_cg.sync(); // First lane reset warp-level counter if (flushing_cg.thread_rank() == 0) { *flushing_cg_counter = 0; } + flushing_cg.sync(); } current_slot = next_slot(current_slot); @@ -1530,8 +1551,12 @@ class static_multimap::device_view_ num_matches, probe_output_begin, contained_output_begin); + // Everyone in the group reads the counter when flushing, so + // sync before writing. + g.sync(); // First lane reset CG-level counter if (lane_id == 0) { *cg_counter = 0; } + g.sync(); } current_slot = next_slot(current_slot); } // while running diff --git a/include/cuco/detail/static_multimap/kernels.cuh b/include/cuco/detail/static_multimap/kernels.cuh index f3820bf64..67fb36045 100644 --- a/include/cuco/detail/static_multimap/kernels.cuh +++ b/include/cuco/detail/static_multimap/kernels.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,7 +15,7 @@ */ #pragma once -#include +#include #include @@ -23,8 +23,6 @@ #include -#include - #include namespace cuco { @@ -42,6 +40,7 @@ namespace cg = cooperative_groups; * @tparam Key key type * @tparam Value value type * @tparam pair_atomic_type key/value pair type + * * @param slots Pointer to flat storage for the map's key/value pairs * @param k Key to which all keys in `slots` are initialized * @param v Value to which all values in `slots` are initialized @@ -52,13 +51,14 @@ template -__global__ void initialize(pair_atomic_type* const slots, Key k, Value v, std::size_t size) +__global__ void initialize(pair_atomic_type* const slots, Key k, Value v, int64_t size) { - auto tid = threadIdx.x + blockIdx.x * blockDim.x; - while (tid < size) { - new (&slots[tid].first) atomic_key_type{k}; - new (&slots[tid].second) atomic_mapped_type{v}; - tid += gridDim.x * blockDim.x; + int64_t const loop_stride = gridDim.x * blockDim.x; + int64_t idx = threadIdx.x + blockIdx.x * blockDim.x; + while (idx < size) { + new (&slots[idx].first) atomic_key_type{k}; + new (&slots[idx].second) atomic_mapped_type{v}; + idx += loop_stride; } } @@ -78,21 +78,21 @@ __global__ void initialize(pair_atomic_type* const slots, Key k, Value v, std::s * @tparam viewT Type of device view allowing access of hash map storage * * @param first Beginning of the sequence of key/value pairs - * @param last End of the sequence of key/value pairs + * @param n Number of key/value pairs to insert * @param view Mutable device view used to access the hash map's slot storage */ template -__global__ void insert(InputIt first, InputIt last, viewT view) +__global__ void insert(InputIt first, int64_t n, viewT view) { - auto tile = cg::tiled_partition(cg::this_thread_block()); - auto tid = block_size * blockIdx.x + threadIdx.x; - auto it = first + tid / tile_size; + auto tile = cg::tiled_partition(cg::this_thread_block()); + int64_t const loop_stride = gridDim.x * block_size / tile_size; + int64_t idx = (block_size * blockIdx.x + threadIdx.x) / tile_size; - while (it < last) { + while (idx < n) { // force conversion to value_type - typename viewT::value_type const insert_pair{*it}; + typename viewT::value_type const insert_pair{*(first + idx)}; view.insert(tile, insert_pair); - it += (gridDim.x * block_size) / tile_size; + idx += loop_stride; } } @@ -117,6 +117,7 @@ __global__ void insert(InputIt first, InputIt last, viewT view) * @tparam viewT Type of device view allowing access of hash map storage * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` and * argument type is convertible from `std::iterator_traits::value_type`. + * * @param first Beginning of the sequence of key/value pairs * @param s Beginning of the stencil sequence * @param n Number of elements to insert @@ -129,19 +130,19 @@ template -__global__ void insert_if_n(InputIt first, StencilIt s, std::size_t n, viewT view, Predicate pred) +__global__ void insert_if_n(InputIt first, StencilIt s, int64_t n, viewT view, Predicate pred) { - auto tile = cg::tiled_partition(cg::this_thread_block()); - auto const tid = block_size * blockIdx.x + threadIdx.x; - auto i = tid / tile_size; + auto tile = cg::tiled_partition(cg::this_thread_block()); + int64_t const loop_stride = gridDim.x * block_size / tile_size; + int64_t idx = (block_size * blockIdx.x + threadIdx.x) / tile_size; - while (i < n) { - if (pred(*(s + i))) { - typename viewT::value_type const insert_pair{*(first + i)}; + while (idx < n) { + if (pred(*(s + idx))) { + typename viewT::value_type const insert_pair{*(first + idx)}; // force conversion to value_type view.insert(tile, insert_pair); } - i += (gridDim.x * block_size) / tile_size; + idx += loop_stride; } } @@ -164,7 +165,7 @@ __global__ void insert_if_n(InputIt first, StencilIt s, std::size_t n, viewT vie * @tparam Equal Binary callable type * * @param first Beginning of the sequence of elements - * @param last End of the sequence of elements + * @param n Number of elements to query * @param output_begin Beginning of the sequence of booleans for the presence of each element * @param view Device view used to access the hash map's slot storage * @param equal The binary function to compare input element and slot content for equality @@ -176,15 +177,14 @@ template -__global__ void contains( - InputIt first, InputIt last, OutputIt output_begin, viewT view, Equal equal) +__global__ void contains(InputIt first, int64_t n, OutputIt output_begin, viewT view, Equal equal) { - auto tile = cg::tiled_partition(cg::this_thread_block()); - auto tid = block_size * blockIdx.x + threadIdx.x; - auto idx = tid / tile_size; - __shared__ bool writeBuffer[block_size]; + auto tile = cg::tiled_partition(cg::this_thread_block()); + int64_t const loop_stride = gridDim.x * block_size / tile_size; + int64_t idx = (block_size * blockIdx.x + threadIdx.x) / tile_size; + __shared__ bool writeBuffer[block_size / tile_size]; - while (first + idx < last) { + while (idx < n) { typename std::iterator_traits::value_type element = *(first + idx); auto found = [&]() { if constexpr (is_pair_contains) { return view.pair_contains(tile, element, equal); } @@ -201,7 +201,7 @@ __global__ void contains( if (tile.thread_rank() == 0) { writeBuffer[threadIdx.x / tile_size] = found; } __syncthreads(); if (tile.thread_rank() == 0) { *(output_begin + idx) = writeBuffer[threadIdx.x / tile_size]; } - idx += (gridDim.x * block_size) / tile_size; + idx += loop_stride; } } @@ -221,8 +221,9 @@ __global__ void contains( * @tparam atomicT Type of atomic storage * @tparam viewT Type of device view allowing access of hash map storage * @tparam KeyEqual Binary callable + * * @param first Beginning of the sequence of keys to count - * @param last End of the sequence of keys to count + * @param n Number of the keys to query * @param num_matches The number of all the matches for a sequence of keys * @param view Device view used to access the hash map's slot storage * @param key_equal Binary function to compare two keys for equality @@ -235,24 +236,24 @@ template __global__ void count( - InputIt first, InputIt last, atomicT* num_matches, viewT view, KeyEqual key_equal) + InputIt first, int64_t n, atomicT* num_matches, viewT view, KeyEqual key_equal) { - auto tile = cg::tiled_partition(cg::this_thread_block()); - auto tid = block_size * blockIdx.x + threadIdx.x; - auto key_idx = tid / tile_size; + auto tile = cg::tiled_partition(cg::this_thread_block()); + int64_t const loop_stride = gridDim.x * block_size / tile_size; + int64_t idx = (block_size * blockIdx.x + threadIdx.x) / tile_size; typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; std::size_t thread_num_matches = 0; - while (first + key_idx < last) { - auto key = *(first + key_idx); + while (idx < n) { + auto key = *(first + idx); if constexpr (is_outer) { thread_num_matches += view.count_outer(tile, key, key_equal); } else { thread_num_matches += view.count(tile, key, key_equal); } - key_idx += (gridDim.x * block_size) / tile_size; + idx += loop_stride; } // compute number of successfully inserted elements for each block @@ -279,8 +280,9 @@ __global__ void count( * @tparam atomicT Type of atomic storage * @tparam viewT Type of device view allowing access of hash map storage * @tparam PairEqual Binary callable + * * @param first Beginning of the sequence of pairs to count - * @param last End of the sequence of pairs to count + * @param n Number of the pairs to query * @param num_matches The number of all the matches for a sequence of pairs * @param view Device view used to access the hash map's slot storage * @param pair_equal Binary function to compare two pairs for equality @@ -293,24 +295,24 @@ template __global__ void pair_count( - InputIt first, InputIt last, atomicT* num_matches, viewT view, PairEqual pair_equal) + InputIt first, int64_t n, atomicT* num_matches, viewT view, PairEqual pair_equal) { - auto tile = cg::tiled_partition(cg::this_thread_block()); - auto tid = block_size * blockIdx.x + threadIdx.x; - auto pair_idx = tid / tile_size; + auto tile = cg::tiled_partition(cg::this_thread_block()); + int64_t const loop_stride = gridDim.x * block_size / tile_size; + int64_t idx = (block_size * blockIdx.x + threadIdx.x) / tile_size; typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; std::size_t thread_num_matches = 0; - while (first + pair_idx < last) { - typename viewT::value_type const pair = *(first + pair_idx); + while (idx < n) { + typename viewT::value_type const pair = *(first + idx); if constexpr (is_outer) { thread_num_matches += view.pair_count_outer(tile, pair, pair_equal); } else { thread_num_matches += view.pair_count(tile, pair, pair_equal); } - pair_idx += (gridDim.x * block_size) / tile_size; + idx += loop_stride; } // compute number of successfully inserted elements for each block @@ -343,8 +345,9 @@ __global__ void pair_count( * @tparam atomicT Type of atomic storage * @tparam viewT Type of device view allowing access of hash map storage * @tparam KeyEqual Binary callable type + * * @param first Beginning of the sequence of keys - * @param last End of the sequence of keys + * @param n Number of the keys to query * @param output_begin Beginning of the sequence of values retrieved for each key * @param num_matches Size of the output sequence * @param view Device view used to access the hash map's slot storage @@ -361,7 +364,7 @@ template __global__ void retrieve(InputIt first, - InputIt last, + int64_t n, OutputIt output_begin, atomicT* num_matches, viewT view, @@ -372,10 +375,10 @@ __global__ void retrieve(InputIt first, constexpr uint32_t num_flushing_cgs = block_size / flushing_cg_size; const uint32_t flushing_cg_id = threadIdx.x / flushing_cg_size; - auto flushing_cg = cg::tiled_partition(cg::this_thread_block()); - auto probing_cg = cg::tiled_partition(cg::this_thread_block()); - auto tid = block_size * blockIdx.x + threadIdx.x; - auto key_idx = tid / probing_cg_size; + auto flushing_cg = cg::tiled_partition(cg::this_thread_block()); + auto probing_cg = cg::tiled_partition(cg::this_thread_block()); + int64_t const loop_stride = gridDim.x * block_size / probing_cg_size; + int64_t idx = (block_size * blockIdx.x + threadIdx.x) / probing_cg_size; __shared__ pair_type output_buffer[num_flushing_cgs][buffer_size]; // TODO: replace this with shared memory cuda::atomic variables once the dynamiic initialization @@ -384,12 +387,14 @@ __global__ void retrieve(InputIt first, if (flushing_cg.thread_rank() == 0) { flushing_cg_counter[flushing_cg_id] = 0; } - while (flushing_cg.any(first + key_idx < last)) { - bool active_flag = first + key_idx < last; + flushing_cg.sync(); + + while (flushing_cg.any(idx < n)) { + bool active_flag = idx < n; auto active_flushing_cg = cg::binary_partition(flushing_cg, active_flag); if (active_flag) { - auto key = *(first + key_idx); + auto key = *(first + idx); if constexpr (is_outer) { view.retrieve_outer(active_flushing_cg, probing_cg, @@ -410,9 +415,10 @@ __global__ void retrieve(InputIt first, key_equal); } } - key_idx += (gridDim.x * block_size) / probing_cg_size; + idx += loop_stride; } + flushing_cg.sync(); // Final flush of output buffer if (flushing_cg_counter[flushing_cg_id] > 0) { view.flush_output_buffer(flushing_cg, @@ -450,8 +456,9 @@ __global__ void retrieve(InputIt first, * @tparam atomicT Type of atomic storage * @tparam viewT Type of device view allowing access of hash map storage * @tparam PairEqual Binary callable type + * * @param first Beginning of the sequence of keys - * @param last End of the sequence of keys + * @param n Number of keys to query * @param probe_output_begin Beginning of the sequence of the matched probe pairs * @param contained_output_begin Beginning of the sequence of the matched contained pairs * @param num_matches Size of the output sequence @@ -470,7 +477,7 @@ template __global__ void pair_retrieve(InputIt first, - InputIt last, + int64_t n, OutputIt1 probe_output_begin, OutputIt2 contained_output_begin, atomicT* num_matches, @@ -482,10 +489,10 @@ __global__ void pair_retrieve(InputIt first, constexpr uint32_t num_flushing_cgs = block_size / flushing_cg_size; const uint32_t flushing_cg_id = threadIdx.x / flushing_cg_size; - auto flushing_cg = cg::tiled_partition(cg::this_thread_block()); - auto probing_cg = cg::tiled_partition(cg::this_thread_block()); - auto tid = block_size * blockIdx.x + threadIdx.x; - auto pair_idx = tid / probing_cg_size; + auto flushing_cg = cg::tiled_partition(cg::this_thread_block()); + auto probing_cg = cg::tiled_partition(cg::this_thread_block()); + int64_t const loop_stride = gridDim.x * block_size / probing_cg_size; + int64_t idx = (block_size * blockIdx.x + threadIdx.x) / probing_cg_size; __shared__ pair_type probe_output_buffer[num_flushing_cgs][buffer_size]; __shared__ pair_type contained_output_buffer[num_flushing_cgs][buffer_size]; @@ -495,12 +502,14 @@ __global__ void pair_retrieve(InputIt first, if (flushing_cg.thread_rank() == 0) { flushing_cg_counter[flushing_cg_id] = 0; } - while (flushing_cg.any(first + pair_idx < last)) { - bool active_flag = first + pair_idx < last; + flushing_cg.sync(); + + while (flushing_cg.any(idx < n)) { + bool active_flag = idx < n; auto active_flushing_cg = cg::binary_partition(flushing_cg, active_flag); if (active_flag) { - pair_type pair = *(first + pair_idx); + pair_type pair = *(first + idx); if constexpr (is_outer) { view.pair_retrieve_outer(active_flushing_cg, probing_cg, @@ -525,9 +534,10 @@ __global__ void pair_retrieve(InputIt first, pair_equal); } } - pair_idx += (gridDim.x * block_size) / probing_cg_size; + idx += loop_stride; } + flushing_cg.sync(); // Final flush of output buffer if (flushing_cg_counter[flushing_cg_id] > 0) { view.flush_output_buffer(flushing_cg, diff --git a/include/cuco/detail/static_multimap/static_multimap.inl b/include/cuco/detail/static_multimap/static_multimap.inl index ddec2e4a2..4e9570bce 100644 --- a/include/cuco/detail/static_multimap/static_multimap.inl +++ b/include/cuco/detail/static_multimap/static_multimap.inl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,8 +14,8 @@ * limitations under the License. */ +#include #include -#include #include #include @@ -33,8 +33,8 @@ template static_multimap::static_multimap( std::size_t capacity, - sentinel::empty_key empty_key_sentinel, - sentinel::empty_value empty_value_sentinel, + empty_key empty_key_sentinel, + empty_value empty_value_sentinel, cudaStream_t stream, Allocator const& alloc) : capacity_{cuco::detail::get_valid_capacity( @@ -66,7 +66,7 @@ void static_multimap::insert(InputI InputIt last, cudaStream_t stream) { - auto const num_keys = std::distance(first, last); + auto const num_keys = cuco::detail::distance(first, last); if (num_keys == 0) { return; } auto constexpr block_size = 128; @@ -75,7 +75,7 @@ void static_multimap::insert(InputI auto view = get_device_mutable_view(); detail::insert - <<>>(first, first + num_keys, view); + <<>>(first, num_keys, view); CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); } @@ -88,7 +88,7 @@ template void static_multimap::insert_if( InputIt first, InputIt last, StencilIt stencil, Predicate pred, cudaStream_t stream) { - auto const num_keys = std::distance(first, last); + auto const num_keys = cuco::detail::distance(first, last); if (num_keys == 0) { return; } auto constexpr block_size = 128; @@ -110,7 +110,7 @@ template void static_multimap::contains( InputIt first, InputIt last, OutputIt output_begin, KeyEqual key_equal, cudaStream_t stream) const { - auto const num_keys = std::distance(first, last); + auto const num_keys = cuco::detail::distance(first, last); if (num_keys == 0) { return; } auto constexpr is_pair_contains = false; @@ -120,7 +120,7 @@ void static_multimap::contains( auto view = get_device_view(); detail::contains - <<>>(first, last, output_begin, view, key_equal); + <<>>(first, num_keys, output_begin, view, key_equal); CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); } @@ -134,7 +134,7 @@ void static_multimap::pair_contains InputIt first, InputIt last, OutputIt output_begin, PairEqual pair_equal, cudaStream_t stream) const { - auto const num_pairs = std::distance(first, last); + auto const num_pairs = cuco::detail::distance(first, last); if (num_pairs == 0) { return; } auto constexpr is_pair_contains = true; @@ -144,7 +144,7 @@ void static_multimap::pair_contains auto view = get_device_view(); detail::contains - <<>>(first, last, output_begin, view, pair_equal); + <<>>(first, num_pairs, output_begin, view, pair_equal); CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); } @@ -157,7 +157,7 @@ template std::size_t static_multimap::count( InputIt first, InputIt last, cudaStream_t stream, KeyEqual key_equal) const { - auto const num_keys = std::distance(first, last); + auto const num_keys = cuco::detail::distance(first, last); if (num_keys == 0) { return 0; } auto constexpr is_outer = false; @@ -167,11 +167,11 @@ std::size_t static_multimap::count( auto view = get_device_view(); auto const grid_size = (cg_size() * num_keys + stride * block_size - 1) / (stride * block_size); - cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream); + CUCO_CUDA_TRY(cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream)); std::size_t h_counter; detail::count - <<>>(first, last, d_counter_.get(), view, key_equal); + <<>>(first, num_keys, d_counter_.get(), view, key_equal); CUCO_CUDA_TRY(cudaMemcpyAsync( &h_counter, d_counter_.get(), sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream)); CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); @@ -188,7 +188,7 @@ template std::size_t static_multimap::count_outer( InputIt first, InputIt last, cudaStream_t stream, KeyEqual key_equal) const { - auto const num_keys = std::distance(first, last); + auto const num_keys = cuco::detail::distance(first, last); if (num_keys == 0) { return 0; } auto constexpr is_outer = true; @@ -198,11 +198,11 @@ std::size_t static_multimap::count_ auto view = get_device_view(); auto const grid_size = (cg_size() * num_keys + stride * block_size - 1) / (stride * block_size); - cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream); + CUCO_CUDA_TRY(cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream)); std::size_t h_counter; detail::count - <<>>(first, last, d_counter_.get(), view, key_equal); + <<>>(first, num_keys, d_counter_.get(), view, key_equal); CUCO_CUDA_TRY(cudaMemcpyAsync( &h_counter, d_counter_.get(), sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream)); CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); @@ -219,21 +219,21 @@ template std::size_t static_multimap::pair_count( InputIt first, InputIt last, PairEqual pair_equal, cudaStream_t stream) const { - auto const num_keys = std::distance(first, last); - if (num_keys == 0) { return 0; } + auto const num_pairs = cuco::detail::distance(first, last); + if (num_pairs == 0) { return 0; } auto constexpr is_outer = false; auto constexpr block_size = 128; auto constexpr stride = 1; auto view = get_device_view(); - auto const grid_size = (cg_size() * num_keys + stride * block_size - 1) / (stride * block_size); + auto const grid_size = (cg_size() * num_pairs + stride * block_size - 1) / (stride * block_size); - cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream); + CUCO_CUDA_TRY(cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream)); std::size_t h_counter; detail::pair_count - <<>>(first, last, d_counter_.get(), view, pair_equal); + <<>>(first, num_pairs, d_counter_.get(), view, pair_equal); CUCO_CUDA_TRY(cudaMemcpyAsync( &h_counter, d_counter_.get(), sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream)); CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); @@ -250,21 +250,21 @@ template std::size_t static_multimap::pair_count_outer( InputIt first, InputIt last, PairEqual pair_equal, cudaStream_t stream) const { - auto const num_keys = std::distance(first, last); - if (num_keys == 0) { return 0; } + auto const num_pairs = cuco::detail::distance(first, last); + if (num_pairs == 0) { return 0; } auto constexpr is_outer = true; auto constexpr block_size = 128; auto constexpr stride = 1; auto view = get_device_view(); - auto const grid_size = (cg_size() * num_keys + stride * block_size - 1) / (stride * block_size); + auto const grid_size = (cg_size() * num_pairs + stride * block_size - 1) / (stride * block_size); - cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream); + CUCO_CUDA_TRY(cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream)); std::size_t h_counter; detail::pair_count - <<>>(first, last, d_counter_.get(), view, pair_equal); + <<>>(first, num_pairs, d_counter_.get(), view, pair_equal); CUCO_CUDA_TRY(cudaMemcpyAsync( &h_counter, d_counter_.get(), sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream)); CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); @@ -281,12 +281,11 @@ template OutputIt static_multimap::retrieve( InputIt first, InputIt last, OutputIt output_begin, cudaStream_t stream, KeyEqual key_equal) const { - auto const num_keys = std::distance(first, last); + auto const num_keys = cuco::detail::distance(first, last); if (num_keys == 0) { return output_begin; } // Using per-warp buffer for vector loads and per-CG buffer for scalar loads constexpr auto buffer_size = uses_vector_load() ? (warp_size() * 3u) : (cg_size() * 3u); - constexpr auto block_size = 128; constexpr auto is_outer = false; auto view = get_device_view(); @@ -295,24 +294,14 @@ OutputIt static_multimap::retrieve( return cg_size(); }(); - auto const grid_size = detail::get_grid_size(detail::retrieve, - block_size); - - cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream); + auto const grid_size = detail::grid_size(num_keys, cg_size()); + + CUCO_CUDA_TRY(cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream)); std::size_t h_counter; - detail::retrieve - <<>>( - first, last, output_begin, d_counter_.get(), view, key_equal); + detail::retrieve + <<>>( + first, num_keys, output_begin, d_counter_.get(), view, key_equal); CUCO_CUDA_TRY(cudaMemcpyAsync( &h_counter, d_counter_.get(), sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream)); @@ -331,12 +320,11 @@ template OutputIt static_multimap::retrieve_outer( InputIt first, InputIt last, OutputIt output_begin, cudaStream_t stream, KeyEqual key_equal) const { - auto const num_keys = std::distance(first, last); + auto const num_keys = cuco::detail::distance(first, last); if (num_keys == 0) { return output_begin; } // Using per-warp buffer for vector loads and per-CG buffer for scalar loads constexpr auto buffer_size = uses_vector_load() ? (warp_size() * 3u) : (cg_size() * 3u); - constexpr auto block_size = 128; constexpr auto is_outer = true; auto view = get_device_view(); @@ -345,24 +333,14 @@ OutputIt static_multimap::retrieve_ return cg_size(); }(); - auto const grid_size = detail::get_grid_size(detail::retrieve, - block_size); - - cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream); + auto const grid_size = detail::grid_size(num_keys, cg_size()); + + CUCO_CUDA_TRY(cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream)); std::size_t h_counter; - detail::retrieve - <<>>( - first, last, output_begin, d_counter_.get(), view, key_equal); + detail::retrieve + <<>>( + first, num_keys, output_begin, d_counter_.get(), view, key_equal); CUCO_CUDA_TRY(cudaMemcpyAsync( &h_counter, d_counter_.get(), sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream)); @@ -387,7 +365,7 @@ static_multimap::pair_retrieve( PairEqual pair_equal, cudaStream_t stream) const { - auto const num_pairs = std::distance(first, last); + auto const num_pairs = cuco::detail::distance(first, last); if (num_pairs == 0) { return std::make_pair(probe_output_begin, contained_output_begin); } // Using per-warp buffer for vector loads and per-CG buffer for scalar loads @@ -403,12 +381,17 @@ static_multimap::pair_retrieve( }(); auto const grid_size = (cg_size() * num_pairs + stride * block_size - 1) / (stride * block_size); - cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream); + CUCO_CUDA_TRY(cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream)); std::size_t h_counter; detail::pair_retrieve - <<>>( - first, last, probe_output_begin, contained_output_begin, d_counter_.get(), view, pair_equal); + <<>>(first, + num_pairs, + probe_output_begin, + contained_output_begin, + d_counter_.get(), + view, + pair_equal); CUCO_CUDA_TRY(cudaMemcpyAsync( &h_counter, d_counter_.get(), sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream)); @@ -432,7 +415,7 @@ static_multimap::pair_retrieve_oute PairEqual pair_equal, cudaStream_t stream) const { - auto const num_pairs = std::distance(first, last); + auto const num_pairs = cuco::detail::distance(first, last); if (num_pairs == 0) { return std::make_pair(probe_output_begin, contained_output_begin); } // Using per-warp buffer for vector loads and per-CG buffer for scalar loads @@ -448,12 +431,17 @@ static_multimap::pair_retrieve_oute }(); auto const grid_size = (cg_size() * num_pairs + stride * block_size - 1) / (stride * block_size); - cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream); + CUCO_CUDA_TRY(cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream)); std::size_t h_counter; detail::pair_retrieve - <<>>( - first, last, probe_output_begin, contained_output_begin, d_counter_.get(), view, pair_equal); + <<>>(first, + num_pairs, + probe_output_begin, + contained_output_begin, + d_counter_.get(), + view, + pair_equal); CUCO_CUDA_TRY(cudaMemcpyAsync( &h_counter, d_counter_.get(), sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream)); diff --git a/include/cuco/detail/static_set/functors.cuh b/include/cuco/detail/static_set/functors.cuh new file mode 100644 index 000000000..3ee7be4be --- /dev/null +++ b/include/cuco/detail/static_set/functors.cuh @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + */ + +#pragma once + +#include + +namespace cuco { +namespace experimental { +namespace static_set_ns { +namespace detail { + +/** + * @brief Device functor returning whether the input slot indexed by `idx` is filled. + * + * @tparam T The slot content type + */ +template +struct slot_is_filled { + T empty_sentinel_; ///< The value of the empty key sentinel + + /** + * @brief Constructs `slot_is_filled` functor with the given empty sentinel. + * + * @param s Sentinel indicating empty slot + */ + explicit constexpr slot_is_filled(T const& s) noexcept : empty_sentinel_{s} {} + + /** + * @brief Indicates if the target slot `slot` is filled. + * + * @tparam T Slot content type + * + * @param slot The slot + * + * @return `true` if slot is filled + */ + __device__ constexpr bool operator()(T const& slot) const noexcept + { + return not cuco::detail::bitwise_compare(empty_sentinel_, slot); + } +}; + +} // namespace detail +} // namespace static_set_ns +} // namespace experimental +} // namespace cuco diff --git a/include/cuco/detail/static_set/kernels.cuh b/include/cuco/detail/static_set/kernels.cuh new file mode 100644 index 000000000..72744f2b4 --- /dev/null +++ b/include/cuco/detail/static_set/kernels.cuh @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include + +#include + +#include + +#include + +namespace cuco { +namespace experimental { +namespace static_set_ns { +namespace detail { + +/** + * @brief Finds the equivalent set elements of all keys in the range `[first, last)`. + * + * If the key `*(first + i)` has a match in the set, copies its matched element to `(output_begin + + * i)`. Else, copies the empty key sentinel. Uses the CUDA Cooperative Groups API to leverage groups + * of multiple threads to find each key. This provides a significant boost in throughput compared to + * the non Cooperative Group `find` at moderate to high load factors. + * + * @tparam CGSize Number of threads in each CG + * @tparam BlockSize The size of the thread block + * @tparam InputIt Device accessible input iterator + * @tparam OutputIt Device accessible output iterator assignable from the set's `key_type` + * @tparam Ref Type of non-owning device ref allowing access to storage + * + * @param first Beginning of the sequence of keys + * @param n Number of keys to query + * @param output_begin Beginning of the sequence of matched elements retrieved for each key + * @param ref Non-owning set device ref used to access the slot storage + */ +template +__global__ void find(InputIt first, cuco::detail::index_type n, OutputIt output_begin, Ref ref) +{ + namespace cg = cooperative_groups; + + auto const block = cg::this_thread_block(); + auto const thread_idx = block.thread_rank(); + auto const loop_stride = cuco::detail::grid_stride() / CGSize; + auto idx = cuco::detail::global_thread_id() / CGSize; + + __shared__ typename Ref::key_type output_buffer[BlockSize / CGSize]; + + while (idx - thread_idx < n) { // the whole thread block falls into the same iteration + if (idx < n) { + auto const key = *(first + idx); + if constexpr (CGSize == 1) { + auto const found = ref.find(key); + /* + * The ld.relaxed.gpu instruction causes L1 to flush more frequently, causing increased + * sector stores from L2 to global memory. By writing results to shared memory and then + * synchronizing before writing back to global, we no longer rely on L1, preventing the + * increase in sector stores from L2 to global and improving performance. + */ + output_buffer[thread_idx] = found == ref.end() ? ref.empty_key_sentinel() : *found; + block.sync(); + *(output_begin + idx) = output_buffer[thread_idx]; + } else { + auto const tile = cg::tiled_partition(block); + auto const found = ref.find(tile, key); + + if (tile.thread_rank() == 0) { + *(output_begin + idx) = found == ref.end() ? ref.empty_key_sentinel() : *found; + } + } + } + idx += loop_stride; + } +} + +} // namespace detail +} // namespace static_set_ns +} // namespace experimental +} // namespace cuco diff --git a/include/cuco/detail/static_set/static_set.inl b/include/cuco/detail/static_set/static_set.inl new file mode 100644 index 000000000..4898f3055 --- /dev/null +++ b/include/cuco/detail/static_set/static_set.inl @@ -0,0 +1,317 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace cuco { +namespace experimental { + +template +constexpr static_set::static_set( + Extent capacity, + empty_key empty_key_sentinel, + KeyEqual const& pred, + ProbingScheme const& probing_scheme, + Allocator const& alloc, + cuda_stream_ref stream) + : impl_{std::make_unique( + capacity, empty_key_sentinel, empty_key_sentinel, pred, probing_scheme, alloc, stream)} +{ +} + +template +void static_set::clear( + cuda_stream_ref stream) noexcept +{ + impl_->clear(stream); +} + +template +void static_set::clear_async( + cuda_stream_ref stream) noexcept +{ + impl_->clear_async(stream); +} + +template +template +static_set::size_type +static_set::insert( + InputIt first, InputIt last, cuda_stream_ref stream) +{ + return impl_->insert(first, last, ref(op::insert), stream); +} + +template +template +void static_set::insert_async( + InputIt first, InputIt last, cuda_stream_ref stream) noexcept +{ + impl_->insert_async(first, last, ref(op::insert), stream); +} + +template +template +static_set::size_type +static_set::insert_if( + InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda_stream_ref stream) +{ + return impl_->insert_if(first, last, stencil, pred, ref(op::insert), stream); +} + +template +template +void static_set::insert_if_async( + InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda_stream_ref stream) noexcept +{ + impl_->insert_if_async(first, last, stencil, pred, ref(op::insert), stream); +} + +template +template +void static_set::contains( + InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream) const +{ + contains_async(first, last, output_begin, stream); + stream.synchronize(); +} + +template +template +void static_set::contains_async( + InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream) const noexcept +{ + impl_->contains_async(first, last, output_begin, ref(op::contains), stream); +} + +template +template +void static_set::contains_if( + InputIt first, + InputIt last, + StencilIt stencil, + Predicate pred, + OutputIt output_begin, + cuda_stream_ref stream) const +{ + contains_if_async(first, last, stencil, pred, output_begin, stream); + stream.synchronize(); +} + +template +template +void static_set::contains_if_async( + InputIt first, + InputIt last, + StencilIt stencil, + Predicate pred, + OutputIt output_begin, + cuda_stream_ref stream) const noexcept +{ + impl_->contains_if_async(first, last, stencil, pred, output_begin, ref(op::contains), stream); +} + +template +template +void static_set::find( + InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream) const +{ + find_async(first, last, output_begin, stream); + stream.synchronize(); +} + +template +template +void static_set::find_async( + InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream) const +{ + auto const num_keys = cuco::detail::distance(first, last); + if (num_keys == 0) { return; } + + auto const grid_size = cuco::detail::grid_size(num_keys, cg_size); + + static_set_ns::detail::find + <<>>( + first, num_keys, output_begin, ref(op::find)); +} + +template +template +OutputIt static_set::retrieve_all( + OutputIt output_begin, cuda_stream_ref stream) const +{ + auto const begin = + thrust::make_transform_iterator(thrust::counting_iterator{0}, + detail::get_slot(impl_->storage_ref())); + auto const is_filled = static_set_ns::detail::slot_is_filled(this->empty_key_sentinel()); + + return impl_->retrieve_all(begin, output_begin, is_filled, stream); +} + +template +static_set::size_type +static_set::size( + cuda_stream_ref stream) const noexcept +{ + auto const is_filled = static_set_ns::detail::slot_is_filled(this->empty_key_sentinel()); + return impl_->size(is_filled, stream); +} + +template +constexpr auto +static_set::capacity() + const noexcept +{ + return impl_->capacity(); +} + +template +constexpr static_set::key_type +static_set::empty_key_sentinel() + const noexcept +{ + return impl_->empty_key_sentinel(); +} + +template +template +auto static_set::ref( + Operators...) const noexcept +{ + static_assert(sizeof...(Operators), "No operators specified"); + return ref_type{cuco::empty_key(this->empty_key_sentinel()), + impl_->key_eq(), + impl_->probing_scheme(), + impl_->storage_ref()}; +} +} // namespace experimental +} // namespace cuco diff --git a/include/cuco/detail/static_set/static_set_ref.inl b/include/cuco/detail/static_set/static_set_ref.inl new file mode 100644 index 000000000..4c3853971 --- /dev/null +++ b/include/cuco/detail/static_set/static_set_ref.inl @@ -0,0 +1,392 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include + +#include + +namespace cuco { +namespace experimental { + +template +__host__ __device__ constexpr static_set_ref< + Key, + Scope, + KeyEqual, + ProbingScheme, + StorageRef, + Operators...>::static_set_ref(cuco::empty_key empty_key_sentinel, + KeyEqual const& predicate, + ProbingScheme const& probing_scheme, + StorageRef storage_ref) noexcept + : impl_{empty_key_sentinel, probing_scheme, storage_ref}, + predicate_{empty_key_sentinel, predicate} +{ +} + +template +template +__host__ __device__ constexpr static_set_ref:: + static_set_ref( + static_set_ref&& + other) noexcept + : impl_{std::move(other.impl_)}, predicate_{std::move(other.predicate_)} +{ +} + +template +__host__ __device__ constexpr auto +static_set_ref::capacity() + const noexcept +{ + return impl_.capacity(); +} + +template +__host__ __device__ constexpr Key +static_set_ref::empty_key_sentinel() + const noexcept +{ + return predicate_.empty_sentinel_; +} + +template +template +auto static_set_ref::with( + NewOperators...) && noexcept +{ + return static_set_ref( + std::move(*this)); +} + +namespace detail { + +template +class operator_impl> { + using base_type = static_set_ref; + using ref_type = static_set_ref; + using key_type = typename base_type::key_type; + using value_type = typename base_type::value_type; + + static constexpr auto cg_size = base_type::cg_size; + static constexpr auto window_size = base_type::window_size; + + public: + /** + * @brief Inserts an element. + * + * @param value The element to insert + * + * @return True if the given element is successfully inserted + */ + __device__ bool insert(value_type const& value) noexcept + { + ref_type& ref_ = static_cast(*this); + auto constexpr has_payload = false; + return ref_.impl_.insert(value, ref_.predicate_); + } + + /** + * @brief Inserts an element. + * + * @param group The Cooperative Group used to perform group insert + * @param value The element to insert + * + * @return True if the given element is successfully inserted + */ + __device__ bool insert(cooperative_groups::thread_block_tile const& group, + value_type const& value) noexcept + { + auto& ref_ = static_cast(*this); + auto constexpr has_payload = false; + return ref_.impl_.insert(group, value, ref_.predicate_); + } +}; + +template +class operator_impl> { + using base_type = static_set_ref; + using ref_type = static_set_ref; + using key_type = typename base_type::key_type; + using value_type = typename base_type::value_type; + using iterator = typename base_type::iterator; + using const_iterator = typename base_type::const_iterator; + + static constexpr auto cg_size = base_type::cg_size; + static constexpr auto window_size = base_type::window_size; + + public: + /** + * @brief Returns a const_iterator to one past the last slot. + * + * @note This API is available only when `find_tag` or `insert_and_find_tag` is present. + * + * @return A const_iterator to one past the last slot + */ + [[nodiscard]] __host__ __device__ constexpr const_iterator end() const noexcept + { + auto const& ref_ = static_cast(*this); + return ref_.impl_.end(); + } + + /** + * @brief Returns an iterator to one past the last slot. + * + * @note This API is available only when `find_tag` or `insert_and_find_tag` is present. + * + * @return An iterator to one past the last slot + */ + [[nodiscard]] __host__ __device__ constexpr iterator end() noexcept + { + auto const& ref_ = static_cast(*this); + return ref_.impl_.end(); + } + + /** + * @brief Inserts the given element into the set. + * + * @note This API returns a pair consisting of an iterator to the inserted element (or to the + * element that prevented the insertion) and a `bool` denoting whether the insertion took place or + * not. + * + * @param value The element to insert + * + * @return a pair consisting of an iterator to the element and a bool indicating whether the + * insertion is successful or not. + */ + __device__ thrust::pair insert_and_find(value_type const& value) noexcept + { + ref_type& ref_ = static_cast(*this); + auto constexpr has_payload = false; + return ref_.impl_.insert_and_find(value, ref_.predicate_); + } + + /** + * @brief Inserts the given element into the set. + * + * @note This API returns a pair consisting of an iterator to the inserted element (or to the + * element that prevented the insertion) and a `bool` denoting whether the insertion took place or + * not. + * + * @param group The Cooperative Group used to perform group insert_and_find + * @param value The element to insert + * + * @return a pair consisting of an iterator to the element and a bool indicating whether the + * insertion is successful or not. + */ + __device__ thrust::pair insert_and_find( + cooperative_groups::thread_block_tile const& group, value_type const& value) noexcept + { + ref_type& ref_ = static_cast(*this); + auto constexpr has_payload = false; + return ref_.impl_.insert_and_find(group, value, ref_.predicate_); + } +}; + +template +class operator_impl> { + using base_type = static_set_ref; + using ref_type = static_set_ref; + using key_type = typename base_type::key_type; + using value_type = typename base_type::value_type; + + static constexpr auto cg_size = base_type::cg_size; + static constexpr auto window_size = base_type::window_size; + + public: + /** + * @brief Indicates whether the probe key `key` was inserted into the container. + * + * @note If the probe key `key` was inserted into the container, returns true. Otherwise, returns + * false. + * + * @tparam ProbeKey Probe key type + * + * @param key The key to search for + * + * @return A boolean indicating whether the probe key is present + */ + template + [[nodiscard]] __device__ bool contains(ProbeKey const& key) const noexcept + { + auto const& ref_ = static_cast(*this); + return ref_.impl_.contains(key, ref_.predicate_); + } + + /** + * @brief Indicates whether the probe key `key` was inserted into the container. + * + * @note If the probe key `key` was inserted into the container, returns true. Otherwise, returns + * false. + * + * @tparam ProbeKey Probe key type + * + * @param group The Cooperative Group used to perform group contains + * @param key The key to search for + * + * @return A boolean indicating whether the probe key is present + */ + template + [[nodiscard]] __device__ bool contains( + cooperative_groups::thread_block_tile const& group, ProbeKey const& key) const noexcept + { + auto const& ref_ = static_cast(*this); + return ref_.impl_.contains(group, key, ref_.predicate_); + } +}; + +template +class operator_impl> { + using base_type = static_set_ref; + using ref_type = static_set_ref; + using key_type = typename base_type::key_type; + using value_type = typename base_type::value_type; + using iterator = typename base_type::iterator; + using const_iterator = typename base_type::const_iterator; + + static constexpr auto cg_size = base_type::cg_size; + static constexpr auto window_size = base_type::window_size; + + public: + /** + * @brief Returns a const_iterator to one past the last slot. + * + * @note This API is available only when `find_tag` or `insert_and_find_tag` is present. + * + * @return A const_iterator to one past the last slot + */ + [[nodiscard]] __host__ __device__ constexpr const_iterator end() const noexcept + { + auto const& ref_ = static_cast(*this); + return ref_.impl_.end(); + } + + /** + * @brief Returns an iterator to one past the last slot. + * + * @note This API is available only when `find_tag` or `insert_and_find_tag` is present. + * + * @return An iterator to one past the last slot + */ + [[nodiscard]] __host__ __device__ constexpr iterator end() noexcept + { + auto const& ref_ = static_cast(*this); + return ref_.impl_.end(); + } + + /** + * @brief Finds an element in the set with key equivalent to the probe key. + * + * @note Returns a un-incrementable input iterator to the element whose key is equivalent to + * `key`. If no such element exists, returns `end()`. + * + * @tparam ProbeKey Probe key type + * + * @param key The key to search for + * + * @return An iterator to the position at which the equivalent key is stored + */ + template + [[nodiscard]] __device__ const_iterator find(ProbeKey const& key) const noexcept + { + // CRTP: cast `this` to the actual ref type + auto const& ref_ = static_cast(*this); + return ref_.impl_.find(key, ref_.predicate_); + } + + /** + * @brief Finds an element in the set with key equivalent to the probe key. + * + * @note Returns a un-incrementable input iterator to the element whose key is equivalent to + * `key`. If no such element exists, returns `end()`. + * + * @tparam ProbeKey Probe key type + * + * @param group The Cooperative Group used to perform this operation + * @param key The key to search for + * + * @return An iterator to the position at which the equivalent key is stored + */ + template + [[nodiscard]] __device__ const_iterator find( + cooperative_groups::thread_block_tile const& group, ProbeKey const& key) const noexcept + { + auto const& ref_ = static_cast(*this); + return ref_.impl_.find(group, key, ref_.predicate_); + } +}; + +} // namespace detail +} // namespace experimental +} // namespace cuco diff --git a/include/cuco/detail/storage/aow_storage.inl b/include/cuco/detail/storage/aow_storage.inl new file mode 100644 index 000000000..c4b5fa8b6 --- /dev/null +++ b/include/cuco/detail/storage/aow_storage.inl @@ -0,0 +1,205 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +namespace cuco { +namespace experimental { + +template +constexpr aow_storage::aow_storage( + Extent size, Allocator const& allocator) noexcept + : detail::aow_storage_base{size}, + allocator_{allocator}, + window_deleter_{capacity(), allocator_}, + windows_{allocator_.allocate(capacity()), window_deleter_} +{ +} + +template +constexpr aow_storage::window_type* +aow_storage::data() const noexcept +{ + return windows_.get(); +} + +template +constexpr aow_storage::allocator_type +aow_storage::allocator() const noexcept +{ + return allocator_; +} + +template +constexpr aow_storage::ref_type +aow_storage::ref() const noexcept +{ + return ref_type{this->window_extent(), this->data()}; +} + +template +void aow_storage::initialize(value_type key, + cuda_stream_ref stream) noexcept +{ + this->initialize_async(key, stream); + stream.synchronize(); +} + +template +void aow_storage::initialize_async( + value_type key, cuda_stream_ref stream) noexcept +{ + auto constexpr cg_size = 1; + auto constexpr stride = 4; + auto const grid_size = cuco::detail::grid_size(this->num_windows(), cg_size, stride); + + detail::initialize<<>>( + this->data(), this->num_windows(), key); +} + +template +__host__ __device__ constexpr aow_storage_ref::aow_storage_ref( + Extent size, window_type* windows) noexcept + : detail::aow_storage_base{size}, windows_{windows} +{ +} + +template +struct aow_storage_ref::iterator { + public: + using iterator_category = std::input_iterator_tag; ///< iterator category + using reference = value_type&; ///< iterator reference type + + /** + * @brief Constructs a device side input iterator of the given slot. + * + * @param current The slot pointer + */ + __device__ constexpr explicit iterator(value_type* current) noexcept : current_{current} {} + + /** + * @brief Prefix increment operator + * + * @throw This code path should never be chosen. + * + * @return Current iterator + */ + __device__ constexpr iterator& operator++() noexcept + { + static_assert("Un-incrementable input iterator"); + } + + /** + * @brief Postfix increment operator + * + * @throw This code path should never be chosen. + * + * @return Current iterator + */ + __device__ constexpr iterator operator++(int32_t) noexcept + { + static_assert("Un-incrementable input iterator"); + } + + /** + * @brief Dereference operator + * + * @return Reference to the current slot + */ + __device__ constexpr reference operator*() const { return *current_; } + + /** + * @brief Access operator + * + * @return Pointer to the current slot + */ + __device__ constexpr value_type* operator->() const { return current_; } + + /** + * Equality operator + * + * @return True if two iterators are identical + */ + friend __device__ constexpr bool operator==(iterator const& lhs, iterator const& rhs) noexcept + { + return lhs.current_ == rhs.current_; + } + + /** + * Inequality operator + * + * @return True if two iterators are not identical + */ + friend __device__ constexpr bool operator!=(iterator const& lhs, iterator const& rhs) noexcept + { + return not(lhs == rhs); + } + + private: + value_type* current_{}; ///< Pointer to the current slot +}; + +template +__device__ constexpr aow_storage_ref::iterator +aow_storage_ref::end() noexcept +{ + return iterator{reinterpret_cast(this->data() + this->capacity())}; +} + +template +__device__ constexpr aow_storage_ref::const_iterator +aow_storage_ref::end() const noexcept +{ + return const_iterator{reinterpret_cast(this->data() + this->capacity())}; +} + +template +__device__ constexpr aow_storage_ref::window_type* +aow_storage_ref::data() noexcept +{ + return windows_; +} + +template +__device__ constexpr aow_storage_ref::window_type* +aow_storage_ref::data() const noexcept +{ + return windows_; +} + +template +__device__ constexpr aow_storage_ref::window_type +aow_storage_ref::operator[](size_type index) const noexcept +{ + return *reinterpret_cast( + __builtin_assume_aligned(this->data() + index, sizeof(value_type) * window_size)); +} + +} // namespace experimental +} // namespace cuco diff --git a/include/cuco/detail/storage/aow_storage_base.cuh b/include/cuco/detail/storage/aow_storage_base.cuh new file mode 100644 index 000000000..5f3d84df4 --- /dev/null +++ b/include/cuco/detail/storage/aow_storage_base.cuh @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include + +#include +#include + +namespace cuco { +namespace experimental { +namespace detail { +/** + * @brief Window data structure type + * + * @tparam T Window slot type + * @tparam WindowSize Number of elements per window + */ +template +struct window : public cuda::std::array { + public: + static int32_t constexpr window_size = WindowSize; ///< Number of slots per window +}; + +/** + * @brief Base class of array of slot windows open addressing storage. + * + * @note This should NOT be used directly. + * + * @tparam T Slot type + * @tparam WindowSize Number of slots in each window + * @tparam Extent Type of extent denoting the number of windows + */ +template +class aow_storage_base : public storage_base { + public: + /** + * @brief The number of elements (slots) processed per window. + */ + static constexpr int32_t window_size = WindowSize; + + using extent_type = typename storage_base::extent_type; ///< Storage extent type + using size_type = typename storage_base::size_type; ///< Storage size type + + using value_type = T; ///< Slot type + using window_type = window; ///< Slot window type + + /** + * @brief Constructor of AoW base storage. + * + * @param size Number of windows to store + */ + __host__ __device__ explicit constexpr aow_storage_base(Extent size) : storage_base{size} + { + } + + /** + * @brief Gets the total number of slot windows in the current storage. + * + * @return The total number of slot windows + */ + [[nodiscard]] __host__ __device__ constexpr size_type num_windows() const noexcept + { + return storage_base::capacity(); + } + + /** + * @brief Gets the total number of slots in the current storage. + * + * @return The total number of slots + */ + [[nodiscard]] __host__ __device__ constexpr size_type capacity() const noexcept + { + return storage_base::capacity() * window_size; + } + + /** + * @brief Gets the window extent of the current storage. + * + * @return The window extent. + */ + [[nodiscard]] __host__ __device__ constexpr extent_type window_extent() const noexcept + { + return storage_base::extent(); + } +}; + +} // namespace detail +} // namespace experimental +} // namespace cuco diff --git a/include/cuco/detail/storage/counter_storage.cuh b/include/cuco/detail/storage/counter_storage.cuh new file mode 100644 index 000000000..bb36b15e2 --- /dev/null +++ b/include/cuco/detail/storage/counter_storage.cuh @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include + +#include + +#include + +namespace cuco { +namespace experimental { +namespace detail { +/** + * @brief Device atomic counter storage class. + * + * @tparam SizeType Type of storage size + * @tparam Scope The scope in which the counter will be used by individual threads + * @tparam Allocator Type of allocator used for device storage + */ +template +class counter_storage : public storage_base> { + public: + using storage_base>::capacity; ///< Storage size + + using size_type = SizeType; ///< Size type + using value_type = cuda::atomic; ///< Type of the counter + using allocator_type = typename std::allocator_traits::rebind_alloc< + value_type>; ///< Type of the allocator to (de)allocate counter + using counter_deleter_type = + custom_deleter; ///< Type of counter deleter + + /** + * @brief Constructor of counter storage. + * + * @param allocator Allocator used for (de)allocating device storage + */ + explicit constexpr counter_storage(Allocator const& allocator) + : storage_base>{cuco::experimental::extent{}}, + allocator_{allocator}, + counter_deleter_{this->capacity(), allocator_}, + counter_{allocator_.allocate(this->capacity()), counter_deleter_} + { + } + + /** + * @brief Asynchronously resets counter to zero. + * + * @param stream CUDA stream used to reset + */ + void reset(cuda_stream_ref stream) + { + static_assert(sizeof(size_type) == sizeof(value_type)); + CUCO_CUDA_TRY(cudaMemsetAsync(this->data(), 0, sizeof(value_type), stream)); + } + + /** + * @brief Gets device atomic counter pointer. + * + * @return Pointer to the device atomic counter + */ + [[nodiscard]] constexpr value_type* data() noexcept { return counter_.get(); } + + /** + * @brief Gets device atomic counter pointer. + * + * @return Pointer to the device atomic counter + */ + [[nodiscard]] constexpr value_type* data() const noexcept { return counter_.get(); } + + /** + * @brief Atomically obtains the value of the device atomic counter and copies it to the host. + * + * @note This API synchronizes the given `stream`. + * + * @param stream CUDA stream used to copy device value to the host + * @return Value of the atomic counter + */ + [[nodiscard]] constexpr size_type load_to_host(cuda_stream_ref stream) const + { + size_type h_count; + CUCO_CUDA_TRY( + cudaMemcpyAsync(&h_count, this->data(), sizeof(size_type), cudaMemcpyDeviceToHost, stream)); + stream.synchronize(); + return h_count; + } + + private: + allocator_type allocator_; ///< Allocator used to (de)allocate counter + counter_deleter_type counter_deleter_; ///< Custom counter deleter + std::unique_ptr counter_; ///< Pointer to counter storage +}; + +} // namespace detail +} // namespace experimental +} // namespace cuco diff --git a/include/cuco/detail/storage/kernels.cuh b/include/cuco/detail/storage/kernels.cuh new file mode 100644 index 000000000..2a5868f61 --- /dev/null +++ b/include/cuco/detail/storage/kernels.cuh @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +#include + +namespace cuco { +namespace experimental { +namespace detail { + +/** + * @brief Initializes each slot in the window storage to contain `value`. + * + * @tparam WindowT Window type + * + * @param windows Pointer to flat storage for windows + * @param n Number of input windows + * @param value Value to which all values in `slots` are initialized + */ +template +__global__ void initialize(WindowT* windows, + cuco::detail::index_type n, + typename WindowT::value_type value) +{ + auto const loop_stride = cuco::detail::grid_stride(); + auto idx = cuco::detail::global_thread_id(); + + while (idx < n) { + auto& window_slots = *(windows + idx); +#pragma unroll + for (auto& slot : window_slots) { + slot = value; + } + idx += loop_stride; + } +} + +} // namespace detail +} // namespace experimental +} // namespace cuco diff --git a/include/cuco/detail/storage/storage.cuh b/include/cuco/detail/storage/storage.cuh new file mode 100644 index 000000000..4dda179c9 --- /dev/null +++ b/include/cuco/detail/storage/storage.cuh @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace cuco { +namespace experimental { +namespace detail { +/** + * @brief Intermediate class internally used by data structures + * + * @tparam StorageImpl Storage implementation class + * @tparam T Storage element type + * @tparam Extent Type of extent denoting number of windows + * @tparam Allocator Type of allocator used for device storage + */ +template +class storage : StorageImpl::template impl { + public: + /// Storage implementation type + using impl_type = typename StorageImpl::template impl; + using ref_type = typename impl_type::ref_type; ///< Storage ref type + using value_type = typename impl_type::value_type; ///< Storage value type + using allocator_type = typename impl_type::allocator_type; ///< Storage value type + + /// Number of elements per window + static constexpr int window_size = impl_type::window_size; + + using impl_type::allocator; + using impl_type::capacity; + using impl_type::data; + using impl_type::initialize; + using impl_type::initialize_async; + using impl_type::num_windows; + using impl_type::ref; + + /** + * @brief Constructs storage. + * + * @param size Number of slots to (de)allocate + * @param allocator Allocator used for (de)allocating device storage + */ + explicit constexpr storage(Extent size, Allocator const& allocator) : impl_type{size, allocator} + { + } +}; + +} // namespace detail +} // namespace experimental +} // namespace cuco diff --git a/include/cuco/detail/storage/storage_base.cuh b/include/cuco/detail/storage/storage_base.cuh new file mode 100644 index 000000000..98eed6c13 --- /dev/null +++ b/include/cuco/detail/storage/storage_base.cuh @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace cuco { +namespace experimental { +namespace detail { +/** + * @brief Custom deleter for unique pointer. + * + * @tparam SizeType Type of device storage size + * @tparam Allocator Type of allocator used for device storage + */ +template +struct custom_deleter { + using pointer = typename Allocator::value_type*; ///< Value pointer type + + /** + * @brief Constructor of custom deleter. + * + * @param size Number of values to deallocate + * @param allocator Allocator used for deallocating device storage + */ + explicit constexpr custom_deleter(SizeType size, Allocator& allocator) + : size_{size}, allocator_{allocator} + { + } + + /** + * @brief Operator for deallocation + * + * @param ptr Pointer to the first value for deallocation + */ + void operator()(pointer ptr) { allocator_.deallocate(ptr, size_); } + + SizeType size_; ///< Number of values to delete + Allocator& allocator_; ///< Allocator used deallocating values +}; + +/** + * @brief Base class of open addressing storage. + * + * This class should not be used directly. + * + * @tparam Extent Type of extent denoting storage capacity + */ +template +class storage_base { + public: + using extent_type = Extent; ///< Storage extent type + using size_type = typename extent_type::value_type; ///< Storage size type + + /** + * @brief Constructor of base storage. + * + * @param size Number of elements to (de)allocate + */ + __host__ __device__ explicit constexpr storage_base(Extent size) : extent_{size} {} + + /** + * @brief Gets the total number of elements in the current storage. + * + * @return The total number of elements + */ + [[nodiscard]] __host__ __device__ constexpr size_type capacity() const noexcept + { + return static_cast(extent_); + } + + /** + * @brief Gets the extent of the current storage. + * + * @return The extent. + */ + [[nodiscard]] __host__ __device__ constexpr extent_type extent() const noexcept + { + return extent_; + } + + protected: + extent_type extent_; ///< Total number of elements +}; + +} // namespace detail +} // namespace experimental +} // namespace cuco diff --git a/include/cuco/detail/traits.hpp b/include/cuco/detail/traits.hpp new file mode 100644 index 000000000..313f95430 --- /dev/null +++ b/include/cuco/detail/traits.hpp @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + */ + +#pragma once + +#include +#include + +#include + +#include + +namespace cuco::detail { + +template +struct is_std_pair_like : cuda::std::false_type { +}; + +template +struct is_std_pair_like(cuda::std::declval())), + decltype(std::get<1>(cuda::std::declval()))>> + : cuda::std:: + conditional_t::value == 2, cuda::std::true_type, cuda::std::false_type> { +}; + +template +struct is_thrust_pair_like_impl : cuda::std::false_type { +}; + +template +struct is_thrust_pair_like_impl< + T, + cuda::std::void_t(cuda::std::declval())), + decltype(thrust::get<1>(cuda::std::declval()))>> + : cuda::std::conditional_t::value == 2, + cuda::std::true_type, + cuda::std::false_type> { +}; + +template +struct is_thrust_pair_like + : is_thrust_pair_like_impl()))>> { +}; + +} // namespace cuco::detail diff --git a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh new file mode 100644 index 000000000..8383669fc --- /dev/null +++ b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh @@ -0,0 +1,375 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include +#include + +#include + +#include +#include + +namespace cuco { +namespace experimental { +namespace detail { + +/** + * @brief Struct to store ranks of bits at 256-bit intervals (or blocks) + * + * This struct encodes a list of four rank values using base + offset format + * e.g. [1000, 1005, 1006, 1009] is stored as base = 1000, offsets = [5, 6, 9] + * base uses 40 bits, split between one uint32_t and one uint8_t + * each offset uses 8 bits + */ +struct rank { + uint32_t base_hi_; ///< Upper 32 bits of base + uint8_t base_lo_; ///< Lower 8 bits of base + cuda::std::array offsets_; ///< Offsets for 64-bit sub-intervals, relative to base + + /** + * @brief Gets base rank of current 256-bit interval + * + * @return The base rank + */ + __host__ __device__ constexpr uint64_t base() const noexcept + { + return (static_cast(base_hi_) << CHAR_BIT) | base_lo_; + } + + /** + * @brief Sets base rank of current 256-bit interval + * + * @param base Base rank + */ + __host__ __device__ constexpr void set_base(uint64_t base) noexcept + { + base_hi_ = static_cast(base >> CHAR_BIT); + base_lo_ = static_cast(base); + } +}; + +/** + * @brief Bitset class with rank and select index structures + * + * In addition to standard bitset set/test operations, this class provides + * rank and select operation API. It maintains index structures to make both these + * new operations close to constant time. + * + * Current limitations: + * - Stream controls are partially supported due to the use of `thrust::device_vector` as storage + * - Device ref doesn't support modifiers like `set`, `reset`, etc. + * + * @tparam Allocator Type of allocator used for device storage + */ +// TODO: have to use device_malloc_allocator for now otherwise the container cannot grow +template > +class dynamic_bitset { + public: + using size_type = std::size_t; ///< size type to specify bit index + using word_type = uint64_t; ///< word type + /// Type of the allocator to (de)allocate words + using allocator_type = typename std::allocator_traits::rebind_alloc; + + /// Number of bits per block. Note this is a tradeoff between space efficiency and perf. + static constexpr size_type words_per_block = 4; + /// Number of bits in a word + static constexpr size_type bits_per_word = sizeof(word_type) * CHAR_BIT; + /// Number of bits in a block + static constexpr size_type bits_per_block = words_per_block * bits_per_word; + + /** + * @brief Constructs an empty bitset + * + * @param allocator Allocator used for allocating device storage + */ + constexpr dynamic_bitset(Allocator const& allocator = Allocator{}); + + /** + * @brief Appends the given element `value` to the end of the bitset + * + * This API may involve data reallocation if the current storage is exhausted. + * + * @param value Boolean value of the new bit to be added + */ + constexpr void push_back(bool value) noexcept; + + /** + * @brief Sets the target bit indexed by `index` to a specified `value`. + * + * @param index Position of bit to be modified + * @param value New value of the target bit + */ + constexpr void set(size_type index, bool value) noexcept; + + /** + * @brief Sets the last bit to a specified value + * + * @param value New value of the last bit + */ + constexpr void set_last(bool value) noexcept; + + /** + * @brief For any element `keys_begin[i]` in the range `[keys_begin, keys_end)`, stores the + * boolean value at position `keys_begin[i]` to `output_begin[i]`. + * + * @tparam KeyIt Device-accessible iterator whose `value_type` can be converted to bitset's + * `size_type` + * @tparam OutputIt Device-accessible iterator whose `value_type` can be constructed from boolean + * type + * + * @param keys_begin Begin iterator to keys list whose values are queried + * @param keys_end End iterator to keys list + * @param outputs_begin Begin iterator to outputs of test operation + * @param stream Stream to execute test kernel + */ + template + constexpr void test(KeyIt keys_begin, + KeyIt keys_end, + OutputIt outputs_begin, + cuda_stream_ref stream = {}) noexcept; + + /** + * @brief For any element `keys_begin[i]` in the range `[keys_begin, keys_end)`, stores total + * count of `1` bits preceeding (but not including) position `keys_begin[i]` to `output_begin[i]`. + * + * @tparam KeyIt Device-accessible iterator whose `value_type` can be converted to bitset's + * `size_type` + * @tparam OutputIt Device-accessible iterator whose `value_type` can be constructed from bitset's + * `size_type` + * + * @param keys_begin Begin iterator to keys list whose ranks are queried + * @param keys_end End iterator to keys list + * @param outputs_begin Begin iterator to outputs ranks list + * @param stream Stream to execute ranks kernel + */ + template + constexpr void rank(KeyIt keys_begin, + KeyIt keys_end, + OutputIt outputs_begin, + cuda_stream_ref stream = {}) noexcept; + + /** + * @brief For any element `keys_begin[i]` in the range `[keys_begin, keys_end)`, stores the + * position of `keys_begin[i]`th `1` bit to `output_begin[i]`. + * + * @tparam KeyIt Device-accessible iterator whose `value_type` can be converted to bitset's + * `size_type` + * @tparam OutputIt Device-accessible iterator whose `value_type` can be constructed from bitset's + * `size_type` + * + * @param keys_begin Begin iterator to keys list whose select values are queried + * @param keys_end End iterator to keys list + * @param outputs_begin Begin iterator to outputs selects list + * @param stream Stream to execute selects kernel + */ + template + constexpr void select(KeyIt keys_begin, + KeyIt keys_end, + OutputIt outputs_begin, + cuda_stream_ref stream = {}) noexcept; + + using rank_type = cuco::experimental::detail::rank; ///< Rank type + + /** + *@brief Struct to hold all storage refs needed by reference + */ + // TODO: this is not a real ref type, to be changed + struct storage_ref_type { + const word_type* words_ref_; ///< Words ref + + const rank_type* ranks_true_ref_; ///< Ranks ref for 1 bits + const size_type* selects_true_ref_; ///< Selects ref for 1 bits + + const rank_type* ranks_false_ref_; ///< Ranks ref for 0 bits + const size_type* selects_false_ref_; ///< Selects ref 0 bits + }; + + /** + * @brief Device non-owning reference type of dynamic_bitset + */ + class reference { + public: + /** + * @brief Constructs a reference + * + * @param storage Struct with non-owning refs to bitset storage arrays + */ + __host__ __device__ explicit constexpr reference(storage_ref_type storage) noexcept; + + /** + * @brief Access value of a single bit + * + * @param key Position of bit + * + * @return Value of bit at position specified by key + */ + [[nodiscard]] __device__ constexpr bool test(size_type key) const noexcept; + + /** + * @brief Access a single word of internal storage + * + * @param word_id Index of word + * + * @return Word at position specified by index + */ + [[nodiscard]] __device__ constexpr word_type word(size_type word_id) const noexcept; + + /** + * @brief Find position of first set bit starting from a given position (inclusive) + * + * @param key Position of starting bit + * + * @return Index of next set bit + */ + [[nodiscard]] __device__ size_type find_next(size_type key) const noexcept; + + /** + * @brief Find number of set bits (rank) in all positions before the input position (exclusive) + * + * @param key Input bit position + * + * @return Rank of input position + */ + [[nodiscard]] __device__ constexpr size_type rank(size_type key) const noexcept; + + /** + * @brief Find position of Nth set (1) bit counting from start + * + * @param count Input N + * + * @return Position of Nth set bit + */ + [[nodiscard]] __device__ constexpr size_type select(size_type count) const noexcept; + + /** + * @brief Find position of Nth not-set (0) bit counting from start + * + * @param count Input N + * + * @return Position of Nth not-set bit + */ + [[nodiscard]] __device__ constexpr size_type select_false(size_type count) const noexcept; + + private: + /** + * @brief Helper function for select operation that computes an initial rank estimate + * + * @param count Input count for which select operation is being performed + * @param selects Selects array + * @param ranks Ranks array + * + * @return index in ranks which corresponds to highest rank less than count (least upper bound) + */ + template + [[nodiscard]] __device__ constexpr size_type initial_rank_estimate( + size_type count, const SelectsRef& selects, const RanksRef& ranks) const noexcept; + + /** + * @brief Subtract rank estimate from input count and return an increment to word_id + * + * @tparam Rank type + * + * @param count Input count that will be updated + * @param rank Initial rank estimate for count + * + * @return Increment to word_id based on rank values + */ + template + [[nodiscard]] __device__ constexpr size_type subtract_rank_from_count(size_type& count, + Rank rank) const noexcept; + + /** + * @brief Find position of Nth set bit in a 64-bit word + * + * @param N Input count + * + * @return Position of Nth set bit + */ + [[nodiscard]] __device__ size_type select_bit_in_word(size_type N, + word_type word) const noexcept; + + storage_ref_type storage_; ///< Non-owning storage + }; + + using ref_type = reference; ///< Non-owning container ref type + + /** + * @brief Gets non-owning device ref of the current object + * + * @return Device ref of the current `dynamic_bitset` object + */ + [[nodiscard]] constexpr ref_type ref() const noexcept; + + /** + * @brief Gets the number of bits dynamic_bitset holds + * + * @return Number of bits dynamic_bitset holds + */ + [[nodiscard]] constexpr size_type size() const noexcept; + + private: + /// Type of the allocator to (de)allocate ranks + using rank_allocator_type = typename std::allocator_traits::rebind_alloc; + /// Type of the allocator to (de)allocate indices + using size_allocator_type = typename std::allocator_traits::rebind_alloc; + + allocator_type allocator_; ///< Words allocator + size_type n_bits_; ///< Number of bits dynamic_bitset currently holds + bool is_built_; ///< Flag indicating whether the rank and select indices are built or not + + /// Words vector that represents all bits + thrust::device_vector words_; + /// Rank values for every 256-th bit (4-th word) + thrust::device_vector ranks_true_; + /// Same as ranks_ but for `0` bits + thrust::device_vector ranks_false_; + /// Block indices of (0, 256, 512...)th `1` bit + thrust::device_vector selects_true_; + /// Same as selects_, but for `0` bits + thrust::device_vector selects_false_; + + /** + * @brief Builds indexes for rank and select + * + * @param stream Stream to execute kernels + */ + constexpr void build(cuda_stream_ref stream = {}) noexcept; + + /** + * @brief Populates rank and select indexes for true or false bits + * + * @param ranks Output array of ranks + * @param selects Output array of selects + * @param flip_bits If true, negate bits to construct indexes for false bits + * @param stream Stream to execute kernels + */ + constexpr void build_ranks_and_selects( + thrust::device_vector& ranks, + thrust::device_vector& selects, + bool flip_bits, + cuda_stream_ref stream = {}); +}; + +} // namespace detail +} // namespace experimental +} // namespace cuco + +#include diff --git a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl new file mode 100644 index 000000000..d56ef9d7c --- /dev/null +++ b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl @@ -0,0 +1,404 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include + +#include +#include + +#include + +namespace cuco { +namespace experimental { +namespace detail { + +template +constexpr dynamic_bitset::dynamic_bitset(Allocator const& allocator) + : allocator_{allocator}, + n_bits_{0}, + is_built_{false}, + words_{allocator}, + ranks_true_{allocator}, + ranks_false_{allocator}, + selects_true_{allocator}, + selects_false_{allocator} +{ +} + +template +constexpr void dynamic_bitset::push_back(bool bit) noexcept +{ + if (n_bits_ % bits_per_block == 0) { + words_.resize(words_.size() + words_per_block); // Extend storage by one block + } + + set(n_bits_++, bit); +} + +template +constexpr void dynamic_bitset::set(size_type index, bool bit) noexcept +{ + is_built_ = false; + size_type word_id = index / bits_per_word; + size_type bit_id = index % bits_per_word; + if (bit) { + words_[word_id] |= 1UL << bit_id; + } else { + words_[word_id] &= ~(1UL << bit_id); + } +} + +template +constexpr void dynamic_bitset::set_last(bool bit) noexcept +{ + set(n_bits_ - 1, bit); +} + +template +template +constexpr void dynamic_bitset::test(KeyIt keys_begin, + KeyIt keys_end, + OutputIt outputs_begin, + cuda_stream_ref stream) noexcept + +{ + build(); + auto const num_keys = cuco::detail::distance(keys_begin, keys_end); + if (num_keys == 0) { return; } + + auto const grid_size = cuco::detail::grid_size(num_keys); + + bitset_test_kernel<<>>( + ref(), keys_begin, outputs_begin, num_keys); +} + +template +template +constexpr void dynamic_bitset::rank(KeyIt keys_begin, + KeyIt keys_end, + OutputIt outputs_begin, + cuda_stream_ref stream) noexcept +{ + build(); + auto const num_keys = cuco::detail::distance(keys_begin, keys_end); + if (num_keys == 0) { return; } + + auto const grid_size = cuco::detail::grid_size(num_keys); + + bitset_rank_kernel<<>>( + ref(), keys_begin, outputs_begin, num_keys); +} + +template +template +constexpr void dynamic_bitset::select(KeyIt keys_begin, + KeyIt keys_end, + OutputIt outputs_begin, + cuda_stream_ref stream) noexcept + +{ + build(); + auto const num_keys = cuco::detail::distance(keys_begin, keys_end); + if (num_keys == 0) { return; } + + auto const grid_size = cuco::detail::grid_size(num_keys); + + bitset_select_kernel<<>>( + ref(), keys_begin, outputs_begin, num_keys); +} + +template +constexpr void dynamic_bitset::build_ranks_and_selects( + thrust::device_vector& ranks, + thrust::device_vector& selects, + bool flip_bits, + cuda_stream_ref stream) +{ + if (n_bits_ == 0) { return; } + + // Step 1. Compute prefix sum of per-word bit counts + // Population counts for each word + size_type const num_words = words_.size(); + // Sized to have one extra entry for subsequent prefix sum + auto const bit_counts_size = num_words + 1; + + thrust::device_vector bit_counts(num_words + 1, this->allocator_); + auto const bit_counts_begin = thrust::raw_pointer_cast(bit_counts.data()); + + auto grid_size = cuco::detail::grid_size(num_words); + bit_counts_kernel<<>>( + thrust::raw_pointer_cast(words_.data()), bit_counts_begin, num_words, flip_bits); + + std::size_t temp_storage_bytes = 0; + using temp_allocator_type = typename std::allocator_traits::rebind_alloc; + auto temp_allocator = temp_allocator_type{this->allocator_}; + + CUCO_CUDA_TRY(cub::DeviceScan::ExclusiveSum( + nullptr, temp_storage_bytes, bit_counts_begin, bit_counts_begin, bit_counts_size, stream)); + + // Allocate temporary storage + auto d_temp_storage = temp_allocator.allocate(temp_storage_bytes); + + CUCO_CUDA_TRY(cub::DeviceScan::ExclusiveSum(thrust::raw_pointer_cast(d_temp_storage), + temp_storage_bytes, + bit_counts_begin, + bit_counts_begin, + bit_counts_size, + stream)); + + temp_allocator.deallocate(d_temp_storage, temp_storage_bytes); + + // Step 2. Compute ranks + auto const num_blocks = (num_words - 1) / words_per_block + 2; + ranks.resize(num_blocks); + + grid_size = cuco::detail::grid_size(num_blocks); + encode_ranks_from_prefix_bit_counts<<>>( + bit_counts_begin, + thrust::raw_pointer_cast(ranks.data()), + num_words, + num_blocks, + words_per_block); + + // Step 3. Compute selects + thrust::device_vector select_markers(num_blocks, + this->allocator_); + auto const select_markers_begin = thrust::raw_pointer_cast(select_markers.data()); + + mark_blocks_with_select_entries<<>>( + bit_counts_begin, select_markers_begin, num_blocks, words_per_block, bits_per_block); + + auto d_sum = reinterpret_cast(thrust::raw_pointer_cast( + std::allocator_traits::allocate(temp_allocator, sizeof(size_type)))); + CUCO_CUDA_TRY(cub::DeviceReduce::Sum( + nullptr, temp_storage_bytes, select_markers_begin, d_sum, num_blocks, stream)); + + d_temp_storage = temp_allocator.allocate(temp_storage_bytes); + + CUCO_CUDA_TRY(cub::DeviceReduce::Sum(thrust::raw_pointer_cast(d_temp_storage), + temp_storage_bytes, + select_markers_begin, + d_sum, + num_blocks, + stream)); + + size_type num_selects{}; + CUCO_CUDA_TRY( + cudaMemcpyAsync(&num_selects, d_sum, sizeof(size_type), cudaMemcpyDeviceToHost, stream)); + stream.synchronize(); + std::allocator_traits::deallocate( + temp_allocator, thrust::device_ptr{reinterpret_cast(d_sum)}, sizeof(size_type)); + temp_allocator.deallocate(d_temp_storage, temp_storage_bytes); + + selects.resize(num_selects); + + auto const select_begin = thrust::raw_pointer_cast(selects.data()); + + CUCO_CUDA_TRY(cub::DeviceSelect::Flagged(nullptr, + temp_storage_bytes, + thrust::make_counting_iterator(0UL), + select_markers_begin, + select_begin, + thrust::make_discard_iterator(), + num_blocks, + stream)); + + d_temp_storage = temp_allocator.allocate(temp_storage_bytes); + + CUCO_CUDA_TRY(cub::DeviceSelect::Flagged(thrust::raw_pointer_cast(d_temp_storage), + temp_storage_bytes, + thrust::make_counting_iterator(0UL), + select_markers_begin, + select_begin, + thrust::make_discard_iterator(), + num_blocks, + stream)); + + temp_allocator.deallocate(d_temp_storage, temp_storage_bytes); +} + +template +constexpr void dynamic_bitset::build(cuda_stream_ref stream) noexcept +{ + if (not is_built_) { + build_ranks_and_selects(ranks_true_, selects_true_, false, stream); // 1 bits + build_ranks_and_selects(ranks_false_, selects_false_, true, stream); // 0 bits + is_built_ = true; + } +} + +template +constexpr dynamic_bitset::ref_type dynamic_bitset::ref() const noexcept +{ + return ref_type{storage_ref_type{thrust::raw_pointer_cast(words_.data()), + thrust::raw_pointer_cast(ranks_true_.data()), + thrust::raw_pointer_cast(selects_true_.data()), + thrust::raw_pointer_cast(ranks_false_.data()), + thrust::raw_pointer_cast(selects_false_.data())}}; +} + +template +constexpr dynamic_bitset::size_type dynamic_bitset::size() const noexcept +{ + return n_bits_; +} + +// Device reference implementations + +template +__host__ __device__ constexpr dynamic_bitset::reference::reference( + storage_ref_type storage) noexcept + : storage_{storage} +{ +} + +template +__device__ constexpr bool dynamic_bitset::reference::test(size_type key) const noexcept +{ + return (storage_.words_ref_[key / bits_per_word] >> (key % bits_per_word)) & 1UL; +} + +template +__device__ constexpr typename dynamic_bitset::word_type +dynamic_bitset::reference::word(size_type word_id) const noexcept +{ + return storage_.words_ref_[word_id]; +} + +template +__device__ typename dynamic_bitset::size_type +dynamic_bitset::reference::find_next(size_type key) const noexcept +{ + size_type word_id = key / bits_per_word; + size_type bit_id = key % bits_per_word; + word_type word = storage_.words_ref_[word_id]; + word &= ~(0UL) << bit_id; + while (word == 0) { + word = storage_.words_ref_[++word_id]; + } + return word_id * bits_per_word + __ffsll(word) - 1; // cuda intrinsic +} + +template +__device__ constexpr typename dynamic_bitset::size_type +dynamic_bitset::reference::rank(size_type key) const noexcept +{ + size_type word_id = key / bits_per_word; + size_type bit_id = key % bits_per_word; + size_type rank_id = word_id / words_per_block; + size_type offset_id = word_id % words_per_block; + + auto rank = storage_.ranks_true_ref_[rank_id]; + size_type n = rank.base(); + + if (offset_id != 0) { n += rank.offsets_[offset_id - 1]; } + + n += cuda::std::popcount(storage_.words_ref_[word_id] & ((1UL << bit_id) - 1)); + + return n; +} + +template +__device__ constexpr typename dynamic_bitset::size_type +dynamic_bitset::reference::select(size_type count) const noexcept +{ + auto rank_id = initial_rank_estimate(count, storage_.selects_true_ref_, storage_.ranks_true_ref_); + auto rank = storage_.ranks_true_ref_[rank_id]; + + size_type word_id = rank_id * words_per_block; + word_id += subtract_rank_from_count(count, rank); + + return word_id * bits_per_word + select_bit_in_word(count, storage_.words_ref_[word_id]); +} + +template +__device__ constexpr typename dynamic_bitset::size_type +dynamic_bitset::reference::select_false(size_type count) const noexcept +{ + auto rank_id = + initial_rank_estimate(count, storage_.selects_false_ref_, storage_.ranks_false_ref_); + auto rank = storage_.ranks_false_ref_[rank_id]; + + size_type word_id = rank_id * words_per_block; + word_id += subtract_rank_from_count(count, rank); + + return word_id * bits_per_word + select_bit_in_word(count, ~(storage_.words_ref_[word_id])); +} + +template +template +__device__ constexpr typename dynamic_bitset::size_type +dynamic_bitset::reference::initial_rank_estimate(size_type count, + SelectsRef const& selects, + RanksRef const& ranks) const noexcept +{ + size_type block_id = count / (bits_per_word * words_per_block); + size_type begin = selects[block_id]; + size_type end = selects[block_id + 1] + 1UL; + + if (begin + 10 >= end) { // Linear search + while (count >= ranks[begin + 1].base()) { + ++begin; + } + } else { // Binary search + while (begin + 1 < end) { + size_type middle = (begin + end) / 2; + if (count < ranks[middle].base()) { + end = middle; + } else { + begin = middle; + } + } + } + return begin; +} + +template +template +__device__ constexpr typename dynamic_bitset::size_type +dynamic_bitset::reference::subtract_rank_from_count(size_type& count, + Rank rank) const noexcept +{ + count -= rank.base(); + + bool a0 = count >= rank.offsets_[0]; + bool a1 = count >= rank.offsets_[1]; + bool a2 = count >= rank.offsets_[2]; + size_type inc = a0 + a1 + a2; + + count -= (inc > 0) * rank.offsets_[inc - (inc > 0)]; + + return inc; +} + +template +__device__ typename dynamic_bitset::size_type +dynamic_bitset::reference::select_bit_in_word(size_type N, word_type word) const noexcept +{ + for (size_type pos = 0; pos < N; pos++) { + word &= word - 1; + } + return __ffsll(word & -word) - 1; // cuda intrinsic +} +} // namespace detail +} // namespace experimental +} // namespace cuco diff --git a/include/cuco/detail/trie/dynamic_bitset/kernels.cuh b/include/cuco/detail/trie/dynamic_bitset/kernels.cuh new file mode 100644 index 000000000..c92ab60b2 --- /dev/null +++ b/include/cuco/detail/trie/dynamic_bitset/kernels.cuh @@ -0,0 +1,240 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include + +namespace cuco { +namespace experimental { +namespace detail { + +/* + * @brief Test bits for a range of keys + * + * @tparam BitsetRef Bitset reference type + * @tparam KeyIt Device-accessible iterator whose `value_type` can be converted to bitset's + * `size_type` + * @tparam OutputIt Device-accessible iterator whose `value_type` can be constructed from boolean + * type + * + * @param ref Bitset ref + * @param keys Begin iterator to keys + * @param outputs Begin iterator to outputs + * @param num_keys Number of input keys + */ +template +__global__ void bitset_test_kernel(BitsetRef ref, + KeyIt keys, + OutputIt outputs, + cuco::detail::index_type num_keys) +{ + auto key_id = cuco::detail::global_thread_id(); + auto const stride = cuco::detail::grid_stride(); + + while (key_id < num_keys) { + outputs[key_id] = ref.test(keys[key_id]); + key_id += stride; + } +} + +/* + * @brief Gather rank values for a range of keys + * + * @tparam BitsetRef Bitset reference type + * @tparam KeyIt Device-accessible iterator whose `value_type` can be converted to bitset's + * `size_type` + * @tparam OutputIt Device-accessible iterator whose `value_type` can be constructed from bitset's + * `size_type` + * + * @param ref Bitset ref + * @param keys Begin iterator to keys + * @param outputs Begin iterator to outputs + * @param num_keys Number of input keys + */ +template +__global__ void bitset_rank_kernel(BitsetRef ref, + KeyIt keys, + OutputIt outputs, + cuco::detail::index_type num_keys) +{ + auto key_id = cuco::detail::global_thread_id(); + auto const stride = cuco::detail::grid_stride(); + + while (key_id < num_keys) { + outputs[key_id] = ref.rank(keys[key_id]); + key_id += stride; + } +} + +/* + * @brief Gather select values for a range of keys + * + * @tparam BitsetRef Bitset reference type + * @tparam KeyIt Device-accessible iterator whose `value_type` can be converted to bitset's + * `size_type` + * @tparam OutputIt Device-accessible iterator whose `value_type` can be constructed from bitset's + * `size_type` + * + * @param ref Bitset ref + * @param keys Begin iterator to keys + * @param outputs Begin iterator to outputs + * @param num_keys Number of input keys + */ +template +__global__ void bitset_select_kernel(BitsetRef ref, + KeyIt keys, + OutputIt outputs, + cuco::detail::index_type num_keys) +{ + auto key_id = cuco::detail::global_thread_id(); + auto const stride = cuco::detail::grid_stride(); + + while (key_id < num_keys) { + outputs[key_id] = ref.select(keys[key_id]); + key_id += stride; + } +} + +/* + * @brief Computes number of set or not-set bits in each word + * + * @tparam WordType Word type + * @tparam SizeType Size type + * + * @param words Input array of words + * @param bit_counts Output array of per-word bit counts + * @param num_words Number of words + * @param flip_bits Boolean to request negation of words before counting bits + */ +template +__global__ void bit_counts_kernel(WordType const* words, + SizeType* bit_counts, + cuco::detail::index_type num_words, + bool flip_bits) +{ + auto word_id = cuco::detail::global_thread_id(); + auto const stride = cuco::detail::grid_stride(); + + while (word_id < num_words) { + auto word = words[word_id]; + bit_counts[word_id] = cuda::std::popcount(flip_bits ? ~word : word); + word_id += stride; + } +} + +/* + * @brief Compute rank values at block size intervals. + * + * ranks[i] = Number of set bits in [0, i) range + * This kernel transforms prefix sum array of per-word bit counts + * into base-delta encoding style of `rank` struct. + * Since prefix sum is available, there are no dependencies across blocks. + + * @tparam SizeType Size type + * + * @param prefix_bit_counts Prefix sum array of per-word bit counts + * @param ranks Output array of ranks + * @param num_words Length of input array + * @param num_blocks Length of ouput array + * @param words_per_block Number of words in each block + */ +template +__global__ void encode_ranks_from_prefix_bit_counts(const SizeType* prefix_bit_counts, + rank* ranks, + SizeType num_words, + SizeType num_blocks, + SizeType words_per_block) +{ + auto rank_id = cuco::detail::global_thread_id(); + auto const stride = cuco::detail::grid_stride(); + + while (rank_id < num_blocks) { + SizeType word_id = rank_id * words_per_block; + + // Set base value of rank + auto& rank = ranks[rank_id]; + rank.set_base(prefix_bit_counts[word_id]); + + if (rank_id < num_blocks - 1) { + // For each subsequent word in this block, compute deltas from base + for (SizeType block_offset = 0; block_offset < words_per_block - 1; block_offset++) { + auto delta = prefix_bit_counts[word_id + block_offset + 1] - prefix_bit_counts[word_id]; + rank.offsets_[block_offset] = delta; + } + } + rank_id += stride; + } +} + +/* + * @brief Compute select values at block size intervals. + * + * selects[i] = Position of (i+ 1)th set bit + * This kernel check for blocks where prefix sum crosses a multiple of `bits_per_block`. + * Such blocks are marked in the output boolean array + * + * @tparam SizeType Size type + * + * @param prefix_bit_counts Prefix sum array of per-word bit counts + * @param selects_markers Ouput array indicating whether a block has selects entry or not + * @param num_blocks Length of ouput array + * @param words_per_block Number of words in each block + * @param bits_per_block Number of bits in each block + */ +template +__global__ void mark_blocks_with_select_entries(SizeType const* prefix_bit_counts, + SizeType* select_markers, + SizeType num_blocks, + SizeType words_per_block, + SizeType bits_per_block) +{ + auto block_id = cuco::detail::global_thread_id(); + auto const stride = cuco::detail::grid_stride(); + + while (block_id < num_blocks) { + if (block_id == 0) { // Block 0 always has a selects entry + select_markers[block_id] = 1; + block_id += stride; + continue; + } + + select_markers[block_id] = 0; // Always clear marker first + SizeType word_id = block_id * words_per_block; + SizeType prev_count = prefix_bit_counts[word_id]; + + for (size_t block_offset = 1; block_offset <= words_per_block; block_offset++) { + SizeType count = prefix_bit_counts[word_id + block_offset]; + + // Selects entry is added when cumulative bitcount crosses a multiple of bits_per_block + if ((prev_count - 1) / bits_per_block != (count - 1) / bits_per_block) { + select_markers[block_id] = 1; + break; + } + prev_count = count; + } + + block_id += stride; + } +} + +} // namespace detail +} // namespace experimental +} // namespace cuco diff --git a/include/cuco/detail/utility/cuda.cuh b/include/cuco/detail/utility/cuda.cuh new file mode 100644 index 000000000..6e5f13ff7 --- /dev/null +++ b/include/cuco/detail/utility/cuda.cuh @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + */ + +#pragma once + +#include + +namespace cuco { +namespace detail { + +/** + * @brief Returns the global thread index in a 1D scalar grid + * + * @return The global thread index + */ +__device__ static index_type global_thread_id() noexcept +{ + return index_type{threadIdx.x} + index_type{blockDim.x} * index_type{blockIdx.x}; +} + +/** + * @brief Returns the grid stride of a 1D grid + * + * @return The grid stride + */ +__device__ static index_type grid_stride() noexcept +{ + return index_type{gridDim.x} * index_type{blockDim.x}; +} + +} // namespace detail +} // namespace cuco diff --git a/include/cuco/detail/utility/cuda.hpp b/include/cuco/detail/utility/cuda.hpp new file mode 100644 index 000000000..f6a84df98 --- /dev/null +++ b/include/cuco/detail/utility/cuda.hpp @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + */ + +#pragma once + +#include + +namespace cuco { +namespace detail { + +using index_type = int64_t; ///< CUDA thread index type + +/// Default block size +constexpr int32_t default_block_size() noexcept { return 128; } +/// Default stride +constexpr int32_t default_stride() noexcept { return 1; } + +/** + * @brief Computes the desired 1D grid size with the given parameters + * + * @param num Number of elements to handle in the kernel + * @param cg_size Number of threads per CUDA Cooperative Group + * @param stride Number of elements to be handled by each thread + * @param block_size Number of threads in each thread block + * + * @return The resulting grid size + */ +constexpr auto grid_size(index_type num, + int32_t cg_size = 1, + int32_t stride = default_stride(), + int32_t block_size = default_block_size()) noexcept +{ + return int_div_ceil(cg_size * num, stride * block_size); +} + +} // namespace detail +} // namespace cuco diff --git a/include/cuco/detail/utility/math.hpp b/include/cuco/detail/utility/math.hpp new file mode 100644 index 000000000..47484d6ad --- /dev/null +++ b/include/cuco/detail/utility/math.hpp @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + */ + +#pragma once + +#include + +namespace cuco { +namespace detail { + +/** + * @brief Ceiling of an integer division + * + * @tparam T Type of dividend + * @tparam U Type of divisor + * + * @throw If `T` is not an integral type + * @throw If `U` is not an integral type + * + * @param dividend Numerator + * @param divisor Denominator + * + * @return Ceiling of the integer division + */ +template +constexpr T int_div_ceil(T dividend, U divisor) noexcept +{ + static_assert(std::is_integral_v); + static_assert(std::is_integral_v); + return (dividend + divisor - 1) / divisor; +} + +} // namespace detail +} // namespace cuco diff --git a/include/cuco/detail/utils.cuh b/include/cuco/detail/utils.cuh index 5b02cef96..22675d496 100644 --- a/include/cuco/detail/utils.cuh +++ b/include/cuco/detail/utils.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,8 +15,14 @@ #pragma once +#include + #include +#include +#include +#include + namespace cuco { namespace detail { @@ -59,7 +65,7 @@ struct slot_to_tuple { */ template struct slot_is_filled { - Key empty_key_sentinel; ///< The value of the empty key sentinel + Key empty_key_sentinel_; ///< The value of the empty key sentinel /** * @brief Indicates if the target slot `s` is filled. @@ -72,8 +78,144 @@ struct slot_is_filled { template __device__ bool operator()(S const& s) { - return thrust::get<0>(s) != empty_key_sentinel; + return not cuco::detail::bitwise_compare(thrust::get<0>(s), empty_key_sentinel_); + } +}; + +/** + * @brief A strong type wrapper. + * + * @tparam T Type of the mapped values + */ +template +struct strong_type { + /** + * @brief Constructs a strong type. + * + * @param v Value to be wrapped as a strong type + */ + __host__ __device__ explicit constexpr strong_type(T v) : value{v} {} + + /** + * @brief Implicit conversion operator to the underlying value. + * + * @return Underlying value + */ + __host__ __device__ constexpr operator T() const noexcept { return value; } + + T value; ///< Underlying value +}; + +/** + * @brief Converts a given hash value into a valid (positive) size type. + * + * @tparam SizeType The target type + * @tparam HashType The input type + * + * @return Converted hash value + */ +template +__host__ __device__ constexpr SizeType sanitize_hash(HashType hash) noexcept +{ + if constexpr (cuda::std::is_signed_v) { + return cuda::std::abs(static_cast(hash)); + } else { + return static_cast(hash); } +} + +/** + * @brief Gives value to use as alignment for a pair type that is at least the + * size of the sum of the size of the first type and second type, or 16, + * whichever is smaller. + */ +template +constexpr std::size_t pair_alignment() +{ + return std::min(std::size_t{16}, cuda::std::bit_ceil(sizeof(First) + sizeof(Second))); +} + +/** + * @brief Denotes the equivalent packed type based on the size of the object. + * + * @tparam N The size of the object + */ +template +struct packed { + using type = void; ///< `void` type by default +}; + +/** + * @brief Denotes the packed type when the size of the object is 8. + */ +template <> +struct packed { + using type = uint64_t; ///< Packed type as `uint64_t` if the size of the object is 8 +}; + +/** + * @brief Denotes the packed type when the size of the object is 4. + */ +template <> +struct packed { + using type = uint32_t; ///< Packed type as `uint32_t` if the size of the object is 4 +}; + +template +using packed_t = typename packed::type; + +/** + * @brief Indicates if a pair type can be packed. + * + * When the size of the key,value pair being inserted into the hash table is + * equal in size to a type where atomicCAS is natively supported, it is more + * efficient to "pack" the pair and insert it with a single atomicCAS. + * + * Pair types whose key and value have the same object representation may be + * packed. Also, the `Pair` must not contain any padding bits otherwise + * accessing the packed value would be undefined. + * + * @tparam Pair The pair type that will be packed + * + * @return true If the pair type can be packed + * @return false If the pair type cannot be packed + */ +template +constexpr bool is_packable() +{ + return not std::is_void>::value and std::has_unique_object_representations_v; +} + +/** + * @brief Allows viewing a pair in a packed representation. + * + * Used as an optimization for inserting when a pair can be inserted with a + * single atomicCAS + */ +template +union pair_converter { + using packed_type = packed_t; ///< The packed pair type + packed_type packed; ///< The pair in the packed representation + Pair pair; ///< The pair in the pair representation + + /** + * @brief Constructs a pair converter by copying from `p` + * + * @tparam T Type that is convertible to `Pair` + * + * @param p The pair to copy from + */ + template + __device__ pair_converter(T&& p) : pair{p} + { + } + + /** + * @brief Constructs a pair converter by copying from `p` + * + * @param p The packed data to copy from + */ + __device__ pair_converter(packed_type p) : packed{p} {} }; } // namespace detail diff --git a/include/cuco/detail/utils.hpp b/include/cuco/detail/utils.hpp index 40697ff5c..86c045e3b 100644 --- a/include/cuco/detail/utils.hpp +++ b/include/cuco/detail/utils.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,38 +15,60 @@ #pragma once +#include +#include + +#include +#include + namespace cuco { namespace detail { +template +constexpr inline index_type distance(Iterator begin, Iterator end) +{ + using category = typename std::iterator_traits::iterator_category; + static_assert(std::is_base_of_v, + "Input iterator should be a random access iterator."); + // `int64_t` instead of arch-dependant `long int` + return static_cast(std::distance(begin, end)); +} + /** - * @brief Compute the number of bits of a simple type. + * @brief C++17 constexpr backport of `std::lower_bound`. * - * @tparam T The type we want to infer its size in bits + * @tparam ForwardIt Type of input iterator + * @tparam T Type of `value` * - * @return Size of type T in bits + * @param first Iterator defining the start of the range to examine + * @param last Iterator defining the start of the range to examine + * @param value Value to compare the elements to + * + * @return Iterator pointing to the first element in the range [first, last) that does not satisfy + * element < value */ -template -static constexpr std::size_t type_bits() noexcept +template +constexpr ForwardIt lower_bound(ForwardIt first, ForwardIt last, const T& value) { - return sizeof(T) * CHAR_BIT; -} + using diff_type = typename std::iterator_traits::difference_type; -// safe division -#ifndef SDIV -#define SDIV(x, y) (((x) + (y)-1) / (y)) -#endif + ForwardIt it{}; + diff_type count = std::distance(first, last); + diff_type step{}; -template -auto get_grid_size(Kernel kernel, std::size_t block_size, std::size_t dynamic_smem_bytes = 0) -{ - int grid_size{-1}; - cudaOccupancyMaxActiveBlocksPerMultiprocessor(&grid_size, kernel, block_size, dynamic_smem_bytes); - int dev_id{-1}; - cudaGetDevice(&dev_id); - int num_sms{-1}; - cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, dev_id); - grid_size *= num_sms; - return grid_size; + while (count > 0) { + it = first; + step = count / 2; + std::advance(it, step); + + if (static_cast(*it) < value) { + first = ++it; + count -= step + 1; + } else + count = step; + } + + return first; } } // namespace detail diff --git a/include/cuco/dynamic_map.cuh b/include/cuco/dynamic_map.cuh index a75512d3c..998ff3647 100644 --- a/include/cuco/dynamic_map.cuh +++ b/include/cuco/dynamic_map.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,7 +17,7 @@ #pragma once #include -#include +#include #include #include @@ -43,8 +43,8 @@ namespace cuco { * concurrent insert and find) from threads in device code. * * Current limitations: - * - Requires keys that are Arithmetic - * - Does not support erasing keys + * - Requires keys and values that where `cuco::is_bitwise_comparable_v` is true + * - Comparisons against the "sentinel" values will always be done with bitwise comparisons. * - Capacity does not shrink automatically * - Requires the user to specify sentinel values for both key and mapped value * to indicate empty slots @@ -66,8 +66,8 @@ namespace cuco { * // within the second insert. * * dynamic_map m{100'000, - * sentinel::empty_key{empty_key_sentinel}, - * sentinel::empty_value{empty_value_sentinel}}; + * empty_key{empty_key_sentinel}, + * empty_value{empty_value_sentinel}}; * * // Create a sequence of pairs {{0,0}, {1,1}, ... {i,i}} * thrust::device_vector> pairs_0(50'000); @@ -101,22 +101,25 @@ class dynamic_map { static_assert(std::is_arithmetic::value, "Unsupported, non-arithmetic key type."); public: - using value_type = cuco::pair_type; ///< Type of key/value pairs + using value_type = cuco::pair; ///< Type of key/value pairs using key_type = Key; ///< Key type using mapped_type = Value; ///< Type of mapped values - using atomic_ctr_type = cuda::atomic; ///< Type of atomic counters - using view_type = typename static_map::device_view; ///< Device view type - using mutable_view_type = typename static_map::device_mutable_view; - ///< Device mutable view type + using atomic_ctr_type = cuda::atomic; ///< Atomic counter type + using view_type = + typename static_map::device_view; ///< Type for submap device view + using mutable_view_type = + typename static_map::device_mutable_view; ///< Type for submap mutable + ///< device view dynamic_map(dynamic_map const&) = delete; dynamic_map(dynamic_map&&) = delete; + dynamic_map& operator=(dynamic_map const&) = delete; dynamic_map& operator=(dynamic_map&&) = delete; /** - * @brief Construct a dynamically-sized map with the specified initial capacity, growth factor and - * sentinel values. + * @brief Constructs a dynamically-sized map with the specified initial capacity, growth factor + * and sentinel values. * * The capacity of the map will automatically increase as the user adds key/value pairs using * `insert`. @@ -133,17 +136,50 @@ class dynamic_map { * @param empty_key_sentinel The reserved key value for empty slots * @param empty_value_sentinel The reserved mapped value for empty slots * @param alloc Allocator used to allocate submap device storage + * @param stream Stream used for executing the kernels */ dynamic_map(std::size_t initial_capacity, - sentinel::empty_key empty_key_sentinel, - sentinel::empty_value empty_value_sentinel, - Allocator const& alloc = Allocator{}); + empty_key empty_key_sentinel, + empty_value empty_value_sentinel, + Allocator const& alloc = Allocator{}, + cudaStream_t stream = nullptr); /** - * @brief Destroy the map and frees its contents + * @brief Constructs a dynamically-sized map with erase capability. + * + * The capacity of the map will automatically increase as the user adds key/value pairs using + * `insert`. + * + * Capacity increases by a factor of growth_factor each time the size of the map exceeds a + * threshold occupancy. The performance of `find` and `contains` decreases somewhat each time the + * map's capacity grows. + * + * The `empty_key_sentinel` and `empty_value_sentinel` values are reserved and + * undefined behavior results from attempting to insert any key/value pair + * that contains either. * + * @param initial_capacity The initial number of slots in the map + * @param empty_key_sentinel The reserved key value for empty slots + * @param empty_value_sentinel The reserved mapped value for empty slots + * @param erased_key_sentinel The reserved key value for erased slots + * @param alloc Allocator used to allocate submap device storage + * @param stream Stream used for executing the kernels + * + * @throw std::runtime error if the empty key sentinel and erased key sentinel + * are the same value */ - ~dynamic_map(); + dynamic_map(std::size_t initial_capacity, + empty_key empty_key_sentinel, + empty_value empty_value_sentinel, + erased_key erased_key_sentinel, + Allocator const& alloc = Allocator{}, + cudaStream_t stream = nullptr); + + /** + * @brief Destroys the map and frees its contents + * + */ + ~dynamic_map() {} /** * @brief Grows the capacity of the map so there is enough space for `n` key/value pairs. @@ -151,8 +187,9 @@ class dynamic_map { * If there is already enough space for `n` key/value pairs, the capacity remains the same. * * @param n The number of key value pairs for which there must be space + * @param stream Stream used for executing the kernels */ - void reserve(std::size_t n); + void reserve(std::size_t n, cudaStream_t stream = nullptr); /** * @brief Inserts all key/value pairs in the range `[first, last)`. @@ -168,11 +205,55 @@ class dynamic_map { * @param last End of the sequence of key/value pairs * @param hash The unary function to apply to hash each key * @param key_equal The binary function to compare two keys for equality + * @param stream Stream used for executing the kernels */ template , + typename Hash = cuco::default_hash_function, typename KeyEqual = thrust::equal_to> - void insert(InputIt first, InputIt last, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}); + void insert(InputIt first, + InputIt last, + Hash hash = Hash{}, + KeyEqual key_equal = KeyEqual{}, + cudaStream_t stream = nullptr); + + /** + * @brief Erases keys in the range `[first, last)`. + * + * For each key `k` in `[first, last)`, if `contains(k) == true), removes `k` and it's + * associated value from the map. Else, no effect. + * + * Side-effects: + * - `contains(k) == false` + * - `find(k) == end()` + * - `insert({k,v}) == true` + * - `get_size()` is reduced by the total number of erased keys + * + * This function synchronizes `stream`. + * + * Keep in mind that `erase` does not cause the map to shrink its memory allocation. + * + * @tparam InputIt Device accessible input iterator whose `value_type` is + * convertible to the map's `value_type` + * @tparam Hash Unary callable type + * @tparam KeyEqual Binary callable type + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param hash The unary function to apply to hash each key + * @param key_equal The binary function to compare two keys for equality + * @param stream Stream used for executing the kernels + * + * @throw std::runtime_error if a unique erased key sentinel value was not + * provided at construction + */ + template , + typename KeyEqual = thrust::equal_to> + void erase(InputIt first, + InputIt last, + Hash hash = Hash{}, + KeyEqual key_equal = KeyEqual{}, + cudaStream_t stream = nullptr); /** * @brief Finds the values corresponding to all keys in the range `[first, last)`. @@ -186,21 +267,24 @@ class dynamic_map { * convertible to the map's `mapped_type` * @tparam Hash Unary callable type * @tparam KeyEqual Binary callable type + * * @param first Beginning of the sequence of keys * @param last End of the sequence of keys * @param output_begin Beginning of the sequence of values retrieved for each key * @param hash The unary function to apply to hash each key * @param key_equal The binary function to compare two keys for equality + * @param stream Stream used for executing the kernels */ template , + typename Hash = cuco::default_hash_function, typename KeyEqual = thrust::equal_to> void find(InputIt first, InputIt last, OutputIt output_begin, - Hash hash = Hash{}, - KeyEqual key_equal = KeyEqual{}); + Hash hash = Hash{}, + KeyEqual key_equal = KeyEqual{}, + cudaStream_t stream = nullptr); /** * @brief Indicates whether the keys in the range `[first, last)` are contained in the map. @@ -213,21 +297,24 @@ class dynamic_map { * convertible to the map's `mapped_type` * @tparam Hash Unary callable type * @tparam KeyEqual Binary callable type + * * @param first Beginning of the sequence of keys * @param last End of the sequence of keys * @param output_begin Beginning of the sequence of booleans for the presence of each key * @param hash The unary function to apply to hash each key * @param key_equal The binary function to compare two keys for equality + * @param stream Stream used for executing the kernels */ template , + typename Hash = cuco::default_hash_function, typename KeyEqual = thrust::equal_to> void contains(InputIt first, InputIt last, OutputIt output_begin, - Hash hash = Hash{}, - KeyEqual key_equal = KeyEqual{}); + Hash hash = Hash{}, + KeyEqual key_equal = KeyEqual{}, + cudaStream_t stream = nullptr); /** * @brief Gets the current number of elements in the map @@ -253,18 +340,22 @@ class dynamic_map { private: key_type empty_key_sentinel_{}; ///< Key value that represents an empty slot mapped_type empty_value_sentinel_{}; ///< Initial value of empty slot - std::size_t size_{}; ///< Number of keys in the map - std::size_t capacity_{}; ///< Maximum number of keys that can be inserted - float max_load_factor_{}; ///< Max load factor before capacity growth + key_type erased_key_sentinel_{}; ///< Key value that represents an erased slot + + // TODO: initialize this + std::size_t size_{}; ///< Number of keys in the map + std::size_t capacity_{}; ///< Maximum number of keys that can be inserted + float max_load_factor_{}; ///< Max load factor before capacity growth std::vector>> submaps_; ///< vector of pointers to each submap thrust::device_vector submap_views_; ///< vector of device views for each submap thrust::device_vector - submap_mutable_views_; ///< vector of mutable device views for each submap - std::size_t min_insert_size_{}; ///< min remaining capacity of submap for insert - atomic_ctr_type* num_successes_; ///< number of successfully inserted keys on insert - Allocator alloc_{}; ///< Allocator passed to submaps to allocate their device storage + submap_mutable_views_; ///< vector of mutable device views for each submap + std::size_t min_insert_size_{}; ///< min remaining capacity of submap for insert + thrust::device_vector + submap_num_successes_; ///< Number of successfully erased keys for each submap + Allocator alloc_{}; ///< Allocator passed to submaps to allocate their device storage }; } // namespace cuco diff --git a/include/cuco/extent.cuh b/include/cuco/extent.cuh new file mode 100644 index 000000000..50e7ae4aa --- /dev/null +++ b/include/cuco/extent.cuh @@ -0,0 +1,180 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +namespace cuco { +namespace experimental { +static constexpr std::size_t dynamic_extent = static_cast(-1); + +/** + * @brief Static extent class. + * + * @tparam SizeType Size type + * @tparam N Extent + */ +template +struct extent { + using value_type = SizeType; ///< Extent value type + + constexpr extent() = default; + + /// Constructs from `SizeType` + __host__ __device__ constexpr extent(SizeType) noexcept {} + + /** + * @brief Conversion to value_type. + * + * @return Extent size + */ + __host__ __device__ constexpr operator value_type() const noexcept { return N; } +}; + +/** + * @brief Dynamic extent class. + * + * @tparam SizeType Size type + */ +template +struct extent { + using value_type = SizeType; ///< Extent value type + + /** + * @brief Constructs extent from a given `size`. + * + * @param size The extent size + */ + __host__ __device__ constexpr extent(SizeType size) noexcept : value_{size} {} + + /** + * @brief Conversion to value_type. + * + * @return Extent size + */ + __host__ __device__ constexpr operator value_type() const noexcept { return value_; } + + private: + value_type value_; ///< Extent value +}; + +/** + * @brief Window extent strong type. + * + * @note This type is used internally and can only be constructed using the `make_window_extent' + * factory method. + * + * @tparam SizeType Size type + * @tparam N Extent + * + */ +template +struct window_extent; + +/** + * @brief Computes a valid window extent/capacity for a given container type. + * + * @note The actual capacity of a container (map/set) should be exclusively determined by the return + * value of this utility since the output depends on the requested low-bound size, the probing + * scheme, and the storage. This utility is used internally during container constructions while for + * container ref constructions, it would be users' responsibility to use this function to determine + * the capacity ctor argument for the container. + * + * @tparam Container Container type to compute the extent for + * @tparam SizeType Size type + * @tparam N Extent + * + * @param ext The input extent + * + * @throw If the input extent is invalid + * + * @return Resulting valid `window extent` + */ +template +[[nodiscard]] auto constexpr make_window_extent(extent ext); + +/** + * @brief Computes a valid capacity for a given container type. + * + * @note The actual capacity of a container (map/set) should be exclusively determined by the return + * value of this utility since the output depends on the requested low-bound size, the probing + * scheme, and the storage. This utility is used internally during container constructions while for + * container ref constructions, it would be users' responsibility to use this function to determine + * the capacity ctor argument for the container. + * + * @tparam Container Container type to compute the extent for + * @tparam SizeType Size type + * + * @param size The input size + * + * @throw If the input size is invalid + * + * @return Resulting valid extent + */ +template +[[nodiscard]] auto constexpr make_window_extent(SizeType size); + +/** + * @brief Computes valid window extent based on given parameters. + * + * @note The actual capacity of a container (map/set) should be exclusively determined by the return + * value of this utility since the output depends on the requested low-bound size, the probing + * scheme, and the storage. This utility is used internally during container constructions while for + * container ref constructions, it would be users' responsibility to use this function to determine + * the input size of the ref. + * + * @tparam CGSize Number of elements handled per CG + * @tparam WindowSize Number of elements handled per Window + * @tparam SizeType Size type + * @tparam N Extent + * + * @param ext The input extent + * + * @throw If the input extent is invalid + * + * @return Resulting valid extent + */ +template +[[nodiscard]] auto constexpr make_window_extent(extent ext); + +/** + * @brief Computes valid window extent/capacity based on given parameters. + * + * @note The actual capacity of a container (map/set) should be exclusively determined by the return + * value of this utility since the output depends on the requested low-bound size, the probing + * scheme, and the storage. This utility is used internally during container constructions while for + * container ref constructions, it would be users' responsibility to use this function to determine + * the capacity ctor argument for the container. + * + * @tparam CGSize Number of elements handled per CG + * @tparam WindowSize Number of elements handled per Window + * @tparam SizeType Size type + * + * @param size The input size + * + * @throw If the input size is invalid + * + * @return Resulting valid extent + */ +template +[[nodiscard]] auto constexpr make_window_extent(SizeType size); + +} // namespace experimental +} // namespace cuco + +#include diff --git a/include/cuco/hash_functions.cuh b/include/cuco/hash_functions.cuh new file mode 100644 index 000000000..000f46fef --- /dev/null +++ b/include/cuco/hash_functions.cuh @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +namespace cuco { + +/** + * @brief The 32-bit integer finalizer function of `MurmurHash3` to hash the given argument on host + * and device. + * + * @throw Key type must be 4 bytes in size + * + * @tparam Key The type of the values to hash + */ +template +using murmurhash3_fmix_32 = detail::MurmurHash3_fmix32; + +/** + * @brief The 64-bit integer finalizer function of `MurmurHash3` to hash the given argument on host + * and device. + * + * @throw Key type must be 8 bytes in size + * + * @tparam Key The type of the values to hash + */ +template +using murmurhash3_fmix_64 = detail::MurmurHash3_fmix64; + +/** + * @brief A 32-bit `MurmurHash3` hash function to hash the given argument on host and device. + * + * @tparam Key The type of the values to hash + */ +template +using murmurhash3_32 = detail::MurmurHash3_32; + +/** + * @brief A 32-bit `XXH32` hash function to hash the given argument on host and device. + * + * @tparam Key The type of the values to hash + */ +template +using xxhash_32 = detail::XXHash_32; + +/** + * @brief A 64-bit `XXH64` hash function to hash the given argument on host and device. + * + * @tparam Key The type of the values to hash + */ +template +using xxhash_64 = detail::XXHash_64; + +/** + * @brief Default hash function. + * + * @tparam Key The type of the values to hash + */ +template +using default_hash_function = xxhash_32; + +} // namespace cuco diff --git a/include/cuco/operator.hpp b/include/cuco/operator.hpp new file mode 100644 index 000000000..77cf2c133 --- /dev/null +++ b/include/cuco/operator.hpp @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +namespace cuco { +namespace experimental { +inline namespace op { +// TODO enum class of int32_t instead of struct +// https://github.com/NVIDIA/cuCollections/issues/239 +/** + * @brief `insert` operator tag + */ +struct insert_tag { +} inline constexpr insert; + +/** + * @brief `insert_and_find` operator tag + */ +struct insert_and_find_tag { +} inline constexpr insert_and_find; + +/** + * @brief `insert_or_assign` operator tag + */ +struct insert_or_assign_tag { +} inline constexpr insert_or_assign; + +/** + * @brief `contains` operator tag + */ +struct contains_tag { +} inline constexpr contains; + +/** + * @brief `find` operator tag + */ +struct find_tag { +} inline constexpr find; + +} // namespace op +} // namespace experimental +} // namespace cuco + +#include diff --git a/include/cuco/pair.cuh b/include/cuco/pair.cuh new file mode 100644 index 000000000..0a804cc04 --- /dev/null +++ b/include/cuco/pair.cuh @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include +#include + +#include +#include + +namespace cuco { + +/** + * @brief Custom pair type + * + * @note This is necessary because `thrust::pair` is under aligned. + * + * @tparam First Type of the first value in the pair + * @tparam Second Type of the second value in the pair + */ +template +struct alignas(detail::pair_alignment()) pair { + using first_type = First; ///< Type of the first value in the pair + using second_type = Second; ///< Type of the second value in the pair + + pair() = default; + ~pair() = default; + pair(pair const&) = default; ///< Copy constructor + pair(pair&&) = default; ///< Move constructor + + /** + * @brief Replaces the contents of the pair with another pair. + * + * @return Reference of the current pair object + */ + pair& operator=(pair const&) = default; + + /** + * @brief Replaces the contents of the pair with another pair. + * + * @return Reference of the current pair object + */ + pair& operator=(pair&&) = default; + + /** + * @brief Constructs a pair from objects `f` and `s`. + * + * @param f The object to copy into `first` + * @param s The object to copy into `second` + */ + __host__ __device__ constexpr pair(First const& f, Second const& s); + + /** + * @brief Constructs a pair by copying from the given pair `p`. + * + * @tparam F Type of the first value of `p` + * @tparam S Type of the second value of `p` + * + * @param p The pair to copy from + */ + template + __host__ __device__ constexpr pair(pair const& p); + + /** + * @brief Constructs a pair from the given std::pair-like `p`. + * + * @tparam T Type of the pair to copy from + * + * @param p The input pair to copy from + */ + template ::value>* = nullptr> + __host__ __device__ constexpr pair(T const& p) + : pair{std::get<0>(thrust::raw_reference_cast(p)), std::get<1>(thrust::raw_reference_cast(p))} + { + } + + /** + * @brief Constructs a pair from the given thrust::pair-like `p`. + * + * @tparam T Type of the pair to copy from + * + * @param p The input pair to copy from + */ + template ::value>* = nullptr> + __host__ __device__ constexpr pair(T const& p) + : pair{thrust::get<0>(thrust::raw_reference_cast(p)), + thrust::get<1>(thrust::raw_reference_cast(p))} + { + } + + First first; ///< The first value in the pair + Second second; ///< The second value in the pair +}; + +/** + * @brief Creates a pair with the given first and second elements + * + * @tparam F Type of first element + * @tparam S Type of second element + * + * @param f First element + * @param s Second element + * + * @return A pair with first element `f` and second element `s`. + */ +template +__host__ __device__ constexpr pair, std::decay_t> make_pair(F&& f, + S&& s) noexcept; + +/** + * @brief Tests if both elements of lhs and rhs are equal + * + * @tparam T1 Type of the first element of the left-hand side pair + * @tparam T2 Type of the second element of the left-hand side pair + * @tparam U1 Type of the first element of the right-hand side pair + * @tparam U2 Type of the second element of the right-hand side pair + * + * @param lhs Left-hand side pair + * @param rhs Right-hand side pair + * + * @return True if two pairs are equal. False otherwise + */ +template +__host__ __device__ constexpr bool operator==(cuco::pair const& lhs, + cuco::pair const& rhs) noexcept; + +} // namespace cuco + +#include diff --git a/include/cuco/probe_sequences.cuh b/include/cuco/probe_sequences.cuh index 071b0921e..7921b6629 100644 --- a/include/cuco/probe_sequences.cuh +++ b/include/cuco/probe_sequences.cuh @@ -60,7 +60,7 @@ class linear_probing : public detail::probe_sequence_base { * @tparam Hash1 Unary callable type * @tparam Hash2 Unary callable type */ -template +template class double_hashing : public detail::probe_sequence_base { public: using probe_sequence_base_type = diff --git a/include/cuco/probing_scheme.cuh b/include/cuco/probing_scheme.cuh new file mode 100644 index 000000000..039433cef --- /dev/null +++ b/include/cuco/probing_scheme.cuh @@ -0,0 +1,153 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include + +namespace cuco { +namespace experimental { +/** + * @brief Public linear probing scheme class. + * + * @note Linear probing is efficient when few collisions are present, e.g., low occupancy or low + * multiplicity. + * + * @note `Hash` should be callable object type. + * + * @tparam CGSize Size of CUDA Cooperative Groups + * @tparam Hash Unary callable type + */ +template +class linear_probing : private detail::probing_scheme_base { + public: + using probing_scheme_base_type = + detail::probing_scheme_base; ///< The base probe scheme type + using probing_scheme_base_type::cg_size; + + /** + *@brief Constructs linear probing scheme with the hasher callable. + * + * @param hash Hasher + */ + __host__ __device__ constexpr linear_probing(Hash const& hash = {}); + + /** + * @brief Operator to return a probing iterator + * + * @tparam ProbeKey Type of probing key + * @tparam Extent Type of extent + * + * @param probe_key The probing key + * @param upper_bound Upper bound of the iteration + * @return An iterator whose value_type is convertible to slot index type + */ + template + __host__ __device__ constexpr auto operator()(ProbeKey const& probe_key, + Extent upper_bound) const noexcept; + + /** + * @brief Operator to return a CG-based probing iterator + * + * @tparam ProbeKey Type of probing key + * @tparam Extent Type of extent + * + * @param g the Cooperative Group to generate probing iterator + * @param probe_key The probing key + * @param upper_bound Upper bound of the iteration + * @return An iterator whose value_type is convertible to slot index type + */ + template + __host__ __device__ constexpr auto operator()( + cooperative_groups::thread_block_tile const& g, + ProbeKey const& probe_key, + Extent upper_bound) const noexcept; + + private: + Hash hash_; +}; + +/** + * @brief Public double hashing scheme class. + * + * @note Default probing scheme for cuco data structures. It shows superior performance over linear + * probing especially when dealing with high multiplicty and/or high occupancy use cases. + * + * @note `Hash1` and `Hash2` should be callable object type. + * + * @note `Hash2` needs to be able to construct from an integer value to avoid secondary clustering. + * + * @tparam CGSize Size of CUDA Cooperative Groups + * @tparam Hash1 Unary callable type + * @tparam Hash2 Unary callable type + */ +template +class double_hashing : private detail::probing_scheme_base { + public: + using probing_scheme_base_type = + detail::probing_scheme_base; ///< The base probe scheme type + using probing_scheme_base_type::cg_size; + + /** + *@brief Constructs double hashing probing scheme with the two hasher callables. + * + * @param hash1 First hasher + * @param hash2 Second hasher + */ + __host__ __device__ constexpr double_hashing(Hash1 const& hash1 = {}, Hash2 const& hash2 = {1}); + + /** + * @brief Operator to return a probing iterator + * + * @tparam ProbeKey Type of probing key + * @tparam Extent Type of extent + * + * @param probe_key The probing key + * @param upper_bound Upper bound of the iteration + * @return An iterator whose value_type is convertible to slot index type + */ + template + __host__ __device__ constexpr auto operator()(ProbeKey const& probe_key, + Extent upper_bound) const noexcept; + + /** + * @brief Operator to return a CG-based probing iterator + * + * @tparam ProbeKey Type of probing key + * @tparam Extent Type of extent + * + * @param g the Cooperative Group to generate probing iterator + * @param probe_key The probing key + * @param upper_bound Upper bound of the iteration + * @return An iterator whose value_type is convertible to slot index type + */ + template + __host__ __device__ constexpr auto operator()( + cooperative_groups::thread_block_tile const& g, + ProbeKey const& probe_key, + Extent upper_bound) const noexcept; + + private: + Hash1 hash1_; + Hash2 hash2_; +}; + +} // namespace experimental +} // namespace cuco + +#include diff --git a/include/cuco/sentinel.cuh b/include/cuco/sentinel.cuh index 58317d179..a440e5b2c 100644 --- a/include/cuco/sentinel.cuh +++ b/include/cuco/sentinel.cuh @@ -16,22 +16,24 @@ #pragma once +#include + namespace cuco { -namespace sentinel { +inline namespace sentinel { + /** * @brief A strong type wrapper used to denote the empty key sentinel. * * @tparam T Type of the key values */ template -struct empty_key { +struct empty_key : public cuco::detail::strong_type { /** * @brief Constructs an empty key sentinel with the given `v`. * * @param v The empty key sentinel value */ - __host__ __device__ explicit constexpr empty_key(T v) : value{v} {} - T value; ///< Empty key sentinel + __host__ __device__ explicit constexpr empty_key(T v) : cuco::detail::strong_type(v) {} }; /** @@ -40,14 +42,13 @@ struct empty_key { * @tparam T Type of the mapped values */ template -struct empty_value { +struct empty_value : public cuco::detail::strong_type { /** * @brief Constructs an empty value sentinel with the given `v`. * * @param v The empty value sentinel value */ - __host__ __device__ explicit constexpr empty_value(T v) : value{v} {} - T value; ///< Empty value sentinel + __host__ __device__ explicit constexpr empty_value(T v) : cuco::detail::strong_type(v) {} }; /** @@ -56,14 +57,13 @@ struct empty_value { * @tparam T Type of the key values */ template -struct erased_key { +struct erased_key : public cuco::detail::strong_type { /** * @brief Constructs an erased key sentinel with the given `v`. * * @param v The erased key sentinel value */ - __host__ __device__ explicit constexpr erased_key(T v) : value{v} {} - T value; ///< Erased key sentinel + __host__ __device__ explicit constexpr erased_key(T v) : cuco::detail::strong_type(v) {} }; } // namespace sentinel diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh index 361e97d37..825f88ab7 100644 --- a/include/cuco/static_map.cuh +++ b/include/cuco/static_map.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,14 +16,16 @@ #pragma once -#include +#include #include -#include -#include -#include +#include #include +#include +#include #include -#include +#include +#include +#include #include @@ -38,6 +40,512 @@ #include namespace cuco { +namespace experimental { +/** + * @brief A GPU-accelerated, unordered, associative container of key-value pairs with unique keys. + * + * The `static_map` supports two types of operations: + * - Host-side "bulk" operations + * - Device-side "singular" operations + * + * The host-side bulk operations include `insert`, `contains`, etc. These APIs should be used when + * there are a large number of keys to modify or lookup. For example, given a range of keys + * specified by device-accessible iterators, the bulk `insert` function will insert all keys into + * the map. + * + * The singular device-side operations allow individual threads (or cooperative groups) to perform + * independent modify or lookup operations from device code. These operations are accessed through + * non-owning, trivially copyable reference types (or "ref"). User can combine any arbitrary + * operators (see options in `include/cuco/operator.hpp`) when creating the ref. Concurrent modify + * and lookup will be supported if both kinds of operators are specified during the ref + * construction. + * + * @note Allows constant time concurrent modify or lookup operations from threads in device code. + * @note cuCollections data structures always place the slot keys on the left-hand side when + * invoking the key comparison predicate, i.e., `pred(slot_key, query_key)`. Order-sensitive + * `KeyEqual` should be used with caution. + * @note `ProbingScheme::cg_size` indicates how many threads are used to handle one independent + * device operation. `cg_size == 1` uses the scalar (or non-CG) code paths. + * + * @throw If the size of the given key type is larger than 8 bytes + * @throw If the size of the given payload type is larger than 8 bytes + * @throw If the size of the given slot type is larger than 16 bytes + * @throw If the given key type doesn't have unique object representations, i.e., + * `cuco::bitwise_comparable_v == false` + * @throw If the given mapped type doesn't have unique object representations, i.e., + * `cuco::bitwise_comparable_v == false` + * @throw If the probing scheme type is not inherited from `cuco::detail::probing_scheme_base` + * + * @tparam Key Type used for keys. Requires `cuco::is_bitwise_comparable_v` + * @tparam T Type of the mapped values + * @tparam Extent Data structure size type + * @tparam Scope The scope in which operations will be performed by individual threads. + * @tparam KeyEqual Binary callable type used to compare two keys for equality + * @tparam ProbingScheme Probing scheme (see `include/cuco/probing_scheme.cuh` for choices) + * @tparam Allocator Type of allocator used for device storage + * @tparam Storage Slot window storage type + */ + +template , + cuda::thread_scope Scope = cuda::thread_scope_device, + class KeyEqual = thrust::equal_to, + class ProbingScheme = + cuco::experimental::double_hashing<4, // CG size + cuco::default_hash_function>, + class Allocator = cuco::cuda_allocator>, + class Storage = cuco::experimental::storage<1>> +class static_map { + static_assert(sizeof(Key) <= 8, "Container does not support key types larger than 8 bytes."); + + static_assert(sizeof(T) <= 8, "Container does not support payload types larger than 8 bytes."); + + static_assert(cuco::is_bitwise_comparable_v, + "Mapped type must have unique object representations or have been explicitly " + "declared as safe for bitwise comparison via specialization of " + "cuco::is_bitwise_comparable_v."); + + using impl_type = detail::open_addressing_impl, + Extent, + Scope, + KeyEqual, + ProbingScheme, + Allocator, + Storage>; + + public: + static constexpr auto cg_size = impl_type::cg_size; ///< CG size used for probing + static constexpr auto window_size = impl_type::window_size; ///< Window size used for probing + static constexpr auto thread_scope = impl_type::thread_scope; ///< CUDA thread scope + + using key_type = typename impl_type::key_type; ///< Key type + using value_type = typename impl_type::value_type; ///< Key-value pair type + using extent_type = typename impl_type::extent_type; ///< Extent type + using size_type = typename impl_type::size_type; ///< Size type + using key_equal = typename impl_type::key_equal; ///< Key equality comparator type + using allocator_type = typename impl_type::allocator_type; ///< Allocator type + /// Non-owning window storage ref type + using storage_ref_type = typename impl_type::storage_ref_type; + using probing_scheme_type = typename impl_type::probing_scheme_type; ///< Probing scheme type + + using mapped_type = T; ///< Payload type + template + using ref_type = + cuco::experimental::static_map_ref; ///< Non-owning container ref type + + static_map(static_map const&) = delete; + static_map& operator=(static_map const&) = delete; + + static_map(static_map&&) = default; ///< Move constructor + + /** + * @brief Replaces the contents of the container with another container. + * + * @return Reference of the current map object + */ + static_map& operator=(static_map&&) = default; + ~static_map() = default; + + /** + * @brief Constructs a statically-sized map with the specified initial capacity, sentinel values + * and CUDA stream. + * + * The actual map capacity depends on the given `capacity`, the probing scheme, CG size, and the + * window size and it is computed via the `make_window_extent` factory. Insert operations will not + * automatically grow the map. Attempting to insert more unique keys than the capacity of the map + * results in undefined behavior. + * + * @note Any `*_sentinel`s are reserved and behavior is undefined when attempting to insert + * this sentinel value. + * @note If a non-default CUDA stream is provided, the caller is responsible for synchronizing the + * stream before the object is first used. + * + * @param capacity The requested lower-bound map size + * @param empty_key_sentinel The reserved key value for empty slots + * @param empty_value_sentinel The reserved mapped value for empty slots + * @param pred Key equality binary predicate + * @param probing_scheme Probing scheme + * @param alloc Allocator used for allocating device storage + * @param stream CUDA stream used to initialize the map + */ + constexpr static_map(Extent capacity, + empty_key empty_key_sentinel, + empty_value empty_value_sentinel, + KeyEqual const& pred = {}, + ProbingScheme const& probing_scheme = {}, + Allocator const& alloc = {}, + cuda_stream_ref stream = {}); + + /** + * @brief Erases all elements from the container. After this call, `size()` returns zero. + * Invalidates any references, pointers, or iterators referring to contained elements. + * + * @param stream CUDA stream this operation is executed in + */ + void clear(cuda_stream_ref stream = {}) noexcept; + + /** + * @brief Asynchronously erases all elements from the container. After this call, `size()` returns + * zero. Invalidates any references, pointers, or iterators referring to contained elements. + * + * @param stream CUDA stream this operation is executed in + */ + void clear_async(cuda_stream_ref stream = {}) noexcept; + + /** + * @brief Inserts all keys in the range `[first, last)` and returns the number of successful + * insertions. + * + * @note This function synchronizes the given stream. For asynchronous execution use + * `insert_async`. + * + * @tparam InputIt Device accessible random access input iterator where + * std::is_convertible::value_type, + * static_map::value_type> is `true` + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param stream CUDA stream used for insert + * + * @return Number of successful insertions + */ + template + size_type insert(InputIt first, InputIt last, cuda_stream_ref stream = {}); + + /** + * @brief Asynchronously inserts all keys in the range `[first, last)`. + * + * @tparam InputIt Device accessible random access input iterator where + * std::is_convertible::value_type, + * static_map::value_type> is `true` + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param stream CUDA stream used for insert + */ + template + void insert_async(InputIt first, InputIt last, cuda_stream_ref stream = {}) noexcept; + + /** + * @brief Inserts keys in the range `[first, last)` if `pred` of the corresponding stencil returns + * true. + * + * @note The key `*(first + i)` is inserted if `pred( *(stencil + i) )` returns true. + * @note This function synchronizes the given stream and returns the number of successful + * insertions. For asynchronous execution use `insert_if_async`. + * + * @tparam InputIt Device accessible random access iterator whose `value_type` is + * convertible to the container's `value_type` + * @tparam StencilIt Device accessible random access iterator whose value_type is + * convertible to Predicate's argument type + * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` and + * argument type is convertible from std::iterator_traits::value_type + * + * @param first Beginning of the sequence of key/value pairs + * @param last End of the sequence of key/value pairs + * @param stencil Beginning of the stencil sequence + * @param pred Predicate to test on every element in the range `[stencil, stencil + + * std::distance(first, last))` + * @param stream CUDA stream used for the operation + * + * @return Number of successful insertions + */ + template + size_type insert_if( + InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda_stream_ref stream = {}); + + /** + * @brief Asynchronously inserts keys in the range `[first, last)` if `pred` of the corresponding + * stencil returns true. + * + * @note The key `*(first + i)` is inserted if `pred( *(stencil + i) )` returns true. + * + * @tparam InputIt Device accessible random access iterator whose `value_type` is + * convertible to the container's `value_type` + * @tparam StencilIt Device accessible random access iterator whose value_type is + * convertible to Predicate's argument type + * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` and + * argument type is convertible from std::iterator_traits::value_type + * + * @param first Beginning of the sequence of key/value pairs + * @param last End of the sequence of key/value pairs + * @param stencil Beginning of the stencil sequence + * @param pred Predicate to test on every element in the range `[stencil, stencil + + * std::distance(first, last))` + * @param stream CUDA stream used for the operation + */ + template + void insert_if_async(InputIt first, + InputIt last, + StencilIt stencil, + Predicate pred, + cuda_stream_ref stream = {}) noexcept; + + /** + * @brief For any key-value pair `{k, v}` in the range `[first, last)`, if a key equivalent to `k` + * already exists in the container, assigns `v` to the mapped_type corresponding to the key `k`. + * If the key does not exist, inserts the pair as if by insert. + * + * @note This function synchronizes the given stream. For asynchronous execution use + * `insert_or_assign_async`. + * @note If multiple pairs in `[first, last)` compare equal, it is unspecified which pair is + * inserted or assigned. + * + * @tparam InputIt Device accessible random access input iterator where + * std::is_convertible::value_type, + * static_map::value_type> is `true` + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param stream CUDA stream used for insert + */ + template + void insert_or_assign(InputIt first, InputIt last, cuda_stream_ref stream = {}) noexcept; + + /** + * @brief For any key-value pair `{k, v}` in the range `[first, last)`, if a key equivalent to `k` + * already exists in the container, assigns `v` to the mapped_type corresponding to the key `k`. + * If the key does not exist, inserts the pair as if by insert. + * + * @note If multiple pairs in `[first, last)` compare equal, it is unspecified which pair is + * inserted or assigned. + * + * @tparam InputIt Device accessible random access input iterator where + * std::is_convertible::value_type, + * static_map::value_type> is `true` + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param stream CUDA stream used for insert + */ + template + void insert_or_assign_async(InputIt first, InputIt last, cuda_stream_ref stream = {}) noexcept; + + /** + * @brief Indicates whether the keys in the range `[first, last)` are contained in the map. + * + * @note This function synchronizes the given stream. For asynchronous execution use + * `contains_async`. + * + * @tparam InputIt Device accessible input iterator + * @tparam OutputIt Device accessible output iterator assignable from `bool` + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param output_begin Beginning of the sequence of booleans for the presence of each key + * @param stream Stream used for executing the kernels + */ + template + void contains(InputIt first, + InputIt last, + OutputIt output_begin, + cuda_stream_ref stream = {}) const; + + /** + * @brief Asynchronously indicates whether the keys in the range `[first, last)` are contained in + * the map. + * + * @tparam InputIt Device accessible input iterator + * @tparam OutputIt Device accessible output iterator assignable from `bool` + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param output_begin Beginning of the sequence of booleans for the presence of each key + * @param stream Stream used for executing the kernels + */ + template + void contains_async(InputIt first, + InputIt last, + OutputIt output_begin, + cuda_stream_ref stream = {}) const noexcept; + + /** + * @brief Indicates whether the keys in the range `[first, last)` are contained in the map if + * `pred` of the corresponding stencil returns true. + * + * @note If `pred( *(stencil + i) )` is true, stores `true` or `false` to `(output_begin + i)` + * indicating if the key `*(first + i)` is present in the map. If `pred( *(stencil + i) )` is + * false, stores false to `(output_begin + i)`. + * @note This function synchronizes the given stream. For asynchronous execution use + * `contains_if_async`. + * + * @tparam InputIt Device accessible input iterator + * @tparam StencilIt Device accessible random access iterator whose value_type is + * convertible to Predicate's argument type + * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` and + * argument type is convertible from std::iterator_traits::value_type + * @tparam OutputIt Device accessible output iterator assignable from `bool` + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param stencil Beginning of the stencil sequence + * @param pred Predicate to test on every element in the range `[stencil, stencil + + * std::distance(first, last))` + * @param output_begin Beginning of the sequence of booleans for the presence of each key + * @param stream Stream used for executing the kernels + */ + template + void contains_if(InputIt first, + InputIt last, + StencilIt stencil, + Predicate pred, + OutputIt output_begin, + cuda_stream_ref stream = {}) const; + + /** + * @brief Asynchronously indicates whether the keys in the range `[first, last)` are contained in + * the map if `pred` of the corresponding stencil returns true. + * + * @note If `pred( *(stencil + i) )` is true, stores `true` or `false` to `(output_begin + i)` + * indicating if the key `*(first + i)` is present in the map. If `pred( *(stencil + i) )` is + * false, stores false to `(output_begin + i)`. + * + * @tparam InputIt Device accessible input iterator + * @tparam StencilIt Device accessible random access iterator whose value_type is + * convertible to Predicate's argument type + * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` and + * argument type is convertible from std::iterator_traits::value_type + * @tparam OutputIt Device accessible output iterator assignable from `bool` + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param stencil Beginning of the stencil sequence + * @param pred Predicate to test on every element in the range `[stencil, stencil + + * std::distance(first, last))` + * @param output_begin Beginning of the sequence of booleans for the presence of each key + * @param stream Stream used for executing the kernels + */ + template + void contains_if_async(InputIt first, + InputIt last, + StencilIt stencil, + Predicate pred, + OutputIt output_begin, + cuda_stream_ref stream = {}) const noexcept; + + /** + * @brief For all keys in the range `[first, last)`, finds a payload with its key equivalent to + * the query key. + * + * @note This function synchronizes the given stream. For asynchronous execution use `find_async`. + * @note If the key `*(first + i)` has a matched `element` in the map, copies the payload of + * `element` to + * `(output_begin + i)`. Else, copies the empty value sentinel. + * + * @tparam InputIt Device accessible input iterator + * @tparam OutputIt Device accessible output iterator assignable from the map's `mapped_type` + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param output_begin Beginning of the sequence of payloads retrieved for each key + * @param stream Stream used for executing the kernels + */ + template + void find(InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream = {}) const; + + /** + * @brief For all keys in the range `[first, last)`, asynchronously finds a payload with its key + * equivalent to the query key. + * + * @note If the key `*(first + i)` has a matched `element` in the map, copies the payload of + * `element` to + * `(output_begin + i)`. Else, copies the empty value sentinel. + * + * @tparam InputIt Device accessible input iterator + * @tparam OutputIt Device accessible output iterator assignable from the map's `mapped_type` + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param output_begin Beginning of the sequence of payloads retrieved for each key + * @param stream Stream used for executing the kernels + */ + template + void find_async(InputIt first, + InputIt last, + OutputIt output_begin, + cuda_stream_ref stream = {}) const; + + /** + * @brief Retrieves all of the keys and their associated values. + * + * @note This API synchronizes the given stream. + * @note The order in which keys are returned is implementation defined and not guaranteed to be + * consistent between subsequent calls to `retrieve_all`. + * @note Behavior is undefined if the range beginning at `keys_out` or `values_out` is smaller + * than the return value of `size()`. + * + * @tparam KeyOut Device accessible random access output iterator whose `value_type` is + * convertible from `key_type`. + * @tparam ValueOut Device accesible random access output iterator whose `value_type` is + * convertible from `mapped_type`. + * + * @param keys_out Beginning output iterator for keys + * @param values_out Beginning output iterator for associated values + * @param stream CUDA stream used for this operation + * + * @return Pair of iterators indicating the last elements in the output + */ + template + std::pair retrieve_all(KeyOut keys_out, + ValueOut values_out, + cuda_stream_ref stream = {}) const; + + /** + * @brief Gets the number of elements in the container. + * + * @note This function synchronizes the given stream. + * + * @param stream CUDA stream used to get the number of inserted elements + * @return The number of elements in the container + */ + [[nodiscard]] size_type size(cuda_stream_ref stream = {}) const noexcept; + + /** + * @brief Gets the maximum number of elements the hash map can hold. + * + * @return The maximum number of elements the hash map can hold + */ + [[nodiscard]] constexpr auto capacity() const noexcept; + + /** + * @brief Gets the sentinel value used to represent an empty key slot. + * + * @return The sentinel value used to represent an empty key slot + */ + [[nodiscard]] constexpr key_type empty_key_sentinel() const noexcept; + + /** + * @brief Gets the sentinel value used to represent an empty value slot. + * + * @return The sentinel value used to represent an empty value slot + */ + [[nodiscard]] constexpr mapped_type empty_value_sentinel() const noexcept; + + /** + * @brief Get device ref with operators. + * + * @tparam Operators Set of `cuco::op` to be provided by the ref + * + * @param ops List of operators, e.g., `cuco::insert` + * + * @return Device ref of the current `static_map` object + */ + template + [[nodiscard]] auto ref(Operators... ops) const noexcept; + + private: + std::unique_ptr impl_; ///< Static map implementation + mapped_type empty_value_sentinel_; ///< Sentinel value that indicates an empty payload +}; +} // namespace experimental template class dynamic_map; @@ -53,7 +561,6 @@ class dynamic_map; * Current limitations: * - Requires keys and values that where `cuco::is_bitwise_comparable_v` is true * - Comparisons against the "sentinel" values will always be done with bitwise comparisons. - * - Does not support erasing keys * - Capacity is fixed and will not grow automatically * - Requires the user to specify sentinel values for both key and mapped value to indicate empty * slots @@ -137,14 +644,14 @@ class static_map { friend class dynamic_map; ///< Dynamic map as friend class public: - using value_type = cuco::pair_type; ///< Type of key/value pairs + using value_type = cuco::pair; ///< Type of key/value pairs using key_type = Key; ///< Key type using mapped_type = Value; ///< Type of mapped values using atomic_key_type = cuda::atomic; ///< Type of atomic keys using atomic_mapped_type = cuda::atomic; ///< Type of atomic mapped values using pair_atomic_type = - cuco::pair_type; ///< Pair type of atomic key and atomic mapped value + cuco::pair; ///< Pair type of atomic key and atomic mapped value using slot_type = pair_atomic_type; ///< Type of hash map slots using atomic_ctr_type = cuda::atomic; ///< Atomic counter type using allocator_type = Allocator; ///< Allocator type @@ -200,8 +707,8 @@ class static_map { * @param stream Stream used for executing the kernels */ static_map(std::size_t capacity, - sentinel::empty_key empty_key_sentinel, - sentinel::empty_value empty_value_sentinel, + empty_key empty_key_sentinel, + empty_value empty_value_sentinel, Allocator const& alloc = Allocator{}, cudaStream_t stream = 0); @@ -220,9 +727,9 @@ class static_map { * @param stream Stream used for executing the kernels */ static_map(std::size_t capacity, - sentinel::empty_key empty_key_sentinel, - sentinel::empty_value empty_value_sentinel, - sentinel::erased_key erased_key_sentinel, + empty_key empty_key_sentinel, + empty_value empty_value_sentinel, + erased_key erased_key_sentinel, Allocator const& alloc = Allocator{}, cudaStream_t stream = 0); @@ -253,7 +760,7 @@ class static_map { * @param stream Stream used for executing the kernels */ template , + typename Hash = cuco::default_hash_function, typename KeyEqual = thrust::equal_to> void insert(InputIt first, InputIt last, @@ -287,7 +794,7 @@ class static_map { template , + typename Hash = cuco::default_hash_function, typename KeyEqual = thrust::equal_to> void insert_if(InputIt first, InputIt last, @@ -325,7 +832,7 @@ class static_map { * provided at construction */ template , + typename Hash = cuco::default_hash_function, typename KeyEqual = thrust::equal_to> void erase(InputIt first, InputIt last, @@ -354,7 +861,7 @@ class static_map { */ template , + typename Hash = cuco::default_hash_function, typename KeyEqual = thrust::equal_to> void find(InputIt first, InputIt last, @@ -384,7 +891,7 @@ class static_map { template std::pair retrieve_all(KeyOut keys_out, ValueOut values_out, - cudaStream_t stream = 0); + cudaStream_t stream = 0) const; /** * @brief Indicates whether the keys in the range `[first, last)` are contained in the map. @@ -409,7 +916,7 @@ class static_map { */ template , + typename Hash = cuco::default_hash_function, typename KeyEqual = thrust::equal_to> void contains(InputIt first, InputIt last, @@ -437,8 +944,8 @@ class static_map { __host__ __device__ device_view_base(pair_atomic_type* slots, std::size_t capacity, - sentinel::empty_key empty_key_sentinel, - sentinel::empty_value empty_value_sentinel) noexcept + empty_key empty_key_sentinel, + empty_value empty_value_sentinel) noexcept : slots_{slots}, capacity_{capacity}, empty_key_sentinel_{empty_key_sentinel.value}, @@ -449,9 +956,9 @@ class static_map { __host__ __device__ device_view_base(pair_atomic_type* slots, std::size_t capacity, - sentinel::empty_key empty_key_sentinel, - sentinel::empty_value empty_value_sentinel, - sentinel::erased_key erased_key_sentinel) noexcept + empty_key empty_key_sentinel, + empty_value empty_value_sentinel, + erased_key erased_key_sentinel) noexcept : slots_{slots}, capacity_{capacity}, empty_key_sentinel_{empty_key_sentinel.value}, @@ -770,11 +1277,10 @@ class static_map { * @param empty_value_sentinel The reserved value for mapped values to * represent empty slots */ - __host__ __device__ - device_mutable_view(pair_atomic_type* slots, - std::size_t capacity, - sentinel::empty_key empty_key_sentinel, - sentinel::empty_value empty_value_sentinel) noexcept + __host__ __device__ device_mutable_view(pair_atomic_type* slots, + std::size_t capacity, + empty_key empty_key_sentinel, + empty_value empty_value_sentinel) noexcept : device_view_base{slots, capacity, empty_key_sentinel, empty_value_sentinel} { } @@ -791,9 +1297,9 @@ class static_map { */ __host__ __device__ device_mutable_view(pair_atomic_type* slots, std::size_t capacity, - sentinel::empty_key empty_key_sentinel, - sentinel::empty_value empty_value_sentinel, - sentinel::erased_key erased_key_sentinel) noexcept + empty_key empty_key_sentinel, + empty_value empty_value_sentinel, + erased_key erased_key_sentinel) noexcept : device_view_base{ slots, capacity, empty_key_sentinel, empty_value_sentinel, erased_key_sentinel} { @@ -880,8 +1386,8 @@ class static_map { CG const& g, pair_atomic_type* slots, std::size_t capacity, - sentinel::empty_key empty_key_sentinel, - sentinel::empty_value empty_value_sentinel) noexcept + empty_key empty_key_sentinel, + empty_value empty_value_sentinel) noexcept { device_view_base::initialize_slots( g, slots, capacity, empty_key_sentinel.value, empty_value_sentinel.value); @@ -889,7 +1395,7 @@ class static_map { capacity, empty_key_sentinel, empty_value_sentinel, - sentinel::erased_key{empty_key_sentinel.value}}; + erased_key{empty_key_sentinel.value}}; } /** @@ -911,9 +1417,9 @@ class static_map { CG const& g, pair_atomic_type* slots, std::size_t capacity, - sentinel::empty_key empty_key_sentinel, - sentinel::empty_value empty_value_sentinel, - sentinel::erased_key erased_key_sentinel) noexcept + empty_key empty_key_sentinel, + empty_value empty_value_sentinel, + erased_key erased_key_sentinel) noexcept { device_view_base::initialize_slots( g, slots, capacity, empty_key_sentinel, empty_value_sentinel); @@ -932,7 +1438,7 @@ class static_map { * equality * @return `true` if the insert was successful, `false` otherwise. */ - template , + template , typename KeyEqual = thrust::equal_to> __device__ bool insert(value_type const& insert_pair, Hash hash = Hash{}, @@ -963,7 +1469,7 @@ class static_map { * @return a pair consisting of an iterator to the element and a bool, * either `true` if the insert was successful, `false` otherwise. */ - template , + template , typename KeyEqual = thrust::equal_to> __device__ thrust::pair insert_and_find( value_type const& insert_pair, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}) noexcept; @@ -988,7 +1494,7 @@ class static_map { * @return `true` if the insert was successful, `false` otherwise. */ template , + typename Hash = cuco::default_hash_function, typename KeyEqual = thrust::equal_to> __device__ bool insert(CG const& g, value_type const& insert_pair, @@ -1009,7 +1515,7 @@ class static_map { * equality * @return `true` if the erasure was successful, `false` otherwise. */ - template , + template , typename KeyEqual = thrust::equal_to> __device__ bool erase(key_type const& k, Hash hash = Hash{}, @@ -1032,7 +1538,7 @@ class static_map { * @return `true` if the erasure was successful, `false` otherwise. */ template , + typename Hash = cuco::default_hash_function, typename KeyEqual = thrust::equal_to> __device__ bool erase(CG const& g, key_type const& k, @@ -1072,8 +1578,8 @@ class static_map { */ __host__ __device__ device_view(pair_atomic_type* slots, std::size_t capacity, - sentinel::empty_key empty_key_sentinel, - sentinel::empty_value empty_value_sentinel) noexcept + empty_key empty_key_sentinel, + empty_value empty_value_sentinel) noexcept : device_view_base{slots, capacity, empty_key_sentinel, empty_value_sentinel} { } @@ -1090,9 +1596,9 @@ class static_map { */ __host__ __device__ device_view(pair_atomic_type* slots, std::size_t capacity, - sentinel::empty_key empty_key_sentinel, - sentinel::empty_value empty_value_sentinel, - sentinel::erased_key erased_key_sentinel) noexcept + empty_key empty_key_sentinel, + empty_value empty_value_sentinel, + erased_key erased_key_sentinel) noexcept : device_view_base{ slots, capacity, empty_key_sentinel, empty_value_sentinel, erased_key_sentinel} { @@ -1106,9 +1612,9 @@ class static_map { __host__ __device__ explicit device_view(device_mutable_view mutable_map) : device_view_base{mutable_map.get_slots(), mutable_map.get_capacity(), - sentinel::empty_key{mutable_map.get_empty_key_sentinel()}, - sentinel::empty_value{mutable_map.get_empty_value_sentinel()}, - sentinel::erased_key{mutable_map.get_erased_key_sentinel()}} + empty_key{mutable_map.get_empty_key_sentinel()}, + empty_value{mutable_map.get_empty_value_sentinel()}, + erased_key{mutable_map.get_erased_key_sentinel()}} { } @@ -1177,12 +1683,11 @@ class static_map { g.sync(); #endif - return device_view( - memory_to_use, - source_device_view.get_capacity(), - sentinel::empty_key{source_device_view.get_empty_key_sentinel()}, - sentinel::empty_value{source_device_view.get_empty_value_sentinel()}, - sentinel::erased_key{source_device_view.get_erased_key_sentinel()}); + return device_view(memory_to_use, + source_device_view.get_capacity(), + empty_key{source_device_view.get_empty_key_sentinel()}, + empty_value{source_device_view.get_empty_value_sentinel()}, + erased_key{source_device_view.get_erased_key_sentinel()}); } /** @@ -1200,7 +1705,7 @@ class static_map { * @return An iterator to the position at which the key/value pair * containing `k` was inserted */ - template , + template , typename KeyEqual = thrust::equal_to> __device__ iterator find(Key const& k, Hash hash = Hash{}, @@ -1220,7 +1725,7 @@ class static_map { * @return An iterator to the position at which the key/value pair * containing `k` was inserted */ - template , + template , typename KeyEqual = thrust::equal_to> __device__ const_iterator find(Key const& k, Hash hash = Hash{}, @@ -1247,7 +1752,7 @@ class static_map { * containing `k` was inserted */ template , + typename Hash = cuco::default_hash_function, typename KeyEqual = thrust::equal_to> __device__ iterator find(CG g, Key const& k, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}) noexcept; @@ -1273,7 +1778,7 @@ class static_map { * containing `k` was inserted */ template , + typename Hash = cuco::default_hash_function, typename KeyEqual = thrust::equal_to> __device__ const_iterator find(CG g, Key const& k, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}) const noexcept; @@ -1302,7 +1807,7 @@ class static_map { * containing `k` was inserted */ template , + typename Hash = cuco::default_hash_function, typename KeyEqual = thrust::equal_to> __device__ bool contains(ProbeKey const& k, Hash hash = Hash{}, @@ -1337,7 +1842,7 @@ class static_map { */ template , + typename Hash = cuco::default_hash_function, typename KeyEqual = thrust::equal_to> __device__ std::enable_if_t, bool> contains( CG const& g, @@ -1397,9 +1902,9 @@ class static_map { { return device_view(slots_, capacity_, - sentinel::empty_key{empty_key_sentinel_}, - sentinel::empty_value{empty_value_sentinel_}, - sentinel::erased_key{erased_key_sentinel_}); + empty_key{empty_key_sentinel_}, + empty_value{empty_value_sentinel_}, + erased_key{erased_key_sentinel_}); } /** @@ -1411,13 +1916,13 @@ class static_map { { return device_mutable_view(slots_, capacity_, - sentinel::empty_key{empty_key_sentinel_}, - sentinel::empty_value{empty_value_sentinel_}, - sentinel::erased_key{erased_key_sentinel_}); + empty_key{empty_key_sentinel_}, + empty_value{empty_value_sentinel_}, + erased_key{erased_key_sentinel_}); } private: - pair_atomic_type* slots_{nullptr}; ///< Pointer to flat slots storage + pair_atomic_type* slots_{}; ///< Pointer to flat slots storage std::size_t capacity_{}; ///< Total number of slots std::size_t size_{}; ///< Number of keys in map Key empty_key_sentinel_{}; ///< Key value that represents an empty slot @@ -1430,3 +1935,4 @@ class static_map { } // namespace cuco #include +#include diff --git a/include/cuco/static_map_ref.cuh b/include/cuco/static_map_ref.cuh new file mode 100644 index 000000000..c41ed88f3 --- /dev/null +++ b/include/cuco/static_map_ref.cuh @@ -0,0 +1,187 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include + +namespace cuco { +namespace experimental { + +/** + * @brief Device non-owning "ref" type that can be used in device code to perform arbitrary + * operations defined in `include/cuco/operator.hpp` + * + * @note Concurrent modify and lookup will be supported if both kinds of operators are specified + * during the ref construction. + * @note cuCollections data structures always place the slot keys on the left-hand + * side when invoking the key comparison predicate. + * @note Ref types are trivially-copyable and are intended to be passed by value. + * @note `ProbingScheme::cg_size` indicates how many threads are used to handle one independent + * device operation. `cg_size == 1` uses the scalar (or non-CG) code paths. + * + * @throw If the size of the given key type is larger than 8 bytes + * @throw If the size of the given payload type is larger than 8 bytes + * @throw If the given key type doesn't have unique object representations, i.e., + * `cuco::bitwise_comparable_v == false` + * @throw If the given payload type doesn't have unique object representations, i.e., + * `cuco::bitwise_comparable_v == false` + * @throw If the probing scheme type is not inherited from `cuco::detail::probing_scheme_base` + * + * @tparam Key Type used for keys. Requires `cuco::is_bitwise_comparable_v` returning true + * @tparam T Type used for mapped values. Requires `cuco::is_bitwise_comparable_v` returning true + * @tparam Scope The scope in which operations will be performed by individual threads. + * @tparam KeyEqual Binary callable type used to compare two keys for equality + * @tparam ProbingScheme Probing scheme (see `include/cuco/probing_scheme.cuh` for options) + * @tparam StorageRef Storage ref type + * @tparam Operators Device operator options defined in `include/cuco/operator.hpp` + */ +template +class static_map_ref + : public detail::operator_impl< + Operators, + static_map_ref>... { + using impl_type = detail::open_addressing_ref_impl; + + static_assert(sizeof(T) <= 8, "Container does not support payload types larger than 8 bytes."); + + static_assert( + cuco::is_bitwise_comparable_v, + "Key type must have unique object representations or have been explicitly declared as safe for " + "bitwise comparison via specialization of cuco::is_bitwise_comparable_v."); + + public: + using key_type = Key; ///< Key type + using mapped_type = T; ///< Mapped type + using probing_scheme_type = ProbingScheme; ///< Type of probing scheme + using storage_ref_type = StorageRef; ///< Type of storage ref + using window_type = typename storage_ref_type::window_type; ///< Window type + using value_type = typename storage_ref_type::value_type; ///< Storage element type + using extent_type = typename storage_ref_type::extent_type; ///< Extent type + using size_type = typename storage_ref_type::size_type; ///< Probing scheme size type + using key_equal = KeyEqual; ///< Type of key equality binary callable + using iterator = typename storage_ref_type::iterator; ///< Slot iterator type + using const_iterator = typename storage_ref_type::const_iterator; ///< Const slot iterator type + + static constexpr auto cg_size = probing_scheme_type::cg_size; ///< Cooperative group size + static constexpr auto window_size = + storage_ref_type::window_size; ///< Number of elements handled per window + + /** + * @brief Constructs static_map_ref. + * + * @param empty_key_sentinel Sentinel indicating empty key + * @param empty_value_sentinel Sentinel indicating empty payload + * @param predicate Key equality binary callable + * @param probing_scheme Probing scheme + * @param storage_ref Non-owning ref of slot storage + */ + __host__ __device__ explicit constexpr static_map_ref( + cuco::empty_key empty_key_sentinel, + cuco::empty_value empty_value_sentinel, + key_equal const& predicate, + probing_scheme_type const& probing_scheme, + storage_ref_type storage_ref) noexcept; + + /** + * @brief Operator-agnostic move constructor. + * + * @tparam OtherOperators Operator set of the `other` object + * + * @param other Object to construct `*this` from + */ + template + __host__ __device__ explicit constexpr static_map_ref( + static_map_ref&& + other) noexcept; + + /** + * @brief Gets the maximum number of elements the container can hold. + * + * @return The maximum number of elements the container can hold + */ + [[nodiscard]] __host__ __device__ constexpr auto capacity() const noexcept; + + /** + * @brief Gets the sentinel value used to represent an empty key slot. + * + * @return The sentinel value used to represent an empty key slot + */ + [[nodiscard]] __host__ __device__ constexpr key_type empty_key_sentinel() const noexcept; + + /** + * @brief Gets the sentinel value used to represent an empty key slot. + * + * @return The sentinel value used to represent an empty key slot + */ + [[nodiscard]] __host__ __device__ constexpr mapped_type empty_value_sentinel() const noexcept; + + /** + * @brief Creates a reference with new operators from the current object. + * + * Note that this function uses move semantics and thus invalidates the current object. + * + * @warning Using two or more reference objects to the same container but with + * a different operator set at the same time results in undefined behavior. + * + * @tparam NewOperators List of `cuco::op::*_tag` types + * + * @param ops List of operators, e.g., `cuco::insert` + * + * @return `*this` with `NewOperators...` + */ + template + [[nodiscard]] __host__ __device__ auto with(NewOperators... ops) && noexcept; + + private: + struct predicate_wrapper; + + impl_type impl_; ///< Static map ref implementation + predicate_wrapper predicate_; ///< Key equality binary callable + mapped_type empty_value_sentinel_; ///< Empty value sentinel + + // Mixins need to be friends with this class in order to access private members + template + friend class detail::operator_impl; + + // Refs with other operator sets need to be friends too + template + friend class static_map_ref; +}; + +} // namespace experimental +} // namespace cuco + +#include diff --git a/include/cuco/static_multimap.cuh b/include/cuco/static_multimap.cuh index ef43b2175..9e2a2e280 100644 --- a/include/cuco/static_multimap.cuh +++ b/include/cuco/static_multimap.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,13 +16,13 @@ #pragma once -#include #include -#include #include +#include #include #include -#include +#include +#include #include @@ -130,8 +130,7 @@ template , - class ProbeSequence = - cuco::double_hashing<8, detail::MurmurHash3_32, detail::MurmurHash3_32>> + class ProbeSequence = cuco::double_hashing<8, cuco::default_hash_function>> class static_multimap { static_assert( cuco::is_bitwise_comparable_v, @@ -149,14 +148,14 @@ class static_multimap { "cuco::linear_probing."); public: - using value_type = cuco::pair_type; ///< Type of key/value pairs + using value_type = cuco::pair; ///< Type of key/value pairs using key_type = Key; ///< Key type using mapped_type = Value; ///< Type of mapped values using atomic_key_type = cuda::atomic; ///< Type of atomic keys using atomic_mapped_type = cuda::atomic; ///< Type of atomic mapped values using pair_atomic_type = - cuco::pair_type; ///< Pair type of atomic key and atomic mapped value + cuco::pair; ///< Pair type of atomic key and atomic mapped value using atomic_ctr_type = cuda::atomic; ///< Atomic counter type using allocator_type = Allocator; ///< Allocator type using slot_allocator_type = typename std::allocator_traits::rebind_alloc< @@ -224,8 +223,8 @@ class static_multimap { * @param alloc Allocator used for allocating device storage */ static_multimap(std::size_t capacity, - sentinel::empty_key empty_key_sentinel, - sentinel::empty_value empty_value_sentinel, + empty_key empty_key_sentinel, + empty_value empty_value_sentinel, cudaStream_t stream = 0, Allocator const& alloc = Allocator{}); @@ -610,8 +609,8 @@ class static_multimap { __host__ __device__ device_view_base(pair_atomic_type* slots, std::size_t capacity, - sentinel::empty_key empty_key_sentinel, - sentinel::empty_value empty_value_sentinel) noexcept + empty_key empty_key_sentinel, + empty_value empty_value_sentinel) noexcept : impl_{slots, capacity, empty_key_sentinel.value, empty_value_sentinel.value} { } @@ -713,11 +712,10 @@ class static_multimap { * @param empty_value_sentinel The reserved value for mapped values to * represent empty slots */ - __host__ __device__ - device_mutable_view(pair_atomic_type* slots, - std::size_t capacity, - sentinel::empty_key empty_key_sentinel, - sentinel::empty_value empty_value_sentinel) noexcept + __host__ __device__ device_mutable_view(pair_atomic_type* slots, + std::size_t capacity, + empty_key empty_key_sentinel, + empty_value empty_value_sentinel) noexcept : view_base_type{slots, capacity, empty_key_sentinel, empty_value_sentinel} { } @@ -769,8 +767,8 @@ class static_multimap { */ __host__ __device__ device_view(pair_atomic_type* slots, std::size_t capacity, - sentinel::empty_key empty_key_sentinel, - sentinel::empty_value empty_value_sentinel) noexcept + empty_key empty_key_sentinel, + empty_value empty_value_sentinel) noexcept : view_base_type{slots, capacity, empty_key_sentinel, empty_value_sentinel} { } @@ -1324,8 +1322,8 @@ class static_multimap { { return device_view(slots_.get(), capacity_, - sentinel::empty_key{empty_key_sentinel_}, - sentinel::empty_value{empty_value_sentinel_}); + empty_key{empty_key_sentinel_}, + empty_value{empty_value_sentinel_}); } /** @@ -1338,8 +1336,8 @@ class static_multimap { { return device_mutable_view(slots_.get(), capacity_, - sentinel::empty_key{empty_key_sentinel_}, - sentinel::empty_value{empty_value_sentinel_}); + empty_key{empty_key_sentinel_}, + empty_value{empty_value_sentinel_}); } private: diff --git a/include/cuco/static_set.cuh b/include/cuco/static_set.cuh new file mode 100644 index 000000000..613a99bd4 --- /dev/null +++ b/include/cuco/static_set.cuh @@ -0,0 +1,469 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#if defined(CUCO_HAS_CUDA_BARRIER) +#include +#endif + +#include +#include + +namespace cuco { +namespace experimental { +/** + * @brief A GPU-accelerated, unordered, associative container of unique keys. + * + * The `static_set` supports two types of operations: + * - Host-side "bulk" operations + * - Device-side "singular" operations + * + * The host-side bulk operations include `insert`, `contains`, etc. These APIs should be used when + * there are a large number of keys to modify or lookup. For example, given a range of keys + * specified by device-accessible iterators, the bulk `insert` function will insert all keys into + * the set. + * + * The singular device-side operations allow individual threads (or cooperative groups) to perform + * independent modify or lookup operations from device code. These operations are accessed through + * non-owning, trivially copyable reference types (or "ref"). User can combine any arbitrary + * operators (see options in `include/cuco/operator.hpp`) when creating the ref. Concurrent modify + * and lookup will be supported if both kinds of operators are specified during the ref + * construction. + * + * @note Allows constant time concurrent modify or lookup operations from threads in device code. + * @note cuCollections data structures always place the slot keys on the left-hand side when + * invoking the key comparison predicate, i.e., `pred(slot_key, query_key)`. Order-sensitive + * `KeyEqual` should be used with caution. + * @note `ProbingScheme::cg_size` indicates how many threads are used to handle one independent + * device operation. `cg_size == 1` uses the scalar (or non-CG) code paths. + * + * @throw If the size of the given key type is larger than 8 bytes + * @throw If the given key type doesn't have unique object representations, i.e., + * `cuco::bitwise_comparable_v == false` + * @throw If the probing scheme type is not inherited from `cuco::detail::probing_scheme_base` + * + * @tparam Key Type used for keys. Requires `cuco::is_bitwise_comparable_v` + * @tparam Extent Data structure size type + * @tparam Scope The scope in which operations will be performed by individual threads. + * @tparam KeyEqual Binary callable type used to compare two keys for equality + * @tparam ProbingScheme Probing scheme (see `include/cuco/probing_scheme.cuh` for choices) + * @tparam Allocator Type of allocator used for device storage + * @tparam Storage Slot window storage type + */ + +template , + cuda::thread_scope Scope = cuda::thread_scope_device, + class KeyEqual = thrust::equal_to, + class ProbingScheme = experimental::double_hashing<4, // CG size + cuco::default_hash_function>, + class Allocator = cuco::cuda_allocator, + class Storage = cuco::experimental::storage<1>> +class static_set { + using impl_type = detail:: + open_addressing_impl; + + public: + static constexpr auto cg_size = impl_type::cg_size; ///< CG size used for probing + static constexpr auto window_size = impl_type::window_size; ///< Window size used for probing + static constexpr auto thread_scope = impl_type::thread_scope; ///< CUDA thread scope + + using key_type = typename impl_type::key_type; ///< Key type + using value_type = typename impl_type::value_type; ///< Key type + using extent_type = typename impl_type::extent_type; ///< Extent type + using size_type = typename impl_type::size_type; ///< Size type + using key_equal = typename impl_type::key_equal; ///< Key equality comparator type + using allocator_type = typename impl_type::allocator_type; ///< Allocator type + /// Non-owning window storage ref type + using storage_ref_type = typename impl_type::storage_ref_type; + using probing_scheme_type = typename impl_type::probing_scheme_type; ///< Probing scheme type + + template + using ref_type = + cuco::experimental::static_set_ref; ///< Non-owning container ref type + + static_set(static_set const&) = delete; + static_set& operator=(static_set const&) = delete; + + static_set(static_set&&) = default; ///< Move constructor + + /** + * @brief Replaces the contents of the container with another container. + * + * @return Reference of the current map object + */ + static_set& operator=(static_set&&) = default; + ~static_set() = default; + + /** + * @brief Constructs a statically-sized set with the specified initial capacity, sentinel values + * and CUDA stream. + * + * The actual set capacity depends on the given `capacity`, the probing scheme, CG size, and the + * window size and it is computed via the `make_window_extent` factory. Insert operations will not + * automatically grow the set. Attempting to insert more unique keys than the capacity of the map + * results in undefined behavior. + * + * @note Any `*_sentinel`s are reserved and behavior is undefined when attempting to insert + * this sentinel value. + * @note If a non-default CUDA stream is provided, the caller is responsible for synchronizing the + * stream before the object is first used. + * + * @param capacity The requested lower-bound set size + * @param empty_key_sentinel The reserved key value for empty slots + * @param pred Key equality binary predicate + * @param probing_scheme Probing scheme + * @param alloc Allocator used for allocating device storage + * @param stream CUDA stream used to initialize the set + */ + constexpr static_set(Extent capacity, + empty_key empty_key_sentinel, + KeyEqual const& pred = {}, + ProbingScheme const& probing_scheme = {}, + Allocator const& alloc = {}, + cuda_stream_ref stream = {}); + + /** + * @brief Erases all elements from the container. After this call, `size()` returns zero. + * Invalidates any references, pointers, or iterators referring to contained elements. + * + * @param stream CUDA stream this operation is executed in + */ + void clear(cuda_stream_ref stream = {}) noexcept; + + /** + * @brief Asynchronously erases all elements from the container. After this call, `size()` returns + * zero. Invalidates any references, pointers, or iterators referring to contained elements. + * + * @param stream CUDA stream this operation is executed in + */ + void clear_async(cuda_stream_ref stream = {}) noexcept; + + /** + * @brief Inserts all keys in the range `[first, last)` and returns the number of successful + * insertions. + * + * @note This function synchronizes the given stream. For asynchronous execution use + * `insert_async`. + * + * @tparam InputIt Device accessible random access input iterator where + * std::is_convertible::value_type, + * static_set::value_type> is `true` + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param stream CUDA stream used for insert + * + * @return Number of successfully inserted keys + */ + template + size_type insert(InputIt first, InputIt last, cuda_stream_ref stream = {}); + + /** + * @brief Asynchronously inserts all keys in the range `[first, last)`. + * + * @tparam InputIt Device accessible random access input iterator where + * std::is_convertible::value_type, + * static_set::value_type> is `true` + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param stream CUDA stream used for insert + */ + template + void insert_async(InputIt first, InputIt last, cuda_stream_ref stream = {}) noexcept; + + /** + * @brief Inserts keys in the range `[first, last)` if `pred` of the corresponding stencil returns + * true. + * + * @note The key `*(first + i)` is inserted if `pred( *(stencil + i) )` returns true. + * @note This function synchronizes the given stream and returns the number of successful + * insertions. For asynchronous execution use `insert_if_async`. + * + * @tparam InputIt Device accessible random access iterator whose `value_type` is + * convertible to the container's `value_type` + * @tparam StencilIt Device accessible random access iterator whose value_type is + * convertible to Predicate's argument type + * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` and + * argument type is convertible from std::iterator_traits::value_type + * + * @param first Beginning of the sequence of key/value pairs + * @param last End of the sequence of key/value pairs + * @param stencil Beginning of the stencil sequence + * @param pred Predicate to test on every element in the range `[stencil, stencil + + * std::distance(first, last))` + * @param stream CUDA stream used for the operation + * + * @return Number of successfully inserted keys + */ + template + size_type insert_if( + InputIt first, InputIt last, StencilIt stencil, Predicate pred, cuda_stream_ref stream = {}); + + /** + * @brief Asynchronously inserts keys in the range `[first, last)` if `pred` of the corresponding + * stencil returns true. + * + * @note The key `*(first + i)` is inserted if `pred( *(stencil + i) )` returns true. + * + * @tparam InputIt Device accessible random access iterator whose `value_type` is + * convertible to the container's `value_type` + * @tparam StencilIt Device accessible random access iterator whose value_type is + * convertible to Predicate's argument type + * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` and + * argument type is convertible from std::iterator_traits::value_type + * + * @param first Beginning of the sequence of key/value pairs + * @param last End of the sequence of key/value pairs + * @param stencil Beginning of the stencil sequence + * @param pred Predicate to test on every element in the range `[stencil, stencil + + * std::distance(first, last))` + * @param stream CUDA stream used for the operation + */ + template + void insert_if_async(InputIt first, + InputIt last, + StencilIt stencil, + Predicate pred, + cuda_stream_ref stream = {}) noexcept; + + /** + * @brief Indicates whether the keys in the range `[first, last)` are contained in the set. + * + * @note This function synchronizes the given stream. For asynchronous execution use + * `contains_async`. + * + * @tparam InputIt Device accessible input iterator + * @tparam OutputIt Device accessible output iterator assignable from `bool` + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param output_begin Beginning of the sequence of booleans for the presence of each key + * @param stream Stream used for executing the kernels + */ + template + void contains(InputIt first, + InputIt last, + OutputIt output_begin, + cuda_stream_ref stream = {}) const; + + /** + * @brief Asynchronously indicates whether the keys in the range `[first, last)` are contained in + * the set. + * + * @tparam InputIt Device accessible input iterator + * @tparam OutputIt Device accessible output iterator assignable from `bool` + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param output_begin Beginning of the sequence of booleans for the presence of each key + * @param stream Stream used for executing the kernels + */ + template + void contains_async(InputIt first, + InputIt last, + OutputIt output_begin, + cuda_stream_ref stream = {}) const noexcept; + + /** + * @brief Indicates whether the keys in the range `[first, last)` are contained in the set if + * `pred` of the corresponding stencil returns true. + * + * @note If `pred( *(stencil + i) )` is true, stores `true` or `false` to `(output_begin + i)` + * indicating if the key `*(first + i)` is present in the set. If `pred( *(stencil + i) )` is + * false, stores false to `(output_begin + i)`. + * @note This function synchronizes the given stream. For asynchronous execution use + * `contains_if_async`. + * + * @tparam InputIt Device accessible input iterator + * @tparam StencilIt Device accessible random access iterator whose value_type is + * convertible to Predicate's argument type + * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` and + * argument type is convertible from std::iterator_traits::value_type + * @tparam OutputIt Device accessible output iterator assignable from `bool` + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param stencil Beginning of the stencil sequence + * @param pred Predicate to test on every element in the range `[stencil, stencil + + * std::distance(first, last))` + * @param output_begin Beginning of the sequence of booleans for the presence of each key + * @param stream Stream used for executing the kernels + */ + template + void contains_if(InputIt first, + InputIt last, + StencilIt stencil, + Predicate pred, + OutputIt output_begin, + cuda_stream_ref stream = {}) const; + + /** + * @brief Asynchronously indicates whether the keys in the range `[first, last)` are contained in + * the set if `pred` of the corresponding stencil returns true. + * + * @note If `pred( *(stencil + i) )` is true, stores `true` or `false` to `(output_begin + i)` + * indicating if the key `*(first + i)` is present in the set. If `pred( *(stencil + i) )` is + * false, stores false to `(output_begin + i)`. + * + * @tparam InputIt Device accessible input iterator + * @tparam StencilIt Device accessible random access iterator whose value_type is + * convertible to Predicate's argument type + * @tparam Predicate Unary predicate callable whose return type must be convertible to `bool` and + * argument type is convertible from std::iterator_traits::value_type + * @tparam OutputIt Device accessible output iterator assignable from `bool` + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param stencil Beginning of the stencil sequence + * @param pred Predicate to test on every element in the range `[stencil, stencil + + * std::distance(first, last))` + * @param output_begin Beginning of the sequence of booleans for the presence of each key + * @param stream Stream used for executing the kernels + */ + template + void contains_if_async(InputIt first, + InputIt last, + StencilIt stencil, + Predicate pred, + OutputIt output_begin, + cuda_stream_ref stream = {}) const noexcept; + + /** + * @brief For all keys in the range `[first, last)`, finds an element with key equivalent to the + * query key. + * + * @note This function synchronizes the given stream. For asynchronous execution use `find_async`. + * @note If the key `*(first + i)` has a matched `element` in the set, copies `element` to + * `(output_begin + i)`. Else, copies the empty key sentinel. + * + * @tparam InputIt Device accessible input iterator + * @tparam OutputIt Device accessible output iterator assignable from the set's `key_type` + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param output_begin Beginning of the sequence of elements retrieved for each key + * @param stream Stream used for executing the kernels + */ + template + void find(InputIt first, InputIt last, OutputIt output_begin, cuda_stream_ref stream = {}) const; + + /** + * @brief For all keys in the range `[first, last)`, asynchronously finds an element with key + * equivalent to the query key. + * + * @note If the key `*(first + i)` has a matched `element` in the set, copies `element` to + * `(output_begin + i)`. Else, copies the empty key sentinel. + * + * @tparam InputIt Device accessible input iterator + * @tparam OutputIt Device accessible output iterator assignable from the set's `key_type` + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param output_begin Beginning of the sequence of elements retrieved for each key + * @param stream Stream used for executing the kernels + */ + template + void find_async(InputIt first, + InputIt last, + OutputIt output_begin, + cuda_stream_ref stream = {}) const; + + /** + * @brief Retrieves all keys contained in the set. + * + * @note This API synchronizes the given stream. + * @note The order in which keys are returned is implementation defined and not guaranteed to be + * consistent between subsequent calls to `retrieve_all`. + * @note Behavior is undefined if the range beginning at `output_begin` is smaller than the return + * value of `size()`. + * + * @tparam OutputIt Device accessible random access output iterator whose `value_type` is + * convertible from the container's `key_type`. + * + * @param output_begin Beginning output iterator for keys + * @param stream CUDA stream used for this operation + * + * @return Iterator indicating the end of the output + */ + template + OutputIt retrieve_all(OutputIt output_begin, cuda_stream_ref stream = {}) const; + + /** + * @brief Gets the number of elements in the container. + * + * @note This function synchronizes the given stream. + * + * @param stream CUDA stream used to get the number of inserted elements + * @return The number of elements in the container + */ + [[nodiscard]] size_type size(cuda_stream_ref stream = {}) const noexcept; + + /** + * @brief Gets the maximum number of elements the hash map can hold. + * + * @return The maximum number of elements the hash map can hold + */ + [[nodiscard]] constexpr auto capacity() const noexcept; + + /** + * @brief Gets the sentinel value used to represent an empty key slot. + * + * @return The sentinel value used to represent an empty key slot + */ + [[nodiscard]] constexpr key_type empty_key_sentinel() const noexcept; + + /** + * @brief Get device ref with operators. + * + * @tparam Operators Set of `cuco::op` to be provided by the ref + * + * @param ops List of operators, e.g., `cuco::insert` + * + * @return Device ref of the current `static_set` object + */ + template + [[nodiscard]] auto ref(Operators... ops) const noexcept; + + private: + std::unique_ptr impl_; +}; +} // namespace experimental +} // namespace cuco + +#include diff --git a/include/cuco/static_set_ref.cuh b/include/cuco/static_set_ref.cuh new file mode 100644 index 000000000..b2c8158e7 --- /dev/null +++ b/include/cuco/static_set_ref.cuh @@ -0,0 +1,164 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +namespace cuco { +namespace experimental { + +/** + * @brief Device non-owning "ref" type that can be used in device code to perform arbitrary + * operations defined in `include/cuco/operator.hpp` + * + * @note Concurrent modify and lookup will be supported if both kinds of operators are specified + * during the ref construction. + * @note cuCollections data structures always place the slot keys on the left-hand + * side when invoking the key comparison predicate. + * @note Ref types are trivially-copyable and are intended to be passed by value. + * @note `ProbingScheme::cg_size` indicates how many threads are used to handle one independent + * device operation. `cg_size == 1` uses the scalar (or non-CG) code paths. + * + * @throw If the size of the given key type is larger than 8 bytes + * @throw If the given key type doesn't have unique object representations, i.e., + * `cuco::bitwise_comparable_v == false` + * @throw If the probing scheme type is not inherited from `cuco::detail::probing_scheme_base` + * + * @tparam Key Type used for keys. Requires `cuco::is_bitwise_comparable_v` returning true + * @tparam Scope The scope in which operations will be performed by individual threads. + * @tparam KeyEqual Binary callable type used to compare two keys for equality + * @tparam ProbingScheme Probing scheme (see `include/cuco/probing_scheme.cuh` for options) + * @tparam StorageRef Storage ref type + * @tparam Operators Device operator options defined in `include/cuco/operator.hpp` + */ +template +class static_set_ref + : public detail::operator_impl< + Operators, + static_set_ref>... { + using impl_type = detail::open_addressing_ref_impl; + + public: + using key_type = Key; ///< Key Type + using probing_scheme_type = ProbingScheme; ///< Type of probing scheme + using storage_ref_type = StorageRef; ///< Type of storage ref + using window_type = typename storage_ref_type::window_type; ///< Window type + using value_type = typename storage_ref_type::value_type; ///< Storage element type + using extent_type = typename storage_ref_type::extent_type; ///< Extent type + using size_type = typename storage_ref_type::size_type; ///< Probing scheme size type + using key_equal = KeyEqual; ///< Type of key equality binary callable + using iterator = typename storage_ref_type::iterator; ///< Slot iterator type + using const_iterator = typename storage_ref_type::const_iterator; ///< Const slot iterator type + + static constexpr auto cg_size = probing_scheme_type::cg_size; ///< Cooperative group size + static constexpr auto window_size = + storage_ref_type::window_size; ///< Number of elements handled per window + + /** + * @brief Constructs static_set_ref. + * + * @param empty_key_sentinel Sentinel indicating empty key + * @param predicate Key equality binary callable + * @param probing_scheme Probing scheme + * @param storage_ref Non-owning ref of slot storage + */ + __host__ __device__ explicit constexpr static_set_ref( + cuco::empty_key empty_key_sentinel, + key_equal const& predicate, + probing_scheme_type const& probing_scheme, + storage_ref_type storage_ref) noexcept; + + /** + * @brief Operator-agnostic move constructor. + * + * @tparam OtherOperators Operator set of the `other` object + * + * @param other Object to construct `*this` from + */ + template + __host__ __device__ explicit constexpr static_set_ref( + static_set_ref&& + other) noexcept; + + /** + * @brief Gets the maximum number of elements the container can hold. + * + * @return The maximum number of elements the container can hold + */ + [[nodiscard]] __host__ __device__ constexpr auto capacity() const noexcept; + + /** + * @brief Gets the sentinel value used to represent an empty key slot. + * + * @return The sentinel value used to represent an empty key slot + */ + [[nodiscard]] __host__ __device__ constexpr key_type empty_key_sentinel() const noexcept; + + /** + * @brief Creates a reference with new operators from the current object. + * + * Note that this function uses move semantics and thus invalidates the current object. + * + * @warning Using two or more reference objects to the same container but with + * a different operator set at the same time results in undefined behavior. + * + * @tparam NewOperators List of `cuco::op::*_tag` types + * + * @param ops List of operators, e.g., `cuco::insert` + * + * @return `*this` with `NewOperators...` + */ + template + [[nodiscard]] __host__ __device__ auto with(NewOperators... ops) && noexcept; + + private: + impl_type impl_; + detail::equal_wrapper predicate_; ///< Key equality binary callable + + // Mixins need to be friends with this class in order to access private members + template + friend class detail::operator_impl; + + // Refs with other operator sets need to be friends too + template + friend class static_set_ref; +}; + +} // namespace experimental +} // namespace cuco + +#include diff --git a/include/cuco/storage.cuh b/include/cuco/storage.cuh new file mode 100644 index 000000000..e2e0c6f46 --- /dev/null +++ b/include/cuco/storage.cuh @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace cuco { +namespace experimental { + +/** + * @brief Public storage class. + * + * @note This is a public interface used to control storage window size. A window consists of a + * number of contiguous slots. The window size defines the workload granularity for each CUDA + * thread, i.e., how many slots a thread would concurrently operate on when performing modify or + * lookup operations. cuCollections uses the AoW storage to supersede the raw flat slot storage due + * to its superior granularity control: When window size equals one, AoW performs the same as the + * flat storage. If the underlying operation is more memory bandwidth bound, e.g., high occupancy + * multimap operations, a larger window size can reduce the length of probing sequences thus improve + * runtime performance. + * + * @tparam WindowSize Number of elements per window storage + */ +template +class storage { + public: + /// Number of slots per window storage + static constexpr int32_t window_size = WindowSize; + + /// Type of implementation details + template + using impl = aow_storage; +}; + +} // namespace experimental +} // namespace cuco diff --git a/include/cuco/allocator.hpp b/include/cuco/utility/allocator.hpp similarity index 97% rename from include/cuco/allocator.hpp rename to include/cuco/utility/allocator.hpp index c19552963..583571620 100644 --- a/include/cuco/allocator.hpp +++ b/include/cuco/utility/allocator.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/include/cuco/utility/error.hpp b/include/cuco/utility/error.hpp new file mode 100644 index 000000000..eb6a5f2e3 --- /dev/null +++ b/include/cuco/utility/error.hpp @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +namespace cuco { +/** + * @brief Exception thrown when logical precondition is violated. + * + * This exception should not be thrown directly and is instead thrown by the + * CUCO_EXPECTS macro. + */ +struct logic_error : public std::logic_error { + /** + * @brief Constructs a logic_error with the error message. + * + * @param message Message to be associated with the exception + */ + logic_error(char const* const message) : std::logic_error(message) {} + + /** + * @brief Construct a new logic error object with error message + * + * @param message Message to be associated with the exception + */ + logic_error(std::string const& message) : std::logic_error(message) {} +}; +/** + * @brief Exception thrown when a CUDA error is encountered. + * + */ +struct cuda_error : public std::runtime_error { + /** + * @brief Constructs a `cuda_error` object with the given `message`. + * + * @param message The error char array used to construct `cuda_error` + */ + cuda_error(const char* message) : std::runtime_error(message) {} + /** + * @brief Constructs a `cuda_error` object with the given `message` string. + * + * @param message The `std::string` used to construct `cuda_error` + */ + cuda_error(std::string const& message) : cuda_error{message.c_str()} {} +}; +} // namespace cuco diff --git a/include/cuco/utility/fast_int.cuh b/include/cuco/utility/fast_int.cuh new file mode 100644 index 000000000..6616e2c5c --- /dev/null +++ b/include/cuco/utility/fast_int.cuh @@ -0,0 +1,159 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include +#include +#include +#include + +namespace cuco::utility { + +/** + * @brief Integer type with optimized division and modulo operators. + * + * @tparam T Underlying integer type + */ +template +struct fast_int { + static_assert(cuda::std::is_same_v or cuda::std::is_same_v +#if defined(CUCO_HAS_INT128) + or cuda::std::is_same_v or cuda::std::is_same_v +#endif + , + "Unsupported integer type"); + + using value_type = T; ///< Underlying integer type + + /** + * @brief Constructs a fast_int from an integer value. + * + * @param value Integer value + */ + __host__ __device__ explicit constexpr fast_int(T value) noexcept : value_{value} + { + evaluate_magic_numbers(); + } + + /** + * @brief Get the underlying integer value. + * + * @return Underlying value + */ + __host__ __device__ constexpr value_type value() const noexcept { return value_; } + + /** + * @brief Explicit conversion operator to the underlying value type. + * + * @return Underlying value + */ + __host__ __device__ explicit constexpr operator value_type() const noexcept { return value_; } + + private: + using intermediate_type = + cuda::std::conditional_t; ///< Intermediate type for multiplication + using unsigned_value_type = cuda::std::make_unsigned_t; ///< Unsigned value type + using signed_value_type = cuda::std::make_signed_t; ///< Signed value type + + static constexpr value_type value_bits = + CHAR_BIT * sizeof(value_type); ///< Number of bits required to represent the value + + /** + * @brief Computes the high bits of the multiplication of two unsigned integers. + * + * @param lhs Left-hand side of the multiplication + * @param rhs Right-hand side of the multiplication + * + * @return High bits of the multiplication + */ + __host__ __device__ constexpr value_type mulhi(unsigned_value_type lhs, + unsigned_value_type rhs) const noexcept + { +#if defined(__CUDA_ARCH__) + if constexpr (sizeof(value_type) == 4) { + return __umulhi(lhs, rhs); + } else { + return __umul64hi(lhs, rhs); + } +#else + return (intermediate_type(lhs) * intermediate_type(rhs)) >> value_bits; +#endif + } + + /** + * @brief Computes the log2 of an unsigned integer. + * + * @param v Unsigned integer + * + * @return Log2 of the unsigned integer + */ + __host__ __device__ constexpr value_type log2(value_type v) const noexcept + { + return cuda::std::bit_width(unsigned_value_type(v)) - 1; + } + + /** + * @brief Computes the magic numbers for the fast division. + */ + __host__ __device__ constexpr void evaluate_magic_numbers() noexcept + { + // TODO assert(value_ > 0); + auto const val_log2 = this->log2(value_); + + // if value_ is a power of 2, we can use a simple shift + if (cuda::std::has_single_bit(unsigned_value_type(value_))) { + magic_ = 0; + shift_ = val_log2; + } else { + auto upper = intermediate_type(1) << value_bits; + auto lower = intermediate_type(1); + auto const lval = intermediate_type(value_); + + // compute the magic number and shift; see "Hacker's Delight" by Henry S. Warren, Jr., 10-2 + for (shift_ = 0; shift_ < val_log2; ++shift_, upper <<= 1, lower <<= 1) { + if ((upper % lval) <= lower) { break; } + } + magic_ = upper / lval; + } + } + + value_type value_; ///< Underlying integer value + value_type magic_; ///< Magic number for fast division + value_type shift_; ///< Shift for fast division + + template + friend __host__ __device__ constexpr value_type operator/(Lhs lhs, fast_int const& rhs) noexcept + { + static_assert(cuda::std::is_same_v, + "Left-hand side operand must be of type value_type."); + if (rhs.value_ == 1) { return lhs; } // edge case for value_ == 1 + if (rhs.magic_ == 0) { return lhs >> rhs.shift_; } // edge case for value_ == pow2 + auto const mul = (lhs == cuda::std::numeric_limits::max()) ? lhs : lhs + 1; + return rhs.mulhi(rhs.magic_, mul) >> rhs.shift_; + } + + template + friend __host__ __device__ constexpr value_type operator%(Lhs lhs, fast_int const& rhs) noexcept + { + return lhs - (lhs / rhs) * rhs.value_; + } +}; +} // namespace cuco::utility \ No newline at end of file diff --git a/include/cuco/utility/key_generator.hpp b/include/cuco/utility/key_generator.hpp new file mode 100644 index 000000000..deea62a62 --- /dev/null +++ b/include/cuco/utility/key_generator.hpp @@ -0,0 +1,293 @@ +/* + * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +namespace cuco::utility { + +namespace distribution { + +/** + * @brief Tag struct representing a random distribution of unique keys. + */ +struct unique { +}; + +/** + * @brief Tag struct representing a uniform distribution. + */ +struct uniform : public cuco::detail::strong_type { + /** + * @param multiplicity Average key multiplicity of the distribution. + */ + uniform(int64_t multiplicity) : cuco::detail::strong_type{multiplicity} + { + CUCO_EXPECTS(multiplicity > 0, "Multiplicity must be greater than 0"); + } +}; + +/** + * @brief Tag struct representing a gaussian distribution. + */ +struct gaussian : public cuco::detail::strong_type { + /** + * @param skew 0 represents a uniform distribution; ∞ represents a Dirac delta distribution. + */ + gaussian(double skew) : cuco::detail::strong_type{skew} + { + CUCO_EXPECTS(skew > 0, "Skew must be greater than 0"); + } +}; + +} // namespace distribution + +/** + * @brief Random key generator. + * + * @tparam RNG Pseudo-random number generator + */ +template +class key_generator { + public: + /** + * @brief Construct a new key generator object. + * + * @param seed Seed for the random number generator + */ + key_generator(uint32_t seed = static_cast(time(nullptr))) : rng_(seed) {} + + /** + * @brief Generates a sequence of random keys in the interval [0, N). + * + * @tparam Dist Key distribution type + * @tparam OutputIt Ouput iterator typy which value type is the desired key type + * @tparam ExecPolicy Thrust execution policy + * @tparam Enable SFINAE helper + * + * @param dist Random distribution to use + * @param out_begin Start of the output sequence + * @param out_end End of the output sequence + * @param exec_policy Thrust execution policy this operation will be executed with + */ + template ::value>> + void generate(Dist dist, OutputIt out_begin, OutputIt out_end, ExecPolicy exec_policy) + { + using value_type = typename std::iterator_traits::value_type; + + if constexpr (std::is_same_v) { + thrust::sequence(exec_policy, out_begin, out_end, 0); + thrust::shuffle(exec_policy, out_begin, out_end, this->rng_); + } else if constexpr (std::is_same_v) { + size_t num_keys = thrust::distance(out_begin, out_end); + + thrust::counting_iterator seeds(this->rng_()); + + thrust::transform(exec_policy, + seeds, + seeds + num_keys, + out_begin, + [*this, dist, num_keys] __host__ __device__(size_t const seed) { + RNG rng; + thrust::uniform_int_distribution uniform_dist( + 1, num_keys / dist.value); + rng.seed(seed); + return uniform_dist(rng); + }); + } else if constexpr (std::is_same_v) { + size_t num_keys = thrust::distance(out_begin, out_end); + + thrust::counting_iterator seq(this->rng_()); + + thrust::transform(exec_policy, + seq, + seq + num_keys, + out_begin, + [*this, dist, num_keys] __host__ __device__(size_t const seed) { + RNG rng; + thrust::normal_distribution<> normal_dist( + static_cast(num_keys / 2), num_keys * dist.value); + rng.seed(seed); + auto val = normal_dist(rng); + while (val < 0 or val >= num_keys) { + // Re-sample if the value is outside the range [0, N) + // This is necessary because the normal distribution is not bounded + // might be a better way to do this, e.g., discard(n) + val = normal_dist(rng); + } + return val; + }); + } else { + CUCO_FAIL("Unexpected distribution type"); + } + } + + /** + * @brief Overload of 'generate' which automatically selects a suitable execution policy + * + * @tparam Dist Key distribution type + * @tparam OutputIt Ouput iterator typy which value type is the desired key type + * + * @param dist Random distribution to use + * @param out_begin Start of the output sequence + * @param out_end End of the output sequence + */ + template + void generate(Dist dist, OutputIt out_begin, OutputIt out_end) + { + using thrust::system::detail::generic::select_system; + + typedef typename thrust::iterator_system::type System; + System system; + + generate(dist, out_begin, out_end, select_system(system)); + } + + /** + * @brief Overload of 'generate' which uses 'thrust::cuda::par_nosync' execution policy on CUDA + * stream 'stream' + * + * @tparam Dist Key distribution type + * @tparam OutputIt Ouput iterator typy which value type is the desired key type + * + * @param dist Random distribution to use + * @param out_begin Start of the output sequence + * @param out_end End of the output sequence + * @param stream CUDA stream in which this operation is executed in + */ + template + void generate(Dist dist, OutputIt out_begin, OutputIt out_end, cudaStream_t stream) + { + generate(dist, out_begin, out_end, thrust::cuda::par_nosync.on(stream)); + } + + /** + * @brief Randomly replaces previously generated keys with new keys outside the input + * distribution. + * + * @tparam InOutIt Input/Ouput iterator typy which value type is the desired key type + * @tparam ExecPolicy Thrust execution policy + * @tparam Enable SFINAE helper + * + * @param begin Start of the key sequence + * @param end End of the key sequence + * @param keep_prob Probability that a key is kept + * @param exec_policy Thrust execution policy this operation will be executed with + */ + template ::value>> + void dropout(InOutIt begin, InOutIt end, double keep_prob, ExecPolicy exec_policy) + { + using value_type = typename std::iterator_traits::value_type; + + CUCO_EXPECTS(keep_prob >= 0.0 and keep_prob <= 1.0, "Probability needs to be between 0 and 1"); + + if (keep_prob < 1.0) { + size_t num_keys = thrust::distance(begin, end); + + thrust::counting_iterator seeds(rng_()); + + thrust::transform_if( + exec_policy, + seeds, + seeds + num_keys, + begin, + [num_keys] __host__ __device__(size_t const seed) { + RNG rng; + thrust::uniform_int_distribution non_match_dist{ + static_cast(num_keys), std::numeric_limits::max()}; + rng.seed(seed); + return non_match_dist(rng); + }, + [keep_prob] __host__ __device__(size_t const seed) { + RNG rng; + thrust::uniform_real_distribution rate_dist(0.0, 1.0); + rng.seed(seed); + return (rate_dist(rng) > keep_prob); + }); + } + + thrust::shuffle(exec_policy, begin, end, rng_); + } + + /** + * @brief Overload of 'dropout' which automatically selects a suitable execution policy + * + * @tparam InOutIt Input/Ouput iterator typy which value type is the desired key type + * + * @param begin Start of the key sequence + * @param end End of the key sequence + * @param keep_prob Probability that a key is kept + */ + template + void dropout(InOutIt begin, InOutIt end, double keep_prob) + { + using thrust::system::detail::generic::select_system; + + typedef typename thrust::iterator_system::type System; + System system; + + dropout(begin, end, keep_prob, select_system(system)); + } + + /** + * @brief Overload of 'dropout' which uses 'thrust::cuda::par_nosync' execution policy on CUDA + * stream 'stream' + * + * @tparam InOutIt Input/Ouput iterator typy which value type is the desired key type + * + * @param begin Start of the key sequence + * @param end End of the key sequence + * @param keep_prob Probability that a key is kept + * @param stream CUDA stream in which this operation is executed in + */ + template + void dropout(InOutIt begin, InOutIt end, double keep_prob, cudaStream_t stream) + { + using thrust::system::detail::generic::select_system; + + typedef typename thrust::iterator_system::type System; + System system; + + dropout(begin, end, keep_prob, thrust::cuda::par_nosync.on(stream)); + } + + private: + RNG rng_; ///< Random number generator +}; + +} // namespace cuco::utility diff --git a/include/cuco/traits.hpp b/include/cuco/utility/traits.hpp similarity index 87% rename from include/cuco/traits.hpp rename to include/cuco/utility/traits.hpp index 445a40daf..1a6252dcb 100644 --- a/include/cuco/traits.hpp +++ b/include/cuco/utility/traits.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,6 +16,9 @@ #pragma once +#include +#include + #include namespace cuco { @@ -58,4 +61,10 @@ inline constexpr bool is_bitwise_comparable_v = is_bitwise_comparable::value; }; \ } +template +inline constexpr bool dependent_bool_value = value; + +template +inline constexpr bool dependent_false = dependent_bool_value; + } // namespace cuco diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 2d1d25526..3deeeddf1 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -1,5 +1,5 @@ #============================================================================= -# Copyright (c) 2018-2022, NVIDIA CORPORATION. +# Copyright (c) 2018-2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. #============================================================================= -cmake_minimum_required(VERSION 3.18 FATAL_ERROR) +cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR) include(CTest) @@ -23,35 +23,49 @@ include(CTest) CPMAddPackage( NAME Catch2 GITHUB_REPOSITORY catchorg/Catch2 - VERSION 2.13.9 + VERSION 3.3.0 ) +# Header for catch_discover_tests if(Catch2_ADDED) - include(${Catch2_SOURCE_DIR}/contrib/Catch.cmake) + include(${Catch2_SOURCE_DIR}/extras/Catch.cmake) endif() -# catch_main.cpp defines `CATCH_CONFIG_MAIN` which provides main() -# Compiles it to be linked into test executables -add_library(CatchMain OBJECT ${CMAKE_CURRENT_SOURCE_DIR}/catch_main.cpp) -target_link_libraries(CatchMain Catch2::Catch2) - ################################################################################################### function(ConfigureTest TEST_NAME) - add_executable(${TEST_NAME} ${ARGN} - $) # Link in the CatchMain object file - target_link_libraries(${TEST_NAME} Catch2::Catch2 cuco CUDA::cudart) + add_executable(${TEST_NAME} ${ARGN}) + target_link_libraries(${TEST_NAME} PRIVATE Catch2::Catch2WithMain cuco CUDA::cudart) target_include_directories(${TEST_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) set_target_properties(${TEST_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/tests") target_compile_options(${TEST_NAME} PRIVATE --compiler-options=-Wall --compiler-options=-Wextra --expt-extended-lambda --expt-relaxed-constexpr -Xcompiler -Wno-subobject-linkage) - catch_discover_tests(${TEST_NAME}) + catch_discover_tests(${TEST_NAME} EXTRA_ARGS --allow-running-no-tests) endfunction(ConfigureTest) ################################################################################################### ### test sources ################################################################################## ################################################################################################### +################################################################################################### +# - utility tests --------------------------------------------------------------------------------- +ConfigureTest(UTILITY_TEST + utility/extent_test.cu + utility/storage_test.cu + utility/fast_int_test.cu + utility/hash_test.cu) + +################################################################################################### +# - static_set tests ------------------------------------------------------------------------------ +ConfigureTest(STATIC_SET_TEST + static_set/capacity_test.cu + static_set/heterogeneous_lookup_test.cu + static_set/insert_and_find_test.cu + static_set/large_input_test.cu + static_set/retrieve_all_test.cu + static_set/size_test.cu + static_set/unique_sequence_test.cu) + ################################################################################################### # - static_map tests ------------------------------------------------------------------------------ ConfigureTest(STATIC_MAP_TEST @@ -60,6 +74,7 @@ ConfigureTest(STATIC_MAP_TEST static_map/erase_test.cu static_map/heterogeneous_lookup_test.cu static_map/insert_and_find_test.cu + static_map/insert_or_assign_test.cu static_map/key_sentinel_test.cu static_map/shared_memory_test.cu static_map/stream_test.cu @@ -68,7 +83,8 @@ ConfigureTest(STATIC_MAP_TEST ################################################################################################### # - dynamic_map tests ----------------------------------------------------------------------------- ConfigureTest(DYNAMIC_MAP_TEST - dynamic_map/unique_sequence_test.cu) + dynamic_map/unique_sequence_test.cu + dynamic_map/erase_test.cu) ################################################################################################### # - static_multimap tests ------------------------------------------------------------------------- @@ -80,3 +96,12 @@ ConfigureTest(STATIC_MULTIMAP_TEST static_multimap/multiplicity_test.cu static_multimap/non_match_test.cu static_multimap/pair_function_test.cu) + +################################################################################################### +# - dynamic_bitset tests -------------------------------------------------------------------------- +ConfigureTest(DYNAMIC_BITSET_TEST + dynamic_bitset/find_next_test.cu + dynamic_bitset/get_test.cu + dynamic_bitset/rank_test.cu + dynamic_bitset/select_test.cu + dynamic_bitset/size_test.cu) diff --git a/tests/catch_main.cpp b/tests/catch_main.cpp deleted file mode 100644 index a7cc18e23..000000000 --- a/tests/catch_main.cpp +++ /dev/null @@ -1,6 +0,0 @@ -// In a Catch project with multiple files, dedicate one file to compile the -// source code of Catch itself and reuse the resulting object file for linking. - -// Let Catch provide main(): -#define CATCH_CONFIG_MAIN -#include diff --git a/tests/dynamic_bitset/find_next_test.cu b/tests/dynamic_bitset/find_next_test.cu new file mode 100644 index 000000000..97ba366ea --- /dev/null +++ b/tests/dynamic_bitset/find_next_test.cu @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include +#include +#include + +#include + +template +__global__ void find_next_kernel(BitsetRef ref, size_type num_elements, OutputIt output) +{ + cuco::detail::index_type index = blockIdx.x * blockDim.x + threadIdx.x; + cuco::detail::index_type stride = gridDim.x * blockDim.x; + while (index < num_elements) { + output[index] = ref.find_next(index); + index += stride; + } +} + +extern bool modulo_bitgen(uint64_t i); // Defined in get_test.cu + +TEST_CASE("Find next set test", "") +{ + cuco::experimental::detail::dynamic_bitset bv; + + using size_type = std::size_t; + constexpr size_type num_elements{400}; + + for (size_type i = 0; i < num_elements; i++) { + bv.push_back(modulo_bitgen(i)); + } + + thrust::device_vector device_result(num_elements); + auto ref = bv.ref(); + find_next_kernel<<<1, 1024>>>(ref, num_elements, device_result.data()); + + thrust::host_vector host_result = device_result; + size_type num_matches = 0; + + size_type next_set_pos = -1lu; + do { + next_set_pos++; + } while (next_set_pos < num_elements and !modulo_bitgen(next_set_pos)); + + for (size_type key = 0; key < num_elements; key++) { + num_matches += host_result[key] == next_set_pos; + + if (key == next_set_pos) { + do { + next_set_pos++; + } while (next_set_pos < num_elements and !modulo_bitgen(next_set_pos)); + } + } + REQUIRE(num_matches == num_elements); +} diff --git a/tests/dynamic_bitset/get_test.cu b/tests/dynamic_bitset/get_test.cu new file mode 100644 index 000000000..10f81a116 --- /dev/null +++ b/tests/dynamic_bitset/get_test.cu @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include + +template +__global__ void test_kernel(BitsetRef ref, size_type num_elements, OutputIt output) +{ + cuco::detail::index_type index = blockIdx.x * blockDim.x + threadIdx.x; + cuco::detail::index_type stride = gridDim.x * blockDim.x; + while (index < num_elements) { + output[index] = ref.test(index); + index += stride; + } +} + +bool modulo_bitgen(uint64_t i) { return i % 7 == 0; } + +TEST_CASE("Get test", "") +{ + cuco::experimental::detail::dynamic_bitset bv; + + using size_type = std::size_t; + constexpr size_type num_elements{400}; + + size_type num_set_ref = 0; + for (size_type i = 0; i < num_elements; i++) { + bv.push_back(modulo_bitgen(i)); + num_set_ref += modulo_bitgen(i); + } + + // Host-bulk test + thrust::device_vector keys(num_elements); + thrust::sequence(keys.begin(), keys.end(), 0); + + thrust::device_vector test_result(num_elements); + thrust::fill(test_result.begin(), test_result.end(), 0); + + bv.test(keys.begin(), keys.end(), test_result.begin()); + + size_type num_set = thrust::reduce(thrust::device, test_result.begin(), test_result.end(), 0); + REQUIRE(num_set == num_set_ref); + + // Device-ref test + auto ref = bv.ref(); + thrust::fill(test_result.begin(), test_result.end(), 0); + test_kernel<<<1, 1024>>>(ref, num_elements, test_result.data()); + + num_set = thrust::reduce(thrust::device, test_result.begin(), test_result.end(), 0); + REQUIRE(num_set == num_set_ref); +} diff --git a/tests/dynamic_bitset/rank_test.cu b/tests/dynamic_bitset/rank_test.cu new file mode 100644 index 000000000..3b4d17cca --- /dev/null +++ b/tests/dynamic_bitset/rank_test.cu @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include +#include +#include + +#include + +extern bool modulo_bitgen(uint64_t i); // Defined in get_test.cu + +TEST_CASE("Rank test", "") +{ + cuco::experimental::detail::dynamic_bitset bv; + + using size_type = std::size_t; + constexpr size_type num_elements{4000}; + + for (size_type i = 0; i < num_elements; i++) { + bv.push_back(modulo_bitgen(i)); + } + + thrust::device_vector keys(num_elements); + thrust::sequence(keys.begin(), keys.end(), 0); + + thrust::device_vector d_ranks(num_elements); + + bv.rank(keys.begin(), keys.end(), d_ranks.begin()); + + thrust::host_vector h_ranks = d_ranks; + + size_type cur_rank = 0; + size_type num_matches = 0; + for (size_type i = 0; i < num_elements; i++) { + num_matches += cur_rank == h_ranks[i]; + if (modulo_bitgen(i)) { cur_rank++; } + } + REQUIRE(num_matches == num_elements); +} diff --git a/tests/dynamic_bitset/select_test.cu b/tests/dynamic_bitset/select_test.cu new file mode 100644 index 000000000..3dc0d74da --- /dev/null +++ b/tests/dynamic_bitset/select_test.cu @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include +#include +#include + +#include + +template +__global__ void select_false_kernel(BitsetRef ref, size_type num_elements, OutputIt output) +{ + cuco::detail::index_type index = blockIdx.x * blockDim.x + threadIdx.x; + cuco::detail::index_type stride = gridDim.x * blockDim.x; + while (index < num_elements) { + output[index] = ref.select_false(index); + index += stride; + } +} + +extern bool modulo_bitgen(uint64_t i); // Defined in get_test.cu + +TEST_CASE("Select test", "") +{ + cuco::experimental::detail::dynamic_bitset bv; + + using size_type = std::size_t; + constexpr size_type num_elements{4000}; + + size_type num_set = 0; + for (size_type i = 0; i < num_elements; i++) { + bv.push_back(modulo_bitgen(i)); + num_set += modulo_bitgen(i); + } + + // Check select + { + thrust::device_vector keys(num_set); + thrust::sequence(keys.begin(), keys.end(), 0); + + thrust::device_vector d_selects(num_set); + + bv.select(keys.begin(), keys.end(), d_selects.begin()); + + thrust::host_vector h_selects = d_selects; + + size_type num_matches = 0; + size_type cur_set_pos = -1lu; + for (size_type i = 0; i < num_set; i++) { + do { + cur_set_pos++; + } while (cur_set_pos < num_elements and !modulo_bitgen(cur_set_pos)); + + num_matches += cur_set_pos == h_selects[i]; + } + REQUIRE(num_matches == num_set); + } + + // Check select_false + { + size_type num_not_set = num_elements - num_set; + + auto ref = bv.ref(); + thrust::device_vector device_result(num_not_set); + select_false_kernel<<<1, 1024>>>(ref, num_not_set, device_result.data()); + thrust::host_vector host_result = device_result; + + size_type num_matches = 0; + size_type cur_not_set_pos = -1lu; + for (size_type i = 0; i < num_not_set; i++) { + do { + cur_not_set_pos++; + } while (cur_not_set_pos < num_elements and modulo_bitgen(cur_not_set_pos)); + + num_matches += cur_not_set_pos == host_result[i]; + } + REQUIRE(num_matches == num_not_set); + } +} diff --git a/tests/dynamic_bitset/size_test.cu b/tests/dynamic_bitset/size_test.cu new file mode 100644 index 000000000..611159dc3 --- /dev/null +++ b/tests/dynamic_bitset/size_test.cu @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +TEST_CASE("Size computation", "") +{ + cuco::experimental::detail::dynamic_bitset bv; + using size_type = std::size_t; + constexpr size_type num_elements{400}; + + for (size_type i = 0; i < num_elements; i++) { + bv.push_back(i % 2 == 0); // Alternate 0s and 1s pattern + } + + auto size = bv.size(); + REQUIRE(size == num_elements); +} diff --git a/tests/dynamic_map/erase_test.cu b/tests/dynamic_map/erase_test.cu new file mode 100644 index 000000000..1a60b49b6 --- /dev/null +++ b/tests/dynamic_map/erase_test.cu @@ -0,0 +1,130 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include +#include +#include +#include + +#include + +TEMPLATE_TEST_CASE_SIG("erase key", + "", + ((typename Key, typename Value), Key, Value), + (int32_t, int32_t), + (int32_t, int64_t), + (int64_t, int32_t), + (int64_t, int64_t)) +{ + constexpr std::size_t num_keys = 1'000'000; + cuco::dynamic_map map{num_keys * 2, + cuco::empty_key{-1}, + cuco::empty_value{-1}, + cuco::erased_key{-2}}; + + SECTION("Check single submap insert/erase") + { + thrust::device_vector d_keys(num_keys); + thrust::device_vector d_values(num_keys); + thrust::device_vector d_keys_exist(num_keys); + + thrust::sequence(thrust::device, d_keys.begin(), d_keys.end(), 1); + thrust::sequence(thrust::device, d_values.begin(), d_values.end(), 1); + + auto pairs_begin = + thrust::make_zip_iterator(thrust::make_tuple(d_keys.begin(), d_values.begin())); + + map.insert(pairs_begin, pairs_begin + num_keys); + + REQUIRE(map.get_size() == num_keys); + + map.erase(d_keys.begin(), d_keys.end()); + + // delete decreases count correctly + REQUIRE(map.get_size() == 0); + + map.contains(d_keys.begin(), d_keys.end(), d_keys_exist.begin()); + + // keys were actaully deleted + REQUIRE(cuco::test::none_of(d_keys_exist.begin(), d_keys_exist.end(), thrust::identity{})); + + // ensures that map is reusing deleted slots + map.insert(pairs_begin, pairs_begin + num_keys); + + REQUIRE(map.get_size() == num_keys); + + map.contains(d_keys.begin(), d_keys.end(), d_keys_exist.begin()); + + REQUIRE(cuco::test::all_of(d_keys_exist.begin(), d_keys_exist.end(), thrust::identity{})); + + // erase can act selectively + map.erase(d_keys.begin(), d_keys.begin() + num_keys / 2); + map.contains(d_keys.begin(), d_keys.end(), d_keys_exist.begin()); + + REQUIRE(cuco::test::none_of( + d_keys_exist.begin(), d_keys_exist.begin() + num_keys / 2, thrust::identity{})); + + REQUIRE(cuco::test::all_of( + d_keys_exist.begin() + num_keys / 2, d_keys_exist.end(), thrust::identity{})); + + // clear map + map.erase(d_keys.begin() + num_keys / 2, d_keys.end()); + } + + SECTION("Check multiple submaps insert/erase") + { + constexpr std::size_t num = 4 * num_keys; + + thrust::device_vector d_keys(num); + thrust::device_vector d_values(num); + thrust::device_vector d_keys_exist(num); + + thrust::sequence(thrust::device, d_keys.begin(), d_keys.end(), 1); + thrust::sequence(thrust::device, d_values.begin(), d_values.end(), 1); + + auto pairs_begin = + thrust::make_zip_iterator(thrust::make_tuple(d_keys.begin(), d_values.begin())); + + map.insert(pairs_begin, pairs_begin + num); + + // map should resize twice if the erased slots are successfully reused + REQUIRE(map.get_capacity() == 2 * num); + // check that keys can be successfully deleted from only the first and second submaps + map.erase(d_keys.begin(), d_keys.begin() + 2 * num_keys); + map.contains(d_keys.begin(), d_keys.end(), d_keys_exist.begin()); + + REQUIRE(cuco::test::none_of( + d_keys_exist.begin(), d_keys_exist.begin() + 2 * num_keys, thrust::identity{})); + + REQUIRE(cuco::test::all_of( + d_keys_exist.begin() + 2 * num_keys, d_keys_exist.end(), thrust::identity{})); + + REQUIRE(map.get_size() == 2 * num_keys); + // check that keys can be successfully deleted from all submaps (some will be unsuccessful + // erases) + map.erase(d_keys.begin(), d_keys.end()); + + map.contains(d_keys.begin(), d_keys.end(), d_keys_exist.begin()); + + REQUIRE(cuco::test::none_of(d_keys_exist.begin(), d_keys_exist.end(), thrust::identity{})); + + REQUIRE(map.get_size() == 0); + } +} diff --git a/tests/dynamic_map/unique_sequence_test.cu b/tests/dynamic_map/unique_sequence_test.cu index de26bb3dc..aa01ca51a 100644 --- a/tests/dynamic_map/unique_sequence_test.cu +++ b/tests/dynamic_map/unique_sequence_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -27,7 +27,7 @@ #include #include -#include +#include TEMPLATE_TEST_CASE_SIG("Unique sequence of keys", "", @@ -38,8 +38,9 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence of keys", (int64_t, int64_t)) { constexpr std::size_t num_keys{50'000'000}; + cuco::dynamic_map map{ - 30'000'000, cuco::sentinel::empty_key{-1}, cuco::sentinel::empty_value{-1}}; + 30'000'000, cuco::empty_key{-1}, cuco::empty_value{-1}}; thrust::device_vector d_keys(num_keys); thrust::device_vector d_values(num_keys); @@ -47,9 +48,9 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence of keys", thrust::sequence(thrust::device, d_keys.begin(), d_keys.end()); thrust::sequence(thrust::device, d_values.begin(), d_values.end()); - auto pairs_begin = thrust::make_transform_iterator( - thrust::make_counting_iterator(0), - [] __device__(auto i) { return cuco::pair_type(i, i); }); + auto pairs_begin = + thrust::make_transform_iterator(thrust::make_counting_iterator(0), + [] __device__(auto i) { return cuco::pair(i, i); }); thrust::device_vector d_results(num_keys); thrust::device_vector d_contained(num_keys); diff --git a/tests/static_map/custom_type_test.cu b/tests/static_map/custom_type_test.cu index e587613d4..e23216ca3 100644 --- a/tests/static_map/custom_type_test.cu +++ b/tests/static_map/custom_type_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -25,7 +25,7 @@ #include #include -#include +#include #include @@ -113,9 +113,8 @@ TEMPLATE_TEST_CASE_SIG("User defined key and value type", constexpr std::size_t num = 100; constexpr std::size_t capacity = num * 2; - cuco::static_map map{capacity, - cuco::sentinel::empty_key{sentinel_key}, - cuco::sentinel::empty_value{sentinel_value}}; + cuco::static_map map{ + capacity, cuco::empty_key{sentinel_key}, cuco::empty_value{sentinel_value}}; thrust::device_vector insert_keys(num); thrust::device_vector insert_values(num); @@ -132,9 +131,9 @@ TEMPLATE_TEST_CASE_SIG("User defined key and value type", insert_values.begin(), [] __device__(auto i) { return Value{i}; }); - auto insert_pairs = thrust::make_transform_iterator( - thrust::make_counting_iterator(0), - [] __device__(auto i) { return cuco::pair_type(i, i); }); + auto insert_pairs = + thrust::make_transform_iterator(thrust::make_counting_iterator(0), + [] __device__(auto i) { return cuco::pair(i, i); }); SECTION("All inserted keys-value pairs should be correctly recovered during find") { @@ -213,7 +212,7 @@ TEMPLATE_TEST_CASE_SIG("User defined key and value type", map.insert(insert_pairs, insert_pairs + num, hash_custom_key{}, custom_key_equals{}); auto view = map.get_device_view(); REQUIRE(cuco::test::all_of( - insert_pairs, insert_pairs + num, [view] __device__(cuco::pair_type const& pair) { + insert_pairs, insert_pairs + num, [view] __device__(cuco::pair const& pair) { return view.contains(pair.first, hash_custom_key{}, custom_key_equals{}); })); } @@ -221,12 +220,11 @@ TEMPLATE_TEST_CASE_SIG("User defined key and value type", SECTION("Inserting unique keys should return insert success.") { auto m_view = map.get_device_mutable_view(); - REQUIRE( - cuco::test::all_of(insert_pairs, - insert_pairs + num, - [m_view] __device__(cuco::pair_type const& pair) mutable { - return m_view.insert(pair, hash_custom_key{}, custom_key_equals{}); - })); + REQUIRE(cuco::test::all_of(insert_pairs, + insert_pairs + num, + [m_view] __device__(cuco::pair const& pair) mutable { + return m_view.insert(pair, hash_custom_key{}, custom_key_equals{}); + })); } SECTION("Cannot find any key in an empty hash map") @@ -237,7 +235,7 @@ TEMPLATE_TEST_CASE_SIG("User defined key and value type", REQUIRE(cuco::test::all_of( insert_pairs, insert_pairs + num, - [view] __device__(cuco::pair_type const& pair) mutable { + [view] __device__(cuco::pair const& pair) mutable { return view.find(pair.first, hash_custom_key{}, custom_key_equals{}) == view.end(); })); } @@ -246,9 +244,7 @@ TEMPLATE_TEST_CASE_SIG("User defined key and value type", { auto const view = map.get_device_view(); REQUIRE(cuco::test::all_of( - insert_pairs, - insert_pairs + num, - [view] __device__(cuco::pair_type const& pair) { + insert_pairs, insert_pairs + num, [view] __device__(cuco::pair const& pair) { return view.find(pair.first, hash_custom_key{}, custom_key_equals{}) == view.end(); })); } diff --git a/tests/static_map/duplicate_keys_test.cu b/tests/static_map/duplicate_keys_test.cu index 34a315a1c..5620fa4e9 100644 --- a/tests/static_map/duplicate_keys_test.cu +++ b/tests/static_map/duplicate_keys_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -27,7 +27,7 @@ #include #include -#include +#include TEMPLATE_TEST_CASE_SIG("Duplicate keys", "", @@ -39,7 +39,7 @@ TEMPLATE_TEST_CASE_SIG("Duplicate keys", { constexpr std::size_t num_keys{500'000}; cuco::static_map map{ - num_keys * 2, cuco::sentinel::empty_key{-1}, cuco::sentinel::empty_value{-1}}; + num_keys * 2, cuco::empty_key{-1}, cuco::empty_value{-1}}; thrust::device_vector d_keys(num_keys); thrust::device_vector d_values(num_keys); @@ -49,7 +49,7 @@ TEMPLATE_TEST_CASE_SIG("Duplicate keys", auto pairs_begin = thrust::make_transform_iterator( thrust::make_counting_iterator(0), - [] __device__(auto i) { return cuco::pair_type(i / 2, i / 2); }); + [] __device__(auto i) { return cuco::pair(i / 2, i / 2); }); thrust::device_vector d_results(num_keys); thrust::device_vector d_contained(num_keys); diff --git a/tests/static_map/erase_test.cu b/tests/static_map/erase_test.cu index b5641539c..26cbd3fd3 100644 --- a/tests/static_map/erase_test.cu +++ b/tests/static_map/erase_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,16 +14,18 @@ * limitations under the License. */ -#include +#include + +#include + #include #include +#include #include #include #include -#include - -#include +#include TEMPLATE_TEST_CASE_SIG("erase key", "", ((typename T), T), (int32_t), (int64_t)) { @@ -33,10 +35,8 @@ TEMPLATE_TEST_CASE_SIG("erase key", "", ((typename T), T), (int32_t), (int64_t)) constexpr std::size_t num_keys = 1'000'000; constexpr std::size_t capacity = 1'100'000; - cuco::static_map map{capacity, - cuco::sentinel::empty_key{-1}, - cuco::sentinel::empty_value{-1}, - cuco::sentinel::erased_key{-2}}; + cuco::static_map map{ + capacity, cuco::empty_key{-1}, cuco::empty_value{-1}, cuco::erased_key{-2}}; thrust::device_vector d_keys(num_keys); thrust::device_vector d_values(num_keys); @@ -60,9 +60,7 @@ TEMPLATE_TEST_CASE_SIG("erase key", "", ((typename T), T), (int32_t), (int64_t)) map.contains(d_keys.begin(), d_keys.end(), d_keys_exist.begin()); - REQUIRE(cuco::test::none_of(d_keys_exist.begin(), - d_keys_exist.end(), - [] __device__(const bool key_found) { return key_found; })); + REQUIRE(cuco::test::none_of(d_keys_exist.begin(), d_keys_exist.end(), thrust::identity{})); map.insert(pairs_begin, pairs_begin + num_keys); @@ -70,20 +68,16 @@ TEMPLATE_TEST_CASE_SIG("erase key", "", ((typename T), T), (int32_t), (int64_t)) map.contains(d_keys.begin(), d_keys.end(), d_keys_exist.begin()); - REQUIRE(cuco::test::all_of(d_keys_exist.begin(), - d_keys_exist.end(), - [] __device__(const bool key_found) { return key_found; })); + REQUIRE(cuco::test::all_of(d_keys_exist.begin(), d_keys_exist.end(), thrust::identity{})); map.erase(d_keys.begin(), d_keys.begin() + num_keys / 2); map.contains(d_keys.begin(), d_keys.end(), d_keys_exist.begin()); - REQUIRE(cuco::test::none_of(d_keys_exist.begin(), - d_keys_exist.begin() + num_keys / 2, - [] __device__(const bool key_found) { return key_found; })); + REQUIRE(cuco::test::none_of( + d_keys_exist.begin(), d_keys_exist.begin() + num_keys / 2, thrust::identity{})); - REQUIRE(cuco::test::all_of(d_keys_exist.begin() + num_keys / 2, - d_keys_exist.end(), - [] __device__(const bool key_found) { return key_found; })); + REQUIRE(cuco::test::all_of( + d_keys_exist.begin() + num_keys / 2, d_keys_exist.end(), thrust::identity{})); map.erase(d_keys.begin() + num_keys / 2, d_keys.end()); REQUIRE(map.get_size() == 0); diff --git a/tests/static_map/heterogeneous_lookup_test.cu b/tests/static_map/heterogeneous_lookup_test.cu index 766fa9e1f..e842612b1 100644 --- a/tests/static_map/heterogeneous_lookup_test.cu +++ b/tests/static_map/heterogeneous_lookup_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -25,7 +25,7 @@ #include #include -#include +#include #include @@ -96,13 +96,12 @@ TEMPLATE_TEST_CASE("Heterogeneous lookup", constexpr std::size_t num = 100; constexpr std::size_t capacity = num * 2; - cuco::static_map map{capacity, - cuco::sentinel::empty_key{sentinel_key}, - cuco::sentinel::empty_value{sentinel_value}}; + cuco::static_map map{ + capacity, cuco::empty_key{sentinel_key}, cuco::empty_value{sentinel_value}}; - auto insert_pairs = thrust::make_transform_iterator( - thrust::counting_iterator(0), - [] __device__(auto i) { return cuco::pair_type(i, i); }); + auto insert_pairs = + thrust::make_transform_iterator(thrust::counting_iterator(0), + [] __device__(auto i) { return cuco::pair(i, i); }); auto probe_keys = thrust::make_transform_iterator(thrust::counting_iterator(0), [] __device__(auto i) { return ProbeKey(i); }); diff --git a/tests/static_map/insert_and_find_test.cu b/tests/static_map/insert_and_find_test.cu index ec3339c4f..5784f786f 100644 --- a/tests/static_map/insert_and_find_test.cu +++ b/tests/static_map/insert_and_find_test.cu @@ -1,6 +1,6 @@ /* * Copyright (c) 2022, Jonas Hahnfeld, CERN. - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -24,7 +24,7 @@ #include #include -#include +#include static constexpr int Iters = 10'000; @@ -59,14 +59,14 @@ TEMPLATE_TEST_CASE_SIG("Parallel insert-or-update", (int64_t, int32_t), (int64_t, int64_t)) { - cuco::sentinel::empty_key empty_key_sentinel{-1}; - cuco::sentinel::empty_value empty_value_sentinel{-1}; + cuco::empty_key empty_key_sentinel{-1}; + cuco::empty_value empty_value_sentinel{-1}; cuco::static_map m(10 * Iters, empty_key_sentinel, empty_value_sentinel); static constexpr int Blocks = 1024; static constexpr int Threads = 128; parallel_sum<<>>(m.get_device_mutable_view()); - cudaDeviceSynchronize(); + CUCO_CUDA_TRY(cudaDeviceSynchronize()); thrust::device_vector d_keys(Iters); thrust::device_vector d_values(Iters); diff --git a/tests/static_map/insert_or_assign_test.cu b/tests/static_map/insert_or_assign_test.cu new file mode 100644 index 000000000..90c6553ce --- /dev/null +++ b/tests/static_map/insert_or_assign_test.cu @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include +#include +#include +#include +#include +#include + +#include + +using size_type = std::size_t; + +template +__inline__ void test_insert_or_assign(Map& map, size_type num_keys) +{ + using Key = typename Map::key_type; + using Value = typename Map::mapped_type; + + // Insert pairs + auto pairs_begin = + thrust::make_transform_iterator(thrust::counting_iterator(0), + [] __device__(auto i) { return cuco::pair(i, i); }); + + auto const initial_size = map.insert(pairs_begin, pairs_begin + num_keys); + REQUIRE(initial_size == num_keys); // all keys should be inserted + + // Query pairs have the same keys but different payloads + auto query_pairs_begin = thrust::make_transform_iterator( + thrust::counting_iterator(0), + [] __device__(auto i) { return cuco::pair(i, i * 2); }); + + map.insert_or_assign(query_pairs_begin, query_pairs_begin + num_keys); + + auto const updated_size = map.size(); + // all keys are present in the map so the size shouldn't change + REQUIRE(updated_size == initial_size); + + thrust::device_vector d_keys(num_keys); + thrust::device_vector d_values(num_keys); + map.retrieve_all(d_keys.begin(), d_values.begin()); + + auto gold_values_begin = thrust::make_transform_iterator(thrust::counting_iterator(0), + [] __device__(auto i) { return i * 2; }); + + thrust::sort(thrust::device, d_values.begin(), d_values.end()); + REQUIRE(cuco::test::equal( + d_values.begin(), d_values.end(), gold_values_begin, thrust::equal_to{})); +} + +TEMPLATE_TEST_CASE_SIG( + "Insert or assign", + "", + ((typename Key, typename Value, cuco::test::probe_sequence Probe, int CGSize), + Key, + Value, + Probe, + CGSize), + (int32_t, int32_t, cuco::test::probe_sequence::double_hashing, 1), + (int32_t, int64_t, cuco::test::probe_sequence::double_hashing, 1), + (int32_t, int32_t, cuco::test::probe_sequence::double_hashing, 2), + (int32_t, int64_t, cuco::test::probe_sequence::double_hashing, 2), + (int64_t, int32_t, cuco::test::probe_sequence::double_hashing, 1), + (int64_t, int64_t, cuco::test::probe_sequence::double_hashing, 1), + (int64_t, int32_t, cuco::test::probe_sequence::double_hashing, 2), + (int64_t, int64_t, cuco::test::probe_sequence::double_hashing, 2), + (int32_t, int32_t, cuco::test::probe_sequence::linear_probing, 1), + (int32_t, int64_t, cuco::test::probe_sequence::linear_probing, 1), + (int32_t, int32_t, cuco::test::probe_sequence::linear_probing, 2), + (int32_t, int64_t, cuco::test::probe_sequence::linear_probing, 2), + (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 1), + (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 1), + (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 2), + (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2)) +{ + constexpr size_type num_keys{400}; + + using probe = + std::conditional_t>, + cuco::experimental::double_hashing, + cuco::murmurhash3_32>>; + + auto map = cuco::experimental::static_map, + cuda::thread_scope_device, + thrust::equal_to, + probe, + cuco::cuda_allocator, + cuco::experimental::storage<2>>{ + num_keys, cuco::empty_key{-1}, cuco::empty_value{-1}}; + + test_insert_or_assign(map, num_keys); +} diff --git a/tests/static_map/key_sentinel_test.cu b/tests/static_map/key_sentinel_test.cu index e52c1405e..74a1badd1 100644 --- a/tests/static_map/key_sentinel_test.cu +++ b/tests/static_map/key_sentinel_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -22,7 +22,7 @@ #include #include -#include +#include #define SIZE 10 __device__ int A[SIZE]; @@ -40,7 +40,7 @@ TEMPLATE_TEST_CASE_SIG( constexpr std::size_t num_keys{SIZE}; cuco::static_map map{ - SIZE * 2, cuco::sentinel::empty_key{-1}, cuco::sentinel::empty_value{-1}}; + SIZE * 2, cuco::empty_key{-1}, cuco::empty_value{-1}}; auto m_view = map.get_device_mutable_view(); auto view = map.get_device_view(); @@ -49,21 +49,21 @@ TEMPLATE_TEST_CASE_SIG( for (int i = 0; i < SIZE; i++) { h_A[i] = i; } - cudaMemcpyToSymbol(A, h_A, SIZE * sizeof(int)); + CUCO_CUDA_TRY(cudaMemcpyToSymbol(A, h_A, SIZE * sizeof(int))); - auto pairs_begin = thrust::make_transform_iterator( - thrust::make_counting_iterator(0), - [] __device__(auto i) { return cuco::pair_type(i, i); }); + auto pairs_begin = + thrust::make_transform_iterator(thrust::make_counting_iterator(0), + [] __device__(auto i) { return cuco::pair(i, i); }); SECTION( "Tests of non-CG insert: The custom `key_equal` can never be used to compare against sentinel") { - REQUIRE(cuco::test::all_of( - pairs_begin, - pairs_begin + num_keys, - [m_view] __device__(cuco::pair_type const& pair) mutable { - return m_view.insert(pair, cuco::detail::MurmurHash3_32{}, custom_equals{}); - })); + REQUIRE(cuco::test::all_of(pairs_begin, + pairs_begin + num_keys, + [m_view] __device__(cuco::pair const& pair) mutable { + return m_view.insert( + pair, cuco::default_hash_function{}, custom_equals{}); + })); } SECTION( @@ -71,16 +71,14 @@ TEMPLATE_TEST_CASE_SIG( { map.insert(pairs_begin, pairs_begin + num_keys, - cuco::detail::MurmurHash3_32{}, + cuco::default_hash_function{}, custom_equals{}); // All keys inserted via custom `key_equal` should be found - REQUIRE(cuco::test::all_of(pairs_begin, - pairs_begin + num_keys, - [view] __device__(cuco::pair_type const& pair) { - auto const found = view.find(pair.first); - return (found != view.end()) and - (found->first.load() == pair.first and - found->second.load() == pair.second); - })); + REQUIRE(cuco::test::all_of( + pairs_begin, pairs_begin + num_keys, [view] __device__(cuco::pair const& pair) { + auto const found = view.find(pair.first); + return (found != view.end()) and + (found->first.load() == pair.first and found->second.load() == pair.second); + })); } } diff --git a/tests/static_map/shared_memory_test.cu b/tests/static_map/shared_memory_test.cu index 67ae88d88..444f1c7e7 100644 --- a/tests/static_map/shared_memory_test.cu +++ b/tests/static_map/shared_memory_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -25,7 +25,7 @@ #include #include -#include +#include #include @@ -95,7 +95,7 @@ TEMPLATE_TEST_CASE_SIG("Shared memory static map", std::vector> maps; for (std::size_t map_id = 0; map_id < number_of_maps; ++map_id) { maps.push_back(std::make_unique( - map_capacity, cuco::sentinel::empty_key{-1}, cuco::sentinel::empty_value{-1})); + map_capacity, cuco::empty_key{-1}, cuco::empty_value{-1})); } thrust::device_vector d_keys_exist(number_of_maps * elements_in_map); @@ -148,9 +148,7 @@ TEMPLATE_TEST_CASE_SIG("Shared memory static map", d_keys_exist.data().get(), d_keys_and_values_correct.data().get()); - REQUIRE(cuco::test::none_of(d_keys_exist.begin(), - d_keys_exist.end(), - [] __device__(const bool key_found) { return key_found; })); + REQUIRE(cuco::test::none_of(d_keys_exist.begin(), d_keys_exist.end(), thrust::identity{})); } } @@ -161,11 +159,8 @@ __global__ void shared_memory_hash_table_kernel(bool* key_found) using map_type = typename cuco::static_map::device_mutable_view; using find_map_type = typename cuco::static_map::device_view; __shared__ typename map_type::slot_type slots[N]; - auto map = map_type::make_from_uninitialized_slots(cg::this_thread_block(), - &slots[0], - N, - cuco::sentinel::empty_key{-1}, - cuco::sentinel::empty_value{-1}); + auto map = map_type::make_from_uninitialized_slots( + cg::this_thread_block(), &slots[0], N, cuco::empty_key{-1}, cuco::empty_value{-1}); auto g = cg::this_thread_block(); std::size_t index = threadIdx.x + blockIdx.x * blockDim.x; diff --git a/tests/static_map/stream_test.cu b/tests/static_map/stream_test.cu index 5f816410e..6121cbd62 100644 --- a/tests/static_map/stream_test.cu +++ b/tests/static_map/stream_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -27,7 +27,7 @@ #include #include -#include +#include TEMPLATE_TEST_CASE_SIG("Unique sequence of keys on given stream", "", @@ -38,12 +38,12 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence of keys on given stream", (int64_t, int64_t)) { cudaStream_t stream; - cudaStreamCreate(&stream); + CUCO_CUDA_TRY(cudaStreamCreate(&stream)); constexpr std::size_t num_keys{500'000}; cuco::static_map map{1'000'000, - cuco::sentinel::empty_key{-1}, - cuco::sentinel::empty_value{-1}, + cuco::empty_key{-1}, + cuco::empty_value{-1}, cuco::cuda_allocator{}, stream}; @@ -53,11 +53,11 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence of keys on given stream", thrust::sequence(thrust::device, d_keys.begin(), d_keys.end()); thrust::sequence(thrust::device, d_values.begin(), d_values.end()); - auto pairs_begin = thrust::make_transform_iterator( - thrust::make_counting_iterator(0), - [] __device__(auto i) { return cuco::pair_type(i, i); }); + auto pairs_begin = + thrust::make_transform_iterator(thrust::make_counting_iterator(0), + [] __device__(auto i) { return cuco::pair(i, i); }); - auto hash_fn = cuco::detail::MurmurHash3_32{}; + auto hash_fn = cuco::default_hash_function{}; auto equal_fn = thrust::equal_to{}; // bulk function test cases @@ -67,7 +67,6 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence of keys on given stream", map.insert(pairs_begin, pairs_begin + num_keys, hash_fn, equal_fn, stream); map.find(d_keys.begin(), d_keys.end(), d_results.begin(), hash_fn, equal_fn, stream); - // cudaStreamSynchronize(stream); auto zip = thrust::make_zip_iterator(thrust::make_tuple(d_results.begin(), d_values.begin())); REQUIRE(cuco::test::all_of( @@ -87,5 +86,5 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence of keys on given stream", REQUIRE(cuco::test::all_of(d_contained.begin(), d_contained.end(), thrust::identity{}, stream)); } - cudaStreamDestroy(stream); + CUCO_CUDA_TRY(cudaStreamDestroy(stream)); } diff --git a/tests/static_map/unique_sequence_test.cu b/tests/static_map/unique_sequence_test.cu index 75bb67d61..6a0165cc2 100644 --- a/tests/static_map/unique_sequence_test.cu +++ b/tests/static_map/unique_sequence_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -26,9 +26,10 @@ #include #include #include +#include #include -#include +#include TEMPLATE_TEST_CASE_SIG("Unique sequence of keys", "", @@ -40,7 +41,7 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence of keys", { constexpr std::size_t num_keys{500'000}; cuco::static_map map{ - 1'000'000, cuco::sentinel::empty_key{-1}, cuco::sentinel::empty_value{-1}}; + 1'000'000, cuco::empty_key{-1}, cuco::empty_value{-1}}; auto m_view = map.get_device_mutable_view(); auto view = map.get_device_view(); @@ -51,9 +52,9 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence of keys", thrust::sequence(thrust::device, d_keys.begin(), d_keys.end()); thrust::sequence(thrust::device, d_values.begin(), d_values.end()); - auto pairs_begin = thrust::make_transform_iterator( - thrust::make_counting_iterator(0), - [] __device__(auto i) { return cuco::pair_type(i, i); }); + auto pairs_begin = + thrust::make_transform_iterator(thrust::make_counting_iterator(0), + [] __device__(auto i) { return cuco::pair(i, i); }); thrust::device_vector d_results(num_keys); thrust::device_vector d_contained(num_keys); @@ -87,68 +88,217 @@ TEMPLATE_TEST_CASE_SIG("Unique sequence of keys", SECTION("Inserting unique keys should return insert success.") { - REQUIRE( - cuco::test::all_of(pairs_begin, - pairs_begin + num_keys, - [m_view] __device__(cuco::pair_type const& pair) mutable { - return m_view.insert(pair); - })); + REQUIRE(cuco::test::all_of(pairs_begin, + pairs_begin + num_keys, + [m_view] __device__(cuco::pair const& pair) mutable { + return m_view.insert(pair); + })); } SECTION("Cannot find any key in an empty hash map with non-const view") { SECTION("non-const view") - { - REQUIRE( - cuco::test::all_of(pairs_begin, - pairs_begin + num_keys, - [view] __device__(cuco::pair_type const& pair) mutable { - return view.find(pair.first) == view.end(); - })); - } - SECTION("const view") { REQUIRE(cuco::test::all_of(pairs_begin, pairs_begin + num_keys, - [view] __device__(cuco::pair_type const& pair) { + [view] __device__(cuco::pair const& pair) mutable { return view.find(pair.first) == view.end(); })); } + SECTION("const view") + { + REQUIRE(cuco::test::all_of( + pairs_begin, pairs_begin + num_keys, [view] __device__(cuco::pair const& pair) { + return view.find(pair.first) == view.end(); + })); + } } SECTION("Keys are all found after inserting many keys.") { // Bulk insert keys - thrust::for_each(thrust::device, - pairs_begin, - pairs_begin + num_keys, - [m_view] __device__(cuco::pair_type const& pair) mutable { - m_view.insert(pair); - }); + thrust::for_each( + thrust::device, + pairs_begin, + pairs_begin + num_keys, + [m_view] __device__(cuco::pair const& pair) mutable { m_view.insert(pair); }); SECTION("non-const view") - { - // All keys should be found - REQUIRE(cuco::test::all_of( - pairs_begin, - pairs_begin + num_keys, - [view] __device__(cuco::pair_type const& pair) mutable { - auto const found = view.find(pair.first); - return (found != view.end()) and - (found->first.load() == pair.first and found->second.load() == pair.second); - })); - } - SECTION("const view") { // All keys should be found REQUIRE(cuco::test::all_of(pairs_begin, pairs_begin + num_keys, - [view] __device__(cuco::pair_type const& pair) { + [view] __device__(cuco::pair const& pair) mutable { auto const found = view.find(pair.first); return (found != view.end()) and (found->first.load() == pair.first and found->second.load() == pair.second); })); } + SECTION("const view") + { + // All keys should be found + REQUIRE(cuco::test::all_of( + pairs_begin, pairs_begin + num_keys, [view] __device__(cuco::pair const& pair) { + auto const found = view.find(pair.first); + return (found != view.end()) and + (found->first.load() == pair.first and found->second.load() == pair.second); + })); + } + } +} + +using size_type = int32_t; + +template +__inline__ void test_unique_sequence(Map& map, size_type num_keys) +{ + using Key = typename Map::key_type; + using Value = typename Map::mapped_type; + + thrust::device_vector d_keys(num_keys); + + thrust::sequence(thrust::device, d_keys.begin(), d_keys.end()); + + auto keys_begin = d_keys.begin(); + auto pairs_begin = + thrust::make_transform_iterator(thrust::make_counting_iterator(0), + [] __device__(auto i) { return cuco::pair(i, i); }); + thrust::device_vector d_contained(num_keys); + + auto zip_equal = [] __device__(auto const& p) { return thrust::get<0>(p) == thrust::get<1>(p); }; + auto is_even = [] __device__(auto const& i) { return i % 2 == 0; }; + + SECTION("Non-inserted keys should not be contained.") + { + REQUIRE(map.size() == 0); + + map.contains(keys_begin, keys_begin + num_keys, d_contained.begin()); + REQUIRE(cuco::test::none_of(d_contained.begin(), d_contained.end(), thrust::identity{})); + } + + SECTION("Non-inserted keys have no matches") + { + thrust::device_vector d_results(num_keys); + + map.find(keys_begin, keys_begin + num_keys, d_results.begin()); + auto zip = thrust::make_zip_iterator(thrust::make_tuple( + d_results.begin(), thrust::constant_iterator{map.empty_key_sentinel()})); + + REQUIRE(cuco::test::all_of(zip, zip + num_keys, zip_equal)); + } + + SECTION("All conditionally inserted keys should be contained") + { + auto const inserted = map.insert_if( + pairs_begin, pairs_begin + num_keys, thrust::counting_iterator(0), is_even); + REQUIRE(inserted == num_keys / 2); + REQUIRE(map.size() == num_keys / 2); + + map.contains(keys_begin, keys_begin + num_keys, d_contained.begin()); + REQUIRE(cuco::test::equal(d_contained.begin(), + d_contained.end(), + thrust::counting_iterator(0), + [] __device__(auto const& idx_contained, auto const& idx) { + return ((idx % 2) == 0) == idx_contained; + })); + } + + map.insert(pairs_begin, pairs_begin + num_keys); + REQUIRE(map.size() == num_keys); + + SECTION("All inserted keys should be contained.") + { + map.contains(keys_begin, keys_begin + num_keys, d_contained.begin()); + REQUIRE(cuco::test::all_of(d_contained.begin(), d_contained.end(), thrust::identity{})); + } + + SECTION("Conditional contains should return true on even inputs.") + { + map.contains_if(keys_begin, + keys_begin + num_keys, + thrust::counting_iterator(0), + is_even, + d_contained.begin()); + auto gold_iter = + thrust::make_transform_iterator(thrust::counting_iterator(0), is_even); + auto zip = thrust::make_zip_iterator(thrust::make_tuple(d_contained.begin(), gold_iter)); + REQUIRE(cuco::test::all_of(zip, zip + num_keys, zip_equal)); + } + + SECTION("All inserted keys should be correctly recovered during find") + { + thrust::device_vector d_results(num_keys); + + map.find(keys_begin, keys_begin + num_keys, d_results.begin()); + auto zip = thrust::make_zip_iterator(thrust::make_tuple(d_results.begin(), keys_begin)); + + REQUIRE(cuco::test::all_of(zip, zip + num_keys, zip_equal)); } + + SECTION("All inserted key-values should be properly retrieved") + { + thrust::device_vector d_values(num_keys); + + auto const [keys_end, values_end] = map.retrieve_all(keys_begin, d_values.begin()); + REQUIRE(std::distance(keys_begin, keys_end) == num_keys); + REQUIRE(std::distance(d_values.begin(), values_end) == num_keys); + + thrust::sort(thrust::device, d_values.begin(), values_end); + REQUIRE(cuco::test::equal(d_values.begin(), + values_end, + thrust::make_counting_iterator(0), + thrust::equal_to{})); + } +} + +TEMPLATE_TEST_CASE_SIG( + "Unique sequence", + "", + ((typename Key, typename Value, cuco::test::probe_sequence Probe, int CGSize), + Key, + Value, + Probe, + CGSize), + (int32_t, int32_t, cuco::test::probe_sequence::double_hashing, 1), + (int32_t, int64_t, cuco::test::probe_sequence::double_hashing, 1), + (int32_t, int32_t, cuco::test::probe_sequence::double_hashing, 2), + (int32_t, int64_t, cuco::test::probe_sequence::double_hashing, 2), + (int64_t, int32_t, cuco::test::probe_sequence::double_hashing, 1), + (int64_t, int64_t, cuco::test::probe_sequence::double_hashing, 1), + (int64_t, int32_t, cuco::test::probe_sequence::double_hashing, 2), + (int64_t, int64_t, cuco::test::probe_sequence::double_hashing, 2), + (int32_t, int32_t, cuco::test::probe_sequence::linear_probing, 1), + (int32_t, int64_t, cuco::test::probe_sequence::linear_probing, 1), + (int32_t, int32_t, cuco::test::probe_sequence::linear_probing, 2), + (int32_t, int64_t, cuco::test::probe_sequence::linear_probing, 2), + (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 1), + (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 1), + (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 2), + (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2)) +{ + constexpr size_type num_keys{400}; + constexpr size_type gold_capacity = CGSize == 1 ? 422 // 211 x 1 x 2 + : 412; // 103 x 2 x 2 + + using probe = + std::conditional_t>, + cuco::experimental::double_hashing, + cuco::murmurhash3_32>>; + + auto map = cuco::experimental::static_map, + cuda::thread_scope_device, + thrust::equal_to, + probe, + cuco::cuda_allocator, + cuco::experimental::storage<2>>{ + num_keys, cuco::empty_key{-1}, cuco::empty_value{-1}}; + + REQUIRE(map.capacity() == gold_capacity); + + test_unique_sequence(map, num_keys); } diff --git a/tests/static_multimap/custom_pair_retrieve_test.cu b/tests/static_multimap/custom_pair_retrieve_test.cu index 5d0329382..7856b9e20 100644 --- a/tests/static_multimap/custom_pair_retrieve_test.cu +++ b/tests/static_multimap/custom_pair_retrieve_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -28,15 +28,15 @@ #include #include -#include +#include #include // Custom pair equal template struct pair_equal { - __device__ bool operator()(const cuco::pair_type& lhs, - const cuco::pair_type& rhs) const + __device__ bool operator()(const cuco::pair& lhs, + const cuco::pair& rhs) const { return lhs.first == rhs.first; } @@ -86,7 +86,7 @@ void test_non_shmem_pair_retrieve(Map& map, std::size_t const num_pairs) using Key = typename Map::key_type; using Value = typename Map::mapped_type; - thrust::device_vector> d_pairs(num_pairs); + thrust::device_vector> d_pairs(num_pairs); // pair multiplicity = 2 thrust::transform(thrust::device, @@ -94,7 +94,7 @@ void test_non_shmem_pair_retrieve(Map& map, std::size_t const num_pairs) thrust::counting_iterator(num_pairs), d_pairs.begin(), [] __device__(auto i) { - return cuco::pair_type{i / 2, i}; + return cuco::pair{i / 2, i}; }); auto pair_begin = d_pairs.begin(); @@ -107,7 +107,7 @@ void test_non_shmem_pair_retrieve(Map& map, std::size_t const num_pairs) thrust::counting_iterator(num_pairs), pair_begin, [] __device__(auto i) { - return cuco::pair_type{i, i}; + return cuco::pair{i, i}; }); // create an array of prefix sum @@ -196,19 +196,11 @@ TEMPLATE_TEST_CASE_SIG( { constexpr std::size_t num_pairs{200}; - if constexpr (Probe == cuco::test::probe_sequence::linear_probing) { - cuco::static_multimap, - cuco::linear_probing<1, cuco::detail::MurmurHash3_32>> - map{ - num_pairs * 2, cuco::sentinel::empty_key{-1}, cuco::sentinel::empty_value{-1}}; - test_non_shmem_pair_retrieve(map, num_pairs); - } - if constexpr (Probe == cuco::test::probe_sequence::double_hashing) { - cuco::static_multimap map{ - num_pairs * 2, cuco::sentinel::empty_key{-1}, cuco::sentinel::empty_value{-1}}; - test_non_shmem_pair_retrieve(map, num_pairs); - } + using probe = std::conditional_t>, + cuco::double_hashing<8, cuco::default_hash_function>>; + + cuco::static_multimap, probe> + map{num_pairs * 2, cuco::empty_key{-1}, cuco::empty_value{-1}}; + test_non_shmem_pair_retrieve(map, num_pairs); } diff --git a/tests/static_multimap/custom_type_test.cu b/tests/static_multimap/custom_type_test.cu index 40bdbe8ba..f53719205 100644 --- a/tests/static_multimap/custom_type_test.cu +++ b/tests/static_multimap/custom_type_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -27,7 +27,7 @@ #include #include -#include +#include #include @@ -39,7 +39,10 @@ struct key_pair { }; struct hash_key_pair { - __device__ uint32_t operator()(key_pair k) const { return k.a; }; + __host__ __device__ hash_key_pair() : hash_key_pair{0} {} + __host__ __device__ hash_key_pair(uint32_t offset) : offset_(offset) {} + __device__ uint32_t operator()(key_pair k) const { return k.a + offset_; }; + uint32_t offset_; }; struct key_pair_equals { @@ -95,7 +98,7 @@ __inline__ void test_custom_key_value_type(Map& map, std::size_t num_pairs) auto count = map.count(key_begin, key_begin + num_pairs, stream, key_pair_equals{}); REQUIRE(count == num_pairs); - thrust::device_vector> found_pairs(num_pairs); + thrust::device_vector> found_pairs(num_pairs); auto output_end = map.retrieve( key_begin, key_begin + num_pairs, found_pairs.begin(), stream, key_pair_equals{}); std::size_t const size = std::distance(found_pairs.begin(), output_end); @@ -107,16 +110,17 @@ __inline__ void test_custom_key_value_type(Map& map, std::size_t num_pairs) thrust::device, found_pairs.begin(), found_pairs.end(), - [] __device__(const cuco::pair_type& lhs, - const cuco::pair_type& rhs) { return lhs.first.a < rhs.first.a; }); - - REQUIRE(cuco::test::equal( - pair_begin, - pair_begin + num_pairs, - found_pairs.begin(), - [] __device__(cuco::pair_type lhs, cuco::pair_type rhs) { - return lhs.first.a == rhs.first.a; - })); + [] __device__(const cuco::pair& lhs, const cuco::pair& rhs) { + return lhs.first.a < rhs.first.a; + }); + + REQUIRE( + cuco::test::equal(pair_begin, + pair_begin + num_pairs, + found_pairs.begin(), + [] __device__(cuco::pair lhs, cuco::pair rhs) { + return lhs.first.a == rhs.first.a; + })); } SECTION("Non-matches are not included in the output") @@ -138,7 +142,7 @@ __inline__ void test_custom_key_value_type(Map& map, std::size_t num_pairs) auto count = map.count(query_key_begin, query_key_begin + num, stream, key_pair_equals{}); REQUIRE(count == num_pairs); - thrust::device_vector> found_pairs(num_pairs); + thrust::device_vector> found_pairs(num_pairs); auto output_end = map.retrieve( query_key_begin, query_key_begin + num, found_pairs.begin(), stream, key_pair_equals{}); std::size_t const size = std::distance(found_pairs.begin(), output_end); @@ -150,15 +154,16 @@ __inline__ void test_custom_key_value_type(Map& map, std::size_t num_pairs) thrust::device, found_pairs.begin(), found_pairs.end(), - [] __device__(const cuco::pair_type& lhs, - const cuco::pair_type& rhs) { return lhs.first.a < rhs.first.a; }); - REQUIRE(cuco::test::equal( - pair_begin, - pair_begin + num_pairs, - found_pairs.begin(), - [] __device__(cuco::pair_type lhs, cuco::pair_type rhs) { - return lhs.first.a == rhs.first.a; - })); + [] __device__(const cuco::pair& lhs, const cuco::pair& rhs) { + return lhs.first.a < rhs.first.a; + }); + REQUIRE( + cuco::test::equal(pair_begin, + pair_begin + num_pairs, + found_pairs.begin(), + [] __device__(cuco::pair lhs, cuco::pair rhs) { + return lhs.first.a == rhs.first.a; + })); } SECTION("Outer functions include non-matches in the output") @@ -180,7 +185,7 @@ __inline__ void test_custom_key_value_type(Map& map, std::size_t num_pairs) map.count_outer(query_key_begin, query_key_begin + num, stream, key_pair_equals{}); REQUIRE(count_outer == num); - thrust::device_vector> found_pairs(num); + thrust::device_vector> found_pairs(num); auto output_end = map.retrieve_outer( query_key_begin, query_key_begin + num, found_pairs.begin(), stream, key_pair_equals{}); std::size_t const size_outer = std::distance(found_pairs.begin(), output_end); @@ -228,21 +233,11 @@ TEMPLATE_TEST_CASE_SIG("User defined key and value type", constexpr std::size_t num_pairs = 100; constexpr std::size_t capacity = num_pairs * 2; - if constexpr (Probe == cuco::test::probe_sequence::linear_probing) { - cuco::static_multimap, - cuco::linear_probing<1, hash_key_pair>> - map{capacity, - cuco::sentinel::empty_key{sentinel_key}, - cuco::sentinel::empty_value{sentinel_value}}; - test_custom_key_value_type(map, num_pairs); - } - if constexpr (Probe == cuco::test::probe_sequence::double_hashing) { - cuco::static_multimap map{capacity, - cuco::sentinel::empty_key{sentinel_key}, - cuco::sentinel::empty_value{sentinel_value}}; - test_custom_key_value_type(map, num_pairs); - } + using probe = std::conditional_t, + cuco::double_hashing<8, hash_key_pair, hash_key_pair>>; + + cuco::static_multimap, probe> + map{capacity, cuco::empty_key{sentinel_key}, cuco::empty_value{sentinel_value}}; + test_custom_key_value_type(map, num_pairs); } diff --git a/tests/static_multimap/heterogeneous_lookup_test.cu b/tests/static_multimap/heterogeneous_lookup_test.cu index dca3de826..5a5b8b242 100644 --- a/tests/static_multimap/heterogeneous_lookup_test.cu +++ b/tests/static_multimap/heterogeneous_lookup_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -25,7 +25,7 @@ #include #include -#include +#include #include @@ -101,13 +101,11 @@ TEMPLATE_TEST_CASE("Heterogeneous lookup", cuda::thread_scope_device, cuco::cuda_allocator, cuco::linear_probing<1, custom_hasher>> - map{capacity, - cuco::sentinel::empty_key{sentinel_key}, - cuco::sentinel::empty_value{sentinel_value}}; + map{capacity, cuco::empty_key{sentinel_key}, cuco::empty_value{sentinel_value}}; - auto insert_pairs = thrust::make_transform_iterator( - thrust::counting_iterator(0), - [] __device__(auto i) { return cuco::pair_type(i, i); }); + auto insert_pairs = + thrust::make_transform_iterator(thrust::counting_iterator(0), + [] __device__(auto i) { return cuco::pair(i, i); }); auto probe_keys = thrust::make_transform_iterator(thrust::counting_iterator(0), [] __device__(auto i) { return ProbeKey(i); }); diff --git a/tests/static_multimap/insert_if_test.cu b/tests/static_multimap/insert_if_test.cu index 506563502..5d5648e71 100644 --- a/tests/static_multimap/insert_if_test.cu +++ b/tests/static_multimap/insert_if_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -24,7 +24,7 @@ #include #include -#include +#include template __inline__ void test_insert_if(Map& map, PairIt pair_begin, KeyIt key_begin, std::size_t size) @@ -55,7 +55,7 @@ TEMPLATE_TEST_CASE_SIG( constexpr std::size_t num_keys{1'000}; thrust::device_vector d_keys(num_keys); - thrust::device_vector> d_pairs(num_keys); + thrust::device_vector> d_pairs(num_keys); thrust::sequence(thrust::device, d_keys.begin(), d_keys.end()); // multiplicity = 1 @@ -64,21 +64,14 @@ TEMPLATE_TEST_CASE_SIG( thrust::counting_iterator(num_keys), d_pairs.begin(), [] __device__(auto i) { - return cuco::pair_type{i, i}; + return cuco::pair{i, i}; }); - if constexpr (Probe == cuco::test::probe_sequence::linear_probing) { - cuco::static_multimap, - cuco::linear_probing<1, cuco::detail::MurmurHash3_32>> - map{num_keys * 2, cuco::sentinel::empty_key{-1}, cuco::sentinel::empty_value{-1}}; - test_insert_if(map, d_pairs.begin(), d_keys.begin(), num_keys); - } - if constexpr (Probe == cuco::test::probe_sequence::double_hashing) { - cuco::static_multimap map{ - num_keys * 2, cuco::sentinel::empty_key{-1}, cuco::sentinel::empty_value{-1}}; - test_insert_if(map, d_pairs.begin(), d_keys.begin(), num_keys); - } + using probe = std::conditional_t>, + cuco::double_hashing<8, cuco::default_hash_function>>; + + cuco::static_multimap, probe> + map{num_keys * 2, cuco::empty_key{-1}, cuco::empty_value{-1}}; + test_insert_if(map, d_pairs.begin(), d_keys.begin(), num_keys); } diff --git a/tests/static_multimap/multiplicity_test.cu b/tests/static_multimap/multiplicity_test.cu index 3f5581b03..5de83a042 100644 --- a/tests/static_multimap/multiplicity_test.cu +++ b/tests/static_multimap/multiplicity_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -27,7 +27,7 @@ #include #include -#include +#include template __inline__ void test_multiplicity_two(Map& map, std::size_t num_items) @@ -36,7 +36,7 @@ __inline__ void test_multiplicity_two(Map& map, std::size_t num_items) using Value = typename Map::mapped_type; thrust::device_vector d_keys(num_items / 2); - thrust::device_vector> d_pairs(num_items); + thrust::device_vector> d_pairs(num_items); thrust::sequence(thrust::device, d_keys.begin(), d_keys.end()); // multiplicity = 2 @@ -45,10 +45,10 @@ __inline__ void test_multiplicity_two(Map& map, std::size_t num_items) thrust::counting_iterator(num_items), d_pairs.begin(), [] __device__(auto i) { - return cuco::pair_type{i / 2, i}; + return cuco::pair{i / 2, i}; }); - thrust::device_vector> d_results(num_items); + thrust::device_vector> d_results(num_items); auto key_begin = d_keys.begin(); auto pair_begin = d_pairs.begin(); @@ -91,22 +91,22 @@ __inline__ void test_multiplicity_two(Map& map, std::size_t num_items) REQUIRE(size == num_items); // sort before compare - thrust::sort(thrust::device, - d_results.begin(), - d_results.end(), - [] __device__(const cuco::pair_type& lhs, - const cuco::pair_type& rhs) { - if (lhs.first != rhs.first) { return lhs.first < rhs.first; } - return lhs.second < rhs.second; - }); - - REQUIRE(cuco::test::equal( - pair_begin, - pair_begin + num_items, - output_begin, - [] __device__(cuco::pair_type lhs, cuco::pair_type rhs) { - return lhs.first == rhs.first and lhs.second == rhs.second; - })); + thrust::sort( + thrust::device, + d_results.begin(), + d_results.end(), + [] __device__(const cuco::pair& lhs, const cuco::pair& rhs) { + if (lhs.first != rhs.first) { return lhs.first < rhs.first; } + return lhs.second < rhs.second; + }); + + REQUIRE( + cuco::test::equal(pair_begin, + pair_begin + num_items, + output_begin, + [] __device__(cuco::pair lhs, cuco::pair rhs) { + return lhs.first == rhs.first and lhs.second == rhs.second; + })); } SECTION("count and count_outer should return the same value.") @@ -129,22 +129,22 @@ __inline__ void test_multiplicity_two(Map& map, std::size_t num_items) REQUIRE(size == size_outer); // sort before compare - thrust::sort(thrust::device, - d_results.begin(), - d_results.end(), - [] __device__(const cuco::pair_type& lhs, - const cuco::pair_type& rhs) { - if (lhs.first != rhs.first) { return lhs.first < rhs.first; } - return lhs.second < rhs.second; - }); - - REQUIRE(cuco::test::equal( - pair_begin, - pair_begin + num_items, - output_begin, - [] __device__(cuco::pair_type lhs, cuco::pair_type rhs) { - return lhs.first == rhs.first and lhs.second == rhs.second; - })); + thrust::sort( + thrust::device, + d_results.begin(), + d_results.end(), + [] __device__(const cuco::pair& lhs, const cuco::pair& rhs) { + if (lhs.first != rhs.first) { return lhs.first < rhs.first; } + return lhs.second < rhs.second; + }); + + REQUIRE( + cuco::test::equal(pair_begin, + pair_begin + num_items, + output_begin, + [] __device__(cuco::pair lhs, cuco::pair rhs) { + return lhs.first == rhs.first and lhs.second == rhs.second; + })); } } @@ -161,18 +161,11 @@ TEMPLATE_TEST_CASE_SIG( { constexpr std::size_t num_items{4}; - if constexpr (Probe == cuco::test::probe_sequence::linear_probing) { - cuco::static_multimap, - cuco::linear_probing<1, cuco::detail::MurmurHash3_32>> - map{5, cuco::sentinel::empty_key{-1}, cuco::sentinel::empty_value{-1}}; - test_multiplicity_two(map, num_items); - } - if constexpr (Probe == cuco::test::probe_sequence::double_hashing) { - cuco::static_multimap map{ - 5, cuco::sentinel::empty_key{-1}, cuco::sentinel::empty_value{-1}}; - test_multiplicity_two(map, num_items); - } + using probe = std::conditional_t>, + cuco::double_hashing<8, cuco::default_hash_function>>; + + cuco::static_multimap, probe> + map{5, cuco::empty_key{-1}, cuco::empty_value{-1}}; + test_multiplicity_two(map, num_items); } diff --git a/tests/static_multimap/non_match_test.cu b/tests/static_multimap/non_match_test.cu index ef0042012..94023af56 100644 --- a/tests/static_multimap/non_match_test.cu +++ b/tests/static_multimap/non_match_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -26,7 +26,7 @@ #include #include -#include +#include template __inline__ void test_non_matches(Map& map, PairIt pair_begin, KeyIt key_begin, std::size_t num_keys) @@ -39,77 +39,77 @@ __inline__ void test_non_matches(Map& map, PairIt pair_begin, KeyIt key_begin, s SECTION("Output of count and retrieve should be coherent.") { auto num = map.count(key_begin, key_begin + num_keys); - thrust::device_vector> d_results(num); + thrust::device_vector> d_results(num); REQUIRE(num == num_keys); - auto output_begin = d_results.data().get(); + auto output_begin = d_results.begin(); auto output_end = map.retrieve(key_begin, key_begin + num_keys, output_begin); std::size_t const size = thrust::distance(output_begin, output_end); REQUIRE(size == num_keys); // sort before compare - thrust::sort(thrust::device, - output_begin, - output_end, - [] __device__(const cuco::pair_type& lhs, - const cuco::pair_type& rhs) { - if (lhs.first != rhs.first) { return lhs.first < rhs.first; } - return lhs.second < rhs.second; - }); - - REQUIRE(cuco::test::equal( - pair_begin, - pair_begin + num_keys, + thrust::sort( + thrust::device, output_begin, - [] __device__(cuco::pair_type lhs, cuco::pair_type rhs) { - return lhs.first == rhs.first and lhs.second == rhs.second; - })); + output_end, + [] __device__(const cuco::pair& lhs, const cuco::pair& rhs) { + if (lhs.first != rhs.first) { return lhs.first < rhs.first; } + return lhs.second < rhs.second; + }); + + REQUIRE( + cuco::test::equal(pair_begin, + pair_begin + num_keys, + output_begin, + [] __device__(cuco::pair lhs, cuco::pair rhs) { + return lhs.first == rhs.first and lhs.second == rhs.second; + })); } SECTION("Output of count_outer and retrieve_outer should be coherent.") { auto num = map.count_outer(key_begin, key_begin + num_keys); - thrust::device_vector> d_results(num); + thrust::device_vector> d_results(num); REQUIRE(num == (num_keys + num_keys / 2)); - auto output_begin = d_results.data().get(); + auto output_begin = d_results.begin(); auto output_end = map.retrieve_outer(key_begin, key_begin + num_keys, output_begin); std::size_t const size = thrust::distance(output_begin, output_end); REQUIRE(size == (num_keys + num_keys / 2)); // sort before compare - thrust::sort(thrust::device, - output_begin, - output_end, - [] __device__(const cuco::pair_type& lhs, - const cuco::pair_type& rhs) { - if (lhs.first != rhs.first) { return lhs.first < rhs.first; } - return lhs.second < rhs.second; - }); + thrust::sort( + thrust::device, + output_begin, + output_end, + [] __device__(const cuco::pair& lhs, const cuco::pair& rhs) { + if (lhs.first != rhs.first) { return lhs.first < rhs.first; } + return lhs.second < rhs.second; + }); // create gold reference - thrust::device_vector> gold(size); + thrust::device_vector> gold(size); auto gold_begin = gold.begin(); thrust::transform(thrust::device, thrust::counting_iterator(0), thrust::counting_iterator(size), gold_begin, [num_keys] __device__(auto i) { - if (i < num_keys) { return cuco::pair_type{i / 2, i}; } - return cuco::pair_type{i - num_keys / 2, -1}; + if (i < num_keys) { return cuco::pair{i / 2, i}; } + return cuco::pair{i - num_keys / 2, -1}; }); - REQUIRE(cuco::test::equal( - gold_begin, - gold_begin + size, - output_begin, - [] __device__(cuco::pair_type lhs, cuco::pair_type rhs) { - return lhs.first == rhs.first and lhs.second == rhs.second; - })); + REQUIRE( + cuco::test::equal(gold_begin, + gold_begin + size, + output_begin, + [] __device__(cuco::pair lhs, cuco::pair rhs) { + return lhs.first == rhs.first and lhs.second == rhs.second; + })); } } @@ -127,7 +127,7 @@ TEMPLATE_TEST_CASE_SIG( constexpr std::size_t num_keys{1'000}; thrust::device_vector d_keys(num_keys); - thrust::device_vector> d_pairs(num_keys); + thrust::device_vector> d_pairs(num_keys); thrust::sequence(thrust::device, d_keys.begin(), d_keys.end()); // multiplicity = 2 @@ -136,21 +136,18 @@ TEMPLATE_TEST_CASE_SIG( thrust::counting_iterator(num_keys), d_pairs.begin(), [] __device__(auto i) { - return cuco::pair_type{i / 2, i}; + return cuco::pair{i / 2, i}; }); - if constexpr (Probe == cuco::test::probe_sequence::linear_probing) { - cuco::static_multimap, - cuco::linear_probing<1, cuco::detail::MurmurHash3_32>> - map{num_keys * 2, cuco::sentinel::empty_key{-1}, cuco::sentinel::empty_value{-1}}; - test_non_matches(map, d_pairs.begin(), d_keys.begin(), num_keys); - } - if constexpr (Probe == cuco::test::probe_sequence::double_hashing) { - cuco::static_multimap map{ - num_keys * 2, cuco::sentinel::empty_key{-1}, cuco::sentinel::empty_value{-1}}; - test_non_matches(map, d_pairs.begin(), d_keys.begin(), num_keys); - } + using probe = std::conditional_t>, + cuco::double_hashing<8, cuco::default_hash_function>>; + + cuco::static_multimap, + cuco::linear_probing<1, cuco::default_hash_function>> + map{num_keys * 2, cuco::empty_key{-1}, cuco::empty_value{-1}}; + test_non_matches(map, d_pairs.begin(), d_keys.begin(), num_keys); } diff --git a/tests/static_multimap/pair_function_test.cu b/tests/static_multimap/pair_function_test.cu index c5442533b..3ef49377d 100644 --- a/tests/static_multimap/pair_function_test.cu +++ b/tests/static_multimap/pair_function_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -27,13 +27,13 @@ #include #include -#include +#include // Custom pair equal template struct pair_equal { - __device__ bool operator()(const cuco::pair_type& lhs, - const cuco::pair_type& rhs) const + __device__ bool operator()(const cuco::pair& lhs, + const cuco::pair& rhs) const { return lhs.first == rhs.first; } @@ -43,7 +43,7 @@ template __inline__ void test_pair_functions(Map& map, PairIt pair_begin, std::size_t num_pairs) { map.insert(pair_begin, pair_begin + num_pairs); - cudaStreamSynchronize(0); + CUCO_CUDA_TRY(cudaStreamSynchronize(0)); auto res = map.get_size(); REQUIRE(res == num_pairs); @@ -54,7 +54,7 @@ __inline__ void test_pair_functions(Map& map, PairIt pair_begin, std::size_t num thrust::counting_iterator(num_pairs), pair_begin, [] __device__(auto i) { - return cuco::pair_type{i, i}; + return cuco::pair{i, i}; }); SECTION("pair_contains returns true for all inserted pairs and false for non-inserted ones.") @@ -121,7 +121,7 @@ TEMPLATE_TEST_CASE_SIG( (int64_t, int64_t, cuco::test::probe_sequence::double_hashing)) { constexpr std::size_t num_pairs{4}; - thrust::device_vector> d_pairs(num_pairs); + thrust::device_vector> d_pairs(num_pairs); // pair multiplicity = 2 thrust::transform(thrust::device, @@ -129,22 +129,14 @@ TEMPLATE_TEST_CASE_SIG( thrust::counting_iterator(num_pairs), d_pairs.begin(), [] __device__(auto i) { - return cuco::pair_type{i / 2, i}; + return cuco::pair{i / 2, i}; }); - if constexpr (Probe == cuco::test::probe_sequence::linear_probing) { - cuco::static_multimap, - cuco::linear_probing<1, cuco::detail::MurmurHash3_32>> - map{ - num_pairs * 2, cuco::sentinel::empty_key{-1}, cuco::sentinel::empty_value{-1}}; - test_pair_functions(map, d_pairs.begin(), num_pairs); - } - if constexpr (Probe == cuco::test::probe_sequence::double_hashing) { - cuco::static_multimap map{ - num_pairs * 2, cuco::sentinel::empty_key{-1}, cuco::sentinel::empty_value{-1}}; - test_pair_functions(map, d_pairs.begin(), num_pairs); - } + using probe = std::conditional_t>, + cuco::double_hashing<8, cuco::default_hash_function>>; + + cuco::static_multimap, probe> + map{num_pairs * 2, cuco::empty_key{-1}, cuco::empty_value{-1}}; + test_pair_functions(map, d_pairs.begin(), num_pairs); } diff --git a/tests/static_set/capacity_test.cu b/tests/static_set/capacity_test.cu new file mode 100644 index 000000000..4c66a7ccc --- /dev/null +++ b/tests/static_set/capacity_test.cu @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +TEST_CASE("Static set capacity", "") +{ + using Key = int32_t; + using ProbeT = cuco::experimental::double_hashing<1, cuco::default_hash_function>; + using Equal = thrust::equal_to; + using AllocatorT = cuco::cuda_allocator; + using StorageT = cuco::experimental::storage<2>; + + SECTION("zero capacity is allowed.") + { + auto constexpr gold_capacity = 4; + + using extent_type = cuco::experimental::extent; + cuco::experimental:: + static_set + set{extent_type{}, cuco::empty_key{-1}}; + auto const capacity = set.capacity(); + REQUIRE(capacity == gold_capacity); + + auto ref = set.ref(cuco::experimental::insert); + auto const ref_capacity = ref.capacity(); + REQUIRE(ref_capacity == gold_capacity); + } + + SECTION("negative capacity (ikr -_-||) is also allowed.") + { + auto constexpr gold_capacity = 4; + + using extent_type = cuco::experimental::extent; + cuco::experimental:: + static_set + set{extent_type{-10}, cuco::empty_key{-1}}; + auto const capacity = set.capacity(); + REQUIRE(capacity == gold_capacity); + + auto ref = set.ref(cuco::experimental::insert); + auto const ref_capacity = ref.capacity(); + REQUIRE(ref_capacity == gold_capacity); + } + + constexpr std::size_t num_keys{400}; + + SECTION("Dynamic extent is evaluated at run time.") + { + auto constexpr gold_capacity = 422; // 211 x 2 + + using extent_type = cuco::experimental::extent; + cuco::experimental:: + static_set + set{num_keys, cuco::empty_key{-1}}; + auto const capacity = set.capacity(); + REQUIRE(capacity == gold_capacity); + + auto ref = set.ref(cuco::experimental::insert); + auto const ref_capacity = ref.capacity(); + REQUIRE(ref_capacity == gold_capacity); + } + + SECTION("Dynamic extent is evaluated at run time.") + { + auto constexpr gold_capacity = 412; // 103 x 2 x 2 + + using probe = cuco::experimental::linear_probing<2, cuco::default_hash_function>; + auto set = cuco::experimental::static_set, + cuda::thread_scope_device, + Equal, + probe, + AllocatorT, + StorageT>{num_keys, cuco::empty_key{-1}}; + + auto const capacity = set.capacity(); + REQUIRE(capacity == gold_capacity); + + auto ref = set.ref(cuco::experimental::insert); + auto const ref_capacity = ref.capacity(); + REQUIRE(ref_capacity == gold_capacity); + } +} diff --git a/tests/static_set/heterogeneous_lookup_test.cu b/tests/static_set/heterogeneous_lookup_test.cu new file mode 100644 index 000000000..cbc0efac3 --- /dev/null +++ b/tests/static_set/heterogeneous_lookup_test.cu @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include +#include +#include +#include +#include +#include + +#include + +#include + +// insert key type +template +struct key_pair { + T a; + T b; + + __host__ __device__ key_pair() {} + __host__ __device__ key_pair(T x) : a{x}, b{x} {} + + // Device equality operator is mandatory due to libcudacxx bug: + // https://github.com/NVIDIA/libcudacxx/issues/223 + __device__ bool operator==(key_pair const& other) const { return a == other.a and b == other.b; } +}; + +// probe key type +template +struct key_triplet { + T a; + T b; + T c; + + __host__ __device__ key_triplet() {} + __host__ __device__ key_triplet(T x) : a{x}, b{x}, c{x} {} + + // Device equality operator is mandatory due to libcudacxx bug: + // https://github.com/NVIDIA/libcudacxx/issues/223 + __device__ bool operator==(key_triplet const& other) const + { + return a == other.a and b == other.b and c == other.c; + } +}; + +// User-defined device hasher +struct custom_hasher { + template + __device__ uint32_t operator()(CustomKey const& k) const + { + return thrust::raw_reference_cast(k).a; + }; +}; + +// User-defined device key equality +struct custom_key_equal { + template + __device__ bool operator()(LHS const& lhs, RHS const& rhs) const + { + return thrust::raw_reference_cast(lhs).a == thrust::raw_reference_cast(rhs).a; + } +}; + +TEMPLATE_TEST_CASE_SIG( + "Heterogeneous lookup", "", ((typename T, int CGSize), T, CGSize), (int32_t, 1), (int32_t, 2)) +{ + using Key = key_pair; + using ProbeKey = key_triplet; + using probe_type = cuco::experimental::double_hashing; + + auto const sentinel_key = Key{-1}; + + constexpr std::size_t num = 100; + constexpr std::size_t capacity = num * 2; + auto const probe = probe_type{custom_hasher{}, custom_hasher{}}; + auto my_set = cuco::experimental::static_set, + cuda::thread_scope_device, + custom_key_equal, + probe_type>{ + capacity, cuco::empty_key{sentinel_key}, custom_key_equal{}, probe}; + + auto insert_pairs = thrust::make_transform_iterator(thrust::counting_iterator(0), + [] __device__(auto i) { return Key{i}; }); + auto probe_keys = thrust::make_transform_iterator(thrust::counting_iterator(0), + [] __device__(auto i) { return ProbeKey(i); }); + + SECTION("All inserted keys should be contained") + { + thrust::device_vector contained(num); + my_set.insert(insert_pairs, insert_pairs + num); + my_set.contains(probe_keys, probe_keys + num, contained.begin()); + REQUIRE(cuco::test::all_of(contained.begin(), contained.end(), thrust::identity{})); + } + + SECTION("Non-inserted keys should not be contained") + { + thrust::device_vector contained(num); + my_set.contains(probe_keys, probe_keys + num, contained.begin()); + REQUIRE(cuco::test::none_of(contained.begin(), contained.end(), thrust::identity{})); + } +} diff --git a/tests/static_set/insert_and_find_test.cu b/tests/static_set/insert_and_find_test.cu new file mode 100644 index 000000000..278510e08 --- /dev/null +++ b/tests/static_set/insert_and_find_test.cu @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include +#include +#include + +#include + +template +__inline__ void test_insert_and_find(Set& set, std::size_t num_keys) +{ + using Key = typename Set::key_type; + static auto constexpr cg_size = Set::cg_size; + + auto const keys_begin = [&]() { + if constexpr (cg_size == 1) { + return thrust::counting_iterator(0); + } else { + return thrust::make_transform_iterator(thrust::counting_iterator(0), + [] __device__(auto i) { return i / cg_size; }); + } + }(); + auto const keys_end = [&]() { + if constexpr (cg_size == 1) { + return keys_begin + num_keys; + } else { + return keys_begin + num_keys * cg_size; + } + }(); + + auto ref = set.ref(cuco::experimental::op::insert_and_find); + + REQUIRE(cuco::test::all_of(keys_begin, keys_end, [ref] __device__(Key key) mutable { + auto [iter, inserted] = [&]() { + if constexpr (cg_size == 1) { + return ref.insert_and_find(key); + } else { + auto const tile = + cooperative_groups::tiled_partition(cooperative_groups::this_thread_block()); + return ref.insert_and_find(tile, key); + } + }(); + return inserted == true; + })); + + SECTION("Inserting elements for the second time will always fail.") + { + REQUIRE(cuco::test::all_of(keys_begin, keys_end, [ref] __device__(Key key) mutable { + auto [iter, inserted] = [&]() { + if constexpr (cg_size == 1) { + return ref.insert_and_find(key); + } else { + auto const tile = + cooperative_groups::tiled_partition(cooperative_groups::this_thread_block()); + return ref.insert_and_find(tile, key); + } + }(); + return inserted == false and key == *iter; + })); + } +} + +TEMPLATE_TEST_CASE_SIG( + "Insert and find", + "", + ((typename Key, cuco::test::probe_sequence Probe, int CGSize), Key, Probe, CGSize), + (int32_t, cuco::test::probe_sequence::double_hashing, 1), + (int32_t, cuco::test::probe_sequence::double_hashing, 2), + (int64_t, cuco::test::probe_sequence::double_hashing, 1), + (int64_t, cuco::test::probe_sequence::double_hashing, 2), + (int32_t, cuco::test::probe_sequence::linear_probing, 1), + (int32_t, cuco::test::probe_sequence::linear_probing, 2), + (int64_t, cuco::test::probe_sequence::linear_probing, 1), + (int64_t, cuco::test::probe_sequence::linear_probing, 2)) +{ + constexpr std::size_t num_keys{400}; + + using probe = std::conditional_t< + Probe == cuco::test::probe_sequence::linear_probing, + cuco::experimental::linear_probing>, + cuco::experimental::double_hashing>>; + + auto set = cuco::experimental::static_set, + cuda::thread_scope_device, + thrust::equal_to, + probe, + cuco::cuda_allocator, + cuco::experimental::storage<2>>{ + num_keys, cuco::empty_key{-1}}; + test_insert_and_find(set, num_keys); +} diff --git a/tests/static_set/large_input_test.cu b/tests/static_set/large_input_test.cu new file mode 100644 index 000000000..5015ca750 --- /dev/null +++ b/tests/static_set/large_input_test.cu @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +template +__inline__ void test_unique_sequence(Set& set, bool* res_begin, std::size_t num_keys) +{ + using Key = typename Set::key_type; + + auto const keys_begin = thrust::counting_iterator(0); + auto const keys_end = thrust::counting_iterator(num_keys); + + SECTION("Non-inserted keys should not be contained.") + { + REQUIRE(set.size() == 0); + + set.contains(keys_begin, keys_end, res_begin); + REQUIRE(cuco::test::none_of(res_begin, res_begin + num_keys, thrust::identity{})); + } + + set.insert(keys_begin, keys_end); + REQUIRE(set.size() == num_keys); + + SECTION("All inserted key/value pairs should be contained.") + { + set.contains(keys_begin, keys_end, res_begin); + REQUIRE(cuco::test::all_of(res_begin, res_begin + num_keys, thrust::identity{})); + } +} + +TEMPLATE_TEST_CASE_SIG( + "Large input", + "", + ((typename Key, cuco::test::probe_sequence Probe, int CGSize), Key, Probe, CGSize), + (int32_t, cuco::test::probe_sequence::double_hashing, 1), + (int32_t, cuco::test::probe_sequence::double_hashing, 2), + (int64_t, cuco::test::probe_sequence::double_hashing, 1), + (int64_t, cuco::test::probe_sequence::double_hashing, 2)) +{ + constexpr std::size_t num_keys{1'200'000'000}; + + using extent_type = cuco::experimental::extent; + using probe = cuco::experimental::double_hashing>; + + try { + auto set = cuco::experimental:: + static_set, probe>{ + num_keys * 2, cuco::empty_key{-1}}; + + thrust::device_vector d_contained(num_keys); + test_unique_sequence(set, d_contained.data().get(), num_keys); + } catch (cuco::cuda_error&) { + SKIP("Out of memory"); + } catch (std::bad_alloc&) { + SKIP("Out of memory"); + } +} diff --git a/tests/static_set/retrieve_all_test.cu b/tests/static_set/retrieve_all_test.cu new file mode 100644 index 000000000..616e35138 --- /dev/null +++ b/tests/static_set/retrieve_all_test.cu @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include +#include +#include +#include +#include +#include + +#include + +template +__inline__ void test_unique_sequence(Set& set, std::size_t num_keys) +{ + using Key = typename Set::key_type; + + thrust::device_vector d_keys(num_keys); + thrust::sequence(d_keys.begin(), d_keys.end()); + auto keys_begin = d_keys.begin(); + + SECTION("Non-inserted keys should not be contained.") + { + REQUIRE(set.size() == 0); + + auto keys_end = set.retrieve_all(keys_begin); + REQUIRE(std::distance(keys_begin, keys_end) == 0); + } + + set.insert(keys_begin, keys_begin + num_keys); + REQUIRE(set.size() == num_keys); + + SECTION("All inserted key/value pairs should be contained.") + { + thrust::device_vector d_res(num_keys); + auto d_res_end = set.retrieve_all(d_res.begin()); + thrust::sort(d_res.begin(), d_res_end); + REQUIRE(cuco::test::equal( + d_res.begin(), d_res_end, thrust::counting_iterator(0), thrust::equal_to{})); + } +} + +TEMPLATE_TEST_CASE_SIG( + "Retrieve all", + "", + ((typename Key, cuco::test::probe_sequence Probe, int CGSize), Key, Probe, CGSize), + (int32_t, cuco::test::probe_sequence::double_hashing, 1), + (int32_t, cuco::test::probe_sequence::double_hashing, 2), + (int64_t, cuco::test::probe_sequence::double_hashing, 1), + (int64_t, cuco::test::probe_sequence::double_hashing, 2), + (int32_t, cuco::test::probe_sequence::linear_probing, 1), + (int32_t, cuco::test::probe_sequence::linear_probing, 2), + (int64_t, cuco::test::probe_sequence::linear_probing, 1), + (int64_t, cuco::test::probe_sequence::linear_probing, 2)) +{ + constexpr std::size_t num_keys{400}; + auto constexpr gold_capacity = CGSize == 1 ? 409 // 409 x 1 x 1 + : 422 // 211 x 2 x 1 + ; + + using probe = std::conditional_t< + Probe == cuco::test::probe_sequence::linear_probing, + cuco::experimental::linear_probing>, + cuco::experimental::double_hashing>>; + + auto set = cuco::experimental::static_set, + cuda::thread_scope_device, + thrust::equal_to, + probe, + cuco::cuda_allocator, + cuco::experimental::storage<1>>{ + num_keys, cuco::empty_key{-1}}; + + REQUIRE(set.capacity() == gold_capacity); + + test_unique_sequence(set, num_keys); +} diff --git a/tests/static_set/size_test.cu b/tests/static_set/size_test.cu new file mode 100644 index 000000000..2e2bfd6c2 --- /dev/null +++ b/tests/static_set/size_test.cu @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include + +#include + +TEST_CASE("Size computation", "") +{ + constexpr std::size_t num_keys{400}; + + cuco::experimental::static_set set{cuco::experimental::extent{400}, + cuco::empty_key{-1}}; + + thrust::device_vector d_keys(num_keys); + + thrust::sequence(thrust::device, d_keys.begin(), d_keys.end()); + + auto const num_successes = set.insert(d_keys.begin(), d_keys.end()); + + REQUIRE(set.size() == num_keys); + REQUIRE(num_successes == num_keys); + + set.clear(); + + REQUIRE(set.size() == 0); +} diff --git a/tests/static_set/unique_sequence_test.cu b/tests/static_set/unique_sequence_test.cu new file mode 100644 index 000000000..53ede7524 --- /dev/null +++ b/tests/static_set/unique_sequence_test.cu @@ -0,0 +1,152 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +using size_type = int32_t; + +template +__inline__ void test_unique_sequence(Set& set, size_type num_keys) +{ + using Key = typename Set::key_type; + + thrust::device_vector d_keys(num_keys); + + thrust::sequence(thrust::device, d_keys.begin(), d_keys.end()); + + auto keys_begin = d_keys.begin(); + thrust::device_vector d_contained(num_keys); + + auto zip_equal = [] __device__(auto const& p) { return thrust::get<0>(p) == thrust::get<1>(p); }; + auto is_even = [] __device__(auto const& i) { return i % 2 == 0; }; + + SECTION("Non-inserted keys should not be contained.") + { + REQUIRE(set.size() == 0); + + set.contains(keys_begin, keys_begin + num_keys, d_contained.begin()); + REQUIRE(cuco::test::none_of(d_contained.begin(), d_contained.end(), thrust::identity{})); + } + + SECTION("Non-inserted keys have no matches") + { + thrust::device_vector d_results(num_keys); + + set.find(keys_begin, keys_begin + num_keys, d_results.begin()); + auto zip = thrust::make_zip_iterator(thrust::make_tuple( + d_results.begin(), thrust::constant_iterator{set.empty_key_sentinel()})); + + REQUIRE(cuco::test::all_of(zip, zip + num_keys, zip_equal)); + } + + SECTION("All conditionally inserted keys should be contained") + { + auto const inserted = set.insert_if( + keys_begin, keys_begin + num_keys, thrust::counting_iterator(0), is_even); + REQUIRE(inserted == num_keys / 2); + REQUIRE(set.size() == num_keys / 2); + + set.contains(keys_begin, keys_begin + num_keys, d_contained.begin()); + REQUIRE(cuco::test::equal(d_contained.begin(), + d_contained.end(), + thrust::counting_iterator(0), + [] __device__(auto const& idx_contained, auto const& idx) { + return ((idx % 2) == 0) == idx_contained; + })); + } + + set.insert(keys_begin, keys_begin + num_keys); + REQUIRE(set.size() == num_keys); + + SECTION("All inserted keys should be contained.") + { + set.contains(keys_begin, keys_begin + num_keys, d_contained.begin()); + REQUIRE(cuco::test::all_of(d_contained.begin(), d_contained.end(), thrust::identity{})); + } + + SECTION("Conditional contains should return true on even inputs.") + { + set.contains_if(keys_begin, + keys_begin + num_keys, + thrust::counting_iterator(0), + is_even, + d_contained.begin()); + auto gold_iter = + thrust::make_transform_iterator(thrust::counting_iterator(0), is_even); + auto zip = thrust::make_zip_iterator(thrust::make_tuple(d_contained.begin(), gold_iter)); + REQUIRE(cuco::test::all_of(zip, zip + num_keys, zip_equal)); + } + + SECTION("All inserted keys should be correctly recovered during find") + { + thrust::device_vector d_results(num_keys); + + set.find(keys_begin, keys_begin + num_keys, d_results.begin()); + auto zip = thrust::make_zip_iterator(thrust::make_tuple(d_results.begin(), keys_begin)); + + REQUIRE(cuco::test::all_of(zip, zip + num_keys, zip_equal)); + } +} + +TEMPLATE_TEST_CASE_SIG( + "Unique sequence", + "", + ((typename Key, cuco::test::probe_sequence Probe, int CGSize), Key, Probe, CGSize), + (int32_t, cuco::test::probe_sequence::double_hashing, 1), + (int32_t, cuco::test::probe_sequence::double_hashing, 2), + (int64_t, cuco::test::probe_sequence::double_hashing, 1), + (int64_t, cuco::test::probe_sequence::double_hashing, 2), + (int32_t, cuco::test::probe_sequence::linear_probing, 1), + (int32_t, cuco::test::probe_sequence::linear_probing, 2), + (int64_t, cuco::test::probe_sequence::linear_probing, 1), + (int64_t, cuco::test::probe_sequence::linear_probing, 2)) +{ + constexpr size_type num_keys{400}; + constexpr size_type gold_capacity = CGSize == 1 ? 422 // 211 x 1 x 2 + : 412 // 103 x 2 x 2 + ; + + using probe = std::conditional_t< + Probe == cuco::test::probe_sequence::linear_probing, + cuco::experimental::linear_probing>, + cuco::experimental::double_hashing>>; + + auto set = cuco::experimental::static_set, + cuda::thread_scope_device, + thrust::equal_to, + probe, + cuco::cuda_allocator, + cuco::experimental::storage<2>>{ + num_keys, cuco::empty_key{-1}}; + + REQUIRE(set.capacity() == gold_capacity); + + test_unique_sequence(set, num_keys); +} diff --git a/tests/utility/extent_test.cu b/tests/utility/extent_test.cu new file mode 100644 index 000000000..d44e20368 --- /dev/null +++ b/tests/utility/extent_test.cu @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include + +TEMPLATE_TEST_CASE_SIG( + "Extent tests", "", ((typename SizeType), SizeType), (int32_t), (int64_t), (std::size_t)) +{ + SizeType constexpr num = 1234; + SizeType constexpr gold_reference = 314; // 157 x 2 + auto constexpr cg_size = 2; + auto constexpr window_size = 4; + + SECTION("Static extent must be evaluated at compile time.") + { + auto const size = cuco::experimental::extent{}; + STATIC_REQUIRE(num == size); + } + + SECTION("Dynamic extent is evaluated at run time.") + { + auto const size = cuco::experimental::extent(num); + REQUIRE(size == num); + } + + SECTION("Compute static valid extent at compile time.") + { + auto constexpr size = cuco::experimental::extent{}; + auto constexpr res = cuco::experimental::make_window_extent(size); + STATIC_REQUIRE(gold_reference == res.value()); + } + + SECTION("Compute dynamic valid extent at run time.") + { + auto const size = cuco::experimental::extent{num}; + auto const res = cuco::experimental::make_window_extent(size); + REQUIRE(gold_reference == res.value()); + } +} diff --git a/tests/utility/fast_int_test.cu b/tests/utility/fast_int_test.cu new file mode 100644 index 000000000..c780293f9 --- /dev/null +++ b/tests/utility/fast_int_test.cu @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include +#include + +#include +#include + +TEMPLATE_TEST_CASE( + "utility::fast_int tests", "", std::int32_t, std::uint32_t, std::int64_t, std::uint64_t) +{ + TestType value = GENERATE(1, 2, 9, 32, 4123, 8192, 4312456); + TestType lhs = GENERATE(1, 2, 9, 32, 4123, 8192, 4312456); + constexpr auto max_value = std::numeric_limits::max(); + + cuco::utility::fast_int fast_value{value}; + + SECTION("Should be explicitly convertible to the underlying integer type.") + { + REQUIRE(static_cast(fast_value) == value); + } + + SECTION("Fast div/mod should produce correct result.") + { + INFO(lhs << " /% " << value); + REQUIRE(lhs / fast_value == lhs / value); + REQUIRE(lhs % fast_value == lhs % value); + } + + SECTION("Fast div/mod with maximum rhs value should produce correct result.") + { + INFO(lhs << " /% " << max_value); + cuco::utility::fast_int fast_max{max_value}; + REQUIRE(lhs / fast_max == lhs / max_value); + REQUIRE(lhs % fast_max == lhs % max_value); + } + + SECTION("Fast div/mod with maximum lhs value should produce correct result.") + { + INFO(max_value << " /% " << value); + REQUIRE(max_value / fast_value == max_value / value); + REQUIRE(max_value % fast_value == max_value % value); + } +} diff --git a/tests/utility/hash_test.cu b/tests/utility/hash_test.cu new file mode 100644 index 000000000..3e8880860 --- /dev/null +++ b/tests/utility/hash_test.cu @@ -0,0 +1,198 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include + +#include + +#include +#include + +#include + +template +struct large_key { + constexpr __host__ __device__ large_key(int32_t value) noexcept + { + for (int32_t i = 0; i < Words; ++i) { + data_[i] = value; + } + } + + private: + int32_t data_[Words]; +}; + +template +__host__ __device__ bool check_hash_result(typename Hash::argument_type const& key, + typename Hash::result_type seed, + typename Hash::result_type expected) noexcept +{ + Hash h(seed); + return (h(key) == expected); +} + +template +__global__ void check_hash_result_kernel_64(OutputIter result) +{ + int i = 0; + + result[i++] = check_hash_result>(0, 0, 16804241149081757544); + result[i++] = check_hash_result>(42, 0, 765293966243412708); + result[i++] = check_hash_result>(0, 42, 9486749600008296231); + + result[i++] = check_hash_result>(0, 0, 4246796580750024372); + result[i++] = check_hash_result>(0, 42, 3614696996920510707); + result[i++] = check_hash_result>(42, 0, 15516826743637085169); + result[i++] = check_hash_result>(123456789, 0, 9462334144942111946); + + result[i++] = check_hash_result>(0, 0, 3803688792395291579); + result[i++] = check_hash_result>(0, 42, 13194218611613725804); + result[i++] = check_hash_result>(42, 0, 13066772586158965587); + result[i++] = check_hash_result>(123456789, 0, 14662639848940634189); + +#if defined(CUCO_HAS_INT128) + result[i++] = check_hash_result>(123456789, 0, 7986913354431084250); +#endif + + result[i++] = + check_hash_result>>(123456789, 0, 2031761887105658523); +} + +TEST_CASE("Test cuco::xxhash_64", "") +{ + // Reference hash values were computed using https://github.com/Cyan4973/xxHash + SECTION("Check if host-generated hash values match the reference implementation.") + { + CHECK(check_hash_result>(0, 0, 16804241149081757544)); + CHECK(check_hash_result>(42, 0, 765293966243412708)); + CHECK(check_hash_result>(0, 42, 9486749600008296231)); + + CHECK(check_hash_result>(0, 0, 4246796580750024372)); + CHECK(check_hash_result>(0, 42, 3614696996920510707)); + CHECK(check_hash_result>(42, 0, 15516826743637085169)); + CHECK(check_hash_result>(123456789, 0, 9462334144942111946)); + + CHECK(check_hash_result>(0, 0, 3803688792395291579)); + CHECK(check_hash_result>(0, 42, 13194218611613725804)); + CHECK(check_hash_result>(42, 0, 13066772586158965587)); + CHECK(check_hash_result>(123456789, 0, 14662639848940634189)); + +#if defined(CUCO_HAS_INT128) + CHECK(check_hash_result>(123456789, 0, 7986913354431084250)); +#endif + + // 32*4=128-byte key to test the pipelined outermost hashing loop + CHECK(check_hash_result>>(123456789, 0, 2031761887105658523)); + } + + SECTION("Check if device-generated hash values match the reference implementation.") + { + thrust::device_vector result(10); + + check_hash_result_kernel_64<<<1, 1>>>(result.begin()); + + CHECK(cuco::test::all_of(result.begin(), result.end(), [] __device__(bool v) { return v; })); + } +} + +template +__global__ void check_hash_result_kernel_32(OutputIter result) +{ + int i = 0; + + result[i++] = check_hash_result>(0, 0, 3479547966); + result[i++] = check_hash_result>(42, 0, 3774771295); + result[i++] = check_hash_result>(0, 42, 2099223482); + + result[i++] = check_hash_result>(0, 0, 148298089); + result[i++] = check_hash_result>(0, 42, 2132181312); + result[i++] = check_hash_result>(42, 0, 1161967057); + result[i++] = check_hash_result>(123456789, 0, 2987034094); + + result[i++] = check_hash_result>(0, 0, 3736311059); + result[i++] = check_hash_result>(0, 42, 1076387279); + result[i++] = check_hash_result>(42, 0, 2332451213); + result[i++] = check_hash_result>(123456789, 0, 1561711919); + +#if defined(CUCO_HAS_INT128) + result[i++] = check_hash_result>(123456789, 0, 1846633701); +#endif + + result[i++] = check_hash_result>>(123456789, 0, 3715432378); +} + +TEST_CASE("Test cuco::xxhash_32", "") +{ + // Reference hash values were computed using https://github.com/Cyan4973/xxHash + SECTION("Check if host-generated hash values match the reference implementation.") + { + CHECK(check_hash_result>(0, 0, 3479547966)); + CHECK(check_hash_result>(42, 0, 3774771295)); + CHECK(check_hash_result>(0, 42, 2099223482)); + + CHECK(check_hash_result>(0, 0, 148298089)); + CHECK(check_hash_result>(0, 42, 2132181312)); + CHECK(check_hash_result>(42, 0, 1161967057)); + CHECK(check_hash_result>(123456789, 0, 2987034094)); + + CHECK(check_hash_result>(0, 0, 3736311059)); + CHECK(check_hash_result>(0, 42, 1076387279)); + CHECK(check_hash_result>(42, 0, 2332451213)); + CHECK(check_hash_result>(123456789, 0, 1561711919)); + +#if defined(CUCO_HAS_INT128) + CHECK(check_hash_result>(123456789, 0, 1846633701)); +#endif + + // 32*4=128-byte key to test the pipelined outermost hashing loop + CHECK(check_hash_result>>(123456789, 0, 3715432378)); + } + + SECTION("Check if device-generated hash values match the reference implementation.") + { + thrust::device_vector result(20, true); + + check_hash_result_kernel_32<<<1, 1>>>(result.begin()); + + CHECK(cuco::test::all_of(result.begin(), result.end(), [] __device__(bool v) { return v; })); + } +} + +TEMPLATE_TEST_CASE_SIG("Static vs. dynamic key hash test", + "", + ((typename Hash), Hash), + (cuco::murmurhash3_32), + (cuco::murmurhash3_32), + (cuco::xxhash_32), + (cuco::xxhash_32), + (cuco::xxhash_64), + (cuco::xxhash_64)) +{ + using key_type = typename Hash::argument_type; + + Hash hash; + key_type key = 42; + + SECTION("Identical keys with static and dynamic key size should have the same hash value.") + { + CHECK(hash(key) == + hash.compute_hash(reinterpret_cast(&key), sizeof(key_type))); + } +} \ No newline at end of file diff --git a/tests/utility/storage_test.cu b/tests/utility/storage_test.cu new file mode 100644 index 000000000..b776f628c --- /dev/null +++ b/tests/utility/storage_test.cu @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include + +#include + +TEMPLATE_TEST_CASE_SIG("Storage tests", + "", + ((typename Key, typename Value), Key, Value), + (int32_t, int32_t), + (int32_t, int64_t), + (int64_t, int64_t)) +{ + constexpr std::size_t size{1'000}; + constexpr int window_size{2}; + constexpr std::size_t gold_capacity{2'000}; + + using allocator_type = cuco::cuda_allocator; + auto allocator = allocator_type{}; + + SECTION("Allocate array of pairs with AoS storage.") + { + auto s = + cuco::experimental::aow_storage, + window_size, + cuco::experimental::extent, + allocator_type>(cuco::experimental::extent{size}, allocator); + auto const num_windows = s.num_windows(); + auto const capacity = s.capacity(); + + REQUIRE(num_windows == size); + REQUIRE(capacity == gold_capacity); + } + + SECTION("Allocate array of pairs with AoS storage with static extent.") + { + using extent_type = cuco::experimental::extent; + auto s = cuco::experimental:: + aow_storage, window_size, extent_type, allocator_type>(extent_type{}, + allocator); + auto const num_windows = s.num_windows(); + auto const capacity = s.capacity(); + + STATIC_REQUIRE(num_windows == size); + STATIC_REQUIRE(capacity == gold_capacity); + } + + SECTION("Allocate array of keys with AoS storage.") + { + auto s = cuco::experimental:: + aow_storage, allocator_type>( + cuco::experimental::extent{size}, allocator); + auto const num_windows = s.num_windows(); + auto const capacity = s.capacity(); + + REQUIRE(num_windows == size); + REQUIRE(capacity == gold_capacity); + } + + SECTION("Allocate array of keys with AoS storage with static extent.") + { + using extent_type = cuco::experimental::extent; + auto s = cuco::experimental::aow_storage( + extent_type{}, allocator); + auto const num_windows = s.num_windows(); + auto const capacity = s.capacity(); + + STATIC_REQUIRE(num_windows == size); + STATIC_REQUIRE(capacity == gold_capacity); + } +} diff --git a/tests/utils.hpp b/tests/utils.hpp index dd2f6545f..3325027a9 100644 --- a/tests/utils.hpp +++ b/tests/utils.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,10 +18,14 @@ #include +#include + #include #include +#include + namespace cuco { namespace test { @@ -35,23 +39,23 @@ enum class probe_sequence { linear_probing, double_hashing }; template int count_if(Iterator begin, Iterator end, Predicate p, cudaStream_t stream = 0) { - auto const size = end - begin; + auto const size = std::distance(begin, end); auto const grid_size = (size + block_size - 1) / block_size; int* count; - cudaMallocManaged(&count, sizeof(int)); + CUCO_CUDA_TRY(cudaMallocManaged(&count, sizeof(int))); *count = 0; int device_id; - cudaGetDevice(&device_id); - cudaMemPrefetchAsync(count, sizeof(int), device_id, stream); + CUCO_CUDA_TRY(cudaGetDevice(&device_id)); + CUCO_CUDA_TRY(cudaMemPrefetchAsync(count, sizeof(int), device_id, stream)); detail::count_if<<>>(begin, end, count, p); - cudaStreamSynchronize(stream); + CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); - auto res = *count; + auto const res = *count; - cudaFree(count); + CUCO_CUDA_TRY(cudaFree(count)); return res; } @@ -59,7 +63,7 @@ int count_if(Iterator begin, Iterator end, Predicate p, cudaStream_t stream = 0) template bool all_of(Iterator begin, Iterator end, Predicate p, cudaStream_t stream = 0) { - auto const size = end - begin; + auto const size = std::distance(begin, end); auto const count = count_if(begin, end, p, stream); return size == count; @@ -81,23 +85,23 @@ bool none_of(Iterator begin, Iterator end, Predicate p, cudaStream_t stream = 0) template bool equal(Iterator1 begin1, Iterator1 end1, Iterator2 begin2, Predicate p, cudaStream_t stream = 0) { - auto const size = end1 - begin1; + auto const size = std::distance(begin1, end1); auto const grid_size = (size + block_size - 1) / block_size; int* count; - cudaMallocManaged(&count, sizeof(int)); + CUCO_CUDA_TRY(cudaMallocManaged(&count, sizeof(int))); *count = 0; int device_id; - cudaGetDevice(&device_id); - cudaMemPrefetchAsync(count, sizeof(int), device_id, stream); + CUCO_CUDA_TRY(cudaGetDevice(&device_id)); + CUCO_CUDA_TRY(cudaMemPrefetchAsync(count, sizeof(int), device_id, stream)); detail::count_if<<>>(begin1, end1, begin2, count, p); - cudaStreamSynchronize(stream); + CUCO_CUDA_TRY(cudaStreamSynchronize(stream)); - auto res = *count; + auto const res = *count; - cudaFree(count); + CUCO_CUDA_TRY(cudaFree(count)); return res == size; }