diff --git a/review/pr-201/.buildinfo b/review/pr-201/.buildinfo new file mode 100644 index 000000000..4b3662696 --- /dev/null +++ b/review/pr-201/.buildinfo @@ -0,0 +1,4 @@ +# Sphinx build info version 1 +# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. +config: 0bc3eaaa85f3864066eeb413040cb61a +tags: 645f666f9bcd5a90fca523b33c5a78b7 diff --git a/review/pr-201/.nojekyll b/review/pr-201/.nojekyll new file mode 100644 index 000000000..e69de29bb diff --git a/review/pr-201/CONTRIBUTING.html b/review/pr-201/CONTRIBUTING.html new file mode 100644 index 000000000..ee1e51d17 --- /dev/null +++ b/review/pr-201/CONTRIBUTING.html @@ -0,0 +1,182 @@ + + + + + + Contributing — Merlin Key-Value Storage documentation + + + + + + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+
    +
  • + +
  • +
  • +
+
+
+
+
+ +
+

Contributing

+
+

About HierarchicalKV

+

HierarchicalKV is a part of NVIDIA Merlin and provides hierarchical key-value storage to meet RecSys requirements.

+

The key capability of HierarchicalKV is to store key-value (feature-embedding) on high-bandwidth memory (HBM) of GPUs and in host memory.

+

You can also use the library for generic key-value storage.

+
+
+

Maintainership

+

HierarchicalKV is co-maintianed by NVIDIA Merlin Team and NVIDIA product end-users, +and also open for public contributions, bug fixes, and documentation. This project adheres to NVIDIA’s Code of Conduct.

+
+
+

Contributing

+

We’re grateful for your interest in HierarchicalKV and value your contributions. +We welcome contributions via pull requests(PR).

+

Before sending out a pull request for significant change on the end-user API, we recommend you open an issue and +discuss your proposed change. Some changes may require a design review. +All submissions require review by project reviewers.

+
+

Coding Style

+

Refer to the Style Guide

+
+
+

Additional Requirements

+

In addition to the above requirements, contribution also needs to meet the following criteria:

+
    +
  • The change needs to include unit tests and integration tests if any.

  • +
  • Each PR needs to provide necessary documentation for when and how to use it.

  • +
+
+
+
+

Community

+ +
+
+

Licence

+

Apache License 2.0

+
+
+ + +
+
+ +
+
+
+
+ + + + + + + \ No newline at end of file diff --git a/review/pr-201/README.html b/review/pr-201/README.html new file mode 100644 index 000000000..481985c8c --- /dev/null +++ b/review/pr-201/README.html @@ -0,0 +1,871 @@ + + + + + + NVIDIA HierarchicalKV(Beta) — Merlin Key-Value Storage documentation + + + + + + + + + + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+
    +
  • + +
  • +
  • +
+
+
+
+
+ +
+

NVIDIA HierarchicalKV(Beta)

+

Version +GitHub License +Documentation

+
+

About HierarchicalKV

+

HierarchicalKV is a part of NVIDIA Merlin and provides hierarchical key-value storage to meet RecSys requirements.

+

The key capability of HierarchicalKV is to store key-value (feature-embedding) on high-bandwidth memory (HBM) of GPUs and in host memory.

+

You can also use the library for generic key-value storage.

+
+
+

Benefits

+

When building large recommender systems, machine learning (ML) engineers face the following challenges:

+
    +
  • GPUs are needed, but HBM on a single GPU is too small for the large DLRMs that scale to several terabytes.

  • +
  • Improving communication performance is getting more difficult in larger and larger CPU clusters.

  • +
  • It is difficult to efficiently control consumption growth of limited HBM with customized strategies.

  • +
  • Most generic key-value libraries provide low HBM and host memory utilization.

  • +
+

HierarchicalKV alleviates these challenges and helps the machine learning engineers in RecSys with the following benefits:

+
    +
  • Supports training large RecSys models on HBM and host memory at the same time.

  • +
  • Provides better performance by full bypassing CPUs and reducing the communication workload.

  • +
  • Implements table-size restraint strategies that are based on LRU or customized strategies. +The strategies are implemented by CUDA kernels.

  • +
  • Operates at a high working-status load factor that is close to 1.0.

  • +
+
+
+

Key ideas

+
    +
  • Buckets are locally ordered

  • +
  • Store keys and values separately

  • +
  • Store all the keys in HBM

  • +
  • Build-in and customizable eviction strategy

  • +
+

HierarchicalKV makes NVIDIA GPUs more suitable for training large and super-large models of search, recommendations, and advertising. +The library simplifies the common challenges to building, evaluating, and serving sophisticated recommenders models.

+
+
+

API Documentation

+

The main classes and structs are below, but reading the comments in the source code is recommended:

+ +

For regular API doc, please refer to API Docs

+
+

API Maturity Matrix

+

industry-validated means the API has been well-tested and verified in at least one real-world scenario.

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

Name

Description

Function

insert_or_assign

Insert or assign for the specified keys.
Overwrite one key with minimum score when bucket is full.

industry-validated

insert_and_evict

Insert new keys, and evict keys with minimum score when bucket is full.

industry-validated

find_or_insert

Search for the specified keys, and insert them when missed.

well-tested

assign

Update for each key and bypass when missed.

well-tested

accum_or_assign

Search and update for each key. If found, add value as a delta to the original value.
If missed, update it directly.

well-tested

find_or_insert*

Search for the specified keys and return the pointers of values. Insert them firstly when missing.

well-tested

find

Search for the specified keys.

industry-validated

find*

Search and return the pointers of values, thread-unsafe but with high performance.

well-tested

export_batch

Exports a certain number of the key-value-score tuples.

industry-validated

export_batch_if

Exports a certain number of the key-value-score tuples which match specific conditions.

industry-validated

warmup

Move the hot key-values from HMEM to HBM

June 15, 2023

+
+
+

Evict Strategy

+

The score is introduced to define the importance of each key, the larger, the more important, the less likely they will be evicted. Eviction only happens when a bucket is full. +The score_type must be uint64_t. For more detail, please refer to class EvictStrategy.

+ + + + + + + + + + + + + + + + + + + + + + + +

Name

Definition of Score

Lru

Device clock in a nanosecond, which could differ slightly from host clock.

Lfu

Frequency increment provided by caller via the input parameter of scores of insert-like APIs as the increment of frequency.

EpochLru

The high 32bits is the global epoch provided via the input parameter of global_epoch,
the low 32bits is equal to (device_clock >> 20) & 0xffffffff with granularity close to 1 ms.

EpochLfu

The high 32bits is the global epoch provided via the input parameter of global_epoch,
the low 32bits is the frequency,
the frequency will keep constant after reaching the max value of 0xffffffff.

Customized

Fully provided by the caller via the input parameter of scores of insert-like APIs.

+
    +
  • Note:

    +
      +
    • The insert-like APIs mean the APIs of insert_or_assign, insert_and_evict, find_or_insert, accum_or_assign, and find_or_insert.

    • +
    • The global_epoch should be maintained by the caller and input as the input parameter of insert-like APIs.

    • +
    +
  • +
+
+
+

Configuration Options

+

It’s recommended to keep the default configuration for the options ending with *.

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

Name

Type

Default

Description

init_capacity

size_t

0

The initial capacity of the hash table.

max_capacity

size_t

0

The maximum capacity of the hash table.

max_hbm_for_vectors

size_t

0

The maximum HBM for vectors, in bytes.

dim

size_t

64

The dimension of the value vectors.

max_bucket_size*

size_t

128

The length of each bucket.

max_load_factor*

float

0.5f

The max load factor before rehashing.

block_size*

int

128

The default block size for CUDA kernels.

io_block_size*

int

1024

The block size for IO CUDA kernels.

device_id*

int

-1

The ID of device. Managed internally when set to -1

io_by_cpu*

bool

false

The flag indicating if the CPU handles IO.

reserved_key_start_bit

int

0

The start bit offset of reserved key in the 64 bit

+ +
+

Reserved Keys

+
    +
  • By default, the keys of 0xFFFFFFFFFFFFFFFD, 0xFFFFFFFFFFFFFFFE, and 0xFFFFFFFFFFFFFFFF are reserved for internal using. +change options.reserved_key_start_bit if you want to use the above keys. +reserved_key_start_bit has a valid range from 0 to 62. The default value is 0, which is the above default reserved keys. When reserved_key_start_bit is set to any value other than 0, the least significant bit (bit 0) is always 0 for any reserved key.

  • +
  • Setting reserved_key_start_bit = 1:

    +
      +
    • This setting reserves the two least significant bits 1 and 2 for the reserved keys.

    • +
    • In binary, the last four bits range from 1000 to 1110. Here, the least significant bit (bit 0) is always 0, and bits from 3 to 63 are set to 1.

    • +
    • The new reserved keys in hexadecimal representation are as follows:

      +
        +
      • 0xFFFFFFFFFFFFFFFE

      • +
      • 0xFFFFFFFFFFFFFFFC

      • +
      • 0xFFFFFFFFFFFFFFF8

      • +
      • 0xFFFFFFFFFFFFFFFA

      • +
      +
    • +
    +
  • +
  • Setting reserved_key_start_bit = 2:

    +
      +
    • This configuration reserves bits 2 and 3 as reserved keys.

    • +
    • The binary representation for the last five bits ranges from 10010 to 11110, with the least significant bit (bit 0) always set to 0, and bits from 4 to 63 are set to 1.

    • +
    +
  • +
  • if you change the reserved_key_start_bit, you should use same value for save/load +For more detail, please refer to init_reserved_keys

  • +
+
+
+
+

How to use:

+
#include "merlin_hashtable.cuh"
+
+
+using TableOptions = nv::merlin::HashTableOptions;
+using EvictStrategy = nv::merlin::EvictStrategy;
+
+int main(int argc, char *argv[])
+{
+  using K = uint64_t;
+  using V = float;
+  using S = uint64_t;
+  
+  // 1. Define the table and use LRU eviction strategy.
+  using HKVTable = nv::merlin::HashTable<K, V, S, EvictStrategy::kLru>;
+  std::unique_ptr<HKVTable> table = std::make_unique<HKVTable>();
+  
+  // 2. Define the configuration options.
+  TableOptions options;
+  options.init_capacity = 16 * 1024 * 1024;
+  options.max_capacity = options.init_capacity;
+  options.dim = 16;
+  options.max_hbm_for_vectors = nv::merlin::GB(16);
+  
+  
+  // 3. Initialize the table memory resource.
+  table->init(options);
+  
+  // 4. Use table to do something.
+  
+  return 0;
+}
+
+
+
+
+

Usage restrictions

+
    +
  • The key_type must be int64_t or uint64_t.

  • +
  • The score_type must be uint64_t.

  • +
+
+
+
+

Contributors

+

HierarchicalKV is co-maintianed by NVIDIA Merlin Team and NVIDIA product end-users, +and also open for public contributions, bug fixes, and documentation. [Contribute]

+
+
+

How to build

+

Basically, HierarchicalKV is a headers only library, the commands below only create binaries for benchmark and unit testing.

+

Your environment must meet the following requirements:

+
    +
  • CUDA version >= 11.2

  • +
  • NVIDIA GPU with compute capability 8.0, 8.6, 8.7 or 9.0

  • +
  • GCC supports `C++17’ standard or later.

  • +
  • Bazel version >= 3.7.2 (Bazel compile only)

  • +
+
+

with cmake

+
git clone --recursive https://github.com/NVIDIA-Merlin/HierarchicalKV.git
+cd HierarchicalKV && mkdir -p build && cd build
+cmake -DCMAKE_BUILD_TYPE=Release -Dsm=80 .. && make -j
+
+
+

For Debug:

+
cmake -DCMAKE_BUILD_TYPE=Debug -Dsm=80 .. && make -j
+
+
+

For Benchmark:

+
./merlin_hashtable_benchmark
+
+
+

For Unit Test:

+
./merlin_hashtable_test
+
+
+
+
+

with bazel

+
    +
  • DON’T use the option of --recursive for git clone.

  • +
  • Please modify the environment variables in the .bazelrc file in advance if using the customized docker images.

  • +
  • The docker images maintained on nvcr.io/nvidia/tensorflow are highly recommended.

  • +
+

Pull the docker image:

+
docker pull nvcr.io/nvidia/tensorflow:22.09-tf2-py3
+docker run --gpus all -it --rm nvcr.io/nvidia/tensorflow:22.09-tf2-py3
+
+
+

Compile in docker container:

+
git clone https://github.com/NVIDIA-Merlin/HierarchicalKV.git
+cd HierarchicalKV && bash bazel_build.sh
+
+
+

For Benchmark:

+
./benchmark_util
+
+
+
+
+
+

Benchmark & Performance(W.I.P)

+
    +
  • GPU: 1 x NVIDIA A100 80GB PCIe: 8.0

  • +
  • Key Type = uint64_t

  • +
  • Value Type = float32 * {dim}

  • +
  • Key-Values per OP = 1048576

  • +
  • Evict strategy: LRU

  • +
  • λ: load factor

  • +
  • find* means the find API that directly returns the addresses of values.

  • +
  • find_or_insert* means the find_or_insert API that directly returns the addresses of values.

  • +
  • Throughput Unit: Billion-KV/second

  • +
+
+

On pure HBM mode:

+
    +
  • dim = 8, capacity = 128 Million-KV, HBM = 4 GB, HMEM = 0 GB

  • +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

λ

insert_or_assign

find

find_or_insert

assign

find*

find_or_insert*

insert_and_evict

0.50

1.093

2.470

1.478

1.770

3.726

1.447

1.075

0.75

1.045

2.452

1.335

1.807

3.374

1.309

1.013

1.00

0.655

2.481

0.612

1.815

1.865

0.619

0.511

+ + + + + + + + + + + + + + + + + + + + + + + + + +

λ

export_batch

export_batch_if

contains

0.50

2.087

12.258

3.121

0.75

2.045

12.447

3.094

1.00

1.950

2.657

3.096

+
    +
  • dim = 32, capacity = 128 Million-KV, HBM = 16 GB, HMEM = 0 GB

  • +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

λ

insert_or_assign

find

find_or_insert

assign

find*

find_or_insert*

insert_and_evict

0.50

0.961

2.272

1.278

1.706

3.718

1.435

0.931

0.75

0.930

2.238

1.177

1.693

3.369

1.316

0.866

1.00

0.646

2.321

0.572

1.783

1.873

0.618

0.469

+ + + + + + + + + + + + + + + + + + + + + + + + + +

λ

export_batch

export_batch_if

contains

0.50

0.692

10.784

3.100

0.75

0.569

10.240

3.075

1.00

0.551

0.765

3.096

+
    +
  • dim = 64, capacity = 64 Million-KV, HBM = 16 GB, HMEM = 0 GB

  • +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

λ

insert_or_assign

find

find_or_insert

assign

find*

find_or_insert*

insert_and_evict

0.50

0.834

1.982

1.113

1.499

3.950

1.502

0.805

0.75

0.801

1.951

1.033

1.493

3.545

1.359

0.773

1.00

0.621

2.021

0.608

1.541

1.965

0.613

0.481

+ + + + + + + + + + + + + + + + + + + + + + + + + +

λ

export_batch

export_batch_if

contains

0.50

0.316

8.199

3.239

0.75

0.296

8.549

3.198

1.00

0.288

0.395

3.225

+
+
+

On HBM+HMEM hybrid mode:

+
    +
  • dim = 64, capacity = 128 Million-KV, HBM = 16 GB, HMEM = 16 GB

  • +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

λ

insert_or_assign

find

find_or_insert

assign

find*

find_or_insert*

0.50

0.083

0.124

0.109

0.131

3.705

1.435

0.75

0.083

0.122

0.111

0.129

3.221

1.274

1.00

0.073

0.123

0.095

0.126

1.854

0.617

+ + + + + + + + + + + + + + + + + + + + + + + + + +

λ

export_batch

export_batch_if

contains

0.50

0.318

8.086

3.122

0.75

0.294

5.549

3.111

1.00

0.287

0.393

3.075

+
    +
  • dim = 64, capacity = 512 Million-KV, HBM = 32 GB, HMEM = 96 GB

  • +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

λ

insert_or_assign

find

find_or_insert

assign

find*

find_or_insert*

0.50

0.049

0.069

0.049

0.069

3.484

1.370

0.75

0.049

0.069

0.049

0.069

3.116

1.242

1.00

0.047

0.072

0.047

0.070

1.771

0.607

+ + + + + + + + + + + + + + + + + + + + + + + + + +

λ

export_batch

export_batch_if

contains

0.50

0.316

8.181

3.073

0.75

0.293

8.950

3.052

1.00

0.292

0.394

3.026

+
+
+

Support and Feedback:

+

If you encounter any issues or have questions, go to https://github.com/NVIDIA-Merlin/HierarchicalKV/issues and submit an issue so that we can provide you with the necessary resolutions and answers.

+
+
+

Acknowledgment

+

We are very grateful to external initial contributors @Zhangyafei and @Lifan for their design, coding, and review work.

+
+
+

License

+

Apache License 2.0

+
+
+
+ + +
+
+ +
+
+
+
+ + + + + + + \ No newline at end of file diff --git a/review/pr-201/_sources/CONTRIBUTING.md.txt b/review/pr-201/_sources/CONTRIBUTING.md.txt new file mode 100644 index 000000000..4ea049cdc --- /dev/null +++ b/review/pr-201/_sources/CONTRIBUTING.md.txt @@ -0,0 +1,41 @@ +# Contributing + +## About HierarchicalKV + +HierarchicalKV is a part of NVIDIA Merlin and provides hierarchical key-value storage to meet RecSys requirements. + +The key capability of HierarchicalKV is to store key-value (feature-embedding) on high-bandwidth memory (HBM) of GPUs and in host memory. + +You can also use the library for generic key-value storage. + +## Maintainership + +HierarchicalKV is co-maintianed by [NVIDIA Merlin Team](https://github.com/NVIDIA-Merlin) and NVIDIA product end-users, +and also open for public contributions, bug fixes, and documentation. This project adheres to NVIDIA's Code of Conduct. + +## Contributing + +We’re grateful for your interest in HierarchicalKV and value your contributions. +We welcome contributions via pull requests(PR). + +Before sending out a pull request for significant change on the end-user API, we recommend you open an issue and +discuss your proposed change. Some changes may require a design review. +All submissions require review by project reviewers. + +### Coding Style + +Refer to the [Style Guide](http://github.com/NVIDIA-Merlin/HierarchicalKV/STYLE_GUIDE.md) + +### Additional Requirements + +In addition to the above requirements, contribution also needs to meet the following criteria: +* The change needs to include unit tests and integration tests if any. +* Each PR needs to provide necessary documentation for when and how to use it. + +## Community + +* HierarchicalKV code (https://github.com/NVIDIA-Merlin/HierarchicalKV) + +## Licence +Apache License 2.0 + diff --git a/review/pr-201/_sources/README.md.txt b/review/pr-201/_sources/README.md.txt new file mode 100644 index 000000000..57bb507ef --- /dev/null +++ b/review/pr-201/_sources/README.md.txt @@ -0,0 +1,327 @@ +# [NVIDIA HierarchicalKV(Beta)](https://github.com/NVIDIA-Merlin/HierarchicalKV) + +[![Version](https://img.shields.io/github/v/release/NVIDIA-Merlin/HierarchicalKV?color=orange&include_prereleases)](https://github.com/NVIDIA-Merlin/HierarchicalKV/releases) +[![GitHub License](https://img.shields.io/github/license/NVIDIA-Merlin/HierarchicalKV)](https://github.com/NVIDIA-Merlin/HierarchicalKV/blob/master/LICENSE) +[![Documentation](https://img.shields.io/badge/documentation-blue.svg)](https://nvidia-merlin.github.io/HierarchicalKV/master/README.html) + +## About HierarchicalKV + +HierarchicalKV is a part of NVIDIA Merlin and provides hierarchical key-value storage to meet RecSys requirements. + +The key capability of HierarchicalKV is to store key-value (feature-embedding) on high-bandwidth memory (HBM) of GPUs and in host memory. + +You can also use the library for generic key-value storage. + +## Benefits + +When building large recommender systems, machine learning (ML) engineers face the following challenges: + +- GPUs are needed, but HBM on a single GPU is too small for the large DLRMs that scale to several terabytes. +- Improving communication performance is getting more difficult in larger and larger CPU clusters. +- It is difficult to efficiently control consumption growth of limited HBM with customized strategies. +- Most generic key-value libraries provide low HBM and host memory utilization. + +HierarchicalKV alleviates these challenges and helps the machine learning engineers in RecSys with the following benefits: + +- Supports training large RecSys models on **HBM and host memory** at the same time. +- Provides better performance by **full bypassing CPUs** and reducing the communication workload. +- Implements table-size restraint strategies that are based on **LRU or customized strategies**. + The strategies are implemented by CUDA kernels. +- Operates at a high working-status load factor that is close to 1.0. + + +## Key ideas + +- Buckets are locally ordered +- Store keys and values separately +- Store all the keys in HBM +- Build-in and customizable eviction strategy + +HierarchicalKV makes NVIDIA GPUs more suitable for training large and super-large models of ***search, recommendations, and advertising***. +The library simplifies the common challenges to building, evaluating, and serving sophisticated recommenders models. + +## API Documentation + +The main classes and structs are below, but reading the comments in the source code is recommended: + +- [`class HashTable`](https://github.com/NVIDIA-Merlin/HierarchicalKV/blob/master/include/merlin_hashtable.cuh#L151) +- [`class EvictStrategy`](https://github.com/NVIDIA-Merlin/HierarchicalKV/blob/master/include/merlin_hashtable.cuh#L52) +- [`struct HashTableOptions`](https://github.com/NVIDIA-Merlin/HierarchicalKV/blob/master/include/merlin_hashtable.cuh#L60) + +For regular API doc, please refer to [API Docs](https://nvidia-merlin.github.io/HierarchicalKV/master/api/index.html) + +### API Maturity Matrix + +`industry-validated` means the API has been well-tested and verified in at least one real-world scenario. + +| Name | Description | Function | +|:---------------------|:-------------------------------------------------------------------------------------------------------------------------|:-------------------| +| __insert_or_assign__ | Insert or assign for the specified keys.
Overwrite one key with minimum score when bucket is full. | industry-validated | +| __insert_and_evict__ | Insert new keys, and evict keys with minimum score when bucket is full. | industry-validated | +| __find_or_insert__ | Search for the specified keys, and insert them when missed. | well-tested | +| __assign__ | Update for each key and bypass when missed. | well-tested | +| __accum_or_assign__ | Search and update for each key. If found, add value as a delta to the original value.
If missed, update it directly. | well-tested | +| __find_or_insert\*__ | Search for the specified keys and return the pointers of values. Insert them firstly when missing. | well-tested | +| __find__ | Search for the specified keys. | industry-validated | +| __find\*__ | Search and return the pointers of values, thread-unsafe but with high performance. | well-tested | +| __export_batch__ | Exports a certain number of the key-value-score tuples. | industry-validated | +| __export_batch_if__ | Exports a certain number of the key-value-score tuples which match specific conditions. | industry-validated | +| __warmup__ | Move the hot key-values from HMEM to HBM | June 15, 2023 | + + +### Evict Strategy + +The `score` is introduced to define the importance of each key, the larger, the more important, the less likely they will be evicted. Eviction only happens when a bucket is full. +The `score_type` must be `uint64_t`. For more detail, please refer to [`class EvictStrategy`](https://github.com/NVIDIA-Merlin/HierarchicalKV/blob/master/include/merlin_hashtable.cuh#L52). + +| Name | Definition of `Score` | +|:---------------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| __Lru__ | Device clock in a nanosecond, which could differ slightly from host clock. | +| __Lfu__ | Frequency increment provided by caller via the input parameter of `scores` of `insert-like` APIs as the increment of frequency. | +| __EpochLru__ | The high 32bits is the global epoch provided via the input parameter of `global_epoch`,
the low 32bits is equal to `(device_clock >> 20) & 0xffffffff` with granularity close to 1 ms. | +| __EpochLfu__ | The high 32bits is the global epoch provided via the input parameter of `global_epoch`,
the low 32bits is the frequency,
the frequency will keep constant after reaching the max value of `0xffffffff`. | +| __Customized__ | Fully provided by the caller via the input parameter of `scores` of `insert-like` APIs. | + + +* __Note__: + - The `insert-like` APIs mean the APIs of `insert_or_assign`, `insert_and_evict`, `find_or_insert`, `accum_or_assign`, and `find_or_insert`. + - The `global_epoch` should be maintained by the caller and input as the input parameter of `insert-like` APIs. + +### Configuration Options + +It's recommended to keep the default configuration for the options ending with `*`. + +| Name | Type | Default | Description | +|:---------------------------|:-------|:--------|:------------------------------------------------------| +| __init_capacity__ | size_t | 0 | The initial capacity of the hash table. | +| __max_capacity__ | size_t | 0 | The maximum capacity of the hash table. | +| __max_hbm_for_vectors__ | size_t | 0 | The maximum HBM for vectors, in bytes. | +| __dim__ | size_t | 64 | The dimension of the value vectors. | +| __max_bucket_size*__ | size_t | 128 | The length of each bucket. | +| __max_load_factor*__ | float | 0.5f | The max load factor before rehashing. | +| __block_size*__ | int | 128 | The default block size for CUDA kernels. | +| __io_block_size*__ | int | 1024 | The block size for IO CUDA kernels. | +| __device_id*__ | int | -1 | The ID of device. Managed internally when set to `-1` | +| __io_by_cpu*__ | bool | false | The flag indicating if the CPU handles IO. | +| __reserved_key_start_bit__ | int | 0 | The start bit offset of reserved key in the 64 bit | + +- Fore more details refer to [`struct HashTableOptions`](https://github.com/NVIDIA-Merlin/HierarchicalKV/blob/master/include/merlin_hashtable.cuh#L60). + +#### Reserved Keys +- By default, the keys of `0xFFFFFFFFFFFFFFFD`, `0xFFFFFFFFFFFFFFFE`, and `0xFFFFFFFFFFFFFFFF` are reserved for internal using. + change `options.reserved_key_start_bit` if you want to use the above keys. + `reserved_key_start_bit` has a valid range from 0 to 62. The default value is 0, which is the above default reserved keys. When `reserved_key_start_bit` is set to any value other than 0, the least significant bit (bit 0) is always `0` for any reserved key. + +- Setting `reserved_key_start_bit = 1`: + - This setting reserves the two least significant bits 1 and 2 for the reserved keys. + - In binary, the last four bits range from `1000` to `1110`. Here, the least significant bit (bit 0) is always `0`, and bits from 3 to 63 are set to `1`. + - The new reserved keys in hexadecimal representation are as follows: + - `0xFFFFFFFFFFFFFFFE` + - `0xFFFFFFFFFFFFFFFC` + - `0xFFFFFFFFFFFFFFF8` + - `0xFFFFFFFFFFFFFFFA` + +- Setting `reserved_key_start_bit = 2`: + - This configuration reserves bits 2 and 3 as reserved keys. + - The binary representation for the last five bits ranges from `10010` to `11110`, with the least significant bit (bit 0) always set to `0`, and bits from 4 to 63 are set to `1`. + +- if you change the reserved_key_start_bit, you should use same value for save/load + For more detail, please refer to [`init_reserved_keys`](https://github.com/search?q=repo%3ANVIDIA-Merlin%2FHierarchicalKV%20init_reserved_keys&type=code) + +### How to use: +```cpp +#include "merlin_hashtable.cuh" + + +using TableOptions = nv::merlin::HashTableOptions; +using EvictStrategy = nv::merlin::EvictStrategy; + +int main(int argc, char *argv[]) +{ + using K = uint64_t; + using V = float; + using S = uint64_t; + + // 1. Define the table and use LRU eviction strategy. + using HKVTable = nv::merlin::HashTable; + std::unique_ptr table = std::make_unique(); + + // 2. Define the configuration options. + TableOptions options; + options.init_capacity = 16 * 1024 * 1024; + options.max_capacity = options.init_capacity; + options.dim = 16; + options.max_hbm_for_vectors = nv::merlin::GB(16); + + + // 3. Initialize the table memory resource. + table->init(options); + + // 4. Use table to do something. + + return 0; +} + +``` + +### Usage restrictions + +- The `key_type` must be `int64_t` or `uint64_t`. +- The `score_type` must be `uint64_t`. +## Contributors + +HierarchicalKV is co-maintianed by [NVIDIA Merlin Team](https://github.com/NVIDIA-Merlin) and NVIDIA product end-users, +and also open for public contributions, bug fixes, and documentation. [[Contribute](CONTRIBUTING.md)] + +## How to build + +Basically, HierarchicalKV is a headers only library, the commands below only create binaries for benchmark and unit testing. + +Your environment must meet the following requirements: + +- CUDA version >= 11.2 +- NVIDIA GPU with compute capability 8.0, 8.6, 8.7 or 9.0 +- GCC supports `C++17' standard or later. +- Bazel version >= 3.7.2 (Bazel compile only) + +### with cmake +```shell +git clone --recursive https://github.com/NVIDIA-Merlin/HierarchicalKV.git +cd HierarchicalKV && mkdir -p build && cd build +cmake -DCMAKE_BUILD_TYPE=Release -Dsm=80 .. && make -j +``` + +For Debug: +```shell +cmake -DCMAKE_BUILD_TYPE=Debug -Dsm=80 .. && make -j +``` + +For Benchmark: +```shell +./merlin_hashtable_benchmark +``` + +For Unit Test: +```shell +./merlin_hashtable_test +``` + +### with bazel + +- DON'T use the option of `--recursive` for `git clone`. +- Please modify the environment variables in the `.bazelrc` file in advance if using the customized docker images. +- The docker images maintained on `nvcr.io/nvidia/tensorflow` are highly recommended. + +Pull the docker image: +```shell +docker pull nvcr.io/nvidia/tensorflow:22.09-tf2-py3 +docker run --gpus all -it --rm nvcr.io/nvidia/tensorflow:22.09-tf2-py3 +``` + +Compile in docker container: +```shell +git clone https://github.com/NVIDIA-Merlin/HierarchicalKV.git +cd HierarchicalKV && bash bazel_build.sh +``` + +For Benchmark: +```shell +./benchmark_util +``` + + +## Benchmark & Performance(W.I.P) + +* GPU: 1 x NVIDIA A100 80GB PCIe: 8.0 +* Key Type = uint64_t +* Value Type = float32 * {dim} +* Key-Values per OP = 1048576 +* Evict strategy: LRU +* `λ`: load factor +* `find*` means the `find` API that directly returns the addresses of values. +* `find_or_insert*` means the `find_or_insert` API that directly returns the addresses of values. +* ***Throughput Unit: Billion-KV/second*** + +### On pure HBM mode: + +* dim = 8, capacity = 128 Million-KV, HBM = 4 GB, HMEM = 0 GB + +| λ | insert_or_assign | find | find_or_insert | assign | find* | find_or_insert* | insert_and_evict | +|-----:|-----------------:|-------:|---------------:|-------:|-------:|----------------:|-----------------:| +| 0.50 | 1.093 | 2.470 | 1.478 | 1.770 | 3.726 | 1.447 | 1.075 | +| 0.75 | 1.045 | 2.452 | 1.335 | 1.807 | 3.374 | 1.309 | 1.013 | +| 1.00 | 0.655 | 2.481 | 0.612 | 1.815 | 1.865 | 0.619 | 0.511 | + +| λ | export_batch | export_batch_if | contains | +|-----:|-------------:|----------------:|---------:| +| 0.50 | 2.087 | 12.258 | 3.121 | +| 0.75 | 2.045 | 12.447 | 3.094 | +| 1.00 | 1.950 | 2.657 | 3.096 | + +* dim = 32, capacity = 128 Million-KV, HBM = 16 GB, HMEM = 0 GB + +| λ | insert_or_assign | find | find_or_insert | assign | find* | find_or_insert* | insert_and_evict | +|-----:|-----------------:|-------:|---------------:|-------:|-------:|----------------:|-----------------:| +| 0.50 | 0.961 | 2.272 | 1.278 | 1.706 | 3.718 | 1.435 | 0.931 | +| 0.75 | 0.930 | 2.238 | 1.177 | 1.693 | 3.369 | 1.316 | 0.866 | +| 1.00 | 0.646 | 2.321 | 0.572 | 1.783 | 1.873 | 0.618 | 0.469 | + +| λ | export_batch | export_batch_if | contains | +|-----:|-------------:|----------------:|---------:| +| 0.50 | 0.692 | 10.784 | 3.100 | +| 0.75 | 0.569 | 10.240 | 3.075 | +| 1.00 | 0.551 | 0.765 | 3.096 | + +* dim = 64, capacity = 64 Million-KV, HBM = 16 GB, HMEM = 0 GB + +| λ | insert_or_assign | find | find_or_insert | assign | find* | find_or_insert* | insert_and_evict | +|-----:|-----------------:|-------:|---------------:|-------:|-------:|----------------:|-----------------:| +| 0.50 | 0.834 | 1.982 | 1.113 | 1.499 | 3.950 | 1.502 | 0.805 | +| 0.75 | 0.801 | 1.951 | 1.033 | 1.493 | 3.545 | 1.359 | 0.773 | +| 1.00 | 0.621 | 2.021 | 0.608 | 1.541 | 1.965 | 0.613 | 0.481 | + +| λ | export_batch | export_batch_if | contains | +|-----:|-------------:|----------------:|---------:| +| 0.50 | 0.316 | 8.199 | 3.239 | +| 0.75 | 0.296 | 8.549 | 3.198 | +| 1.00 | 0.288 | 0.395 | 3.225 | + +### On HBM+HMEM hybrid mode: + +* dim = 64, capacity = 128 Million-KV, HBM = 16 GB, HMEM = 16 GB + +| λ | insert_or_assign | find | find_or_insert | assign | find* | find_or_insert* | +|-----:|-----------------:|-------:|---------------:|-------:|-------:|----------------:| +| 0.50 | 0.083 | 0.124 | 0.109 | 0.131 | 3.705 | 1.435 | +| 0.75 | 0.083 | 0.122 | 0.111 | 0.129 | 3.221 | 1.274 | +| 1.00 | 0.073 | 0.123 | 0.095 | 0.126 | 1.854 | 0.617 | + +| λ | export_batch | export_batch_if | contains | +|-----:|-------------:|----------------:|---------:| +| 0.50 | 0.318 | 8.086 | 3.122 | +| 0.75 | 0.294 | 5.549 | 3.111 | +| 1.00 | 0.287 | 0.393 | 3.075 | + +* dim = 64, capacity = 512 Million-KV, HBM = 32 GB, HMEM = 96 GB + +| λ | insert_or_assign | find | find_or_insert | assign | find* | find_or_insert* | +|-----:|-----------------:|-------:|---------------:|-------:|-------:|----------------:| +| 0.50 | 0.049 | 0.069 | 0.049 | 0.069 | 3.484 | 1.370 | +| 0.75 | 0.049 | 0.069 | 0.049 | 0.069 | 3.116 | 1.242 | +| 1.00 | 0.047 | 0.072 | 0.047 | 0.070 | 1.771 | 0.607 | + +| λ | export_batch | export_batch_if | contains | +|-----:|-------------:|----------------:|---------:| +| 0.50 | 0.316 | 8.181 | 3.073 | +| 0.75 | 0.293 | 8.950 | 3.052 | +| 1.00 | 0.292 | 0.394 | 3.026 | + +### Support and Feedback: + +If you encounter any issues or have questions, go to [https://github.com/NVIDIA-Merlin/HierarchicalKV/issues](https://github.com/NVIDIA-Merlin/HierarchicalKV/issues) and submit an issue so that we can provide you with the necessary resolutions and answers. + +### Acknowledgment +We are very grateful to external initial contributors [@Zhangyafei](https://github.com/zhangyafeikimi) and [@Lifan](https://github.com/Lifann) for their design, coding, and review work. + +### License +Apache License 2.0 diff --git a/review/pr-201/_sources/api/classnv_1_1merlin_1_1HashTable.rst.txt b/review/pr-201/_sources/api/classnv_1_1merlin_1_1HashTable.rst.txt new file mode 100644 index 000000000..1733d0f77 --- /dev/null +++ b/review/pr-201/_sources/api/classnv_1_1merlin_1_1HashTable.rst.txt @@ -0,0 +1,26 @@ +.. _exhale_class_classnv_1_1merlin_1_1HashTable: + +Template Class HashTable +======================== + +- Defined in :ref:`file_merlin_hashtable.cuh` + + +Inheritance Relationships +------------------------- + +Base Type +********* + +- ``public nv::merlin::HashTableBase< K, V, uint64_t >`` + + +Class Documentation +------------------- + + +.. doxygenclass:: nv::merlin::HashTable + :project: HierarchicalKV + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/review/pr-201/_sources/api/classnv_1_1merlin_1_1HashTableBase.rst.txt b/review/pr-201/_sources/api/classnv_1_1merlin_1_1HashTableBase.rst.txt new file mode 100644 index 000000000..4169f927c --- /dev/null +++ b/review/pr-201/_sources/api/classnv_1_1merlin_1_1HashTableBase.rst.txt @@ -0,0 +1,26 @@ +.. _exhale_class_classnv_1_1merlin_1_1HashTableBase: + +Template Class HashTableBase +============================ + +- Defined in :ref:`file_merlin_hashtable.cuh` + + +Inheritance Relationships +------------------------- + +Derived Type +************ + +- ``public nv::merlin::HashTable< K, V, S, Strategy, ArchTag >`` (:ref:`exhale_class_classnv_1_1merlin_1_1HashTable`) + + +Class Documentation +------------------- + + +.. doxygenclass:: nv::merlin::HashTableBase + :project: HierarchicalKV + :members: + :protected-members: + :undoc-members: \ No newline at end of file diff --git a/review/pr-201/_sources/api/file_merlin_hashtable.cuh.rst.txt b/review/pr-201/_sources/api/file_merlin_hashtable.cuh.rst.txt new file mode 100644 index 000000000..4be280b1a --- /dev/null +++ b/review/pr-201/_sources/api/file_merlin_hashtable.cuh.rst.txt @@ -0,0 +1,103 @@ + +.. _file_merlin_hashtable.cuh: + +File merlin_hashtable.cuh +========================= + +.. contents:: Contents + :local: + :backlinks: none + +Definition (``merlin_hashtable.cuh``) +------------------------------------- + + +.. toctree:: + :maxdepth: 1 + + program_listing_file_merlin_hashtable.cuh.rst + + + + + +Includes +-------- + + +- ``atomic`` + +- ``cstdint`` + +- ``limits`` + +- ``memory`` + +- ``merlin/allocator.cuh`` + +- ``merlin/array_kernels.cuh`` + +- ``merlin/core_kernels.cuh`` + +- ``merlin/flexible_buffer.cuh`` + +- ``merlin/group_lock.cuh`` + +- ``merlin/memory_pool.cuh`` + +- ``merlin/types.cuh`` + +- ``merlin/utils.cuh`` + +- ``mutex`` + +- ``shared_mutex`` + +- ``thrust/device_vector.h`` + +- ``thrust/execution_policy.h`` + +- ``thrust/sort.h`` + +- ``type_traits`` + + + + + + +Namespaces +---------- + + +- :ref:`namespace_nv` + +- :ref:`namespace_nv__merlin` + + +Classes +------- + + +- :ref:`exhale_struct_structnv_1_1merlin_1_1EvictStrategy` + +- :ref:`exhale_struct_structnv_1_1merlin_1_1HashTableOptions` + +- :ref:`exhale_class_classnv_1_1merlin_1_1HashTable` + +- :ref:`exhale_class_classnv_1_1merlin_1_1HashTableBase` + + +Typedefs +-------- + + +- :ref:`exhale_typedef_merlin__hashtable_8cuh_1a5001706db6e977358e7f76ad6773703a` + + +Variables +--------- + + +- :ref:`exhale_variable_merlin__hashtable_8cuh_1a359fe56354918308560f46cb3136a3da` + diff --git a/review/pr-201/_sources/api/index.rst.txt b/review/pr-201/_sources/api/index.rst.txt new file mode 100644 index 000000000..0cda6b556 --- /dev/null +++ b/review/pr-201/_sources/api/index.rst.txt @@ -0,0 +1,10 @@ +==================================== +HierarchicalKV C++ API Documentation +==================================== + +.. include:: class_view_hierarchy.rst.include + +.. include:: file_view_hierarchy.rst.include + +.. include:: unabridged_api.rst.include + diff --git a/review/pr-201/_sources/api/namespace_nv.rst.txt b/review/pr-201/_sources/api/namespace_nv.rst.txt new file mode 100644 index 000000000..aeed04381 --- /dev/null +++ b/review/pr-201/_sources/api/namespace_nv.rst.txt @@ -0,0 +1,20 @@ + +.. _namespace_nv: + +Namespace nv +============ + + +.. contents:: Contents + :local: + :backlinks: none + + + + + +Namespaces +---------- + + +- :ref:`namespace_nv__merlin` diff --git a/review/pr-201/_sources/api/namespace_nv__merlin.rst.txt b/review/pr-201/_sources/api/namespace_nv__merlin.rst.txt new file mode 100644 index 000000000..7d7b08ba3 --- /dev/null +++ b/review/pr-201/_sources/api/namespace_nv__merlin.rst.txt @@ -0,0 +1,40 @@ + +.. _namespace_nv__merlin: + +Namespace nv::merlin +==================== + + +.. contents:: Contents + :local: + :backlinks: none + + + + + +Classes +------- + + +- :ref:`exhale_struct_structnv_1_1merlin_1_1EvictStrategy` + +- :ref:`exhale_struct_structnv_1_1merlin_1_1HashTableOptions` + +- :ref:`exhale_class_classnv_1_1merlin_1_1HashTable` + +- :ref:`exhale_class_classnv_1_1merlin_1_1HashTableBase` + + +Typedefs +-------- + + +- :ref:`exhale_typedef_merlin__hashtable_8cuh_1a5001706db6e977358e7f76ad6773703a` + + +Variables +--------- + + +- :ref:`exhale_variable_merlin__hashtable_8cuh_1a359fe56354918308560f46cb3136a3da` diff --git a/review/pr-201/_sources/api/program_listing_file_merlin_hashtable.cuh.rst.txt b/review/pr-201/_sources/api/program_listing_file_merlin_hashtable.cuh.rst.txt new file mode 100644 index 000000000..55480d175 --- /dev/null +++ b/review/pr-201/_sources/api/program_listing_file_merlin_hashtable.cuh.rst.txt @@ -0,0 +1,1955 @@ + +.. _program_listing_file_merlin_hashtable.cuh: + +Program Listing for File merlin_hashtable.cuh +============================================= + +|exhale_lsh| :ref:`Return to documentation for file ` (``merlin_hashtable.cuh``) + +.. |exhale_lsh| unicode:: U+021B0 .. UPWARDS ARROW WITH TIP LEFTWARDS + +.. code-block:: cpp + + /* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + #pragma once + + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include "merlin/allocator.cuh" + #include "merlin/array_kernels.cuh" + #include "merlin/core_kernels.cuh" + #include "merlin/flexible_buffer.cuh" + #include "merlin/group_lock.cuh" + #include "merlin/memory_pool.cuh" + #include "merlin/types.cuh" + #include "merlin/utils.cuh" + + namespace nv { + namespace merlin { + + struct EvictStrategy { + enum EvictStrategyEnum { + kLru = 0, + kLfu = 1, + kEpochLru = 2, + kEpochLfu = 3, + kCustomized = 4, + }; + }; + + struct HashTableOptions { + size_t init_capacity = 0; + size_t max_capacity = 0; + size_t max_hbm_for_vectors = 0; + size_t max_bucket_size = 128; + size_t dim = 64; + float max_load_factor = 0.5f; + int block_size = 128; + int io_block_size = 1024; + int device_id = -1; + bool io_by_cpu = false; + bool use_constant_memory = false; + /* + * reserved_key_start_bit = 0, is the default behavior, HKV reserves + * `0xFFFFFFFFFFFFFFFD`, `0xFFFFFFFFFFFFFFFE`, and `0xFFFFFFFFFFFFFFFF` for + * internal using. if the default one conflicted with your keys, change the + * reserved_key_start_bit value to a numbers between 1 and 62, + * reserved_key_start_bit = 1 means using the insignificant bits index 1 and 2 + * as the keys as the reserved keys and the index 0 bit is 0 and all the other + * bits are 1, the new reserved keys are `FFFFFFFFFFFFFFFE`, + * `0xFFFFFFFFFFFFFFFC`, `0xFFFFFFFFFFFFFFF8`, and `0xFFFFFFFFFFFFFFFA` the + * console log prints the reserved keys during the table initialization. + */ + int reserved_key_start_bit = 0; + size_t num_of_buckets_per_alloc = 1; + MemoryPoolOptions + device_memory_pool; + MemoryPoolOptions + host_memory_pool; + }; + + template + using EraseIfPredict = bool (*)( + const K& key, + S& score, + const K& pattern, + const S& threshold + ); + + #if THRUST_VERSION >= 101600 + static constexpr auto& thrust_par = thrust::cuda::par_nosync; + #else + static constexpr auto& thrust_par = thrust::cuda::par; + #endif + + template + class HashTableBase { + public: + using size_type = size_t; + using key_type = K; + using value_type = V; + using score_type = S; + using allocator_type = BaseAllocator; + + public: + virtual ~HashTableBase() {} + + virtual void init(const HashTableOptions& options, + allocator_type* allocator = nullptr) = 0; + + virtual void insert_or_assign(const size_type n, + const key_type* keys, // (n) + const value_type* values, // (n, DIM) + const score_type* scores = nullptr, // (n) + cudaStream_t stream = 0, bool unique_key = true, + bool ignore_evict_strategy = false) = 0; + + virtual void insert_and_evict(const size_type n, + const key_type* keys, // (n) + const value_type* values, // (n, DIM) + const score_type* scores, // (n) + key_type* evicted_keys, // (n) + value_type* evicted_values, // (n, DIM) + score_type* evicted_scores, // (n) + size_type* d_evicted_counter, // (1) + cudaStream_t stream = 0, bool unique_key = true, + bool ignore_evict_strategy = false) = 0; + + virtual size_type insert_and_evict(const size_type n, + const key_type* keys, // (n) + const value_type* values, // (n, DIM) + const score_type* scores, // (n) + key_type* evicted_keys, // (n) + value_type* evicted_values, // (n, DIM) + score_type* evicted_scores, // (n) + cudaStream_t stream = 0, + bool unique_key = true, + bool ignore_evict_strategy = false) = 0; + + virtual void accum_or_assign(const size_type n, + const key_type* keys, // (n) + const value_type* value_or_deltas, // (n, DIM) + const bool* accum_or_assigns, // (n) + const score_type* scores = nullptr, // (n) + cudaStream_t stream = 0, + bool ignore_evict_strategy = false) = 0; + + virtual void find_or_insert(const size_type n, const key_type* keys, // (n) + value_type* values, // (n * DIM) + score_type* scores = nullptr, // (n) + cudaStream_t stream = 0, bool unique_key = true, + bool ignore_evict_strategy = false) = 0; + + virtual void find_or_insert(const size_type n, const key_type* keys, // (n) + value_type** values, // (n) + bool* founds, // (n) + score_type* scores = nullptr, // (n) + cudaStream_t stream = 0, bool unique_key = true, + bool ignore_evict_strategy = false) = 0; + + virtual void assign(const size_type n, + const key_type* keys, // (n) + const value_type* values, // (n, DIM) + const score_type* scores = nullptr, // (n) + cudaStream_t stream = 0, bool unique_key = true) = 0; + + virtual void assign_scores(const size_type n, + const key_type* keys, // (n) + const score_type* scores = nullptr, // (n) + cudaStream_t stream = 0, + bool unique_key = true) = 0; + + virtual void assign(const size_type n, + const key_type* keys, // (n) + const score_type* scores = nullptr, // (n) + cudaStream_t stream = 0, bool unique_key = true) = 0; + + virtual void assign_values(const size_type n, + const key_type* keys, // (n) + const value_type* values, // (n, DIM) + cudaStream_t stream = 0, + bool unique_key = true) = 0; + virtual void find(const size_type n, const key_type* keys, // (n) + value_type* values, // (n, DIM) + bool* founds, // (n) + score_type* scores = nullptr, // (n) + cudaStream_t stream = 0) const = 0; + + virtual void find(const size_type n, const key_type* keys, // (n) + value_type* values, // (n, DIM) + key_type* missed_keys, // (n) + int* missed_indices, // (n) + int* missed_size, // scalar + score_type* scores = nullptr, // (n) + cudaStream_t stream = 0) const = 0; + + virtual void find(const size_type n, const key_type* keys, // (n) + value_type** values, // (n) + bool* founds, // (n) + score_type* scores = nullptr, // (n) + cudaStream_t stream = 0, bool unique_key = true) const = 0; + + virtual void contains(const size_type n, const key_type* keys, // (n) + bool* founds, // (n) + cudaStream_t stream = 0) const = 0; + + virtual void erase(const size_type n, const key_type* keys, + cudaStream_t stream = 0) = 0; + + virtual void clear(cudaStream_t stream = 0) = 0; + + virtual void export_batch(size_type n, const size_type offset, + size_type* d_counter, // (1) + key_type* keys, // (n) + value_type* values, // (n, DIM) + score_type* scores = nullptr, // (n) + cudaStream_t stream = 0) const = 0; + + virtual size_type export_batch(const size_type n, const size_type offset, + key_type* keys, // (n) + value_type* values, // (n, DIM) + score_type* scores = nullptr, // (n) + cudaStream_t stream = 0) const = 0; + + virtual bool empty(cudaStream_t stream = 0) const = 0; + + virtual size_type size(cudaStream_t stream = 0) const = 0; + + virtual size_type capacity() const = 0; + + virtual void reserve(const size_type new_capacity, + cudaStream_t stream = 0) = 0; + + virtual float load_factor(cudaStream_t stream = 0) const = 0; + + virtual void set_max_capacity(size_type new_max_capacity) = 0; + + virtual size_type dim() const noexcept = 0; + + virtual size_type max_bucket_size() const noexcept = 0; + + virtual size_type bucket_count() const noexcept = 0; + + virtual size_type save(BaseKVFile* file, + const size_t max_workspace_size = 1L * 1024 * 1024, + cudaStream_t stream = 0) const = 0; + + virtual size_type load(BaseKVFile* file, + const size_t max_workspace_size = 1L * 1024 * 1024, + cudaStream_t stream = 0) = 0; + + virtual void set_global_epoch(const uint64_t epoch) = 0; + }; + + template + class HashTable : public HashTableBase { + public: + using size_type = size_t; + using key_type = K; + using value_type = V; + using score_type = S; + static constexpr int evict_strategy = Strategy; + + using Pred = EraseIfPredict; + using allocator_type = BaseAllocator; + + private: + using TableCore = nv::merlin::Table; + static constexpr unsigned int TILE_SIZE = 4; + + using DeviceMemoryPool = MemoryPool>; + using HostMemoryPool = MemoryPool>; + + public: + HashTable() { + static_assert((std::is_same::value || + std::is_same::value), + "The key_type must be int64_t or uint64_t."); + + static_assert(std::is_same::value, + "The key_type must be uint64_t."); + }; + + ~HashTable() { + if (initialized_) { + CUDA_CHECK(cudaDeviceSynchronize()); + + initialized_ = false; + destroy_table(&table_, allocator_); + allocator_->free(MemoryType::Device, d_table_); + dev_mem_pool_.reset(); + host_mem_pool_.reset(); + + CUDA_CHECK(cudaDeviceSynchronize()); + if (default_allocator_ && allocator_ != nullptr) { + delete allocator_; + } + } + } + + private: + HashTable(const HashTable&) = delete; + HashTable& operator=(const HashTable&) = delete; + HashTable(HashTable&&) = delete; + HashTable& operator=(HashTable&&) = delete; + + public: + void init(const HashTableOptions& options, + allocator_type* allocator = nullptr) { + if (initialized_) { + return; + } + options_ = options; + MERLIN_CHECK(options.reserved_key_start_bit >= 0 && + options.reserved_key_start_bit <= MAX_RESERVED_KEY_BIT, + "options.reserved_key_start_bit should >= 0 and <= 62."); + CUDA_CHECK(init_reserved_keys(options.reserved_key_start_bit)); + + default_allocator_ = (allocator == nullptr); + allocator_ = (allocator == nullptr) ? (new DefaultAllocator()) : allocator; + + thrust_allocator_.set_allocator(allocator_); + + if (options_.device_id >= 0) { + CUDA_CHECK(cudaSetDevice(options_.device_id)); + } else { + CUDA_CHECK(cudaGetDevice(&(options_.device_id))); + } + + MERLIN_CHECK(ispow2(static_cast(options_.max_bucket_size)), + "Bucket size should be the pow of 2"); + MERLIN_CHECK( + ispow2(static_cast(options_.num_of_buckets_per_alloc)), + "Then `num_of_buckets_per_alloc` should be the pow of 2"); + MERLIN_CHECK(options_.init_capacity >= options_.num_of_buckets_per_alloc * + options_.max_bucket_size, + "Then `num_of_buckets_per_alloc` must be equal or less than " + "initial required buckets number"); + + options_.block_size = SAFE_GET_BLOCK_SIZE(options_.block_size); + + MERLIN_CHECK( + (((options_.max_bucket_size * (sizeof(key_type) + sizeof(score_type))) % + 128) == 0), + "Storage size of keys and scores in one bucket should be the mutiple " + "of cache line size"); + + // Construct table. + cudaDeviceProp deviceProp; + CUDA_CHECK(cudaGetDeviceProperties(&deviceProp, options_.device_id)); + shared_mem_size_ = deviceProp.sharedMemPerBlock; + create_table( + &table_, allocator_, options_.dim, options_.init_capacity, + options_.max_capacity, options_.max_hbm_for_vectors, + options_.max_bucket_size, options_.num_of_buckets_per_alloc); + options_.block_size = SAFE_GET_BLOCK_SIZE(options_.block_size); + reach_max_capacity_ = (options_.init_capacity * 2 > options_.max_capacity); + MERLIN_CHECK((!(options_.io_by_cpu && options_.max_hbm_for_vectors != 0)), + "[HierarchicalKV] `io_by_cpu` should not be true when " + "`max_hbm_for_vectors` is not 0!"); + allocator_->alloc(MemoryType::Device, (void**)&(d_table_), + sizeof(TableCore)); + + sync_table_configuration(); + + // Create memory pools. + dev_mem_pool_ = std::make_unique>>( + options_.device_memory_pool, allocator_); + host_mem_pool_ = std::make_unique>>( + options_.host_memory_pool, allocator_); + + CUDA_CHECK(cudaDeviceSynchronize()); + + initialized_ = true; + CudaCheckError(); + } + + void insert_or_assign(const size_type n, + const key_type* keys, // (n) + const value_type* values, // (n, DIM) + const score_type* scores = nullptr, // (n) + cudaStream_t stream = 0, bool unique_key = true, + bool ignore_evict_strategy = false) { + if (n == 0) { + return; + } + + while (!reach_max_capacity_ && + fast_load_factor(n, stream) > options_.max_load_factor) { + reserve(capacity() * 2, stream); + } + + if (!ignore_evict_strategy) { + check_evict_strategy(scores); + } + + insert_unique_lock lock(mutex_, stream); + + if (is_fast_mode()) { + static thread_local int step_counter = 0; + static thread_local float load_factor = 0.0; + + if (((step_counter++) % kernel_select_interval_) == 0) { + load_factor = fast_load_factor(0, stream, false); + } + + using Selector = KernelSelector_Upsert; + if (Selector::callable(unique_key, + static_cast(options_.max_bucket_size), + static_cast(options_.dim))) { + typename Selector::Params kernelParams( + load_factor, table_->buckets, table_->buckets_size, + table_->buckets_num, + static_cast(options_.max_bucket_size), + static_cast(options_.dim), keys, values, scores, n, + global_epoch_); + Selector::select_kernel(kernelParams, stream); + } else { + using Selector = SelectUpsertKernelWithIO; + Selector::execute_kernel( + load_factor, options_.block_size, options_.max_bucket_size, + table_->buckets_num, options_.dim, stream, n, d_table_, + table_->buckets, keys, reinterpret_cast(values), + scores, global_epoch_); + } + } else { + const size_type dev_ws_size{ + n * (sizeof(value_type*) + sizeof(int) + sizeof(key_type*))}; + auto dev_ws{dev_mem_pool_->get_workspace<1>(dev_ws_size, stream)}; + auto d_dst{dev_ws.get(0)}; + auto keys_ptr{reinterpret_cast(d_dst + n)}; + auto d_src_offset{reinterpret_cast(keys_ptr + n)}; + + CUDA_CHECK(cudaMemsetAsync(d_dst, 0, dev_ws_size, stream)); + + constexpr uint32_t MinBucketCapacityFilter = + sizeof(VecD_Load) / sizeof(D); + + bool filter_condition = + unique_key && options_.max_bucket_size >= MinBucketCapacityFilter && + !options_.io_by_cpu; + + if (filter_condition) { + constexpr uint32_t BLOCK_SIZE = 128; + + upsert_kernel_lock_key_hybrid + <<<(n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( + table_->buckets, table_->buckets_size, table_->buckets_num, + options_.max_bucket_size, options_.dim, keys, d_dst, scores, + keys_ptr, d_src_offset, n, global_epoch_); + + } else { + const size_t block_size = options_.block_size; + const size_t N = n * TILE_SIZE; + const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); + + upsert_kernel<<>>( + d_table_, table_->buckets, options_.max_bucket_size, + table_->buckets_num, options_.dim, keys, d_dst, scores, + d_src_offset, global_epoch_, N); + } + + { + thrust::device_ptr d_dst_ptr( + reinterpret_cast(d_dst)); + thrust::device_ptr d_src_offset_ptr(d_src_offset); + + thrust::sort_by_key(thrust_par(thrust_allocator_).on(stream), d_dst_ptr, + d_dst_ptr + n, d_src_offset_ptr, + thrust::less()); + } + + if (filter_condition) { + const size_t block_size = options_.io_block_size; + const size_t N = n * dim(); + const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); + + write_kernel_unlock_key + <<>>(values, d_dst, d_src_offset, + dim(), keys, keys_ptr, N); + + } else if (options_.io_by_cpu) { + const size_type host_ws_size{dev_ws_size + + n * sizeof(value_type) * dim()}; + auto host_ws{host_mem_pool_->get_workspace<1>(host_ws_size, stream)}; + auto h_dst{host_ws.get(0)}; + auto h_src_offset{reinterpret_cast(h_dst + n)}; + auto h_values{reinterpret_cast(h_src_offset + n)}; + + CUDA_CHECK(cudaMemcpyAsync(h_dst, d_dst, dev_ws_size, + cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaMemcpyAsync(h_values, values, host_ws_size - dev_ws_size, + cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + + write_by_cpu(h_dst, h_values, h_src_offset, dim(), n); + } else { + const size_t block_size = options_.io_block_size; + const size_t N = n * dim(); + const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); + + write_kernel + <<>>(values, d_dst, d_src_offset, + dim(), N); + } + } + + CudaCheckError(); + } + + void insert_and_evict(const size_type n, + const key_type* keys, // (n) + const value_type* values, // (n, DIM) + const score_type* scores, // (n) + key_type* evicted_keys, // (n) + value_type* evicted_values, // (n, DIM) + score_type* evicted_scores, // (n) + size_type* d_evicted_counter, // (1) + cudaStream_t stream = 0, bool unique_key = true, + bool ignore_evict_strategy = false) { + if (n == 0) { + return; + } + + while (!reach_max_capacity_ && + fast_load_factor(n, stream) > options_.max_load_factor) { + reserve(capacity() * 2, stream); + } + + if (!ignore_evict_strategy) { + check_evict_strategy(scores); + } + + insert_unique_lock lock(mutex_, stream); + + // TODO: Currently only need eviction when using HashTable as HBM cache. + if (!is_fast_mode()) { + throw std::runtime_error("Only allow insert_and_evict in pure HBM mode."); + } + + static thread_local int step_counter = 0; + static thread_local float load_factor = 0.0; + + if (((step_counter++) % kernel_select_interval_) == 0) { + load_factor = fast_load_factor(0, stream, false); + } + + using Selector = + KernelSelector_UpsertAndEvict; + if (Selector::callable(unique_key, + static_cast(options_.max_bucket_size), + static_cast(options_.dim))) { + typename Selector::Params kernelParams( + load_factor, table_->buckets, table_->buckets_size, + table_->buckets_num, static_cast(options_.max_bucket_size), + static_cast(options_.dim), keys, values, scores, + evicted_keys, evicted_values, evicted_scores, n, d_evicted_counter, + global_epoch_); + Selector::select_kernel(kernelParams, stream); + } else { + // always use max tile to avoid data-deps as possible. + const int TILE_SIZE = 32; + size_t n_offsets = (n + TILE_SIZE - 1) / TILE_SIZE; + const size_type dev_ws_size = + n_offsets * sizeof(int64_t) + n * sizeof(bool) + sizeof(size_type); + + auto dev_ws{dev_mem_pool_->get_workspace<1>(dev_ws_size, stream)}; + auto d_offsets{dev_ws.get(0)}; + auto d_masks = reinterpret_cast(d_offsets + n_offsets); + + CUDA_CHECK( + cudaMemsetAsync(d_offsets, 0, n_offsets * sizeof(int64_t), stream)); + CUDA_CHECK(cudaMemsetAsync(d_masks, 0, n * sizeof(bool), stream)); + + size_type block_size = options_.block_size; + size_type grid_size = SAFE_GET_GRID_SIZE(n, block_size); + CUDA_CHECK(memset64Async(evicted_keys, EMPTY_KEY_CPU, n, stream)); + using Selector = + SelectUpsertAndEvictKernelWithIO; + + Selector::execute_kernel( + load_factor, options_.block_size, options_.max_bucket_size, + table_->buckets_num, options_.dim, stream, n, d_table_, + table_->buckets, keys, values, scores, evicted_keys, evicted_values, + evicted_scores, global_epoch_); + + keys_not_empty + <<>>(evicted_keys, d_masks, n); + gpu_boolean_mask( + grid_size, block_size, d_masks, n, d_evicted_counter, d_offsets, + evicted_keys, evicted_values, evicted_scores, dim(), stream); + } + return; + } + + size_type insert_and_evict(const size_type n, + const key_type* keys, // (n) + const value_type* values, // (n, DIM) + const score_type* scores, // (n) + key_type* evicted_keys, // (n) + value_type* evicted_values, // (n, DIM) + score_type* evicted_scores, // (n) + cudaStream_t stream = 0, bool unique_key = true, + bool ignore_evict_strategy = false) { + if (n == 0) { + return 0; + } + auto dev_ws{dev_mem_pool_->get_workspace<1>(sizeof(size_type), stream)}; + size_type* d_evicted_counter{dev_ws.get(0)}; + + CUDA_CHECK( + cudaMemsetAsync(d_evicted_counter, 0, sizeof(size_type), stream)); + insert_and_evict(n, keys, values, scores, evicted_keys, evicted_values, + evicted_scores, d_evicted_counter, stream, unique_key, + ignore_evict_strategy); + + size_type h_evicted_counter = 0; + CUDA_CHECK(cudaMemcpyAsync(&h_evicted_counter, d_evicted_counter, + sizeof(size_type), cudaMemcpyDeviceToHost, + stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + CudaCheckError(); + return h_evicted_counter; + } + + void accum_or_assign(const size_type n, + const key_type* keys, // (n) + const value_type* value_or_deltas, // (n, DIM) + const bool* accum_or_assigns, // (n) + const score_type* scores = nullptr, // (n) + cudaStream_t stream = 0, + bool ignore_evict_strategy = false) { + if (n == 0) { + return; + } + + while (!reach_max_capacity_ && + fast_load_factor(n, stream) > options_.max_load_factor) { + reserve(capacity() * 2, stream); + } + + if (!ignore_evict_strategy) { + check_evict_strategy(scores); + } + + insert_unique_lock lock(mutex_, stream); + + if (is_fast_mode()) { + using Selector = + SelectAccumOrAssignKernelWithIO; + static thread_local int step_counter = 0; + static thread_local float load_factor = 0.0; + + if (((step_counter++) % kernel_select_interval_) == 0) { + load_factor = fast_load_factor(0, stream, false); + } + Selector::execute_kernel( + load_factor, options_.block_size, options_.max_bucket_size, + table_->buckets_num, dim(), stream, n, d_table_, keys, + value_or_deltas, scores, accum_or_assigns, global_epoch_); + + } else { + const size_type dev_ws_size{ + n * (sizeof(value_type*) + sizeof(int) + sizeof(bool))}; + auto dev_ws{dev_mem_pool_->get_workspace<1>(dev_ws_size, stream)}; + auto dst{dev_ws.get(0)}; + auto src_offset{reinterpret_cast(dst + n)}; + auto founds{reinterpret_cast(src_offset + n)}; + + CUDA_CHECK(cudaMemsetAsync(dst, 0, dev_ws_size, stream)); + + { + const size_t block_size = options_.block_size; + const size_t N = n * TILE_SIZE; + const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); + + accum_or_assign_kernel<<>>( + d_table_, options_.max_bucket_size, table_->buckets_num, dim(), + keys, dst, scores, accum_or_assigns, src_offset, founds, + global_epoch_, N); + } + + { + thrust::device_ptr dst_ptr( + reinterpret_cast(dst)); + thrust::device_ptr src_offset_ptr(src_offset); + + thrust::sort_by_key(thrust_par(thrust_allocator_).on(stream), dst_ptr, + dst_ptr + n, src_offset_ptr, + thrust::less()); + } + + { + const size_t block_size = options_.io_block_size; + const size_t N = n * dim(); + const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); + + write_with_accum_kernel + <<>>(value_or_deltas, dst, + accum_or_assigns, founds, + src_offset, dim(), N); + } + } + CudaCheckError(); + } + + void find_or_insert(const size_type n, const key_type* keys, // (n) + value_type* values, // (n * DIM) + score_type* scores = nullptr, // (n) + cudaStream_t stream = 0, bool unique_key = true, + bool ignore_evict_strategy = false) { + if (n == 0) { + return; + } + + while (!reach_max_capacity_ && + fast_load_factor(n, stream) > options_.max_load_factor) { + reserve(capacity() * 2, stream); + } + + if (!ignore_evict_strategy) { + check_evict_strategy(scores); + } + + insert_unique_lock lock(mutex_, stream); + + if (is_fast_mode()) { + static thread_local int step_counter = 0; + static thread_local float load_factor = 0.0; + + if (((step_counter++) % kernel_select_interval_) == 0) { + load_factor = fast_load_factor(0, stream, false); + } + + using Selector = + KernelSelector_FindOrInsert; + if (Selector::callable(unique_key, + static_cast(options_.max_bucket_size), + static_cast(options_.dim))) { + typename Selector::Params kernelParams( + load_factor, table_->buckets, table_->buckets_size, + table_->buckets_num, + static_cast(options_.max_bucket_size), + static_cast(options_.dim), keys, values, scores, n, + global_epoch_); + Selector::select_kernel(kernelParams, stream); + } else { + using Selector = + SelectFindOrInsertKernelWithIO; + Selector::execute_kernel( + load_factor, options_.block_size, options_.max_bucket_size, + table_->buckets_num, options_.dim, stream, n, d_table_, + table_->buckets, keys, values, scores, global_epoch_); + } + } else { + const size_type dev_ws_size{n * (sizeof(value_type*) + sizeof(int) + + sizeof(bool) + sizeof(key_type*))}; + auto dev_ws{dev_mem_pool_->get_workspace<1>(dev_ws_size, stream)}; + auto d_table_value_addrs{dev_ws.get(0)}; + auto keys_ptr{reinterpret_cast(d_table_value_addrs + n)}; + auto param_key_index{reinterpret_cast(keys_ptr + n)}; + auto founds{reinterpret_cast(param_key_index + n)}; + + CUDA_CHECK(cudaMemsetAsync(d_table_value_addrs, 0, dev_ws_size, stream)); + + constexpr uint32_t MinBucketCapacityFilter = + sizeof(VecD_Load) / sizeof(D); + + bool filter_condition = + unique_key && options_.max_bucket_size >= MinBucketCapacityFilter && + !options_.io_by_cpu; + + if (filter_condition) { + constexpr uint32_t BLOCK_SIZE = 128; + + find_or_insert_kernel_lock_key_hybrid + <<<(n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( + table_->buckets, table_->buckets_size, table_->buckets_num, + options_.max_bucket_size, options_.dim, keys, + d_table_value_addrs, scores, keys_ptr, param_key_index, founds, + n, global_epoch_); + + } else { + const size_t block_size = options_.block_size; + const size_t N = n * TILE_SIZE; + const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); + + find_or_insert_kernel<<>>( + d_table_, table_->buckets, options_.max_bucket_size, + table_->buckets_num, options_.dim, keys, d_table_value_addrs, + scores, founds, param_key_index, global_epoch_, N); + } + + { + thrust::device_ptr table_value_ptr( + reinterpret_cast(d_table_value_addrs)); + thrust::device_ptr param_key_index_ptr(param_key_index); + + thrust::sort_by_key(thrust_par(thrust_allocator_).on(stream), + table_value_ptr, table_value_ptr + n, + param_key_index_ptr, thrust::less()); + } + + if (filter_condition) { + const size_t block_size = options_.io_block_size; + const size_t N = n * dim(); + const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); + + read_or_write_kernel_unlock_key + <<>>(d_table_value_addrs, values, + founds, param_key_index, + keys_ptr, keys, dim(), N); + + } else if (options_.io_by_cpu) { + const size_type host_ws_size{ + dev_ws_size + n * (sizeof(bool) + sizeof(value_type) * dim())}; + auto host_ws{host_mem_pool_->get_workspace<1>(host_ws_size, stream)}; + auto h_table_value_addrs{host_ws.get(0)}; + auto h_param_key_index{reinterpret_cast(h_table_value_addrs + n)}; + auto h_founds{reinterpret_cast(h_param_key_index + n)}; + auto h_param_values{reinterpret_cast(h_founds + n)}; + + CUDA_CHECK(cudaMemcpyAsync(h_table_value_addrs, d_table_value_addrs, + dev_ws_size, cudaMemcpyDeviceToHost, + stream)); + CUDA_CHECK(cudaMemcpyAsync(h_founds, founds, n * sizeof(bool), + cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaMemcpyAsync(h_param_values, values, + n * sizeof(value_type) * dim(), + cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + + read_or_write_by_cpu(h_table_value_addrs, h_param_values, + h_param_key_index, h_founds, dim(), n); + CUDA_CHECK(cudaMemcpyAsync(values, h_param_values, + n * sizeof(value_type) * dim(), + cudaMemcpyHostToDevice, stream)); + } else { + const size_t block_size = options_.io_block_size; + const size_t N = n * dim(); + const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); + + read_or_write_kernel + <<>>( + d_table_value_addrs, values, founds, param_key_index, dim(), N); + } + } + + CudaCheckError(); + } + + void find_or_insert(const size_type n, const key_type* keys, // (n) + value_type** values, // (n) + bool* founds, // (n) + score_type* scores = nullptr, // (n) + cudaStream_t stream = 0, bool unique_key = true, + bool ignore_evict_strategy = false) { + if (n == 0) { + return; + } + + while (!reach_max_capacity_ && + fast_load_factor(n, stream) > options_.max_load_factor) { + reserve(capacity() * 2, stream); + } + + if (!ignore_evict_strategy) { + check_evict_strategy(scores); + } + + insert_unique_lock lock(mutex_, stream); + + constexpr uint32_t MinBucketCapacityFilter = sizeof(VecD_Load) / sizeof(D); + + if (unique_key && options_.max_bucket_size >= MinBucketCapacityFilter) { + constexpr uint32_t BLOCK_SIZE = 128U; + + const size_type dev_ws_size{n * sizeof(key_type**)}; + auto dev_ws{dev_mem_pool_->get_workspace<1>(dev_ws_size, stream)}; + auto keys_ptr{dev_ws.get(0)}; + CUDA_CHECK(cudaMemsetAsync(keys_ptr, 0, dev_ws_size, stream)); + + find_or_insert_ptr_kernel_lock_key + <<<(n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( + table_->buckets, table_->buckets_size, table_->buckets_num, + options_.max_bucket_size, options_.dim, keys, values, scores, + keys_ptr, n, founds, global_epoch_); + + find_or_insert_ptr_kernel_unlock_key + <<<(n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( + keys, keys_ptr, n); + } else { + using Selector = SelectFindOrInsertPtrKernel; + static thread_local int step_counter = 0; + static thread_local float load_factor = 0.0; + + if (((step_counter++) % kernel_select_interval_) == 0) { + load_factor = fast_load_factor(0, stream, false); + } + Selector::execute_kernel( + load_factor, options_.block_size, options_.max_bucket_size, + table_->buckets_num, options_.dim, stream, n, d_table_, + table_->buckets, keys, values, scores, founds, global_epoch_); + } + + CudaCheckError(); + } + void assign(const size_type n, + const key_type* keys, // (n) + const value_type* values, // (n, DIM) + const score_type* scores = nullptr, // (n) + cudaStream_t stream = 0, bool unique_key = true) { + if (n == 0) { + return; + } + + check_evict_strategy(scores); + + update_shared_lock lock(mutex_, stream); + + if (is_fast_mode()) { + static thread_local int step_counter = 0; + static thread_local float load_factor = 0.0; + + if (((step_counter++) % kernel_select_interval_) == 0) { + load_factor = fast_load_factor(0, stream, false); + } + using Selector = KernelSelector_Update; + if (Selector::callable(unique_key, + static_cast(options_.max_bucket_size), + static_cast(options_.dim))) { + typename Selector::Params kernelParams( + load_factor, table_->buckets, table_->buckets_num, + static_cast(options_.max_bucket_size), + static_cast(options_.dim), keys, values, scores, n, + global_epoch_); + Selector::select_kernel(kernelParams, stream); + } else { + using Selector = SelectUpdateKernelWithIO; + Selector::execute_kernel( + load_factor, options_.block_size, options_.max_bucket_size, + table_->buckets_num, options_.dim, stream, n, d_table_, + table_->buckets, keys, values, scores, global_epoch_); + } + } else { + const size_type dev_ws_size{ + n * (sizeof(value_type*) + sizeof(key_type) + sizeof(int))}; + auto dev_ws{dev_mem_pool_->get_workspace<1>(dev_ws_size, stream)}; + auto d_dst{dev_ws.get(0)}; + auto keys_ptr{reinterpret_cast(d_dst + n)}; + auto d_src_offset{reinterpret_cast(keys_ptr + n)}; + + CUDA_CHECK(cudaMemsetAsync(d_dst, 0, dev_ws_size, stream)); + + constexpr uint32_t MinBucketCapacityFilter = + sizeof(VecD_Load) / sizeof(D); + + bool filter_condition = + options_.max_bucket_size >= MinBucketCapacityFilter && + !options_.io_by_cpu && unique_key; + + if (filter_condition) { + constexpr uint32_t BLOCK_SIZE = 128U; + + tlp_update_kernel_hybrid + <<<(n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( + table_->buckets, table_->buckets_num, options_.max_bucket_size, + options_.dim, keys, d_dst, scores, keys_ptr, d_src_offset, + global_epoch_, n); + + } else { + const size_t block_size = options_.block_size; + const size_t N = n * TILE_SIZE; + const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); + + update_kernel<<>>( + d_table_, table_->buckets, options_.max_bucket_size, + table_->buckets_num, options_.dim, keys, d_dst, scores, + d_src_offset, global_epoch_, N); + } + + { + thrust::device_ptr d_dst_ptr( + reinterpret_cast(d_dst)); + thrust::device_ptr d_src_offset_ptr(d_src_offset); + + thrust::sort_by_key(thrust_par(thrust_allocator_).on(stream), d_dst_ptr, + d_dst_ptr + n, d_src_offset_ptr, + thrust::less()); + } + + if (filter_condition) { + const size_t block_size = options_.io_block_size; + const size_t N = n * dim(); + const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); + + write_kernel_unlock_key + <<>>(values, d_dst, d_src_offset, + dim(), keys, keys_ptr, N); + + } else if (options_.io_by_cpu) { + const size_type host_ws_size{dev_ws_size + + n * sizeof(value_type) * dim()}; + auto host_ws{host_mem_pool_->get_workspace<1>(host_ws_size, stream)}; + auto h_dst{host_ws.get(0)}; + auto h_src_offset{reinterpret_cast(h_dst + n)}; + auto h_values{reinterpret_cast(h_src_offset + n)}; + + CUDA_CHECK(cudaMemcpyAsync(h_dst, d_dst, dev_ws_size, + cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaMemcpyAsync(h_values, values, host_ws_size - dev_ws_size, + cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + + write_by_cpu(h_dst, h_values, h_src_offset, dim(), n); + } else { + const size_t block_size = options_.io_block_size; + const size_t N = n * dim(); + const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); + + write_kernel + <<>>(values, d_dst, d_src_offset, + dim(), N); + } + } + + CudaCheckError(); + } + + void assign_scores(const size_type n, + const key_type* keys, // (n) + const score_type* scores = nullptr, // (n) + cudaStream_t stream = 0, bool unique_key = true) { + if (n == 0) { + return; + } + + check_evict_strategy(scores); + + { + update_shared_lock lock(mutex_, stream); + static thread_local int step_counter = 0; + static thread_local float load_factor = 0.0; + + if (((step_counter++) % kernel_select_interval_) == 0) { + load_factor = fast_load_factor(0, stream, false); + } + using Selector = KernelSelector_UpdateScore; + if (Selector::callable(unique_key, + static_cast(options_.max_bucket_size))) { + typename Selector::Params kernelParams( + load_factor, table_->buckets, table_->buckets_num, + static_cast(options_.max_bucket_size), keys, scores, n, + global_epoch_); + Selector::select_kernel(kernelParams, stream); + } else { + using Selector = SelectUpdateScoreKernel; + Selector::execute_kernel(load_factor, options_.block_size, + options_.max_bucket_size, table_->buckets_num, + stream, n, d_table_, table_->buckets, keys, + scores, global_epoch_); + } + } + + CudaCheckError(); + } + + void assign(const size_type n, + const key_type* keys, // (n) + const score_type* scores = nullptr, // (n) + cudaStream_t stream = 0, bool unique_key = true) { + assign_scores(n, keys, scores, stream, unique_key); + } + + void assign_values(const size_type n, + const key_type* keys, // (n) + const value_type* values, // (n, DIM) + cudaStream_t stream = 0, bool unique_key = true) { + if (n == 0) { + return; + } + + update_shared_lock lock(mutex_, stream); + + if (is_fast_mode()) { + static thread_local int step_counter = 0; + static thread_local float load_factor = 0.0; + + if (((step_counter++) % kernel_select_interval_) == 0) { + load_factor = fast_load_factor(0, stream, false); + } + using Selector = KernelSelector_UpdateValues; + if (Selector::callable(unique_key, + static_cast(options_.max_bucket_size), + static_cast(options_.dim))) { + typename Selector::Params kernelParams( + load_factor, table_->buckets, table_->buckets_num, + static_cast(options_.max_bucket_size), + static_cast(options_.dim), keys, values, n); + Selector::select_kernel(kernelParams, stream); + } else { + using Selector = + SelectUpdateValuesKernelWithIO; + Selector::execute_kernel(load_factor, options_.block_size, + options_.max_bucket_size, table_->buckets_num, + options_.dim, stream, n, d_table_, + table_->buckets, keys, values); + } + } else { + const size_type dev_ws_size{ + n * (sizeof(value_type*) + sizeof(key_type) + sizeof(int))}; + auto dev_ws{dev_mem_pool_->get_workspace<1>(dev_ws_size, stream)}; + auto d_dst{dev_ws.get(0)}; + auto keys_ptr{reinterpret_cast(d_dst + n)}; + auto d_src_offset{reinterpret_cast(keys_ptr + n)}; + + CUDA_CHECK(cudaMemsetAsync(d_dst, 0, dev_ws_size, stream)); + + constexpr uint32_t MinBucketCapacityFilter = + sizeof(VecD_Load) / sizeof(D); + + bool filter_condition = + options_.max_bucket_size >= MinBucketCapacityFilter && + !options_.io_by_cpu && unique_key; + + if (filter_condition) { + constexpr uint32_t BLOCK_SIZE = 128U; + + tlp_update_values_kernel_hybrid + <<<(n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( + table_->buckets, table_->buckets_num, options_.max_bucket_size, + options_.dim, keys, d_dst, keys_ptr, d_src_offset, n); + + } else { + const size_t block_size = options_.block_size; + const size_t N = n * TILE_SIZE; + const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); + + update_values_kernel + <<>>( + d_table_, table_->buckets, options_.max_bucket_size, + table_->buckets_num, options_.dim, keys, d_dst, d_src_offset, + N); + } + + { + thrust::device_ptr d_dst_ptr( + reinterpret_cast(d_dst)); + thrust::device_ptr d_src_offset_ptr(d_src_offset); + + thrust::sort_by_key(thrust_par(thrust_allocator_).on(stream), d_dst_ptr, + d_dst_ptr + n, d_src_offset_ptr, + thrust::less()); + } + + if (filter_condition) { + const size_t block_size = options_.io_block_size; + const size_t N = n * dim(); + const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); + + write_kernel_unlock_key + <<>>(values, d_dst, d_src_offset, + dim(), keys, keys_ptr, N); + + } else if (options_.io_by_cpu) { + const size_type host_ws_size{dev_ws_size + + n * sizeof(value_type) * dim()}; + auto host_ws{host_mem_pool_->get_workspace<1>(host_ws_size, stream)}; + auto h_dst{host_ws.get(0)}; + auto h_src_offset{reinterpret_cast(h_dst + n)}; + auto h_values{reinterpret_cast(h_src_offset + n)}; + + CUDA_CHECK(cudaMemcpyAsync(h_dst, d_dst, dev_ws_size, + cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaMemcpyAsync(h_values, values, host_ws_size - dev_ws_size, + cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + + write_by_cpu(h_dst, h_values, h_src_offset, dim(), n); + } else { + const size_t block_size = options_.io_block_size; + const size_t N = n * dim(); + const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); + + write_kernel + <<>>(values, d_dst, d_src_offset, + dim(), N); + } + } + + CudaCheckError(); + } + + void find(const size_type n, const key_type* keys, // (n) + value_type* values, // (n, DIM) + bool* founds, // (n) + score_type* scores = nullptr, // (n) + cudaStream_t stream = 0) const { + if (n == 0) { + return; + } + + CUDA_CHECK(cudaMemsetAsync(founds, 0, n * sizeof(bool), stream)); + + read_shared_lock lock(mutex_, stream); + + const uint32_t value_size = dim() * sizeof(V); + + if (is_fast_mode()) { + using Selector = SelectPipelineLookupKernelWithIO; + const uint32_t pipeline_max_size = Selector::max_value_size(); + // Pipeline lookup kernel only supports "bucket_size = 128". + if (options_.max_bucket_size == 128 && value_size <= pipeline_max_size) { + LookupKernelParams lookupParams( + table_->buckets, table_->buckets_num, static_cast(dim()), + keys, values, scores, founds, n); + Selector::select_kernel(lookupParams, stream); + } else { + using Selector = + SelectLookupKernelWithIO; + static thread_local int step_counter = 0; + static thread_local float load_factor = 0.0; + + if (((step_counter++) % kernel_select_interval_) == 0) { + load_factor = fast_load_factor(0, stream, false); + } + Selector::execute_kernel(load_factor, options_.block_size, + options_.max_bucket_size, table_->buckets_num, + options_.dim, stream, n, d_table_, + table_->buckets, keys, values, scores, founds); + } + } else { + const size_type dev_ws_size{n * (sizeof(value_type*) + sizeof(int))}; + auto dev_ws{dev_mem_pool_->get_workspace<1>(dev_ws_size, stream)}; + auto src{dev_ws.get(0)}; + auto dst_offset{reinterpret_cast(src + n)}; + + CUDA_CHECK(cudaMemsetAsync(src, 0, dev_ws_size, stream)); + + constexpr uint32_t MinBucketCapacityFilter = + sizeof(VecD_Load) / sizeof(D); + + bool filter_condition = + options_.max_bucket_size >= MinBucketCapacityFilter; + + if (filter_condition) { + constexpr uint32_t BLOCK_SIZE = 128U; + + tlp_lookup_kernel_hybrid + <<<(n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( + table_->buckets, table_->buckets_num, options_.max_bucket_size, + options_.dim, keys, src, scores, dst_offset, founds, n); + } else { + const size_t block_size = options_.block_size; + const size_t N = n * TILE_SIZE; + const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); + + lookup_kernel + <<>>( + d_table_, table_->buckets, options_.max_bucket_size, + table_->buckets_num, options_.dim, keys, src, scores, founds, + dst_offset, N); + } + + if (values != nullptr) { + thrust::device_ptr src_ptr( + reinterpret_cast(src)); + thrust::device_ptr dst_offset_ptr(dst_offset); + + thrust::sort_by_key(thrust_par(thrust_allocator_).on(stream), src_ptr, + src_ptr + n, dst_offset_ptr, + thrust::less()); + + const size_t block_size = options_.io_block_size; + const size_t N = n * dim(); + const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); + + read_kernel + <<>>(src, values, founds, + dst_offset, dim(), N); + } + } + + CudaCheckError(); + } + + void find(const size_type n, const key_type* keys, // (n) + value_type* values, // (n, DIM) + key_type* missed_keys, // (n) + int* missed_indices, // (n) + int* missed_size, // scalar + score_type* scores = nullptr, // (n) + cudaStream_t stream = 0) const { + if (n == 0) { + return; + } + + CUDA_CHECK(cudaMemsetAsync(missed_size, 0, sizeof(*missed_size), stream)); + + read_shared_lock lock(mutex_, stream); + + const uint32_t value_size = options_.dim * sizeof(V); + + if (is_fast_mode()) { + using Selector = SelectPipelineLookupKernelWithIO; + const uint32_t pipeline_max_size = Selector::max_value_size(); + // Pipeline lookup kernel only supports "bucket_size = 128". + if (options_.max_bucket_size == 128 && value_size <= pipeline_max_size) { + LookupKernelParamsV2 lookupParams( + table_->buckets, table_->buckets_num, static_cast(dim()), + keys, values, scores, missed_keys, missed_indices, missed_size, n); + Selector::select_kernel(lookupParams, stream); + } else { + using Selector = + SelectLookupKernelWithIOV2; + static thread_local int step_counter = 0; + static thread_local float load_factor = 0.0; + + if (((step_counter++) % kernel_select_interval_) == 0) { + load_factor = fast_load_factor(0, stream, false); + } + Selector::execute_kernel(load_factor, options_.block_size, + options_.max_bucket_size, table_->buckets_num, + options_.dim, stream, n, d_table_, + table_->buckets, keys, values, scores, + missed_keys, missed_indices, missed_size); + } + } else { + const size_type dev_ws_size{n * (sizeof(value_type*) + sizeof(int))}; + auto dev_ws{dev_mem_pool_->get_workspace<1>(dev_ws_size, stream)}; + auto src{dev_ws.get(0)}; + auto dst_offset{reinterpret_cast(src + n)}; + + CUDA_CHECK(cudaMemsetAsync(src, 0, dev_ws_size, stream)); + + constexpr uint32_t MinBucketCapacityFilter = + sizeof(VecD_Load) / sizeof(D); + + bool filter_condition = + options_.max_bucket_size >= MinBucketCapacityFilter; + + if (filter_condition) { + constexpr uint32_t BLOCK_SIZE = 128U; + + tlp_lookup_kernel_hybrid + <<<(n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( + table_->buckets, table_->buckets_num, options_.max_bucket_size, + options_.dim, keys, src, scores, dst_offset, missed_keys, + missed_indices, missed_size, n); + } else { + const size_t block_size = options_.block_size; + const size_t N = n * TILE_SIZE; + const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); + + lookup_kernel + <<>>( + d_table_, table_->buckets, options_.max_bucket_size, + table_->buckets_num, options_.dim, keys, src, scores, + missed_keys, missed_indices, missed_size, dst_offset, N); + } + + if (values != nullptr) { + thrust::device_ptr src_ptr( + reinterpret_cast(src)); + thrust::device_ptr dst_offset_ptr(dst_offset); + + thrust::sort_by_key(thrust_par(thrust_allocator_).on(stream), src_ptr, + src_ptr + n, dst_offset_ptr, + thrust::less()); + + const size_t block_size = options_.io_block_size; + const size_t N = n * dim(); + const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); + + read_kernel + <<>>(src, values, dst_offset, + dim(), N); + } + } + + CudaCheckError(); + } + + void find(const size_type n, const key_type* keys, // (n) + value_type** values, // (n) + bool* founds, // (n) + score_type* scores = nullptr, // (n) + cudaStream_t stream = 0, bool unique_key = true) const { + if (n == 0) { + return; + } + + CUDA_CHECK(cudaMemsetAsync(founds, 0, n * sizeof(bool), stream)); + + read_shared_lock lock(mutex_, stream); + + constexpr uint32_t MinBucketCapacityFilter = sizeof(VecD_Load) / sizeof(D); + if (unique_key && options_.max_bucket_size >= MinBucketCapacityFilter) { + constexpr uint32_t BLOCK_SIZE = 128U; + tlp_lookup_ptr_kernel_with_filter + <<<(n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>( + table_->buckets, table_->buckets_num, options_.max_bucket_size, + options_.dim, keys, values, scores, founds, n); + } else { + using Selector = SelectLookupPtrKernel; + static thread_local int step_counter = 0; + static thread_local float load_factor = 0.0; + + if (((step_counter++) % kernel_select_interval_) == 0) { + load_factor = fast_load_factor(0, stream, false); + } + + Selector::execute_kernel(load_factor, options_.block_size, + options_.max_bucket_size, table_->buckets_num, + options_.dim, stream, n, d_table_, + table_->buckets, keys, values, scores, founds); + } + + CudaCheckError(); + } + + void contains(const size_type n, const key_type* keys, // (n) + bool* founds, // (n) + cudaStream_t stream = 0) const { + if (n == 0) { + return; + } + + read_shared_lock lock(mutex_, stream); + + if (options_.max_bucket_size == 128) { + // Pipeline lookup kernel only supports "bucket_size = 128". + using Selector = SelectPipelineContainsKernel; + ContainsKernelParams containsParams( + table_->buckets, table_->buckets_num, static_cast(dim()), + keys, founds, n); + Selector::select_kernel(containsParams, stream); + } else { + using Selector = SelectContainsKernel; + static thread_local int step_counter = 0; + static thread_local float load_factor = 0.0; + + if (((step_counter++) % kernel_select_interval_) == 0) { + load_factor = fast_load_factor(0, stream, false); + } + Selector::execute_kernel(load_factor, options_.block_size, + options_.max_bucket_size, table_->buckets_num, + options_.dim, stream, n, d_table_, + table_->buckets, keys, founds); + } + CudaCheckError(); + } + + void erase(const size_type n, const key_type* keys, cudaStream_t stream = 0) { + if (n == 0) { + return; + } + + update_read_lock lock(mutex_, stream); + + { + const size_t block_size = options_.block_size; + const size_t N = n * TILE_SIZE; + const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size); + + remove_kernel + <<>>( + d_table_, keys, table_->buckets, table_->buckets_size, + table_->bucket_max_size, table_->buckets_num, N); + } + + CudaCheckError(); + return; + } + + template