diff --git a/cache/cache_bench_tool.cc b/cache/cache_bench_tool.cc index 89945abf7f0..07cd1a1f6f2 100644 --- a/cache/cache_bench_tool.cc +++ b/cache/cache_bench_tool.cc @@ -48,6 +48,10 @@ DEFINE_uint64(cache_size, 1 * GiB, "Number of bytes to use as a cache of uncompressed data."); DEFINE_int32(num_shard_bits, -1, "ShardedCacheOptions::shard_bits. Default = auto"); +DEFINE_int32( + eviction_effort_cap, + ROCKSDB_NAMESPACE::HyperClockCacheOptions(1, 1).eviction_effort_cap, + "HyperClockCacheOptions::eviction_effort_cap"); DEFINE_double(resident_ratio, 0.25, "Ratio of keys fitting in cache to keyspace."); @@ -391,6 +395,7 @@ class CacheBench { FLAGS_cache_size, /*estimated_entry_charge=*/0, FLAGS_num_shard_bits); opts.hash_seed = BitwiseAnd(FLAGS_seed, INT32_MAX); opts.memory_allocator = allocator; + opts.eviction_effort_cap = FLAGS_eviction_effort_cap; if (FLAGS_cache_type == "fixed_hyper_clock_cache" || FLAGS_cache_type == "hyper_clock_cache") { opts.estimated_entry_charge = FLAGS_value_bytes_estimate > 0 diff --git a/cache/clock_cache.cc b/cache/clock_cache.cc index fd330d90d83..017f6a222f3 100644 --- a/cache/clock_cache.cc +++ b/cache/clock_cache.cc @@ -93,7 +93,8 @@ inline void Unref(const ClockHandle& h, uint64_t count = 1) { (void)old_meta; } -inline bool ClockUpdate(ClockHandle& h, bool* purgeable = nullptr) { +inline bool ClockUpdate(ClockHandle& h, BaseClockTable::EvictionData* data, + bool* purgeable = nullptr) { uint64_t meta; if (purgeable) { assert(*purgeable == false); @@ -125,6 +126,7 @@ inline bool ClockUpdate(ClockHandle& h, bool* purgeable = nullptr) { (meta >> ClockHandle::kReleaseCounterShift) & ClockHandle::kCounterMask; if (acquire_count != release_count) { // Only clock update entries with no outstanding refs + data->seen_pinned_count++; return false; } if ((meta >> ClockHandle::kStateShift == ClockHandle::kStateVisible) && @@ -148,6 +150,8 @@ inline bool ClockUpdate(ClockHandle& h, bool* purgeable = nullptr) { << ClockHandle::kStateShift) | (meta & ClockHandle::kHitBitMask))) { // Took ownership. + data->freed_charge += h.GetTotalCharge(); + data->freed_count += 1; return true; } else { // Compare-exchange failing probably @@ -529,11 +533,7 @@ inline bool BaseClockTable::ChargeUsageMaybeEvictNonStrict( return true; } -void BaseClockTable::TrackAndReleaseEvictedEntry( - ClockHandle* h, BaseClockTable::EvictionData* data) { - data->freed_charge += h->GetTotalCharge(); - data->freed_count += 1; - +void BaseClockTable::TrackAndReleaseEvictedEntry(ClockHandle* h) { bool took_value_ownership = false; if (eviction_callback_) { // For key reconstructed from hash @@ -550,6 +550,14 @@ void BaseClockTable::TrackAndReleaseEvictedEntry( MarkEmpty(*h); } +bool BaseClockTable::IsEvictionEffortExceeded(const EvictionData& data) const { + // Basically checks whether the ratio of useful effort to wasted effort is + // too low, with a start-up allowance for wasted effort before any useful + // effort. + return (data.freed_count + 1) * eviction_effort_cap_ <= + data.seen_pinned_count; +} + template Status BaseClockTable::Insert(const ClockHandleBasicData& proto, typename Table::HandleImpl** handle, @@ -692,7 +700,7 @@ FixedHyperClockTable::FixedHyperClockTable( MemoryAllocator* allocator, const Cache::EvictionCallback* eviction_callback, const uint32_t* hash_seed, const Opts& opts) - : BaseClockTable(metadata_charge_policy, allocator, eviction_callback, + : BaseClockTable(opts, metadata_charge_policy, allocator, eviction_callback, hash_seed), length_bits_(CalcHashBits(capacity, opts.estimated_value_size, metadata_charge_policy)), @@ -1104,10 +1112,10 @@ inline void FixedHyperClockTable::Evict(size_t requested_charge, InsertState&, for (;;) { for (size_t i = 0; i < step_size; i++) { HandleImpl& h = array_[ModTableSize(Lower32of64(old_clock_pointer + i))]; - bool evicting = ClockUpdate(h); + bool evicting = ClockUpdate(h, data); if (evicting) { Rollback(h.hashed_key, &h); - TrackAndReleaseEvictedEntry(&h, data); + TrackAndReleaseEvictedEntry(&h); } } @@ -1118,6 +1126,9 @@ inline void FixedHyperClockTable::Evict(size_t requested_charge, InsertState&, if (old_clock_pointer >= max_clock_pointer) { return; } + if (IsEvictionEffortExceeded(*data)) { + return; + } // Advance clock pointer (concurrently) old_clock_pointer = clock_pointer_.FetchAddRelaxed(step_size); @@ -1912,7 +1923,7 @@ AutoHyperClockTable::AutoHyperClockTable( MemoryAllocator* allocator, const Cache::EvictionCallback* eviction_callback, const uint32_t* hash_seed, const Opts& opts) - : BaseClockTable(metadata_charge_policy, allocator, eviction_callback, + : BaseClockTable(opts, metadata_charge_policy, allocator, eviction_callback, hash_seed), array_(MemMapping::AllocateLazyZeroed( sizeof(HandleImpl) * CalcMaxUsableLength(capacity, @@ -2589,7 +2600,8 @@ using ClockUpdateChainLockedOpData = template void AutoHyperClockTable::PurgeImplLocked(OpData* op_data, ChainRewriteLock& rewrite_lock, - size_t home) { + size_t home, + BaseClockTable::EvictionData* data) { constexpr bool kIsPurge = std::is_same_v; constexpr bool kIsClockUpdateChain = std::is_same_v; @@ -2631,7 +2643,7 @@ void AutoHyperClockTable::PurgeImplLocked(OpData* op_data, assert(home == BottomNBits(h->hashed_key[1], home_shift)); if constexpr (kIsClockUpdateChain) { // Clock update and/or check for purgeable (under (de)construction) - if (ClockUpdate(*h, &purgeable)) { + if (ClockUpdate(*h, data, &purgeable)) { // Remember for finishing eviction op_data->push_back(h); // Entries for eviction become purgeable @@ -2718,7 +2730,8 @@ using PurgeOpData = const UniqueId64x2; using ClockUpdateChainOpData = ClockUpdateChainLockedOpData; template -void AutoHyperClockTable::PurgeImpl(OpData* op_data, size_t home) { +void AutoHyperClockTable::PurgeImpl(OpData* op_data, size_t home, + BaseClockTable::EvictionData* data) { // Early efforts to make AutoHCC fully wait-free ran into too many problems // that needed obscure and potentially inefficient work-arounds to have a // chance at working. @@ -2799,9 +2812,9 @@ void AutoHyperClockTable::PurgeImpl(OpData* op_data, size_t home) { if (!rewrite_lock.IsEnd()) { if constexpr (kIsPurge) { PurgeLockedOpData* locked_op_data{}; - PurgeImplLocked(locked_op_data, rewrite_lock, home); + PurgeImplLocked(locked_op_data, rewrite_lock, home, data); } else { - PurgeImplLocked(op_data, rewrite_lock, home); + PurgeImplLocked(op_data, rewrite_lock, home, data); } } } @@ -3462,12 +3475,12 @@ void AutoHyperClockTable::Evict(size_t requested_charge, InsertState& state, if (home >= used_length) { break; } - PurgeImpl(&to_finish_eviction, home); + PurgeImpl(&to_finish_eviction, home, data); } } for (HandleImpl* h : to_finish_eviction) { - TrackAndReleaseEvictedEntry(h, data); + TrackAndReleaseEvictedEntry(h); // NOTE: setting likely_empty_slot here can cause us to reduce the // portion of "at home" entries, probably because an evicted entry // is more likely to come back than a random new entry and would be @@ -3495,6 +3508,10 @@ void AutoHyperClockTable::Evict(size_t requested_charge, InsertState& state, if (old_clock_pointer + step_size >= max_clock_pointer) { return; } + + if (IsEvictionEffortExceeded(*data)) { + return; + } } } diff --git a/cache/clock_cache.h b/cache/clock_cache.h index 3086e7e972f..8e4f23edafb 100644 --- a/cache/clock_cache.h +++ b/cache/clock_cache.h @@ -374,13 +374,25 @@ struct ClockHandle : public ClockHandleBasicData { class BaseClockTable { public: - BaseClockTable(CacheMetadataChargePolicy metadata_charge_policy, + struct BaseOpts { + explicit BaseOpts(int _eviction_effort_cap) + : eviction_effort_cap(_eviction_effort_cap) { + eviction_effort_cap = std::max(int{1}, _eviction_effort_cap); + } + explicit BaseOpts(const HyperClockCacheOptions& opts) + : BaseOpts(opts.eviction_effort_cap) {} + int eviction_effort_cap; + }; + + BaseClockTable(const BaseOpts& opts, + CacheMetadataChargePolicy metadata_charge_policy, MemoryAllocator* allocator, const Cache::EvictionCallback* eviction_callback, const uint32_t* hash_seed) : metadata_charge_policy_(metadata_charge_policy), allocator_(allocator), eviction_callback_(*eviction_callback), + eviction_effort_cap_(opts.eviction_effort_cap), hash_seed_(*hash_seed) {} template @@ -409,9 +421,12 @@ class BaseClockTable { struct EvictionData { size_t freed_charge = 0; size_t freed_count = 0; + size_t seen_pinned_count = 0; }; - void TrackAndReleaseEvictedEntry(ClockHandle* h, EvictionData* data); + void TrackAndReleaseEvictedEntry(ClockHandle* h); + + bool IsEvictionEffortExceeded(const EvictionData& data) const; #ifndef NDEBUG // Acquire N references @@ -450,7 +465,6 @@ class BaseClockTable { bool ChargeUsageMaybeEvictNonStrict(size_t total_charge, size_t capacity, bool need_evict_for_occupancy, typename Table::InsertState& state); - protected: // data // We partition the following members into different cache lines // to avoid false sharing among Lookup, Release, Erase and Insert @@ -484,6 +498,9 @@ class BaseClockTable { // A reference to Cache::eviction_callback_ const Cache::EvictionCallback& eviction_callback_; + // See HyperClockCacheOptions::eviction_effort_cap + int eviction_effort_cap_; + // A reference to ShardedCacheBase::hash_seed_ const uint32_t& hash_seed_; }; @@ -517,10 +534,12 @@ class FixedHyperClockTable : public BaseClockTable { inline void SetStandalone() { standalone = true; } }; // struct HandleImpl - struct Opts { - explicit Opts(size_t _estimated_value_size) - : estimated_value_size(_estimated_value_size) {} - explicit Opts(const HyperClockCacheOptions& opts) { + struct Opts : public BaseOpts { + explicit Opts(size_t _estimated_value_size, int _eviction_effort_cap) + : BaseOpts(_eviction_effort_cap), + estimated_value_size(_estimated_value_size) {} + explicit Opts(const HyperClockCacheOptions& opts) + : BaseOpts(opts.eviction_effort_cap) { assert(opts.estimated_entry_charge > 0); estimated_value_size = opts.estimated_entry_charge; } @@ -803,11 +822,13 @@ class AutoHyperClockTable : public BaseClockTable { } }; // struct HandleImpl - struct Opts { - explicit Opts(size_t _min_avg_value_size) - : min_avg_value_size(_min_avg_value_size) {} + struct Opts : public BaseOpts { + explicit Opts(size_t _min_avg_value_size, int _eviction_effort_cap) + : BaseOpts(_eviction_effort_cap), + min_avg_value_size(_min_avg_value_size) {} - explicit Opts(const HyperClockCacheOptions& opts) { + explicit Opts(const HyperClockCacheOptions& opts) + : BaseOpts(opts.eviction_effort_cap) { assert(opts.estimated_entry_charge == 0); min_avg_value_size = opts.min_avg_entry_charge; } @@ -906,7 +927,8 @@ class AutoHyperClockTable : public BaseClockTable { // with proper handling to ensure all existing data is seen even in the // presence of concurrent insertions, etc. (See implementation.) template - void PurgeImpl(OpData* op_data, size_t home = SIZE_MAX); + void PurgeImpl(OpData* op_data, size_t home = SIZE_MAX, + EvictionData* data = nullptr); // An RAII wrapper for locking a chain of entries for removals. See // implementation. @@ -916,7 +938,7 @@ class AutoHyperClockTable : public BaseClockTable { // implementation. template void PurgeImplLocked(OpData* op_data, ChainRewriteLock& rewrite_lock, - size_t home); + size_t home, EvictionData* data); // Update length_info_ as much as possible without waiting, given a known // usable (ready for inserts and lookups) grow_home. (Previous grow_homes diff --git a/cache/compressed_secondary_cache_test.cc b/cache/compressed_secondary_cache_test.cc index d72680b845e..79f40868a7e 100644 --- a/cache/compressed_secondary_cache_test.cc +++ b/cache/compressed_secondary_cache_test.cc @@ -992,6 +992,8 @@ class CompressedSecCacheTestWithTiered /*_capacity=*/0, /*_estimated_entry_charge=*/256 << 10, /*_num_shard_bits=*/0); + // eviction_effort_cap setting simply to avoid churn in existing test + hcc_opts.eviction_effort_cap = 100; TieredCacheOptions opts; lru_opts.capacity = 0; lru_opts.num_shard_bits = 0; diff --git a/cache/lru_cache_test.cc b/cache/lru_cache_test.cc index 27fd5cc854b..6233f2388f1 100644 --- a/cache/lru_cache_test.cc +++ b/cache/lru_cache_test.cc @@ -389,12 +389,13 @@ class ClockCacheTest : public testing::Test { } } - void NewShard(size_t capacity, bool strict_capacity_limit = true) { + void NewShard(size_t capacity, bool strict_capacity_limit = true, + int eviction_effort_cap = 30) { DeleteShard(); shard_ = reinterpret_cast(port::cacheline_aligned_alloc(sizeof(Shard))); - TableOpts opts{1 /*value_size*/}; + TableOpts opts{1 /*value_size*/, eviction_effort_cap}; new (shard_) Shard(capacity, strict_capacity_limit, kDontChargeCacheMetadata, /*allocator*/ nullptr, &eviction_callback_, &hash_seed_, opts); @@ -445,12 +446,20 @@ class ClockCacheTest : public testing::Test { return Slice(reinterpret_cast(&hashed_key), 16U); } + // A bad hash function for testing / stressing collision handling static inline UniqueId64x2 TestHashedKey(char key) { // For testing hash near-collision behavior, put the variance in // hashed_key in bits that are unlikely to be used as hash bits. return {(static_cast(key) << 56) + 1234U, 5678U}; } + // A reasonable hash function, for testing "typical behavior" etc. + template + static inline UniqueId64x2 CheapHash(T i) { + return {static_cast(i) * uint64_t{0x85EBCA77C2B2AE63}, + static_cast(i) * uint64_t{0xC2B2AE3D27D4EB4F}}; + } + Shard* shard_ = nullptr; private: @@ -683,6 +692,43 @@ TYPED_TEST(ClockCacheTest, ClockEvictionTest) { } } +TYPED_TEST(ClockCacheTest, ClockEvictionEffortCapTest) { + using HandleImpl = typename ClockCacheTest::Shard::HandleImpl; + for (int eec : {-42, 0, 1, 10, 100, 1000}) { + SCOPED_TRACE("eviction_effort_cap = " + std::to_string(eec)); + constexpr size_t kCapacity = 1000; + // Start with much larger capacity to ensure that we can go way over + // capacity without reaching table occupancy limit. + this->NewShard(3 * kCapacity, /*strict_capacity_limit=*/false, eec); + auto& shard = *this->shard_; + shard.SetCapacity(kCapacity); + + // Nearly fill the cache with pinned entries, then add a bunch of + // non-pinned entries. eviction_effort_cap should affect how many + // evictable entries are present beyond the cache capacity, despite + // being evictable. + constexpr size_t kCount = kCapacity - 1; + std::unique_ptr ha { new HandleImpl* [kCount] {} }; + for (size_t i = 0; i < 2 * kCount; ++i) { + UniqueId64x2 hkey = this->CheapHash(i); + ASSERT_OK(shard.Insert( + this->TestKey(hkey), hkey, nullptr /*value*/, &kNoopCacheItemHelper, + 1 /*charge*/, i < kCount ? &ha[i] : nullptr, Cache::Priority::LOW)); + } + + // Rough inverse relationship between cap and possible memory + // explosion, which shows up as increased table occupancy count. + int effective_eec = std::max(int{1}, eec) + 1; + EXPECT_NEAR(shard.GetOccupancyCount() * 1.0, + kCount * (1 + 1.4 / effective_eec), + kCount * (0.6 / effective_eec) + 1.0); + + for (size_t i = 0; i < kCount; ++i) { + shard.Release(ha[i]); + } + } +} + namespace { struct DeleteCounter { int deleted = 0; diff --git a/include/rocksdb/cache.h b/include/rocksdb/cache.h index 43a422049dc..2a358504e88 100644 --- a/include/rocksdb/cache.h +++ b/include/rocksdb/cache.h @@ -380,9 +380,6 @@ inline std::shared_ptr NewCompressedSecondaryCache( // to find the appropriate balance automatically. // * Cache priorities are less aggressively enforced, which could cause // cache dilution from long range scans (unless they use fill_cache=false). -// * Can be worse for small caches, because if almost all of a cache shard is -// pinned (more likely with non-partitioned filters), then CLOCK eviction -// becomes very CPU intensive. // // See internal cache/clock_cache.h for full description. struct HyperClockCacheOptions : public ShardedCacheOptions { @@ -441,6 +438,43 @@ struct HyperClockCacheOptions : public ShardedCacheOptions { // load factor for efficient Lookup, Insert, etc. size_t min_avg_entry_charge = 450; + // A tuning parameter to cap eviction CPU usage in a "thrashing" situation + // by allowing the memory capacity to be exceeded slightly as needed. The + // default setting should offer balanced protection against excessive CPU + // and memory usage under extreme stress conditions, with no effect on + // normal operation. Such stress conditions are proportionally more likely + // with small caches (10s of MB or less) vs. large caches (GB-scale). + // (NOTE: With the unusual setting of strict_capacity_limit=true, this + // parameter is ignored.) + // + // BACKGROUND: Without some kind of limiter, inserting into a CLOCK-based + // cache with no evictable entries (all "pinned") requires scanning the + // entire cache to determine that nothing can be evicted. (By contrast, + // LRU caches can determine no entries are evictable in O(1) time, but + // require more synchronization/coordination on that eviction metadata.) + // This aspect of a CLOCK cache can make a stressed situation worse by + // bogging down the CPU with repeated scans of the cache. And with + // strict_capacity_limit=false (normal setting), finding something evictable + // doesn't change the outcome of insertion: the entry is inserted anyway + // and the cache is allowed to exceed its target capacity if necessary. + // + // SOLUTION: Eviction is aborted upon seeing some number of pinned + // entries before evicting anything, or if the ratio of pinned to evicted + // is too high. This setting `eviction_effort_cap` essentially controls both + // that allowed initial number of pinned entries and the maximum allowed + // ratio. As the pinned size approaches the target cache capacity, roughly + // 1/eviction_effort_cap additional portion of the capacity might be kept + // in memory and evictable in order to keep CLOCK eviction reasonably + // performant. Under the default setting and high stress conditions, this + // memory overhead is around 3-5%. Under normal or even moderate stress + // conditions, the memory overhead is negligible to zero. + // + // A large value like 1000 offers some protection with essentially no + // memory overhead, while the minimum value of 1 could be useful for a + // small cache where roughly doubling in size under stress could be OK to + // keep operations very fast. + int eviction_effort_cap = 30; + HyperClockCacheOptions( size_t _capacity, size_t _estimated_entry_charge, int _num_shard_bits = -1, bool _strict_capacity_limit = false,