From c4be62edc86052610e7e301c025c9581d858d0b3 Mon Sep 17 00:00:00 2001 From: Borys Date: Sun, 29 Dec 2024 09:37:08 +0200 Subject: [PATCH 01/20] fix: return value for DflyMigrateAck (#4379) --- src/server/cluster/cluster_family.cc | 2 +- src/server/cluster/outgoing_slot_migration.h | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/src/server/cluster/cluster_family.cc b/src/server/cluster/cluster_family.cc index db6d41bd5c03..b76cb5ab404c 100644 --- a/src/server/cluster/cluster_family.cc +++ b/src/server/cluster/cluster_family.cc @@ -1012,7 +1012,7 @@ void ClusterFamily::DflyMigrateAck(CmdArgList args, SinkReplyBuilder* builder) { [source_id = source_id](const auto& m) { return m.node_info.id == source_id; }); if (m_it == in_migrations.end()) { LOG(WARNING) << "migration isn't in config"; - return builder->SendLong(OutgoingMigration::kInvalidAttempt); + return builder->SendError(OutgoingMigration::kUnknownMigration); } auto migration = GetIncomingMigration(source_id); diff --git a/src/server/cluster/outgoing_slot_migration.h b/src/server/cluster/outgoing_slot_migration.h index d1ce69f15143..eda8ef441927 100644 --- a/src/server/cluster/outgoing_slot_migration.h +++ b/src/server/cluster/outgoing_slot_migration.h @@ -59,7 +59,6 @@ class OutgoingMigration : private ProtocolClient { size_t GetKeyCount() const ABSL_LOCKS_EXCLUDED(state_mu_); - static constexpr long kInvalidAttempt = -1; static constexpr std::string_view kUnknownMigration = "UNKNOWN_MIGRATION"; private: From 22994cf3b7b9134adcd384a44c6dd712d67f1b73 Mon Sep 17 00:00:00 2001 From: adiholden Date: Mon, 30 Dec 2024 11:52:58 +0200 Subject: [PATCH 02/20] chore(server): cleanup replication shard sync execution (#4374) chore server: cleanup replication shard sync execution Signed-off-by: adi_holden --- src/server/cluster/incoming_slot_migration.cc | 5 ++--- src/server/journal/tx_executor.cc | 1 - src/server/journal/tx_executor.h | 1 - src/server/journal/types.h | 3 ++- src/server/replica.cc | 3 ++- src/server/string_family.cc | 13 ++++--------- src/server/transaction.cc | 18 +++--------------- src/server/transaction.h | 2 +- 8 files changed, 14 insertions(+), 32 deletions(-) diff --git a/src/server/cluster/incoming_slot_migration.cc b/src/server/cluster/incoming_slot_migration.cc index 0ed07768a14d..0ca02598b6a5 100644 --- a/src/server/cluster/incoming_slot_migration.cc +++ b/src/server/cluster/incoming_slot_migration.cc @@ -94,7 +94,7 @@ class ClusterShardMigration { if (tx_data->opcode == journal::Op::PING) { // TODO check about ping logic } else { - ExecuteTxWithNoShardSync(std::move(*tx_data), cntx); + ExecuteTx(std::move(*tx_data), cntx); } } @@ -125,11 +125,10 @@ class ClusterShardMigration { } private: - void ExecuteTxWithNoShardSync(TransactionData&& tx_data, Context* cntx) { + void ExecuteTx(TransactionData&& tx_data, Context* cntx) { if (cntx->IsCancelled()) { return; } - CHECK(tx_data.shard_cnt <= 1); // we don't support sync for multishard execution if (!tx_data.IsGlobalCmd()) { executor_.Execute(tx_data.dbid, tx_data.command); } else { diff --git a/src/server/journal/tx_executor.cc b/src/server/journal/tx_executor.cc index 734a59637c61..f163f69736bd 100644 --- a/src/server/journal/tx_executor.cc +++ b/src/server/journal/tx_executor.cc @@ -60,7 +60,6 @@ void TransactionData::AddEntry(journal::ParsedEntry&& entry) { case journal::Op::EXPIRED: case journal::Op::COMMAND: command = std::move(entry.cmd); - shard_cnt = entry.shard_cnt; dbid = entry.dbid; txid = entry.txid; return; diff --git a/src/server/journal/tx_executor.h b/src/server/journal/tx_executor.h index d4c401a37312..4ed9d2ec9777 100644 --- a/src/server/journal/tx_executor.h +++ b/src/server/journal/tx_executor.h @@ -47,7 +47,6 @@ struct TransactionData { TxId txid{0}; DbIndex dbid{0}; - uint32_t shard_cnt{0}; journal::ParsedEntry::CmdData command; journal::Op opcode = journal::Op::NOOP; diff --git a/src/server/journal/types.h b/src/server/journal/types.h index f03af1409fe0..b460cfeb542b 100644 --- a/src/server/journal/types.h +++ b/src/server/journal/types.h @@ -28,7 +28,8 @@ struct EntryBase { TxId txid; Op opcode; DbIndex dbid; - uint32_t shard_cnt; + uint32_t shard_cnt; // This field is no longer used by the replica, but we continue to serialize + // and deserialize it to maintain backward compatibility. std::optional slot; LSN lsn{0}; }; diff --git a/src/server/replica.cc b/src/server/replica.cc index 928ad610406e..977ec0f885d4 100644 --- a/src/server/replica.cc +++ b/src/server/replica.cc @@ -966,7 +966,8 @@ void DflyShardReplica::ExecuteTx(TransactionData&& tx_data, Context* cntx) { return; } - bool inserted_by_me = multi_shard_exe_->InsertTxToSharedMap(tx_data.txid, tx_data.shard_cnt); + bool inserted_by_me = + multi_shard_exe_->InsertTxToSharedMap(tx_data.txid, master_context_.num_flows); auto& multi_shard_data = multi_shard_exe_->Find(tx_data.txid); diff --git a/src/server/string_family.cc b/src/server/string_family.cc index 537611c5f39f..ac4679d5f321 100644 --- a/src/server/string_family.cc +++ b/src/server/string_family.cc @@ -399,17 +399,12 @@ OpStatus OpMSet(const OpArgs& op_args, const ShardArgs& args) { if (stored * 2 == args.Size()) { RecordJournal(op_args, "MSET", args, op_args.tx->GetUniqueShardCnt()); DCHECK_EQ(result, OpStatus::OK); - return result; + } else if (stored > 0) { + vector store_args(args.begin(), args.end()); + store_args.resize(stored * 2); + RecordJournal(op_args, "MSET", store_args, op_args.tx->GetUniqueShardCnt()); } - - // Even without changes, we have to send a dummy command like PING for the - // replica to ack - string_view cmd = stored == 0 ? "PING" : "MSET"; - vector store_args(args.begin(), args.end()); - store_args.resize(stored * 2); - RecordJournal(op_args, cmd, store_args, op_args.tx->GetUniqueShardCnt()); } - return result; } diff --git a/src/server/transaction.cc b/src/server/transaction.cc index 9abf8b346557..3e180f871a49 100644 --- a/src/server/transaction.cc +++ b/src/server/transaction.cc @@ -704,7 +704,7 @@ void Transaction::RunCallback(EngineShard* shard) { // Log to journal only once the command finished running if ((coordinator_state_ & COORD_CONCLUDING) || (multi_ && multi_->concluding)) { - LogAutoJournalOnShard(shard, result); + LogAutoJournalOnShard(shard); MaybeInvokeTrackingCb(); } } @@ -1346,7 +1346,7 @@ OpStatus Transaction::RunSquashedMultiCb(RunnableType cb) { auto result = cb(this, shard); db_slice.OnCbFinish(); - LogAutoJournalOnShard(shard, result); + LogAutoJournalOnShard(shard); MaybeInvokeTrackingCb(); DCHECK_EQ(result.flags, 0); // if it's sophisticated, we shouldn't squash it @@ -1438,7 +1438,7 @@ optional Transaction::GetWakeKey(ShardId sid) const { return ArgS(full_args_, sd.wake_key_pos); } -void Transaction::LogAutoJournalOnShard(EngineShard* shard, RunnableResult result) { +void Transaction::LogAutoJournalOnShard(EngineShard* shard) { // TODO: For now, we ignore non shard coordination. if (shard == nullptr) return; @@ -1455,20 +1455,8 @@ void Transaction::LogAutoJournalOnShard(EngineShard* shard, RunnableResult resul if (journal == nullptr) return; - if (result.status != OpStatus::OK) { - // We log NOOP even for NO_AUTOJOURNAL commands because the non-success status could have been - // due to OOM in a single shard, while other shards succeeded - journal->RecordEntry(txid_, journal::Op::NOOP, db_index_, unique_shard_cnt_, - unique_slot_checker_.GetUniqueSlotId(), journal::Entry::Payload{}, true); - return; - } - // If autojournaling was disabled and not re-enabled the callback is writing to journal. - // We do not allow preemption in callbacks and therefor the call to RecordJournal from - // from callbacks does not allow await. - // To make sure we flush the changes to sync we call TriggerJournalWriteToSink here. if ((cid_->opt_mask() & CO::NO_AUTOJOURNAL) && !re_enabled_auto_journal_) { - TriggerJournalWriteToSink(); return; } diff --git a/src/server/transaction.h b/src/server/transaction.h index 5e209eb96d98..c0eb37ff8042 100644 --- a/src/server/transaction.h +++ b/src/server/transaction.h @@ -559,7 +559,7 @@ class Transaction { // Log command in shard's journal, if this is a write command with auto-journaling enabled. // Should be called immediately after the last hop. - void LogAutoJournalOnShard(EngineShard* shard, RunnableResult shard_result); + void LogAutoJournalOnShard(EngineShard* shard); // Whether the callback can be run directly on this thread without dispatching on the shard queue bool CanRunInlined() const; From b7532538fee6746511c9f61e9798e219bdfaa45d Mon Sep 17 00:00:00 2001 From: Borys Date: Mon, 30 Dec 2024 14:02:12 +0200 Subject: [PATCH 03/20] test: fix test_network_disconnect_during_migration (#4378) --- tests/dragonfly/cluster_test.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/dragonfly/cluster_test.py b/tests/dragonfly/cluster_test.py index 230dc710d490..b90d486650a2 100644 --- a/tests/dragonfly/cluster_test.py +++ b/tests/dragonfly/cluster_test.py @@ -1433,8 +1433,7 @@ async def test_migration_with_key_ttl(df_factory): assert await nodes[1].client.execute_command("stick k_sticky") == 0 -@pytest.mark.skip("Flaky test") -@dfly_args({"proactor_threads": 4, "cluster_mode": "yes"}) +@dfly_args({"proactor_threads": 4, "cluster_mode": "yes", "migration_finalization_timeout_ms": 5}) async def test_network_disconnect_during_migration(df_factory): instances = [ df_factory.create( @@ -1473,7 +1472,7 @@ async def test_network_disconnect_during_migration(df_factory): await nodes[0].admin_client.execute_command("DFLYCLUSTER", "SLOT-MIGRATION-STATUS") ) - await wait_for_status(nodes[0].admin_client, nodes[1].id, "SYNC") + await wait_for_status(nodes[0].admin_client, nodes[1].id, "SYNC", 20) finally: await proxy.close(task) From 5e7acbb396080bf26914b8077ab93faba47c3706 Mon Sep 17 00:00:00 2001 From: Stepan Bagritsevich <43710058+BagritsevichStepan@users.noreply.github.com> Date: Mon, 30 Dec 2024 17:38:57 +0400 Subject: [PATCH 04/20] fix(snapshot_test): Fix test_big_value_serialization_memory_limit after adding streams support (#4383) * fix(snapshot_test): Fix test_big_value_serialization_memory_limit after adding streams support Signed-off-by: Stepan Bagritsevich * feat: Temporary run test 30 times Signed-off-by: Stepan Bagritsevich * fix: Remove repeat Signed-off-by: Stepan Bagritsevich * Revert "fix: Remove repeat" This reverts commit b7f1ed90b21c313d0018d7fc16bcfde5609aae03. * Revert "feat: Temporary run test 30 times" This reverts commit 78bfb0fc2607bf364700c74ad1584470e1bcdae3. * Revert "fix(snapshot_test): Fix test_big_value_serialization_memory_limit after adding streams support" This reverts commit f33036db2ac9186ab900f9fa48e4343524eacc9c. * refactor: address comments Signed-off-by: Stepan Bagritsevich --------- Signed-off-by: Stepan Bagritsevich --- tests/dragonfly/snapshot_test.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/dragonfly/snapshot_test.py b/tests/dragonfly/snapshot_test.py index cb3f658fc369..81fd7f7fc00a 100644 --- a/tests/dragonfly/snapshot_test.py +++ b/tests/dragonfly/snapshot_test.py @@ -574,7 +574,6 @@ async def test_tiered_entries_throttle(async_client: aioredis.Redis): ("SET"), ("ZSET"), ("LIST"), - ("STREAM"), ], ) @pytest.mark.slow From 2abe2c0ac2d921f539e1dfe52b9c739818350bf6 Mon Sep 17 00:00:00 2001 From: Roman Gershman Date: Tue, 31 Dec 2024 18:41:14 +0200 Subject: [PATCH 05/20] fix: ExternalAllocator::Free with large sizes (#4388) ExternalAllocator allocates large sizes directly from the extent tree bypassing segment data structure. Unfortunately, we forgot to align Free() the same way. This PR: 1. Make sure that we add back the allocated range to the extent tree in Free. 2. Rewrite and simplify ExtentTree::Add implementation. Signed-off-by: Roman Gershman --- src/core/extent_tree.cc | 50 ++++++++++------------- src/core/extent_tree_test.cc | 12 +++--- src/server/tiering/external_alloc.cc | 6 +++ src/server/tiering/external_alloc_test.cc | 8 ++++ 4 files changed, 41 insertions(+), 35 deletions(-) diff --git a/src/core/extent_tree.cc b/src/core/extent_tree.cc index 555dddf654fe..f05bb84750f6 100644 --- a/src/core/extent_tree.cc +++ b/src/core/extent_tree.cc @@ -15,47 +15,39 @@ void ExtentTree::Add(size_t start, size_t len) { DCHECK_GT(len, 0u); DCHECK_EQ(len_extents_.size(), extents_.size()); - size_t end = start + len; - - if (extents_.empty()) { - extents_.emplace(start, end); - len_extents_.emplace(len, start); - - return; - } - auto it = extents_.lower_bound(start); - bool merged = false; + optional::iterator> prev_extent; if (it != extents_.begin()) { auto prev = it; --prev; DCHECK_LE(prev->second, start); - if (prev->second == start) { // [first, second = start, end) - merged = true; - len_extents_.erase(pair{prev->second - prev->first, prev->first}); - - // check if we join: prev, [start, end), [it->first, it->second] - if (it != extents_.end() && end == it->first) { // [first, end = it->first, it->second) - prev->second = it->second; - len_extents_.erase(pair{it->second - it->first, it->first}); - extents_.erase(it); - } else { - prev->second = end; // just extend prev - } - len_extents_.emplace(prev->second - prev->first, prev->first); + if (prev->second == start) { // combine with the previous extent + size_t prev_len = prev->second - prev->first; + CHECK_EQ(1u, len_extents_.erase(pair{prev_len, prev->first})); + prev->second += len; + start = prev->first; + len += prev_len; + prev_extent = prev; } } - if (!merged) { - if (end == it->first) { // [start, end), [it->first, it->second] - len_extents_.erase(pair{it->second - it->first, it->first}); - end = it->second; + if (it != extents_.end()) { + DCHECK_GE(it->first, start + len); + if (start + len == it->first) { // merge with the next extent + size_t it_len = it->second - it->first; + CHECK_EQ(1u, len_extents_.erase(pair{it_len, it->first})); extents_.erase(it); + len += it_len; } - extents_.emplace(start, end); - len_extents_.emplace(end - start, start); + } + + len_extents_.emplace(len, start); + if (prev_extent) { + (*prev_extent)->second = start + len; + } else { + extents_.emplace(start, start + len); } } diff --git a/src/core/extent_tree_test.cc b/src/core/extent_tree_test.cc index 74f682a73f46..6a4ccc652659 100644 --- a/src/core/extent_tree_test.cc +++ b/src/core/extent_tree_test.cc @@ -28,28 +28,28 @@ TEST_F(ExtentTreeTest, Basic) { tree_.Add(0, 256); auto op = tree_.GetRange(64, 16); EXPECT_TRUE(op); - EXPECT_THAT(*op, testing::Pair(0, 64)); + EXPECT_THAT(*op, testing::Pair(0, 64)); // [64, 256) tree_.Add(56, 8); op = tree_.GetRange(64, 16); EXPECT_TRUE(op); - EXPECT_THAT(*op, testing::Pair(64, 128)); + EXPECT_THAT(*op, testing::Pair(64, 128)); // {[56, 64), [128, 256)} op = tree_.GetRange(18, 2); EXPECT_TRUE(op); - EXPECT_THAT(*op, testing::Pair(128, 146)); + EXPECT_THAT(*op, testing::Pair(128, 146)); // {[56, 64), [146, 256)} op = tree_.GetRange(80, 16); EXPECT_TRUE(op); - EXPECT_THAT(*op, testing::Pair(160, 240)); + EXPECT_THAT(*op, testing::Pair(160, 240)); // {[56, 64), [146, 160), [240, 256)} op = tree_.GetRange(4, 1); EXPECT_TRUE(op); - EXPECT_THAT(*op, testing::Pair(56, 60)); + EXPECT_THAT(*op, testing::Pair(56, 60)); // {[60, 64), [146, 160), [240, 256)} op = tree_.GetRange(32, 1); EXPECT_FALSE(op); - tree_.Add(64, 240 - 64); + tree_.Add(64, 146 - 64); op = tree_.GetRange(32, 4); EXPECT_TRUE(op); EXPECT_THAT(*op, testing::Pair(60, 92)); diff --git a/src/server/tiering/external_alloc.cc b/src/server/tiering/external_alloc.cc index fd8b3c8e0951..37f016c72a85 100644 --- a/src/server/tiering/external_alloc.cc +++ b/src/server/tiering/external_alloc.cc @@ -367,6 +367,12 @@ int64_t ExternalAllocator::Malloc(size_t sz) { } void ExternalAllocator::Free(size_t offset, size_t sz) { + if (sz > kMediumObjMaxSize) { + size_t align_sz = alignup(sz, 4_KB); + extent_tree_.Add(offset, align_sz); + return; + } + size_t idx = offset / 256_MB; size_t delta = offset % 256_MB; CHECK_LT(idx, segments_.size()); diff --git a/src/server/tiering/external_alloc_test.cc b/src/server/tiering/external_alloc_test.cc index dccd44140b7b..76db86dc0b02 100644 --- a/src/server/tiering/external_alloc_test.cc +++ b/src/server/tiering/external_alloc_test.cc @@ -146,4 +146,12 @@ TEST_F(ExternalAllocatorTest, EmptyFull) { EXPECT_GT(ext_alloc_.Malloc(kAllocSize), 0u); } +TEST_F(ExternalAllocatorTest, AllocLarge) { + ext_alloc_.AddStorage(0, kSegSize); + + off_t offs = ext_alloc_.Malloc(2_MB - 1); + EXPECT_EQ(offs, 0); + ext_alloc_.Free(offs, 2_MB - 1); +} + } // namespace dfly::tiering From 810af83074561a6bf954fb712f2a533239349773 Mon Sep 17 00:00:00 2001 From: adiholden Date: Wed, 1 Jan 2025 10:49:17 +0200 Subject: [PATCH 06/20] fix(server): debug populate consume less memory (#4384) * fix server: debug populate consume less memory Signed-off-by: adi_holden --- src/server/debugcmd.cc | 48 +++++++++++++++++++------------- tests/dragonfly/memory_test.py | 3 +- tests/dragonfly/snapshot_test.py | 14 +++------- 3 files changed, 34 insertions(+), 31 deletions(-) diff --git a/src/server/debugcmd.cc b/src/server/debugcmd.cc index 326fd569ee76..3c72e57514d4 100644 --- a/src/server/debugcmd.cc +++ b/src/server/debugcmd.cc @@ -124,12 +124,13 @@ tuple> GeneratePopulateCommand( args.push_back(GenerateValue(val_size, random_value, gen)); } } else if (type == "JSON") { - cid = registry.Find("JSON.SET"); + cid = registry.Find("JSON.MERGE"); args.push_back("$"); string json = "{"; for (size_t i = 0; i < elements; ++i) { - absl::StrAppend(&json, "\"", i, "\":\"", GenerateValue(val_size, random_value, gen), "\","); + absl::StrAppend(&json, "\"", GenerateValue(val_size / 2, random_value, gen), "\":\"", + GenerateValue(val_size / 2, random_value, gen), "\","); } json[json.size() - 1] = '}'; // Replace last ',' with '}' args.push_back(json); @@ -157,30 +158,37 @@ void DoPopulateBatch(string_view type, string_view prefix, size_t val_size, bool absl::InlinedVector args_view; facade::CapturingReplyBuilder crb; ConnectionContext local_cntx{cntx, stub_tx.get()}; - absl::InsecureBitGen gen; for (unsigned i = 0; i < batch.sz; ++i) { string key = absl::StrCat(prefix, ":", batch.index[i]); + uint32_t elements_left = elements; + + while (elements_left) { + // limit rss grow by 32K by limiting the element count in each command. + uint32_t max_batch_elements = std::max(32_KB / val_size, 1ULL); + uint32_t populate_elements = std::min(max_batch_elements, elements_left); + elements_left -= populate_elements; + auto [cid, args] = + GeneratePopulateCommand(type, key, val_size, random_value, populate_elements, + *sf->service().mutable_registry(), &gen); + if (!cid) { + LOG_EVERY_N(WARNING, 10'000) << "Unable to find command, was it renamed?"; + break; + } - auto [cid, args] = GeneratePopulateCommand(type, std::move(key), val_size, random_value, - elements, *sf->service().mutable_registry(), &gen); - if (!cid) { - LOG_EVERY_N(WARNING, 10'000) << "Unable to find command, was it renamed?"; - break; - } - - args_view.clear(); - for (auto& arg : args) { - args_view.push_back(arg); - } - auto args_span = absl::MakeSpan(args_view); + args_view.clear(); + for (auto& arg : args) { + args_view.push_back(arg); + } + auto args_span = absl::MakeSpan(args_view); - stub_tx->MultiSwitchCmd(cid); - local_cntx.cid = cid; - crb.SetReplyMode(ReplyMode::NONE); - stub_tx->InitByArgs(cntx->ns, local_cntx.conn_state.db_index, args_span); + stub_tx->MultiSwitchCmd(cid); + local_cntx.cid = cid; + crb.SetReplyMode(ReplyMode::NONE); + stub_tx->InitByArgs(cntx->ns, local_cntx.conn_state.db_index, args_span); - sf->service().InvokeCmd(cid, args_span, &crb, &local_cntx); + sf->service().InvokeCmd(cid, args_span, &crb, &local_cntx); + } } local_tx->UnlockMulti(); diff --git a/tests/dragonfly/memory_test.py b/tests/dragonfly/memory_test.py index 292275a2002c..37cd1131812a 100644 --- a/tests/dragonfly/memory_test.py +++ b/tests/dragonfly/memory_test.py @@ -17,7 +17,7 @@ ("ZSET", 250_000, 100, 100), ("LIST", 300_000, 100, 100), ("STRING", 3_500_000, 1000, 1), - ("STREAM", 260_000, 100, 100), + ("STREAM", 280_000, 100, 100), ], ) # We limit to 5gb just in case to sanity check the gh runner. Otherwise, if we ask for too much @@ -69,6 +69,7 @@ async def check_memory(): await client.execute_command("DFLY", "LOAD", f"{dbfilename}-summary.dfs") await check_memory() + await client.execute_command("FLUSHALL") @pytest.mark.asyncio diff --git a/tests/dragonfly/snapshot_test.py b/tests/dragonfly/snapshot_test.py index 81fd7f7fc00a..5101388bebbf 100644 --- a/tests/dragonfly/snapshot_test.py +++ b/tests/dragonfly/snapshot_test.py @@ -569,12 +569,7 @@ async def test_tiered_entries_throttle(async_client: aioredis.Redis): @dfly_args({"serialization_max_chunk_size": 4096, "proactor_threads": 1}) @pytest.mark.parametrize( "cont_type", - [ - ("HASH"), - ("SET"), - ("ZSET"), - ("LIST"), - ], + [("HASH"), ("SET"), ("ZSET"), ("LIST"), ("STREAM")], ) @pytest.mark.slow async def test_big_value_serialization_memory_limit(df_factory, cont_type): @@ -590,17 +585,16 @@ async def test_big_value_serialization_memory_limit(df_factory, cont_type): await client.execute_command( f"debug populate 1 prefix {element_size} TYPE {cont_type} RAND ELEMENTS {elements}" ) + await asyncio.sleep(1) info = await client.info("ALL") - # rss double's because of DEBUG POPULATE - assert info["used_memory_peak_rss"] > (one_gb * 2) + assert info["used_memory_peak_rss"] < (one_gb * 1.2) # if we execute SAVE below without big value serialization we trigger the assertion below. # note the peak would reach (one_gb * 3) without it. await client.execute_command("SAVE") info = await client.info("ALL") - upper_limit = 2_250_000_000 # 2.25 GB - assert info["used_memory_peak_rss"] < upper_limit + assert info["used_memory_peak_rss"] < (one_gb * 1.3) await client.execute_command("FLUSHALL") await client.close() From c5b3584dfdfe8b8a4d42927fce0c8977e188f745 Mon Sep 17 00:00:00 2001 From: Borys Date: Wed, 1 Jan 2025 10:51:38 +0200 Subject: [PATCH 07/20] refactor: slot_set don't use stack memory anymore (#4386) --- src/server/cluster/slot_set.h | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/src/server/cluster/slot_set.h b/src/server/cluster/slot_set.h index de54631fdcc6..c1eb23a7faf2 100644 --- a/src/server/cluster/slot_set.h +++ b/src/server/cluster/slot_set.h @@ -18,22 +18,22 @@ class SlotSet { using TBitSet = std::bitset; SlotSet(bool full_house = false) { + slots_ = std::make_unique(); if (full_house) slots_->flip(); } SlotSet(const SlotRanges& slot_ranges) { + slots_ = std::make_unique(); Set(slot_ranges, true); } - SlotSet(const TBitSet& s) { - *slots_ = s; - } - SlotSet(const SlotSet& s) { - *slots_ = *s.slots_; + slots_ = std::make_unique(*s.slots_); } + SlotSet(SlotSet&& s) = default; + bool Contains(SlotId slot) const { return slots_->test(slot); } @@ -64,7 +64,11 @@ class SlotSet { // Get SlotSet that are absent in the slots SlotSet GetRemovedSlots(const SlotSet& slots) const { - return *slots_ & ~*slots.slots_; + // we need to avoid stack usage to prevent stack overflow + SlotSet res(slots); + res.slots_->flip(); + *res.slots_ &= *slots_; + return res; } SlotRanges ToSlotRanges() const { @@ -85,7 +89,12 @@ class SlotSet { } private: - std::unique_ptr slots_{std::make_unique()}; + SlotSet(std::unique_ptr s) { + slots_ = std::move(s); + } + + private: + std::unique_ptr slots_; }; } // namespace dfly::cluster From 413ec0a1cf1e9684c7d2e84092738e6d6fd5174a Mon Sep 17 00:00:00 2001 From: guyzilla Date: Wed, 1 Jan 2025 11:34:53 +0200 Subject: [PATCH 08/20] feat:Adding support for ZMPOP command (#4385) Signed-off-by: Guy Flysher --- src/facade/reply_builder.cc | 18 ++- src/facade/reply_builder.h | 5 +- src/facade/reply_builder_test.cc | 21 ++++ src/server/command_registry.cc | 2 + src/server/command_registry.h | 1 + src/server/zset_family.cc | 188 +++++++++++++++++++++++++++---- src/server/zset_family.h | 1 + src/server/zset_family_test.cc | 128 +++++++++++++++++++++ 8 files changed, 340 insertions(+), 24 deletions(-) diff --git a/src/facade/reply_builder.cc b/src/facade/reply_builder.cc index 9a909b902d12..d398ecbf8f98 100644 --- a/src/facade/reply_builder.cc +++ b/src/facade/reply_builder.cc @@ -408,8 +408,7 @@ void RedisReplyBuilder::SendBulkStrArr(const facade::ArgRange& strs, CollectionT SendBulkString(str); } -void RedisReplyBuilder::SendScoredArray(absl::Span> arr, - bool with_scores) { +void RedisReplyBuilder::SendScoredArray(ScoredArray arr, bool with_scores) { ReplyScope scope(this); StartArray((with_scores && !IsResp3()) ? arr.size() * 2 : arr.size()); for (const auto& [str, score] : arr) { @@ -421,6 +420,21 @@ void RedisReplyBuilder::SendScoredArray(absl::Span>; RedisReplyBuilder(io::Sink* sink) : RedisReplyBuilderBase(sink) { } @@ -281,8 +282,8 @@ class RedisReplyBuilder : public RedisReplyBuilderBase { void SendSimpleStrArr(const facade::ArgRange& strs); void SendBulkStrArr(const facade::ArgRange& strs, CollectionType ct = ARRAY); - void SendScoredArray(absl::Span> arr, bool with_scores); - + void SendScoredArray(ScoredArray arr, bool with_scores); + void SendLabeledScoredArray(std::string_view arr_label, ScoredArray arr); void SendStored() final; void SendSetSkipped() final; diff --git a/src/facade/reply_builder_test.cc b/src/facade/reply_builder_test.cc index 028d594eef9f..8560bcde5ee0 100644 --- a/src/facade/reply_builder_test.cc +++ b/src/facade/reply_builder_test.cc @@ -775,6 +775,27 @@ TEST_F(RedisReplyBuilderTest, SendScoredArray) { << "Resp3 WITHSCORES failed."; } +TEST_F(RedisReplyBuilderTest, SendLabeledScoredArray) { + const std::vector> scored_array{ + {"e1", 1.1}, {"e2", 2.2}, {"e3", 3.3}}; + + builder_->SetResp3(false); + builder_->SendLabeledScoredArray("foobar", scored_array); + ASSERT_TRUE(NoErrors()); + ASSERT_EQ(TakePayload(), + "*2\r\n$6\r\nfoobar\r\n*3\r\n*2\r\n$2\r\ne1\r\n$3\r\n1.1\r\n*2\r\n$2\r\ne2\r\n$3\r\n2." + "2\r\n*2\r\n$2\r\ne3\r\n$3\r\n3.3\r\n") + << "Resp3 failed.\n"; + + builder_->SetResp3(true); + builder_->SendLabeledScoredArray("foobar", scored_array); + ASSERT_TRUE(NoErrors()); + ASSERT_EQ(TakePayload(), + "*2\r\n$6\r\nfoobar\r\n*3\r\n*2\r\n$2\r\ne1\r\n,1.1\r\n*2\r\n$2\r\ne2\r\n,2.2\r\n*" + "2\r\n$2\r\ne3\r\n,3.3\r\n") + << "Resp3 failed."; +} + TEST_F(RedisReplyBuilderTest, BasicCapture) { GTEST_SKIP() << "Unmark when CaptuingReplyBuilder is updated"; diff --git a/src/server/command_registry.cc b/src/server/command_registry.cc index cce5354f8ffe..0777ad7d992e 100644 --- a/src/server/command_registry.cc +++ b/src/server/command_registry.cc @@ -283,6 +283,8 @@ const char* OptName(CO::CommandOpt fl) { return "no-key-tx-span-all"; case IDEMPOTENT: return "idempotent"; + case SLOW: + return "slow"; } return "unknown"; } diff --git a/src/server/command_registry.h b/src/server/command_registry.h index 3acc69c355bc..76f27117bef1 100644 --- a/src/server/command_registry.h +++ b/src/server/command_registry.h @@ -52,6 +52,7 @@ enum CommandOpt : uint32_t { // The same callback can be run multiple times without corrupting the result. Used for // opportunistic optimizations where inconsistencies can only be detected afterwards. IDEMPOTENT = 1U << 18, + SLOW = 1U << 19 // Unused? }; const char* OptName(CommandOpt fl); diff --git a/src/server/zset_family.cc b/src/server/zset_family.cc index 56b8a1777f07..fc440ad92037 100644 --- a/src/server/zset_family.cc +++ b/src/server/zset_family.cc @@ -67,10 +67,10 @@ struct GeoPoint { double dist; double score; std::string member; - GeoPoint() : longitude(0.0), latitude(0.0), dist(0.0), score(0.0){}; + GeoPoint() : longitude(0.0), latitude(0.0), dist(0.0), score(0.0) {}; GeoPoint(double _longitude, double _latitude, double _dist, double _score, const std::string& _member) - : longitude(_longitude), latitude(_latitude), dist(_dist), score(_score), member(_member){}; + : longitude(_longitude), latitude(_latitude), dist(_dist), score(_score), member(_member) {}; }; using GeoArray = std::vector; @@ -179,8 +179,7 @@ struct ZParams { bool override = false; }; -void OutputScoredArrayResult(const OpResult& result, - const ZSetFamily::RangeParams& params, SinkReplyBuilder* builder) { +void OutputScoredArrayResult(const OpResult& result, SinkReplyBuilder* builder) { if (result.status() == OpStatus::WRONG_TYPE) { return builder->SendError(kWrongTypeErr); } @@ -188,7 +187,7 @@ void OutputScoredArrayResult(const OpResult& result, LOG_IF(WARNING, !result && result.status() != OpStatus::KEY_NOTFOUND) << "Unexpected status " << result.status(); auto* rb = static_cast(builder); - rb->SendScoredArray(result.value(), params.with_scores); + rb->SendScoredArray(result.value(), true /* with scores */); } OpResult FindZEntry(const ZParams& zparams, const OpArgs& op_args, @@ -1821,31 +1820,47 @@ void ZBooleanOperation(CmdArgList args, string_view cmd, bool is_union, bool sto } } -void ZPopMinMax(CmdArgList args, bool reverse, Transaction* tx, SinkReplyBuilder* builder) { - string_view key = ArgS(args, 0); +enum class FilterShards { NO = 0, YES = 1 }; +OpResult ZPopMinMaxInternal(std::string_view key, FilterShards should_filter_shards, + uint32 count, bool reverse, Transaction* tx) { ZSetFamily::RangeParams range_params; range_params.reverse = reverse; range_params.with_scores = true; ZSetFamily::ZRangeSpec range_spec; range_spec.params = range_params; - ZSetFamily::TopNScored sc = 1; - if (args.size() > 1) { - string_view count = ArgS(args, 1); - if (!SimpleAtoi(count, &sc)) { - return builder->SendError(kUintErr); - } - } + range_spec.interval = count; - range_spec.interval = sc; + OpResult result; + std::optional key_shard; + if (should_filter_shards == FilterShards::YES) { + key_shard = Shard(key, shard_set->size()); + } auto cb = [&](Transaction* t, EngineShard* shard) { - return OpPopCount(range_spec, t->GetOpArgs(shard), key); + if (!key_shard.has_value() || *key_shard == shard->shard_id()) { + result = std::move(OpPopCount(range_spec, t->GetOpArgs(shard), key)); + } + return OpStatus::OK; }; - OpResult result = tx->ScheduleSingleHopT(std::move(cb)); - OutputScoredArrayResult(result, range_params, builder); + tx->Execute(std::move(cb), true); + + return result; +} + +void ZPopMinMaxFromArgs(CmdArgList args, bool reverse, Transaction* tx, SinkReplyBuilder* builder) { + string_view key = ArgS(args, 0); + uint32 count = 1; + if (args.size() > 1) { + string_view count_str = ArgS(args, 1); + if (!SimpleAtoi(count_str, &count)) { + return builder->SendError(kUintErr); + } + } + + OutputScoredArrayResult(ZPopMinMaxInternal(key, FilterShards::NO, count, reverse, tx), builder); } OpResult ZGetMembers(CmdArgList args, Transaction* tx, SinkReplyBuilder* builder) { @@ -2060,6 +2075,71 @@ void ZRemRangeGeneric(string_view key, const ZSetFamily::ZRangeSpec& range_spec, } } +// Returns the key of the first non empty set found in the list of shard arguments. +// Returns nullopt if none. +std::optional GetFirstNonEmptyKeyFound(EngineShard* shard, Transaction* t) { + ShardArgs keys = t->GetShardArgs(shard->shard_id()); + DCHECK(!keys.Empty()); + + auto& db_slice = t->GetDbSlice(shard->shard_id()); + + for (string_view key : keys) { + auto it = db_slice.FindReadOnly(t->GetDbContext(), key, OBJ_ZSET); + if (!it) { + continue; + } + return std::optional(key); + } + + return std::nullopt; +} + +// Validates the ZMPop command arguments and extracts the values to the output params. +// If the arguments are invalid sends the appropiate error to builder and returns false. +bool ValidateZMPopCommand(CmdArgList args, uint32* num_keys, bool* is_max, int* pop_count, + SinkReplyBuilder* builder) { + CmdArgParser parser{args}; + + if (!SimpleAtoi(parser.Next(), num_keys)) { + builder->SendError(kUintErr); + return false; + } + + if (*num_keys <= 0 || !parser.HasAtLeast(*num_keys + 1)) { + // We should have at least num_keys keys + a MIN/MAX arg. + builder->SendError(kSyntaxErr); + return false; + } + // Skip over the keys themselves. + parser.Skip(*num_keys); + + // We know we have at least one more arg (we checked above). + if (parser.Check("MAX")) { + *is_max = true; + } else if (parser.Check("MIN")) { + *is_max = false; + } else { + builder->SendError(kSyntaxErr); + return false; + } + + *pop_count = 1; + // Check if we have additional COUNT argument. + if (parser.HasNext()) { + if (!parser.Check("COUNT", pop_count)) { + builder->SendError(kSyntaxErr); + return false; + } + } + + if (!parser.Finalize()) { + builder->SendError(parser.Error()->MakeReply()); + return false; + } + + return true; +} + } // namespace void ZSetFamily::BZPopMin(CmdArgList args, const CommandContext& cmd_cntx) { @@ -2355,12 +2435,77 @@ void ZSetFamily::ZInterCard(CmdArgList args, const CommandContext& cmd_cntx) { builder->SendLong(result.value().size()); } +void ZSetFamily::ZMPop(CmdArgList args, const CommandContext& cmd_cntx) { + uint32 num_keys; + bool is_max; + int pop_count; + if (!ValidateZMPopCommand(args, &num_keys, &is_max, &pop_count, cmd_cntx.rb)) { + return; + } + auto* response_builder = static_cast(cmd_cntx.rb); + + // From the list of input keys, keep the first (in the order of keys in the command) key found in + // the current shard. + std::vector> first_found_key_per_shard_vec(shard_set->size(), + std::nullopt); + + auto cb = [&](Transaction* t, EngineShard* shard) { + std::optional result = GetFirstNonEmptyKeyFound(shard, t); + if (result.has_value()) { + first_found_key_per_shard_vec[shard->shard_id()] = result; + } + return OpStatus::OK; + }; + + cmd_cntx.tx->Execute(std::move(cb), false /* possibly another hop */); + + // Keep all the keys found (first only for each shard) in a set for fast lookups. + absl::flat_hash_set first_found_keys_for_shard; + // We can have at most one result from each shard. + first_found_keys_for_shard.reserve(std::min(shard_set->size(), num_keys)); + for (const auto& key : first_found_key_per_shard_vec) { + if (!key.has_value()) { + continue; + } + first_found_keys_for_shard.insert(*key); + } + + // Now that we have the first non empty key from each shard, find the first overall first key and + // pop elements from it. + std::optional key_to_pop = std::nullopt; + ArgRange arg_keys(args.subspan(1, num_keys)); + // Find the first arg_key which exists in any shard and is not empty. + for (std::string_view key : arg_keys) { + if (first_found_keys_for_shard.contains(key)) { + key_to_pop = key; + break; + } + } + + if (!key_to_pop.has_value()) { + cmd_cntx.tx->Conclude(); + response_builder->SendNull(); + return; + } + + // Pop elements from relevant set. + OpResult pop_result = + ZPopMinMaxInternal(*key_to_pop, FilterShards::YES, pop_count, is_max, cmd_cntx.tx); + + if (pop_result.status() == OpStatus::WRONG_TYPE) { + return response_builder->SendError(kWrongTypeErr); + } + + LOG_IF(WARNING, !pop_result) << "Unexpected status " << pop_result.status(); + response_builder->SendLabeledScoredArray(*key_to_pop, pop_result.value()); +} + void ZSetFamily::ZPopMax(CmdArgList args, const CommandContext& cmd_cntx) { - ZPopMinMax(std::move(args), true, cmd_cntx.tx, cmd_cntx.rb); + ZPopMinMaxFromArgs(std::move(args), true, cmd_cntx.tx, cmd_cntx.rb); } void ZSetFamily::ZPopMin(CmdArgList args, const CommandContext& cmd_cntx) { - ZPopMinMax(std::move(args), false, cmd_cntx.tx, cmd_cntx.rb); + ZPopMinMaxFromArgs(std::move(args), false, cmd_cntx.tx, cmd_cntx.rb); } void ZSetFamily::ZLexCount(CmdArgList args, const CommandContext& cmd_cntx) { @@ -3217,6 +3362,7 @@ constexpr uint32_t kZInterStore = WRITE | SORTEDSET | SLOW; constexpr uint32_t kZInter = READ | SORTEDSET | SLOW; constexpr uint32_t kZInterCard = WRITE | SORTEDSET | SLOW; constexpr uint32_t kZLexCount = READ | SORTEDSET | FAST; +constexpr uint32_t kZMPop = WRITE | SORTEDSET | SLOW; constexpr uint32_t kZPopMax = WRITE | SORTEDSET | FAST; constexpr uint32_t kZPopMin = WRITE | SORTEDSET | FAST; constexpr uint32_t kZRem = WRITE | SORTEDSET | FAST; @@ -3267,6 +3413,8 @@ void ZSetFamily::Register(CommandRegistry* registry) { << CI{"ZINTERCARD", CO::READONLY | CO::VARIADIC_KEYS, -3, 2, 2, acl::kZInterCard}.HFUNC( ZInterCard) << CI{"ZLEXCOUNT", CO::READONLY, 4, 1, 1, acl::kZLexCount}.HFUNC(ZLexCount) + << CI{"ZMPOP", CO::SLOW | CO::WRITE | CO::VARIADIC_KEYS, -4, 2, 2, acl::kZMPop}.HFUNC(ZMPop) + << CI{"ZPOPMAX", CO::FAST | CO::WRITE, -2, 1, 1, acl::kZPopMax}.HFUNC(ZPopMax) << CI{"ZPOPMIN", CO::FAST | CO::WRITE, -2, 1, 1, acl::kZPopMin}.HFUNC(ZPopMin) << CI{"ZREM", CO::FAST | CO::WRITE, -3, 1, 1, acl::kZRem}.HFUNC(ZRem) diff --git a/src/server/zset_family.h b/src/server/zset_family.h index ec678597a791..17d4eceb24ad 100644 --- a/src/server/zset_family.h +++ b/src/server/zset_family.h @@ -72,6 +72,7 @@ class ZSetFamily { static void ZInter(CmdArgList args, const CommandContext& cmd_cntx); static void ZInterCard(CmdArgList args, const CommandContext& cmd_cntx); static void ZLexCount(CmdArgList args, const CommandContext& cmd_cntx); + static void ZMPop(CmdArgList args, const CommandContext& cmd_cntx); static void ZPopMax(CmdArgList args, const CommandContext& cmd_cntx); static void ZPopMin(CmdArgList args, const CommandContext& cmd_cntx); static void ZRange(CmdArgList args, const CommandContext& cmd_cntx); diff --git a/src/server/zset_family_test.cc b/src/server/zset_family_test.cc index 140488741de7..e38a364fbabf 100644 --- a/src/server/zset_family_test.cc +++ b/src/server/zset_family_test.cc @@ -81,6 +81,32 @@ MATCHER_P(UnorderedScoredElementsAreMatcher, elements_list, "") { elements_list.end()); } +MATCHER_P2(ContainsLabeledScoredArrayMatcher, label, elements, "") { + auto label_vec = arg.GetVec(); + if (label_vec.size() != 2) { + *result_listener << "Labeled Scored Array does no contain two elements."; + return false; + } + + if (!ExplainMatchResult(Eq(label), label_vec[0].GetString(), result_listener)) { + return false; + } + + auto value_pairs_vec = label_vec[1].GetVec(); + std::set> actual_elements; + for (const auto& scored_element : value_pairs_vec) { + actual_elements.insert(std::make_pair(scored_element.GetVec()[0].GetString(), + scored_element.GetVec()[1].GetString())); + } + if (actual_elements != elements) { + *result_listener << "Scored elements do not match: "; + ExplainMatchResult(ElementsAreArray(elements), actual_elements, result_listener); + return false; + } + + return true; +} + auto ConsistsOf(std::initializer_list elements) { return ConsistsOfMatcher(std::unordered_set{elements}); } @@ -98,6 +124,12 @@ auto UnorderedScoredElementsAre( return UnorderedScoredElementsAreMatcher(elements); } +auto ContainsLabeledScoredArray( + std::string_view label, std::initializer_list> elements) { + return ContainsLabeledScoredArrayMatcher(label, + std::set>{elements}); +} + TEST_F(ZSetFamilyTest, Add) { auto resp = Run({"zadd", "x", "1.1", "a"}); EXPECT_THAT(resp, IntArg(1)); @@ -757,6 +789,102 @@ TEST_F(ZSetFamilyTest, ZAddBug148) { EXPECT_THAT(resp, IntArg(1)); } +TEST_F(ZSetFamilyTest, ZMPopInvalidSyntax) { + // Not enough arguments. + auto resp = Run({"zmpop", "1", "a"}); + EXPECT_THAT(resp, ErrArg("wrong number of arguments")); + + // Zero keys. + resp = Run({"zmpop", "0", "MIN", "COUNT", "1"}); + EXPECT_THAT(resp, ErrArg("syntax error")); + + // Number of keys not uint. + resp = Run({"zmpop", "aa", "a", "MIN"}); + EXPECT_THAT(resp, ErrArg("value is not an integer or out of range")); + + // Missing MIN/MAX. + resp = Run({"zmpop", "1", "a", "COUNT", "1"}); + EXPECT_THAT(resp, ErrArg("syntax error")); + + // Wrong number of keys. + resp = Run({"zmpop", "1", "a", "b", "MAX"}); + EXPECT_THAT(resp, ErrArg("syntax error")); + + // Count with no number. + resp = Run({"zmpop", "1", "a", "MAX", "COUNT"}); + EXPECT_THAT(resp, ErrArg("syntax error")); + + // Count number is not uint. + resp = Run({"zmpop", "1", "a", "MIN", "COUNT", "boo"}); + EXPECT_THAT(resp, ErrArg("value is not an integer or out of range")); + + // Too many arguments. + resp = Run({"zmpop", "1", "c", "MAX", "COUNT", "2", "foo"}); + EXPECT_THAT(resp, ErrArg("syntax error")); +} + +TEST_F(ZSetFamilyTest, ZMPop) { + // All sets are empty. + auto resp = Run({"zmpop", "1", "e", "MIN"}); + EXPECT_THAT(resp, ArgType(RespExpr::NIL)); + + // Min operation. + resp = Run({"zadd", "a", "1", "a1", "2", "a2"}); + EXPECT_THAT(resp, IntArg(2)); + + resp = Run({"zmpop", "1", "a", "MIN"}); + EXPECT_THAT(resp, ContainsLabeledScoredArray("a", {{"a1", "1"}})); + + resp = Run({"ZRANGE", "a", "0", "-1", "WITHSCORES"}); + EXPECT_THAT(resp, RespArray(ElementsAre("a2", "2"))); + + // Max operation. + resp = Run({"zadd", "b", "1", "b1", "2", "b2"}); + EXPECT_THAT(resp, IntArg(2)); + + resp = Run({"zmpop", "1", "b", "MAX"}); + EXPECT_THAT(resp, ContainsLabeledScoredArray("b", {{"b2", "2"}})); + + resp = Run({"ZRANGE", "b", "0", "-1", "WITHSCORES"}); + EXPECT_THAT(resp, RespArray(ElementsAre("b1", "1"))); + + // Count > 1. + resp = Run({"zadd", "c", "1", "c1", "2", "c2"}); + EXPECT_THAT(resp, IntArg(2)); + + resp = Run({"zmpop", "1", "c", "MAX", "COUNT", "2"}); + EXPECT_THAT(resp, ContainsLabeledScoredArray("c", {{"c1", "1"}, {"c2", "2"}})); + + resp = Run({"zcard", "c"}); + EXPECT_THAT(resp, IntArg(0)); + + // Count > #elements in set. + resp = Run({"zadd", "d", "1", "d1", "2", "d2"}); + EXPECT_THAT(resp, IntArg(2)); + + resp = Run({"zmpop", "1", "d", "MAX", "COUNT", "3"}); + EXPECT_THAT(resp, ContainsLabeledScoredArray("d", {{"d1", "1"}, {"d2", "2"}})); + + resp = Run({"zcard", "d"}); + EXPECT_THAT(resp, IntArg(0)); + + // First non empty set is not the first set. + resp = Run({"zadd", "x", "1", "x1"}); + EXPECT_THAT(resp, IntArg(1)); + + resp = Run({"zadd", "y", "1", "y1"}); + EXPECT_THAT(resp, IntArg(1)); + + resp = Run({"zmpop", "3", "empty", "x", "y", "MAX"}); + EXPECT_THAT(resp, ContainsLabeledScoredArray("x", {{"x1", "1"}})); + + resp = Run({"zcard", "x"}); + EXPECT_THAT(resp, IntArg(0)); + + resp = Run({"ZRANGE", "y", "0", "-1", "WITHSCORES"}); + EXPECT_THAT(resp, RespArray(ElementsAre("y1", "1"))); +} + TEST_F(ZSetFamilyTest, ZPopMin) { auto resp = Run({"zadd", "key", "1", "a", "2", "b", "3", "c", "4", "d", "5", "e", "6", "f"}); EXPECT_THAT(resp, IntArg(6)); From 3b082e42b8b33253c22b1f4f95809983edd6af89 Mon Sep 17 00:00:00 2001 From: adiholden Date: Thu, 2 Jan 2025 09:39:31 +0200 Subject: [PATCH 09/20] fix(replication): do not log to journal on callback fail (#4392) fix replication: do not log to journal on callback fail Signed-off-by: adi_holden --- src/server/transaction.cc | 10 +++++++--- src/server/transaction.h | 2 +- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/src/server/transaction.cc b/src/server/transaction.cc index 3e180f871a49..36372a2a4e39 100644 --- a/src/server/transaction.cc +++ b/src/server/transaction.cc @@ -704,7 +704,7 @@ void Transaction::RunCallback(EngineShard* shard) { // Log to journal only once the command finished running if ((coordinator_state_ & COORD_CONCLUDING) || (multi_ && multi_->concluding)) { - LogAutoJournalOnShard(shard); + LogAutoJournalOnShard(shard, result); MaybeInvokeTrackingCb(); } } @@ -1346,7 +1346,7 @@ OpStatus Transaction::RunSquashedMultiCb(RunnableType cb) { auto result = cb(this, shard); db_slice.OnCbFinish(); - LogAutoJournalOnShard(shard); + LogAutoJournalOnShard(shard, result); MaybeInvokeTrackingCb(); DCHECK_EQ(result.flags, 0); // if it's sophisticated, we shouldn't squash it @@ -1438,7 +1438,7 @@ optional Transaction::GetWakeKey(ShardId sid) const { return ArgS(full_args_, sd.wake_key_pos); } -void Transaction::LogAutoJournalOnShard(EngineShard* shard) { +void Transaction::LogAutoJournalOnShard(EngineShard* shard, RunnableResult result) { // TODO: For now, we ignore non shard coordination. if (shard == nullptr) return; @@ -1455,6 +1455,10 @@ void Transaction::LogAutoJournalOnShard(EngineShard* shard) { if (journal == nullptr) return; + if (result.status != OpStatus::OK) { + return; // Do not log to journal if command execution failed. + } + // If autojournaling was disabled and not re-enabled the callback is writing to journal. if ((cid_->opt_mask() & CO::NO_AUTOJOURNAL) && !re_enabled_auto_journal_) { return; diff --git a/src/server/transaction.h b/src/server/transaction.h index c0eb37ff8042..5e209eb96d98 100644 --- a/src/server/transaction.h +++ b/src/server/transaction.h @@ -559,7 +559,7 @@ class Transaction { // Log command in shard's journal, if this is a write command with auto-journaling enabled. // Should be called immediately after the last hop. - void LogAutoJournalOnShard(EngineShard* shard); + void LogAutoJournalOnShard(EngineShard* shard, RunnableResult shard_result); // Whether the callback can be run directly on this thread without dispatching on the shard queue bool CanRunInlined() const; From 7a6852802277f6cb2fa74b28bb6ff74668b67c9f Mon Sep 17 00:00:00 2001 From: Roman Gershman Date: Thu, 2 Jan 2025 11:35:55 +0200 Subject: [PATCH 10/20] chore: minor refactorings around dense_set deletions (#4390) chore: refactorings around deletions Done as a preparation to introduce asynchronous deletions for sets/zsets/hmaps. 1. Restrict the interface around DbSlice::Del. Now it requires for the iterator to be valid and the checks should be explicit before the call. Most callers already provides a valid iterator. 2. Some minor refactoring in compact_object_test. 3. Expose DenseSet::ClearStep to allow iterative deletions of elements. Signed-off-by: Roman Gershman --- src/core/compact_object.cc | 4 +- src/core/compact_object_test.cc | 39 ++++++++++++------- src/core/dense_set.cc | 5 ++- src/core/dense_set.h | 11 +++--- src/server/db_slice.cc | 10 ++--- src/server/db_slice.h | 3 +- src/server/engine_shard.cc | 5 +-- src/server/generic_family.cc | 69 +++++++++++++++++++-------------- src/server/generic_family.h | 1 + src/server/hset_family.cc | 19 +++++---- src/server/json_family.cc | 7 +++- src/server/list_family.cc | 10 ++--- src/server/set_family.cc | 12 +++--- src/server/zset_family.cc | 12 +++--- 14 files changed, 115 insertions(+), 92 deletions(-) diff --git a/src/core/compact_object.cc b/src/core/compact_object.cc index d30f6f44cf64..261a7f4bdfbd 100644 --- a/src/core/compact_object.cc +++ b/src/core/compact_object.cc @@ -791,8 +791,8 @@ uint64_t CompactObj::HashCode() const { } if (encoded) { - GetString(&tl.tmp_str); - return XXH3_64bits_withSeed(tl.tmp_str.data(), tl.tmp_str.size(), kHashSeed); + string_view sv = GetSlice(&tl.tmp_str); + return XXH3_64bits_withSeed(sv.data(), sv.size(), kHashSeed); } switch (taglen_) { diff --git a/src/core/compact_object_test.cc b/src/core/compact_object_test.cc index 207e67f1cfd4..eedce4fee71f 100644 --- a/src/core/compact_object_test.cc +++ b/src/core/compact_object_test.cc @@ -76,29 +76,38 @@ void DeallocateAtRandom(size_t steps, std::vector* ptrs) { } } +static void InitThreadStructs() { + auto* tlh = mi_heap_get_backing(); + init_zmalloc_threadlocal(tlh); + SmallString::InitThreadLocal(tlh); + thread_local MiMemoryResource mi_resource(tlh); + CompactObj::InitThreadLocal(&mi_resource); +}; + +static void CheckEverythingDeallocated() { + mi_heap_collect(mi_heap_get_backing(), true); + + auto cb_visit = [](const mi_heap_t* heap, const mi_heap_area_t* area, void* block, + size_t block_size, void* arg) { + LOG(ERROR) << "Unfreed allocations: block_size " << block_size + << ", allocated: " << area->used * block_size; + return true; + }; + + mi_heap_visit_blocks(mi_heap_get_backing(), false /* do not visit all blocks*/, cb_visit, + nullptr); +} + class CompactObjectTest : public ::testing::Test { protected: static void SetUpTestSuite() { InitRedisTables(); // to initialize server struct. - auto* tlh = mi_heap_get_backing(); - init_zmalloc_threadlocal(tlh); - SmallString::InitThreadLocal(tlh); - CompactObj::InitThreadLocal(PMR_NS::get_default_resource()); + InitThreadStructs(); } static void TearDownTestSuite() { - mi_heap_collect(mi_heap_get_backing(), true); - - auto cb_visit = [](const mi_heap_t* heap, const mi_heap_area_t* area, void* block, - size_t block_size, void* arg) { - LOG(ERROR) << "Unfreed allocations: block_size " << block_size - << ", allocated: " << area->used * block_size; - return true; - }; - - mi_heap_visit_blocks(mi_heap_get_backing(), false /* do not visit all blocks*/, cb_visit, - nullptr); + CheckEverythingDeallocated(); } CompactObj cobj_; diff --git a/src/core/dense_set.cc b/src/core/dense_set.cc index 0b8b7479d4d9..9750ded2a061 100644 --- a/src/core/dense_set.cc +++ b/src/core/dense_set.cc @@ -168,14 +168,14 @@ auto DenseSet::PopPtrFront(DenseSet::ChainVectorIterator it) -> DensePtr { return front; } -uint32_t DenseSet::ClearInternal(uint32_t start, uint32_t count) { +uint32_t DenseSet::ClearStep(uint32_t start, uint32_t count) { constexpr unsigned kArrLen = 32; ClearItem arr[kArrLen]; unsigned len = 0; size_t end = min(entries_.size(), start + count); for (size_t i = start; i < end; ++i) { - DensePtr ptr = entries_[i]; + DensePtr& ptr = entries_[i]; if (ptr.IsEmpty()) continue; @@ -190,6 +190,7 @@ uint32_t DenseSet::ClearInternal(uint32_t start, uint32_t count) { dest.ptr = ptr; dest.obj = nullptr; } + ptr.Reset(); if (len == kArrLen) { ClearBatch(kArrLen, arr); len = 0; diff --git a/src/core/dense_set.h b/src/core/dense_set.h index 1cc5ac135b78..0ac73a47bc0c 100644 --- a/src/core/dense_set.h +++ b/src/core/dense_set.h @@ -215,9 +215,13 @@ class DenseSet { virtual ~DenseSet(); void Clear() { - ClearInternal(0, entries_.size()); + ClearStep(0, entries_.size()); } + // Returns the next bucket index that should be cleared. + // Returns BucketCount when all objects are erased. + uint32_t ClearStep(uint32_t start, uint32_t count); + // Returns the number of elements in the map. Note that it might be that some of these elements // have expired and can't be accessed. size_t UpperBoundSize() const { @@ -303,11 +307,6 @@ class DenseSet { void* PopInternal(); - // Note this does not free any dynamic allocations done by derived classes, that a DensePtr - // in the set may point to. This function only frees the allocated DenseLinkKeys created by - // DenseSet. All data allocated by a derived class should be freed before calling this - uint32_t ClearInternal(uint32_t start, uint32_t count); - void IncreaseMallocUsed(size_t delta) { obj_malloc_used_ += delta; } diff --git a/src/server/db_slice.cc b/src/server/db_slice.cc index 88f18696b555..90c10a54f624 100644 --- a/src/server/db_slice.cc +++ b/src/server/db_slice.cc @@ -669,10 +669,8 @@ void DbSlice::ActivateDb(DbIndex db_ind) { CreateDb(db_ind); } -bool DbSlice::Del(Context cntx, Iterator it) { - if (!IsValid(it)) { - return false; - } +void DbSlice::Del(Context cntx, Iterator it) { + CHECK(IsValid(it)); auto& db = db_arr_[cntx.db_index]; auto obj_type = it->second.ObjType(); @@ -683,8 +681,6 @@ bool DbSlice::Del(Context cntx, Iterator it) { doc_del_cb_(key, cntx, it->second); } PerformDeletion(it, db.get()); - - return true; } void DbSlice::FlushSlotsFb(const cluster::SlotSet& slot_ids) { @@ -917,7 +913,7 @@ OpResult DbSlice::UpdateExpire(const Context& cntx, Iterator prime_it, } if (rel_msec <= 0) { // implicit - don't persist - CHECK(Del(cntx, prime_it)); + Del(cntx, prime_it); return -1; } else if (IsValid(expire_it) && !params.persist) { auto current = ExpireTime(expire_it); diff --git a/src/server/db_slice.h b/src/server/db_slice.h index 7b56ee8ca757..263fa26b318c 100644 --- a/src/server/db_slice.h +++ b/src/server/db_slice.h @@ -346,7 +346,8 @@ class DbSlice { // Delete a key referred by its iterator. void PerformDeletion(Iterator del_it, DbTable* table); - bool Del(Context cntx, Iterator it); + // Deletes the iterator. The iterator must be valid. + void Del(Context cntx, Iterator it); constexpr static DbIndex kDbAll = 0xFFFF; diff --git a/src/server/engine_shard.cc b/src/server/engine_shard.cc index 5396b8f6f50d..f637f1a116a2 100644 --- a/src/server/engine_shard.cc +++ b/src/server/engine_shard.cc @@ -426,12 +426,11 @@ bool EngineShard::DoDefrag() { // priority. // otherwise lower the task priority so that it would not use the CPU when not required uint32_t EngineShard::DefragTask() { + constexpr uint32_t kRunAtLowPriority = 0u; if (!namespaces) { - return util::ProactorBase::kOnIdleMaxLevel; + return kRunAtLowPriority; } - constexpr uint32_t kRunAtLowPriority = 0u; - if (defrag_state_.CheckRequired()) { VLOG(2) << shard_id_ << ": need to run defrag memory cursor state: " << defrag_state_.cursor; if (DoDefrag()) { diff --git a/src/server/generic_family.cc b/src/server/generic_family.cc index 94a0319a004e..1e0cb750834f 100644 --- a/src/server/generic_family.cc +++ b/src/server/generic_family.cc @@ -440,7 +440,7 @@ OpStatus Renamer::DelSrc(Transaction* t, EngineShard* shard) { DVLOG(1) << "Rename: removing the key '" << src_key_; res.post_updater.Run(); - CHECK(db_slice.Del(t->GetDbContext(), it)); + db_slice.Del(t->GetDbContext(), it); if (shard->journal()) { RecordJournal(t->GetOpArgs(shard), "DEL"sv, ArgSlice{src_key_}, 2); } @@ -462,7 +462,7 @@ OpStatus Renamer::DeserializeDest(Transaction* t, EngineShard* shard) { if (dest_found_) { DVLOG(1) << "Rename: deleting the destiny key '" << dest_key_; dest_res.post_updater.Run(); - CHECK(db_slice.Del(op_args.db_cntx, dest_res.it)); + db_slice.Del(op_args.db_cntx, dest_res.it); } if (restore_args.Expired()) { @@ -554,7 +554,7 @@ OpResult OpRestore(const OpArgs& op_args, std::string_view key, std::strin VLOG(1) << "restore command is running with replace, found old key '" << key << "' and removing it"; res.post_updater.Run(); - CHECK(db_slice.Del(op_args.db_cntx, res.it)); + db_slice.Del(op_args.db_cntx, res.it); } else { // we are not allowed to replace it. return OpStatus::KEY_EXISTS; @@ -812,7 +812,7 @@ OpStatus OpMove(const OpArgs& op_args, string_view key, DbIndex target_db) { // Restore expire flag after std::move. from_res.it->second.SetExpire(IsValid(from_res.exp_it)); - CHECK(db_slice.Del(op_args.db_cntx, from_res.it)); + db_slice.Del(op_args.db_cntx, from_res.it); auto op_result = db_slice.AddNew(target_cntx, key, std::move(from_obj), exp_ts); RETURN_ON_BAD_STATUS(op_result); auto& add_res = *op_result; @@ -868,13 +868,13 @@ OpResult OpRen(const OpArgs& op_args, string_view from_key, string_view to to_res.post_updater.Run(); from_res.post_updater.Run(); - CHECK(db_slice.Del(op_args.db_cntx, from_res.it)); + db_slice.Del(op_args.db_cntx, from_res.it); } else { // Here we first delete from_it because AddNew below could invalidate from_it. // On the other hand, AddNew does not rely on the iterators - this is why we keep // the value in `from_obj`. from_res.post_updater.Run(); - CHECK(db_slice.Del(op_args.db_cntx, from_res.it)); + db_slice.Del(op_args.db_cntx, from_res.it); auto op_result = db_slice.AddNew(op_args.db_cntx, to_key, std::move(from_obj), exp_ts); RETURN_ON_BAD_STATUS(op_result); to_res = std::move(*op_result); @@ -995,35 +995,14 @@ std::optional ParseExpireOptionsOrReply(const CmdArgList args, SinkRepl return flags; } -} // namespace - -OpResult GenericFamily::OpDel(const OpArgs& op_args, const ShardArgs& keys) { - DVLOG(1) << "Del: " << keys.Front(); - auto& db_slice = op_args.GetDbSlice(); - - uint32_t res = 0; - - for (string_view key : keys) { - auto fres = db_slice.FindMutable(op_args.db_cntx, key); - if (!IsValid(fres.it)) - continue; - fres.post_updater.Run(); - res += int(db_slice.Del(op_args.db_cntx, fres.it)); - } - - return res; -} - -void GenericFamily::Del(CmdArgList args, const CommandContext& cmd_cntx) { - VLOG(1) << "Del " << ArgS(args, 0); - +void DeleteGeneric(CmdArgList args, const CommandContext& cmd_cntx) { atomic_uint32_t result{0}; auto* builder = cmd_cntx.rb; bool is_mc = (builder->GetProtocol() == Protocol::MEMCACHE); auto cb = [&result](const Transaction* t, EngineShard* shard) { ShardArgs args = t->GetShardArgs(shard->shard_id()); - auto res = OpDel(t->GetOpArgs(shard), args); + auto res = GenericFamily::OpDel(t->GetOpArgs(shard), args); result.fetch_add(res.value_or(0), memory_order_relaxed); return OpStatus::OK; @@ -1049,6 +1028,36 @@ void GenericFamily::Del(CmdArgList args, const CommandContext& cmd_cntx) { } } +} // namespace + +OpResult GenericFamily::OpDel(const OpArgs& op_args, const ShardArgs& keys) { + DVLOG(1) << "Del: " << keys.Front(); + auto& db_slice = op_args.GetDbSlice(); + + uint32_t res = 0; + + for (string_view key : keys) { + auto it = db_slice.FindMutable(op_args.db_cntx, key).it; // post_updater will run immediately + if (!IsValid(it)) + continue; + + db_slice.Del(op_args.db_cntx, it); + ++res; + } + + return res; +} + +void GenericFamily::Del(CmdArgList args, const CommandContext& cmd_cntx) { + VLOG(1) << "Del " << ArgS(args, 0); + + DeleteGeneric(args, cmd_cntx); +} + +void GenericFamily::Unlink(CmdArgList args, const CommandContext& cmd_cntx) { + DeleteGeneric(args, cmd_cntx); +} + void GenericFamily::Ping(CmdArgList args, const CommandContext& cmd_cntx) { auto* rb = static_cast(cmd_cntx.rb); if (args.size() > 1) { @@ -1886,7 +1895,7 @@ void GenericFamily::Register(CommandRegistry* registry) { << CI{"TIME", CO::LOADING | CO::FAST, 1, 0, 0, acl::kTime}.HFUNC(Time) << CI{"TYPE", CO::READONLY | CO::FAST | CO::LOADING, 2, 1, 1, acl::kType}.HFUNC(Type) << CI{"DUMP", CO::READONLY, 2, 1, 1, acl::kDump}.HFUNC(Dump) - << CI{"UNLINK", CO::WRITE, -2, 1, -1, acl::kUnlink}.HFUNC(Del) + << CI{"UNLINK", CO::WRITE, -2, 1, -1, acl::kUnlink}.HFUNC(Unlink) << CI{"STICK", CO::WRITE, -2, 1, -1, acl::kStick}.HFUNC(Stick) << CI{"SORT", CO::READONLY, -2, 1, 1, acl::kSort}.HFUNC(Sort) << CI{"MOVE", CO::WRITE | CO::GLOBAL_TRANS | CO::NO_AUTOJOURNAL, 3, 1, 1, acl::kMove}.HFUNC( diff --git a/src/server/generic_family.h b/src/server/generic_family.h index 40af99bedfff..0fb231ec80eb 100644 --- a/src/server/generic_family.h +++ b/src/server/generic_family.h @@ -35,6 +35,7 @@ class GenericFamily { using SinkReplyBuilder = facade::SinkReplyBuilder; static void Del(CmdArgList args, const CommandContext& cmd_cntx); + static void Unlink(CmdArgList args, const CommandContext& cmd_cntx); static void Ping(CmdArgList args, const CommandContext& cmd_cntx); static void Exists(CmdArgList args, const CommandContext& cmd_cntx); static void Expire(CmdArgList args, const CommandContext& cmd_cntx); diff --git a/src/server/hset_family.cc b/src/server/hset_family.cc index 62f001603a63..8361a514a090 100644 --- a/src/server/hset_family.cc +++ b/src/server/hset_family.cc @@ -585,10 +585,10 @@ OpResult> OpGetAll(const OpArgs& op_args, string_view key, uint8_ // and the enconding is guaranteed to be a DenseSet since we only support expiring // value with that enconding. if (res.empty()) { - auto mutable_res = db_slice.FindMutable(op_args.db_cntx, key, OBJ_HASH); - // Run postupdater, it means that we deleted the keys - mutable_res->post_updater.Run(); - db_slice.Del(op_args.db_cntx, mutable_res->it); + // post_updater will run immediately + auto it = db_slice.FindMutable(op_args.db_cntx, key).it; + + db_slice.Del(op_args.db_cntx, it); } return res; @@ -1169,10 +1169,10 @@ void HSetFamily::HRandField(CmdArgList args, const CommandContext& cmd_cntx) { } } - if (string_map->Empty()) { - auto it_mutable = db_slice.FindMutable(db_context, key, OBJ_HASH); - it_mutable->post_updater.Run(); - db_slice.Del(db_context, it_mutable->it); + if (string_map->Empty()) { // Can happen if we use a TTL on hash members. + // post_updater will run immediately + auto it = db_slice.FindMutable(db_context, key).it; + db_slice.Del(db_context, it); return facade::OpStatus::KEY_NOTFOUND; } } else if (pv.Encoding() == kEncodingListPack) { @@ -1207,8 +1207,7 @@ void HSetFamily::HRandField(CmdArgList args, const CommandContext& cmd_cntx) { } } } else { - LOG(ERROR) << "Invalid encoding " << pv.Encoding(); - return OpStatus::INVALID_VALUE; + LOG(FATAL) << "Invalid encoding " << pv.Encoding(); } return str_vec; }; diff --git a/src/server/json_family.cc b/src/server/json_family.cc index 45e91396cd45..4b070c236e2b 100644 --- a/src/server/json_family.cc +++ b/src/server/json_family.cc @@ -913,8 +913,13 @@ OpResult OpDel(const OpArgs& op_args, string_view key, string_view path, if (json_path.RefersToRootElement()) { auto& db_slice = op_args.GetDbSlice(); auto it = db_slice.FindMutable(op_args.db_cntx, key).it; // post_updater will run immediately - return static_cast(db_slice.Del(op_args.db_cntx, it)); + if (IsValid(it)) { + db_slice.Del(op_args.db_cntx, it); + return 1; + } + return 0; } + JsonMemTracker tracker; // FindMutable because we need to run the AutoUpdater at the end which will account // the deltas calculated from the MemoryTracker diff --git a/src/server/list_family.cc b/src/server/list_family.cc index 984e9f31dec4..ab16b0e63e75 100644 --- a/src/server/list_family.cc +++ b/src/server/list_family.cc @@ -180,7 +180,7 @@ std::string OpBPop(Transaction* t, EngineShard* shard, std::string_view key, Lis OpArgs op_args = t->GetOpArgs(shard); if (len == 0) { DVLOG(1) << "deleting key " << key << " " << t->DebugId(); - CHECK(op_args.GetDbSlice().Del(op_args.db_cntx, it)); + op_args.GetDbSlice().Del(op_args.db_cntx, it); } if (op_args.shard->journal()) { @@ -276,7 +276,7 @@ OpResult OpMoveSingleShard(const OpArgs& op_args, string_view src, strin dest_res.post_updater.Run(); if (prev_len == 1) { - CHECK(db_slice.Del(op_args.db_cntx, src_it)); + db_slice.Del(op_args.db_cntx, src_it); } return val; @@ -452,7 +452,7 @@ OpResult OpPop(const OpArgs& op_args, string_view key, ListDir dir, u it_res->post_updater.Run(); if (count == prev_len) { - CHECK(db_slice.Del(op_args.db_cntx, it)); + db_slice.Del(op_args.db_cntx, it); } if (op_args.shard->journal() && journal_rewrite) { @@ -765,7 +765,7 @@ OpResult OpRem(const OpArgs& op_args, string_view key, string_view ele it_res->post_updater.Run(); if (len == 0) { - CHECK(db_slice.Del(op_args.db_cntx, it)); + db_slice.Del(op_args.db_cntx, it); } return removed; @@ -840,7 +840,7 @@ OpStatus OpTrim(const OpArgs& op_args, string_view key, long start, long end) { it_res->post_updater.Run(); if (it->second.Size() == 0) { - CHECK(db_slice.Del(op_args.db_cntx, it)); + db_slice.Del(op_args.db_cntx, it); } return OpStatus::OK; } diff --git a/src/server/set_family.cc b/src/server/set_family.cc index a2ab2cbcf66b..f2fd67d0a4c3 100644 --- a/src/server/set_family.cc +++ b/src/server/set_family.cc @@ -443,9 +443,11 @@ OpResult OpAdd(const OpArgs& op_args, std::string_view key, const NewE // key if it exists. if (overwrite && (vals_it.begin() == vals_it.end())) { auto it = db_slice.FindMutable(op_args.db_cntx, key).it; // post_updater will run immediately - db_slice.Del(op_args.db_cntx, it); - if (journal_update && op_args.shard->journal()) { - RecordJournal(op_args, "DEL"sv, ArgSlice{key}); + if (IsValid(it)) { + db_slice.Del(op_args.db_cntx, it); + if (journal_update && op_args.shard->journal()) { + RecordJournal(op_args, "DEL"sv, ArgSlice{key}); + } } return 0; } @@ -561,7 +563,7 @@ OpResult OpRem(const OpArgs& op_args, string_view key, facade::ArgRang find_res->post_updater.Run(); if (isempty) { - CHECK(db_slice.Del(op_args.db_cntx, find_res->it)); + db_slice.Del(op_args.db_cntx, find_res->it); } if (journal_rewrite && op_args.shard->journal()) { vector mapped(vals.Size() + 1); @@ -886,7 +888,7 @@ OpResult OpPop(const OpArgs& op_args, string_view key, unsigned count // Delete the set as it is now empty find_res->post_updater.Run(); - CHECK(db_slice.Del(op_args.db_cntx, find_res->it)); + db_slice.Del(op_args.db_cntx, find_res->it); // Replicate as DEL. if (op_args.shard->journal()) { diff --git a/src/server/zset_family.cc b/src/server/zset_family.cc index fc440ad92037..60eda900fee0 100644 --- a/src/server/zset_family.cc +++ b/src/server/zset_family.cc @@ -964,7 +964,9 @@ OpResult OpAdd(const OpArgs& op_args, const ZParams& zparams, string_ if (zparams.override && members.empty()) { auto it = db_slice.FindMutable(op_args.db_cntx, key).it; // post_updater will run immediately - db_slice.Del(op_args.db_cntx, it); + if (IsValid(it)) { + db_slice.Del(op_args.db_cntx, it); + } return OpStatus::OK; } @@ -1219,7 +1221,7 @@ ScoredArray OpBZPop(Transaction* t, EngineShard* shard, std::string_view key, bo auto zlen = pv.Size(); if (zlen == 0) { DVLOG(1) << "deleting key " << key << " " << t->DebugId(); - CHECK(db_slice.Del(t->GetDbContext(), it_res->it)); + db_slice.Del(t->GetDbContext(), it_res->it); } OpArgs op_args = t->GetOpArgs(shard); @@ -1330,7 +1332,7 @@ auto OpPopCount(const ZSetFamily::ZRangeSpec& range_spec, const OpArgs& op_args, auto zlen = pv.Size(); if (zlen == 0) { - CHECK(op_args.GetDbSlice().Del(op_args.db_cntx, res_it->it)); + op_args.GetDbSlice().Del(op_args.db_cntx, res_it->it); } return iv.PopResult(); @@ -1384,7 +1386,7 @@ OpResult OpRemRange(const OpArgs& op_args, string_view key, auto zlen = pv.Size(); if (zlen == 0) { - CHECK(op_args.GetDbSlice().Del(op_args.db_cntx, res_it->it)); + op_args.GetDbSlice().Del(op_args.db_cntx, res_it->it); } return iv.removed(); @@ -1563,7 +1565,7 @@ OpResult OpRem(const OpArgs& op_args, string_view key, facade::ArgRang res_it->post_updater.Run(); if (zlen == 0) { - CHECK(op_args.GetDbSlice().Del(op_args.db_cntx, res_it->it)); + op_args.GetDbSlice().Del(op_args.db_cntx, res_it->it); } return deleted; From a3ef239ac7d94e908fae467816c673ae71c07d83 Mon Sep 17 00:00:00 2001 From: adiholden Date: Thu, 2 Jan 2025 12:16:21 +0200 Subject: [PATCH 11/20] feat(server): refactor allow preempt on journal record (#4393) * feat server: refactor allow preempt on journal record Signed-off-by: adi_holden --- src/server/cluster/cluster_family.cc | 2 +- src/server/db_slice.cc | 23 +++++--- src/server/db_slice.h | 2 +- src/server/dflycmd.cc | 2 +- src/server/engine_shard.cc | 88 +++++++++++++--------------- src/server/generic_family.cc | 4 +- src/server/journal/journal.cc | 8 ++- src/server/journal/journal.h | 29 ++++++++- src/server/journal/journal_slice.cc | 63 ++++++++++---------- src/server/journal/journal_slice.h | 11 +++- src/server/journal/streamer.cc | 1 + src/server/transaction.cc | 6 +- src/server/transaction.h | 4 +- src/server/tx_base.cc | 21 ++----- src/server/tx_base.h | 2 +- 15 files changed, 148 insertions(+), 118 deletions(-) diff --git a/src/server/cluster/cluster_family.cc b/src/server/cluster/cluster_family.cc index b76cb5ab404c..3d71befbcd1b 100644 --- a/src/server/cluster/cluster_family.cc +++ b/src/server/cluster/cluster_family.cc @@ -543,7 +543,7 @@ void WriteFlushSlotsToJournal(const SlotRanges& slot_ranges) { // TODO: Break slot migration upon FLUSHSLOTS journal->RecordEntry(/* txid= */ 0, journal::Op::COMMAND, /* dbid= */ 0, /* shard_cnt= */ shard_set->size(), nullopt, - Payload("DFLYCLUSTER", args_view), false); + Payload("DFLYCLUSTER", args_view)); }; shard_set->pool()->AwaitFiberOnAll(std::move(cb)); } diff --git a/src/server/db_slice.cc b/src/server/db_slice.cc index 90c10a54f624..5b6661616ef3 100644 --- a/src/server/db_slice.cc +++ b/src/server/db_slice.cc @@ -144,6 +144,8 @@ unsigned PrimeEvictionPolicy::GarbageCollect(const PrimeTable::HotspotBuckets& e if (db_slice_->WillBlockOnJournalWrite()) { return res; } + // Disable flush journal changes to prevent preemtion in GarbageCollect. + journal::JournalFlushGuard journal_flush_guard(db_slice_->shard_owner()->journal()); // bool should_print = (eb.key_hash % 128) == 0; @@ -172,6 +174,8 @@ unsigned PrimeEvictionPolicy::GarbageCollect(const PrimeTable::HotspotBuckets& e unsigned PrimeEvictionPolicy::Evict(const PrimeTable::HotspotBuckets& eb, PrimeTable* me) { if (!can_evict_ || db_slice_->WillBlockOnJournalWrite()) return 0; + // Disable flush journal changes to prevent preemtion in evict. + journal::JournalFlushGuard journal_flush_guard(db_slice_->shard_owner()->journal()); constexpr size_t kNumStashBuckets = ABSL_ARRAYSIZE(eb.probes.by_type.stash_buckets); @@ -195,7 +199,7 @@ unsigned PrimeEvictionPolicy::Evict(const PrimeTable::HotspotBuckets& eb, PrimeT // log the evicted keys to journal. if (auto journal = db_slice_->shard_owner()->journal(); journal) { - RecordExpiry(cntx_.db_index, key, false); + RecordExpiry(cntx_.db_index, key); } db_slice_->PerformDeletion(DbSlice::Iterator(last_slot_it, StringOrView::FromView(key)), table); @@ -453,7 +457,7 @@ OpResult DbSlice::FindInternal(const Context& cntx, std: } if (res.it->second.HasExpire()) { // check expiry state - res = ExpireIfNeeded(cntx, res.it, true); + res = ExpireIfNeeded(cntx, res.it); if (!IsValid(res.it)) { return OpStatus::KEY_NOTFOUND; } @@ -1077,12 +1081,11 @@ void DbSlice::PostUpdate(DbIndex db_ind, Iterator it, std::string_view key, size } DbSlice::ItAndExp DbSlice::ExpireIfNeeded(const Context& cntx, Iterator it) const { - auto res = ExpireIfNeeded(cntx, it.GetInnerIt(), false); + auto res = ExpireIfNeeded(cntx, it.GetInnerIt()); return {.it = Iterator::FromPrime(res.it), .exp_it = ExpIterator::FromPrime(res.exp_it)}; } -DbSlice::PrimeItAndExp DbSlice::ExpireIfNeeded(const Context& cntx, PrimeIterator it, - bool preempts) const { +DbSlice::PrimeItAndExp DbSlice::ExpireIfNeeded(const Context& cntx, PrimeIterator it) const { if (!it->second.HasExpire()) { LOG(ERROR) << "Invalid call to ExpireIfNeeded"; return {it, ExpireIterator{}}; @@ -1112,7 +1115,7 @@ DbSlice::PrimeItAndExp DbSlice::ExpireIfNeeded(const Context& cntx, PrimeIterato // Replicate expiry if (auto journal = owner_->journal(); journal) { - RecordExpiry(cntx.db_index, key, preempts); + RecordExpiry(cntx.db_index, key); } if (expired_keys_events_recording_) @@ -1136,6 +1139,8 @@ void DbSlice::ExpireAllIfNeeded() { // We hold no locks to any of the keys so we should Wait() here such that // we don't preempt in ExpireIfNeeded block_counter_.Wait(); + // Disable flush journal changes to prevent preemtion in traverse. + journal::JournalFlushGuard journal_flush_guard(owner_->journal()); for (DbIndex db_index = 0; db_index < db_arr_.size(); db_index++) { if (!db_arr_[db_index]) @@ -1148,7 +1153,7 @@ void DbSlice::ExpireAllIfNeeded() { LOG(ERROR) << "Expire entry " << exp_it->first.ToString() << " not found in prime table"; return; } - ExpireIfNeeded(Context{nullptr, db_index, GetCurrentTimeMs()}, prime_it, false); + ExpireIfNeeded(Context{nullptr, db_index, GetCurrentTimeMs()}, prime_it); }; ExpireTable::Cursor cursor; @@ -1212,7 +1217,7 @@ auto DbSlice::DeleteExpiredStep(const Context& cntx, unsigned count) -> DeleteEx auto prime_it = db.prime.Find(it->first); CHECK(!prime_it.is_done()); result.deleted_bytes += prime_it->first.MallocUsed() + prime_it->second.MallocUsed(); - ExpireIfNeeded(cntx, prime_it, false); + ExpireIfNeeded(cntx, prime_it); ++result.deleted; } else { result.survivor_ttl_sum += ttl; @@ -1280,7 +1285,7 @@ pair DbSlice::FreeMemWithEvictionStep(DbIndex db_ind, size_t s // fiber preemption could happen in this phase. for (string_view key : keys_to_journal) { if (auto journal = owner_->journal(); journal) - RecordExpiry(db_ind, key, false); + RecordExpiry(db_ind, key); if (expired_keys_events_recording_) db_table->expired_keys_events_.emplace_back(key); diff --git a/src/server/db_slice.h b/src/server/db_slice.h index 263fa26b318c..4f442910e157 100644 --- a/src/server/db_slice.h +++ b/src/server/db_slice.h @@ -553,7 +553,7 @@ class DbSlice { ExpireIterator exp_it; }; - PrimeItAndExp ExpireIfNeeded(const Context& cntx, PrimeIterator it, bool preempts = false) const; + PrimeItAndExp ExpireIfNeeded(const Context& cntx, PrimeIterator it) const; OpResult AddOrFindInternal(const Context& cntx, std::string_view key); diff --git a/src/server/dflycmd.cc b/src/server/dflycmd.cc index 5e42e2cdcdf3..547fefb3ed3f 100644 --- a/src/server/dflycmd.cc +++ b/src/server/dflycmd.cc @@ -78,7 +78,7 @@ bool WaitReplicaFlowToCatchup(absl::Time end_time, const DflyCmd::ReplicaInfo* r // We don't want any writes to the journal after we send the `PING`, // and expirations could ruin that. namespaces->GetDefaultNamespace().GetDbSlice(shard->shard_id()).SetExpireAllowed(false); - shard->journal()->RecordEntry(0, journal::Op::PING, 0, 0, nullopt, {}, true); + shard->journal()->RecordEntry(0, journal::Op::PING, 0, 0, nullopt, {}); const FlowInfo* flow = &replica->flows[shard->shard_id()]; while (flow->last_acked_lsn < shard->journal()->GetLsn()) { diff --git a/src/server/engine_shard.cc b/src/server/engine_shard.cc index f637f1a116a2..ddb67476057b 100644 --- a/src/server/engine_shard.cc +++ b/src/server/engine_shard.cc @@ -12,8 +12,8 @@ extern "C" { #include "redis/zmalloc.h" } - #include "server/engine_shard_set.h" +#include "server/journal/journal.h" #include "server/namespaces.h" #include "server/search/doc_index.h" #include "server/server_state.h" @@ -756,62 +756,56 @@ void EngineShard::Heartbeat() { } void EngineShard::RetireExpiredAndEvict() { - { - FiberAtomicGuard guard; - // TODO: iterate over all namespaces - DbSlice& db_slice = namespaces->GetDefaultNamespace().GetDbSlice(shard_id()); - constexpr double kTtlDeleteLimit = 200; - - uint32_t traversed = GetMovingSum6(TTL_TRAVERSE); - uint32_t deleted = GetMovingSum6(TTL_DELETE); - unsigned ttl_delete_target = 5; - - if (deleted > 10) { - // deleted should be <= traversed. - // hence we map our delete/traversed ratio into a range [0, kTtlDeleteLimit). - // The higher ttl_delete_target the more likely we have lots of expired items that need - // to be deleted. - ttl_delete_target = kTtlDeleteLimit * double(deleted) / (double(traversed) + 10); - } + // Disable flush journal changes to prevent preemtion + journal::JournalFlushGuard journal_flush_guard(journal_); - DbContext db_cntx; - db_cntx.time_now_ms = GetCurrentTimeMs(); + // TODO: iterate over all namespaces + DbSlice& db_slice = namespaces->GetDefaultNamespace().GetDbSlice(shard_id()); + constexpr double kTtlDeleteLimit = 200; - size_t eviction_goal = GetFlag(FLAGS_enable_heartbeat_eviction) ? CalculateEvictionBytes() : 0; + uint32_t traversed = GetMovingSum6(TTL_TRAVERSE); + uint32_t deleted = GetMovingSum6(TTL_DELETE); + unsigned ttl_delete_target = 5; - for (unsigned i = 0; i < db_slice.db_array_size(); ++i) { - if (!db_slice.IsDbValid(i)) - continue; + if (deleted > 10) { + // deleted should be <= traversed. + // hence we map our delete/traversed ratio into a range [0, kTtlDeleteLimit). + // The higher ttl_delete_target the more likely we have lots of expired items that need + // to be deleted. + ttl_delete_target = kTtlDeleteLimit * double(deleted) / (double(traversed) + 10); + } - db_cntx.db_index = i; - auto [pt, expt] = db_slice.GetTables(i); - if (expt->size() > pt->size() / 4) { - DbSlice::DeleteExpiredStats stats = db_slice.DeleteExpiredStep(db_cntx, ttl_delete_target); + DbContext db_cntx; + db_cntx.time_now_ms = GetCurrentTimeMs(); - eviction_goal -= std::min(eviction_goal, size_t(stats.deleted_bytes)); - counter_[TTL_TRAVERSE].IncBy(stats.traversed); - counter_[TTL_DELETE].IncBy(stats.deleted); - } + size_t eviction_goal = GetFlag(FLAGS_enable_heartbeat_eviction) ? CalculateEvictionBytes() : 0; - if (eviction_goal) { - uint32_t starting_segment_id = rand() % pt->GetSegmentCount(); - auto [evicted_items, evicted_bytes] = - db_slice.FreeMemWithEvictionStep(i, starting_segment_id, eviction_goal); + for (unsigned i = 0; i < db_slice.db_array_size(); ++i) { + if (!db_slice.IsDbValid(i)) + continue; - DVLOG(2) << "Heartbeat eviction: Expected to evict " << eviction_goal - << " bytes. Actually evicted " << evicted_items << " items, " << evicted_bytes - << " bytes. Max eviction per heartbeat: " - << GetFlag(FLAGS_max_eviction_per_heartbeat); + db_cntx.db_index = i; + auto [pt, expt] = db_slice.GetTables(i); + if (expt->size() > pt->size() / 4) { + DbSlice::DeleteExpiredStats stats = db_slice.DeleteExpiredStep(db_cntx, ttl_delete_target); - eviction_goal -= std::min(eviction_goal, evicted_bytes); - } + eviction_goal -= std::min(eviction_goal, size_t(stats.deleted_bytes)); + counter_[TTL_TRAVERSE].IncBy(stats.traversed); + counter_[TTL_DELETE].IncBy(stats.deleted); } - } - // Journal entries for expired entries are not writen to socket in the loop above. - // Trigger write to socket when loop finishes. - if (auto journal = EngineShard::tlocal()->journal(); journal) { - TriggerJournalWriteToSink(); + if (eviction_goal) { + uint32_t starting_segment_id = rand() % pt->GetSegmentCount(); + auto [evicted_items, evicted_bytes] = + db_slice.FreeMemWithEvictionStep(i, starting_segment_id, eviction_goal); + + DVLOG(2) << "Heartbeat eviction: Expected to evict " << eviction_goal + << " bytes. Actually evicted " << evicted_items << " items, " << evicted_bytes + << " bytes. Max eviction per heartbeat: " + << GetFlag(FLAGS_max_eviction_per_heartbeat); + + eviction_goal -= std::min(eviction_goal, evicted_bytes); + } } } diff --git a/src/server/generic_family.cc b/src/server/generic_family.cc index 1e0cb750834f..17725d1c79b5 100644 --- a/src/server/generic_family.cc +++ b/src/server/generic_family.cc @@ -616,7 +616,8 @@ void OpScan(const OpArgs& op_args, const ScanOpts& scan_opts, uint64_t* cursor, // we enter the callback in a timing when journaling will not cause preemptions. Otherwise, // the bucket might change as we Traverse and yield. db_slice.BlockingCounter()->Wait(); - + // Disable flush journal changes to prevent preemtion in traverse. + journal::JournalFlushGuard journal_flush_guard(op_args.shard->journal()); util::FiberAtomicGuard guard; unsigned cnt = 0; @@ -630,6 +631,7 @@ void OpScan(const OpArgs& op_args, const ScanOpts& scan_opts, uint64_t* cursor, cur = prime_table->Traverse( cur, [&](PrimeIterator it) { cnt += ScanCb(op_args, it, scan_opts, &scratch, vec); }); } while (cur && cnt < scan_opts.limit); + VLOG(1) << "OpScan " << db_slice.shard_id() << " cursor: " << cur.value(); *cursor = cur.value(); } diff --git a/src/server/journal/journal.cc b/src/server/journal/journal.cc index e1e2f994e74a..3e4212eb7e89 100644 --- a/src/server/journal/journal.cc +++ b/src/server/journal/journal.cc @@ -84,8 +84,12 @@ LSN Journal::GetLsn() const { } void Journal::RecordEntry(TxId txid, Op opcode, DbIndex dbid, unsigned shard_cnt, - std::optional slot, Entry::Payload payload, bool await) { - journal_slice.AddLogRecord(Entry{txid, opcode, dbid, shard_cnt, slot, std::move(payload)}, await); + std::optional slot, Entry::Payload payload) { + journal_slice.AddLogRecord(Entry{txid, opcode, dbid, shard_cnt, slot, std::move(payload)}); +} + +void Journal::SetFlushMode(bool allow_flush) { + journal_slice.SetFlushMode(allow_flush); } } // namespace journal diff --git a/src/server/journal/journal.h b/src/server/journal/journal.h index 2c2be714a960..ae275471a21d 100644 --- a/src/server/journal/journal.h +++ b/src/server/journal/journal.h @@ -3,8 +3,8 @@ // #pragma once - #include "server/journal/types.h" +#include "util/fibers/detail/fiber_interface.h" #include "util/proactor_pool.h" namespace dfly { @@ -35,11 +35,36 @@ class Journal { LSN GetLsn() const; void RecordEntry(TxId txid, Op opcode, DbIndex dbid, unsigned shard_cnt, - std::optional slot, Entry::Payload payload, bool await); + std::optional slot, Entry::Payload payload); + + void SetFlushMode(bool allow_flush); private: mutable util::fb2::Mutex state_mu_; }; +class JournalFlushGuard { + public: + explicit JournalFlushGuard(Journal* journal) : journal_(journal) { + if (journal_) { + journal_->SetFlushMode(false); + } + util::fb2::detail::EnterFiberAtomicSection(); + } + + ~JournalFlushGuard() { + util::fb2::detail::LeaveFiberAtomicSection(); + if (journal_) { + journal_->SetFlushMode(true); // Restore the state on destruction + } + } + + JournalFlushGuard(const JournalFlushGuard&) = delete; + JournalFlushGuard& operator=(const JournalFlushGuard&) = delete; + + private: + Journal* journal_; +}; + } // namespace journal } // namespace dfly diff --git a/src/server/journal/journal_slice.cc b/src/server/journal/journal_slice.cc index c3e9c9680bd2..48bfe722e460 100644 --- a/src/server/journal/journal_slice.cc +++ b/src/server/journal/journal_slice.cc @@ -142,34 +142,37 @@ std::string_view JournalSlice::GetEntry(LSN lsn) const { return (*ring_buffer_)[lsn - start].data; } -void JournalSlice::AddLogRecord(const Entry& entry, bool await) { +void JournalSlice::SetFlushMode(bool allow_flush) { + DCHECK(allow_flush != enable_journal_flush_); + enable_journal_flush_ = allow_flush; + if (allow_flush) { + JournalItem item; + item.lsn = -1; + item.opcode = Op::NOOP; + item.data = ""; + item.slot = {}; + CallOnChange(item); + } +} + +void JournalSlice::AddLogRecord(const Entry& entry) { DCHECK(ring_buffer_); - JournalItem dummy; - JournalItem* item; - if (entry.opcode == Op::NOOP) { - item = &dummy; - item->lsn = -1; - item->opcode = entry.opcode; - item->data = ""; - item->slot = entry.slot; - } else { + JournalItem item; + { FiberAtomicGuard fg; - // GetTail gives a pointer to a new tail entry in the buffer, possibly overriding the last entry - // if the buffer is full. - item = &dummy; - item->opcode = entry.opcode; - item->lsn = lsn_++; - item->cmd = entry.payload.cmd; - item->slot = entry.slot; + item.opcode = entry.opcode; + item.lsn = lsn_++; + item.cmd = entry.payload.cmd; + item.slot = entry.slot; io::BufSink buf_sink{&ring_serialize_buf_}; JournalWriter writer{&buf_sink}; writer.Write(entry); - item->data = io::View(ring_serialize_buf_.InputBuffer()); + item.data = io::View(ring_serialize_buf_.InputBuffer()); ring_serialize_buf_.Clear(); - VLOG(2) << "Writing item [" << item->lsn << "]: " << entry.ToString(); + VLOG(2) << "Writing item [" << item.lsn << "]: " << entry.ToString(); } #if 0 @@ -180,19 +183,17 @@ void JournalSlice::AddLogRecord(const Entry& entry, bool await) { file_offset_ += line.size(); } #endif + CallOnChange(item); +} - // TODO: Remove the callbacks, replace with notifiers - { - std::shared_lock lk(cb_mu_); - DVLOG(2) << "AddLogRecord: run callbacks for " << entry.ToString() - << " num callbacks: " << change_cb_arr_.size(); - - const size_t size = change_cb_arr_.size(); - auto k_v = change_cb_arr_.begin(); - for (size_t i = 0; i < size; ++i) { - k_v->second(*item, await); - ++k_v; - } +void JournalSlice::CallOnChange(const JournalItem& item) { + std::shared_lock lk(cb_mu_); + + const size_t size = change_cb_arr_.size(); + auto k_v = change_cb_arr_.begin(); + for (size_t i = 0; i < size; ++i) { + k_v->second(item, enable_journal_flush_); + ++k_v; } } diff --git a/src/server/journal/journal_slice.h b/src/server/journal/journal_slice.h index 8534d78f7aae..da0b18ea7918 100644 --- a/src/server/journal/journal_slice.h +++ b/src/server/journal/journal_slice.h @@ -37,7 +37,7 @@ class JournalSlice { return slice_index_ != UINT32_MAX; } - void AddLogRecord(const Entry& entry, bool await); + void AddLogRecord(const Entry& entry); // Register a callback that will be called every time a new entry is // added to the journal. @@ -54,8 +54,16 @@ class JournalSlice { /// from the buffer. bool IsLSNInBuffer(LSN lsn) const; std::string_view GetEntry(LSN lsn) const; + // SetFlushMode with allow_flush=false is used to disable preemptions during + // subsequent calls to AddLogRecord. + // SetFlushMode with allow_flush=true flushes all log records aggregated + // since the last call with allow_flush=false. This call may preempt. + // The caller must ensure that no preemptions occur between the initial call + // with allow_flush=false and the subsequent call with allow_flush=true. + void SetFlushMode(bool allow_flush); private: + void CallOnChange(const JournalItem& item); // std::string shard_path_; // std::unique_ptr shard_file_; std::optional> ring_buffer_; @@ -69,6 +77,7 @@ class JournalSlice { uint32_t slice_index_ = UINT32_MAX; uint32_t next_cb_id_ = 1; std::error_code status_ec_; + bool enable_journal_flush_ = true; }; } // namespace journal diff --git a/src/server/journal/streamer.cc b/src/server/journal/streamer.cc index 90d94ca6ef9c..91480a181341 100644 --- a/src/server/journal/streamer.cc +++ b/src/server/journal/streamer.cc @@ -56,6 +56,7 @@ void JournalStreamer::Start(util::FiberSocketBase* dest, bool send_lsn) { if (allow_await) { ThrottleIfNeeded(); // No record to write, just await if data was written so consumer will read the data. + // TODO: shouldnt we trigger async write in noop?? if (item.opcode == Op::NOOP) return; } diff --git a/src/server/transaction.cc b/src/server/transaction.cc index 36372a2a4e39..ddfc54021195 100644 --- a/src/server/transaction.cc +++ b/src/server/transaction.cc @@ -1474,15 +1474,15 @@ void Transaction::LogAutoJournalOnShard(EngineShard* shard, RunnableResult resul } // Record to journal autojournal commands, here we allow await which anables writing to sync // the journal change. - LogJournalOnShard(shard, std::move(entry_payload), unique_shard_cnt_, true); + LogJournalOnShard(shard, std::move(entry_payload), unique_shard_cnt_); } void Transaction::LogJournalOnShard(EngineShard* shard, journal::Entry::Payload&& payload, - uint32_t shard_cnt, bool allow_await) const { + uint32_t shard_cnt) const { auto journal = shard->journal(); CHECK(journal); journal->RecordEntry(txid_, journal::Op::COMMAND, db_index_, shard_cnt, - unique_slot_checker_.GetUniqueSlotId(), std::move(payload), allow_await); + unique_slot_checker_.GetUniqueSlotId(), std::move(payload)); } void Transaction::ReviveAutoJournal() { diff --git a/src/server/transaction.h b/src/server/transaction.h index 5e209eb96d98..22fa79374fc4 100644 --- a/src/server/transaction.h +++ b/src/server/transaction.h @@ -341,8 +341,8 @@ class Transaction { void PrepareMultiForScheduleSingleHop(Namespace* ns, ShardId sid, DbIndex db, CmdArgList args); // Write a journal entry to a shard journal with the given payload. - void LogJournalOnShard(EngineShard* shard, journal::Entry::Payload&& payload, uint32_t shard_cnt, - bool allow_await) const; + void LogJournalOnShard(EngineShard* shard, journal::Entry::Payload&& payload, + uint32_t shard_cnt) const; // Re-enable auto journal for commands marked as NO_AUTOJOURNAL. Call during setup. void ReviveAutoJournal(); diff --git a/src/server/tx_base.cc b/src/server/tx_base.cc index 70436f8c2ab3..83bfefa9336a 100644 --- a/src/server/tx_base.cc +++ b/src/server/tx_base.cc @@ -53,32 +53,21 @@ size_t ShardArgs::Size() const { void RecordJournal(const OpArgs& op_args, string_view cmd, const ShardArgs& args, uint32_t shard_cnt) { VLOG(2) << "Logging command " << cmd << " from txn " << op_args.tx->txid(); - op_args.tx->LogJournalOnShard(op_args.shard, Payload(cmd, args), shard_cnt, true); + op_args.tx->LogJournalOnShard(op_args.shard, Payload(cmd, args), shard_cnt); } void RecordJournal(const OpArgs& op_args, std::string_view cmd, facade::ArgSlice args, uint32_t shard_cnt) { VLOG(2) << "Logging command " << cmd << " from txn " << op_args.tx->txid(); - op_args.tx->LogJournalOnShard(op_args.shard, Payload(cmd, args), shard_cnt, true); + op_args.tx->LogJournalOnShard(op_args.shard, Payload(cmd, args), shard_cnt); } -void RecordExpiry(DbIndex dbid, string_view key, bool preempts) { +void RecordExpiry(DbIndex dbid, string_view key) { auto journal = EngineShard::tlocal()->journal(); CHECK(journal); - if (!preempts) { - util::FiberAtomicGuard guard; - journal->RecordEntry(0, journal::Op::EXPIRED, dbid, 1, cluster::KeySlot(key), - Payload("DEL", ArgSlice{key}), preempts); - return; - } - journal->RecordEntry(0, journal::Op::EXPIRED, dbid, 1, cluster::KeySlot(key), - Payload("DEL", ArgSlice{key}), preempts); -} -void TriggerJournalWriteToSink() { - auto journal = EngineShard::tlocal()->journal(); - CHECK(journal); - journal->RecordEntry(0, journal::Op::NOOP, 0, 0, nullopt, {}, true); + journal->RecordEntry(0, journal::Op::EXPIRED, dbid, 1, cluster::KeySlot(key), + Payload("DEL", ArgSlice{key})); } LockTag::LockTag(std::string_view key) { diff --git a/src/server/tx_base.h b/src/server/tx_base.h index 1253788864ef..af7091a62103 100644 --- a/src/server/tx_base.h +++ b/src/server/tx_base.h @@ -224,7 +224,7 @@ void RecordJournal(const OpArgs& op_args, std::string_view cmd, ArgSlice args, // Record expiry in journal with independent transaction. Must be called from shard thread holding // key. -void RecordExpiry(DbIndex dbid, std::string_view key, bool preempts = false); +void RecordExpiry(DbIndex dbid, std::string_view key); // Trigger journal write to sink, no journal record will be added to journal. // Must be called from shard thread of journal to sink. From 0832d23f13c8a49582f266e2851938cc6f12f96d Mon Sep 17 00:00:00 2001 From: Borys Date: Thu, 2 Jan 2025 13:50:21 +0200 Subject: [PATCH 12/20] fix: allow cluster node load snapshot bigger than maxmemory (#4394) --- src/server/rdb_load.cc | 11 ++++-- tests/dragonfly/cluster_test.py | 59 +++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+), 3 deletions(-) diff --git a/src/server/rdb_load.cc b/src/server/rdb_load.cc index d227cd8231e0..383b218dab7a 100644 --- a/src/server/rdb_load.cc +++ b/src/server/rdb_load.cc @@ -2464,9 +2464,14 @@ error_code RdbLoader::HandleAux() { if (absl::SimpleAtoi(auxval, &usedmem)) { VLOG(1) << "RDB memory usage when created " << strings::HumanReadableNumBytes(usedmem); if (usedmem > ssize_t(max_memory_limit)) { - LOG(WARNING) << "Could not load snapshot - its used memory is " << usedmem - << " but the limit is " << max_memory_limit; - return RdbError(errc::out_of_memory); + if (cluster::IsClusterEnabled()) { + LOG(INFO) << "Attempting to load a snapshot of size " << usedmem + << ", despite memory limit of " << max_memory_limit; + } else { + LOG(WARNING) << "Could not load snapshot - its used memory is " << usedmem + << " but the limit is " << max_memory_limit; + return RdbError(errc::out_of_memory); + } } } } else if (auxkey == "aof-preamble") { diff --git a/tests/dragonfly/cluster_test.py b/tests/dragonfly/cluster_test.py index b90d486650a2..5d9f9e6a9dc9 100644 --- a/tests/dragonfly/cluster_test.py +++ b/tests/dragonfly/cluster_test.py @@ -2499,3 +2499,62 @@ async def test_migration_timeout_on_sync(df_factory: DflyInstanceFactory, df_see await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes]) assert (await StaticSeeder.capture(nodes[1].client)) == start_capture + + +@pytest.mark.slow +@dfly_args({"proactor_threads": 4, "cluster_mode": "yes"}) +async def test_snapshot_bigger_than_maxmemory(df_factory: DflyInstanceFactory, df_seeder_factory): + """ + Test load snapshot that is bigger than max_memory, but contains more slots and should be load without OOM: + + 1) Create snapshot + 2) split slots between 2 instances and reduce maxmemory + 3) load snapshot to both instances + + The result should be the same: instances contain all the data that was in snapshot + """ + dbfilename = f"dump_{tmp_file_name()}" + instances = [ + df_factory.create( + port=next(next_port), admin_port=next(next_port), maxmemory="3G", dbfilename=dbfilename + ), + df_factory.create(port=next(next_port), admin_port=next(next_port), maxmemory="1G"), + ] + df_factory.start_all(instances) + + nodes = [await create_node_info(n) for n in instances] + + nodes[0].slots = [(0, 16383)] + nodes[1].slots = [] + + logging.debug("Push initial config") + await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes]) + + logging.debug("create data") + seeder = df_seeder_factory.create( + keys=30000, val_size=10000, port=nodes[0].instance.port, cluster_mode=True + ) + await seeder.run(target_deviation=0.05) + capture = await seeder.capture() + + logging.debug("SAVE") + await nodes[0].client.execute_command("SAVE", "rdb") + + logging.debug("flushall") + for node in nodes: + await node.client.execute_command("flushall") + await node.client.execute_command("CONFIG SET maxmemory 1G") + + nodes[0].slots = [(0, 8191)] + nodes[1].slots = [(8192, 16383)] + + await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes]) + + for node in nodes: + await node.client.execute_command("DFLY", "LOAD", f"{dbfilename}.rdb") + + assert await seeder.compare(capture, nodes[0].instance.port) + + # prevent saving during shutdown + for node in nodes: + await node.client.execute_command("flushall") From 6e9409c65c412f264089d07d6e579f09a38544b7 Mon Sep 17 00:00:00 2001 From: Roman Gershman Date: Thu, 2 Jan 2025 15:49:13 +0200 Subject: [PATCH 13/20] fix: properly clear tiered stashes upon expirations (#4395) Fixes #4387 Signed-off-by: Roman Gershman --- src/server/db_slice.cc | 8 ++++++-- src/server/dfly_bench.cc | 2 +- src/server/tiered_storage.cc | 2 +- src/server/tiered_storage_test.cc | 9 +++++++++ 4 files changed, 17 insertions(+), 4 deletions(-) diff --git a/src/server/db_slice.cc b/src/server/db_slice.cc index 5b6661616ef3..d765caf5cb5d 100644 --- a/src/server/db_slice.cc +++ b/src/server/db_slice.cc @@ -1479,9 +1479,13 @@ void DbSlice::PerformDeletion(Iterator del_it, ExpIterator exp_it, DbTable* tabl } DbTableStats& stats = table->stats; - const PrimeValue& pv = del_it->second; + PrimeValue& pv = del_it->second; - if (pv.IsExternal() && shard_owner()->tiered_storage()) { + if (pv.HasStashPending()) { + string scratch; + string_view key = del_it->first.GetSlice(&scratch); + shard_owner()->tiered_storage()->CancelStash(table->index, key, &pv); + } else if (pv.IsExternal()) { shard_owner()->tiered_storage()->Delete(table->index, &del_it->second); } diff --git a/src/server/dfly_bench.cc b/src/server/dfly_bench.cc index 8d0d974a7222..8d2093a1cc79 100644 --- a/src/server/dfly_bench.cc +++ b/src/server/dfly_bench.cc @@ -487,7 +487,7 @@ void Driver::ParseRESP() { PopRequest(); } io_buf_.ConsumeInput(consumed); - } while (result == RedisParser::OK); + } while (result == RedisParser::OK && io_buf_.InputLen() > 0); } void Driver::ParseMC() { diff --git a/src/server/tiered_storage.cc b/src/server/tiered_storage.cc index eb323aab9c84..181320825057 100644 --- a/src/server/tiered_storage.cc +++ b/src/server/tiered_storage.cc @@ -391,7 +391,7 @@ bool TieredStorage::TryStash(DbIndex dbid, string_view key, PrimeValue* value) { return false; // This invariant should always hold because ShouldStash tests for IoPending flag. - DCHECK(!bins_->IsPending(dbid, key)); + CHECK(!bins_->IsPending(dbid, key)); // TODO: When we are low on memory we should introduce a back-pressure, to avoid OOMs // with a lot of underutilized disk space. diff --git a/src/server/tiered_storage_test.cc b/src/server/tiered_storage_test.cc index 038d72e9c9c0..cb13032bb1b3 100644 --- a/src/server/tiered_storage_test.cc +++ b/src/server/tiered_storage_test.cc @@ -317,4 +317,13 @@ TEST_F(TieredStorageTest, MemoryPressure) { EXPECT_LT(used_mem_peak.load(), 20_MB); } +TEST_F(TieredStorageTest, Expiry) { + string val = BuildString(100); + Run({"psetex", "key1", "1", val}); + AdvanceTime(10); + Run({"psetex", "key1", "1", val}); + auto resp = Run({"get", "key1"}); + EXPECT_EQ(resp, val); +} + } // namespace dfly From 4f09fe036c1a291179040251187220cba2b8447d Mon Sep 17 00:00:00 2001 From: Roman Gershman Date: Fri, 3 Jan 2025 15:06:24 +0200 Subject: [PATCH 14/20] feat: support deletions with meta protocol (#4398) Signed-off-by: Roman Gershman --- src/facade/reply_builder.cc | 7 +++++-- src/facade/reply_builder.h | 1 + src/server/generic_family.cc | 2 +- tests/dragonfly/memcache_meta.py | 2 ++ 4 files changed, 9 insertions(+), 3 deletions(-) diff --git a/src/facade/reply_builder.cc b/src/facade/reply_builder.cc index d398ecbf8f98..8d372c3c4244 100644 --- a/src/facade/reply_builder.cc +++ b/src/facade/reply_builder.cc @@ -283,6 +283,10 @@ void MCReplyBuilder::SendMiss() { SendSimpleString("EN"); } +void MCReplyBuilder::SendDeleted() { + SendSimpleString(flag_.meta ? "HD" : "DELETED"); +} + void MCReplyBuilder::SendRaw(std::string_view str) { ReplyScope scope(this); WriteRef(str); @@ -422,7 +426,7 @@ void RedisReplyBuilder::SendScoredArray(ScoredArray arr, bool with_scores) { void RedisReplyBuilder::SendLabeledScoredArray(std::string_view arr_label, ScoredArray arr) { ReplyScope scope(this); - + StartArray(2); SendBulkString(arr_label); @@ -432,7 +436,6 @@ void RedisReplyBuilder::SendLabeledScoredArray(std::string_view arr_label, Score SendBulkString(str); SendDouble(score); } - } void RedisReplyBuilder::SendStored() { diff --git a/src/facade/reply_builder.h b/src/facade/reply_builder.h index 8c2bf7ee38c6..8b6a5a1a3926 100644 --- a/src/facade/reply_builder.h +++ b/src/facade/reply_builder.h @@ -172,6 +172,7 @@ class MCReplyBuilder : public SinkReplyBuilder { void SendClientError(std::string_view str); void SendNotFound(); void SendMiss(); + void SendDeleted(); void SendGetEnd(); void SendValue(std::string_view key, std::string_view value, uint64_t mc_ver, uint32_t mc_flag); diff --git a/src/server/generic_family.cc b/src/server/generic_family.cc index 17725d1c79b5..efe4fea221c2 100644 --- a/src/server/generic_family.cc +++ b/src/server/generic_family.cc @@ -1023,7 +1023,7 @@ void DeleteGeneric(CmdArgList args, const CommandContext& cmd_cntx) { if (del_cnt == 0) { mc_builder->SendNotFound(); } else { - mc_builder->SendSimpleString("DELETED"); + mc_builder->SendDeleted(); } } else { builder->SendLong(del_cnt); diff --git a/tests/dragonfly/memcache_meta.py b/tests/dragonfly/memcache_meta.py index 87bbd1fcf648..243339d319ea 100644 --- a/tests/dragonfly/memcache_meta.py +++ b/tests/dragonfly/memcache_meta.py @@ -28,3 +28,5 @@ def test_basic(df_server: DflyInstance): response = pool.meta_get(Key("key1"), flags=request_flags) assert isinstance(response, Success) assert pool.get("key2") is None + assert pool.delete("key1") + assert pool.delete("key1") is False From 92c3749c8c4055b1ab5560a3a8b5a85c39e4d08d Mon Sep 17 00:00:00 2001 From: adiholden Date: Sun, 5 Jan 2025 13:18:13 +0200 Subject: [PATCH 15/20] fix(tests): check cluster big snapshot in unit test (#4403) fix tests: check cluster big snapshot in unit test Signed-off-by: adi_holden --- src/server/cluster/cluster_family_test.cc | 12 +++++ tests/dragonfly/cluster_test.py | 59 ----------------------- 2 files changed, 12 insertions(+), 59 deletions(-) diff --git a/src/server/cluster/cluster_family_test.cc b/src/server/cluster/cluster_family_test.cc index 074ba2f5abfb..8c609a952e12 100644 --- a/src/server/cluster/cluster_family_test.cc +++ b/src/server/cluster/cluster_family_test.cc @@ -619,6 +619,18 @@ TEST_F(ClusterFamilyTest, ClusterFirstConfigCallDropsEntriesNotOwnedByNode) { ExpectConditionWithinTimeout([&]() { return CheckedInt({"dbsize"}) == 0; }); } +TEST_F(ClusterFamilyTest, SnapshotBiggerThanMaxMemory) { + InitWithDbFilename(); + ConfigSingleNodeCluster(GetMyId()); + + Run({"debug", "populate", "50000"}); + EXPECT_EQ(Run({"save", "df"}), "OK"); + + max_memory_limit = 10000; + auto save_info = service_->server_family().GetLastSaveInfo(); + EXPECT_EQ(Run({"dfly", "load", save_info.file_name}), "OK"); +} + TEST_F(ClusterFamilyTest, Keyslot) { // Example from Redis' command reference: https://redis.io/commands/cluster-keyslot/ EXPECT_THAT(Run({"cluster", "keyslot", "somekey"}), IntArg(11'058)); diff --git a/tests/dragonfly/cluster_test.py b/tests/dragonfly/cluster_test.py index 5d9f9e6a9dc9..b90d486650a2 100644 --- a/tests/dragonfly/cluster_test.py +++ b/tests/dragonfly/cluster_test.py @@ -2499,62 +2499,3 @@ async def test_migration_timeout_on_sync(df_factory: DflyInstanceFactory, df_see await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes]) assert (await StaticSeeder.capture(nodes[1].client)) == start_capture - - -@pytest.mark.slow -@dfly_args({"proactor_threads": 4, "cluster_mode": "yes"}) -async def test_snapshot_bigger_than_maxmemory(df_factory: DflyInstanceFactory, df_seeder_factory): - """ - Test load snapshot that is bigger than max_memory, but contains more slots and should be load without OOM: - - 1) Create snapshot - 2) split slots between 2 instances and reduce maxmemory - 3) load snapshot to both instances - - The result should be the same: instances contain all the data that was in snapshot - """ - dbfilename = f"dump_{tmp_file_name()}" - instances = [ - df_factory.create( - port=next(next_port), admin_port=next(next_port), maxmemory="3G", dbfilename=dbfilename - ), - df_factory.create(port=next(next_port), admin_port=next(next_port), maxmemory="1G"), - ] - df_factory.start_all(instances) - - nodes = [await create_node_info(n) for n in instances] - - nodes[0].slots = [(0, 16383)] - nodes[1].slots = [] - - logging.debug("Push initial config") - await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes]) - - logging.debug("create data") - seeder = df_seeder_factory.create( - keys=30000, val_size=10000, port=nodes[0].instance.port, cluster_mode=True - ) - await seeder.run(target_deviation=0.05) - capture = await seeder.capture() - - logging.debug("SAVE") - await nodes[0].client.execute_command("SAVE", "rdb") - - logging.debug("flushall") - for node in nodes: - await node.client.execute_command("flushall") - await node.client.execute_command("CONFIG SET maxmemory 1G") - - nodes[0].slots = [(0, 8191)] - nodes[1].slots = [(8192, 16383)] - - await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes]) - - for node in nodes: - await node.client.execute_command("DFLY", "LOAD", f"{dbfilename}.rdb") - - assert await seeder.compare(capture, nodes[0].instance.port) - - # prevent saving during shutdown - for node in nodes: - await node.client.execute_command("flushall") From 0f5eb33e6a609f4d1feb0b2d70033a58b0e25da6 Mon Sep 17 00:00:00 2001 From: adiholden Date: Sun, 5 Jan 2025 13:19:13 +0200 Subject: [PATCH 16/20] chore(pytest): add timeout per test (#4404) regression action add 5 minutes timeout per test Signed-off-by: adi_holden --- .github/actions/regression-tests/action.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/actions/regression-tests/action.yml b/.github/actions/regression-tests/action.yml index 01fddc2ba602..6dae421c7e29 100644 --- a/.github/actions/regression-tests/action.yml +++ b/.github/actions/regression-tests/action.yml @@ -46,7 +46,7 @@ runs: export DRAGONFLY_PATH="${GITHUB_WORKSPACE}/${{inputs.build-folder-name}}/${{inputs.dfly-executable}}" export UBSAN_OPTIONS=print_stacktrace=1:halt_on_error=1 # to crash on errors - timeout 50m pytest -m "${{inputs.filter}}" --durations=10 --color=yes --json-report --json-report-file=report.json dragonfly --log-cli-level=INFO || code=$? + timeout 50m pytest -m "${{inputs.filter}}" --durations=10 --timeout=300 --color=yes --json-report --json-report-file=report.json dragonfly --log-cli-level=INFO || code=$? # timeout returns 124 if we exceeded the timeout duration if [[ $code -eq 124 ]]; then From f663f8e841edb5d046de69c32342a0260ac29565 Mon Sep 17 00:00:00 2001 From: Borys Date: Sun, 5 Jan 2025 14:16:02 +0200 Subject: [PATCH 17/20] fix: provide resp3 option to CapturingReplyBuilder (#4400) --- src/facade/reply_builder.cc | 4 ++-- src/facade/reply_builder.h | 14 +++++++---- src/facade/reply_builder_test.cc | 36 ++++++++++++++-------------- src/facade/reply_capture.h | 3 ++- src/server/multi_command_squasher.cc | 17 +++++++------ src/server/multi_command_squasher.h | 3 ++- src/server/server_family.cc | 4 ++-- 7 files changed, 46 insertions(+), 35 deletions(-) diff --git a/src/facade/reply_builder.cc b/src/facade/reply_builder.cc index 8d372c3c4244..ba51e49c1f75 100644 --- a/src/facade/reply_builder.cc +++ b/src/facade/reply_builder.cc @@ -294,7 +294,7 @@ void MCReplyBuilder::SendRaw(std::string_view str) { void RedisReplyBuilderBase::SendNull() { ReplyScope scope(this); - resp3_ ? WritePieces(kNullStringR3) : WritePieces(kNullStringR2); + IsResp3() ? WritePieces(kNullStringR3) : WritePieces(kNullStringR2); } void RedisReplyBuilderBase::SendSimpleString(std::string_view str) { @@ -327,7 +327,7 @@ void RedisReplyBuilderBase::SendDouble(double val) { static_assert(ABSL_ARRAYSIZE(buf) < kMaxInlineSize, "Write temporary string from buf inline"); string_view val_str = FormatDouble(val, buf, ABSL_ARRAYSIZE(buf)); - if (!resp3_) + if (!IsResp3()) return SendBulkString(val_str); ReplyScope scope(this); diff --git a/src/facade/reply_builder.h b/src/facade/reply_builder.h index 8b6a5a1a3926..ec3ce6580e09 100644 --- a/src/facade/reply_builder.h +++ b/src/facade/reply_builder.h @@ -22,6 +22,8 @@ enum class ReplyMode { FULL // All replies are recorded }; +enum class RespVersion { kResp2, kResp3 }; + // Base class for all reply builders. Offer a simple high level interface for controlling output // modes and sending basic response types. class SinkReplyBuilder { @@ -259,15 +261,19 @@ class RedisReplyBuilderBase : public SinkReplyBuilder { static std::string SerializeCommand(std::string_view command); bool IsResp3() const { - return resp3_; + return resp_ == RespVersion::kResp3; + } + + void SetRespVersion(RespVersion resp_version) { + resp_ = resp_version; } - void SetResp3(bool resp3) { - resp3_ = resp3; + RespVersion GetRespVersion() { + return resp_; } private: - bool resp3_ = false; + RespVersion resp_ = RespVersion::kResp2; }; // Non essential redis reply builder functions implemented on top of the base resp protocol diff --git a/src/facade/reply_builder_test.cc b/src/facade/reply_builder_test.cc index 8560bcde5ee0..f1a3c5b49b25 100644 --- a/src/facade/reply_builder_test.cc +++ b/src/facade/reply_builder_test.cc @@ -699,14 +699,14 @@ TEST_F(RedisReplyBuilderTest, BatchMode) { } TEST_F(RedisReplyBuilderTest, Resp3Double) { - builder_->SetResp3(true); + builder_->SetRespVersion(RespVersion::kResp3); builder_->SendDouble(5.5); ASSERT_TRUE(NoErrors()); ASSERT_EQ(str(), ",5.5\r\n"); } TEST_F(RedisReplyBuilderTest, Resp3NullString) { - builder_->SetResp3(true); + builder_->SetRespVersion(RespVersion::kResp3); builder_->SendNull(); ASSERT_TRUE(NoErrors()); ASSERT_EQ(TakePayload(), "_\r\n"); @@ -715,13 +715,13 @@ TEST_F(RedisReplyBuilderTest, Resp3NullString) { TEST_F(RedisReplyBuilderTest, SendStringArrayAsMap) { const std::vector map_array{"k1", "v1", "k2", "v2"}; - builder_->SetResp3(false); + builder_->SetRespVersion(RespVersion::kResp2); builder_->SendBulkStrArr(map_array, builder_->MAP); ASSERT_TRUE(NoErrors()); ASSERT_EQ(TakePayload(), "*4\r\n$2\r\nk1\r\n$2\r\nv1\r\n$2\r\nk2\r\n$2\r\nv2\r\n") << "SendStringArrayAsMap Resp2 Failed."; - builder_->SetResp3(true); + builder_->SetRespVersion(RespVersion::kResp3); builder_->SendBulkStrArr(map_array, builder_->MAP); ASSERT_TRUE(NoErrors()); ASSERT_EQ(TakePayload(), "%2\r\n$2\r\nk1\r\n$2\r\nv1\r\n$2\r\nk2\r\n$2\r\nv2\r\n") @@ -731,13 +731,13 @@ TEST_F(RedisReplyBuilderTest, SendStringArrayAsMap) { TEST_F(RedisReplyBuilderTest, SendStringArrayAsSet) { const std::vector set_array{"e1", "e2", "e3"}; - builder_->SetResp3(false); + builder_->SetRespVersion(RespVersion::kResp2); builder_->SendBulkStrArr(set_array, builder_->SET); ASSERT_TRUE(NoErrors()); ASSERT_EQ(TakePayload(), "*3\r\n$2\r\ne1\r\n$2\r\ne2\r\n$2\r\ne3\r\n") << "SendStringArrayAsSet Resp2 Failed."; - builder_->SetResp3(true); + builder_->SetRespVersion(RespVersion::kResp3); builder_->SendBulkStrArr(set_array, builder_->SET); ASSERT_TRUE(NoErrors()); ASSERT_EQ(TakePayload(), "~3\r\n$2\r\ne1\r\n$2\r\ne2\r\n$2\r\ne3\r\n") @@ -748,26 +748,26 @@ TEST_F(RedisReplyBuilderTest, SendScoredArray) { const std::vector> scored_array{ {"e1", 1.1}, {"e2", 2.2}, {"e3", 3.3}}; - builder_->SetResp3(false); + builder_->SetRespVersion(RespVersion::kResp2); builder_->SendScoredArray(scored_array, false); ASSERT_TRUE(NoErrors()); ASSERT_EQ(TakePayload(), "*3\r\n$2\r\ne1\r\n$2\r\ne2\r\n$2\r\ne3\r\n") << "Resp2 WITHOUT scores failed."; - builder_->SetResp3(true); + builder_->SetRespVersion(RespVersion::kResp3); builder_->SendScoredArray(scored_array, false); ASSERT_TRUE(NoErrors()); ASSERT_EQ(TakePayload(), "*3\r\n$2\r\ne1\r\n$2\r\ne2\r\n$2\r\ne3\r\n") << "Resp3 WITHOUT scores failed."; - builder_->SetResp3(false); + builder_->SetRespVersion(RespVersion::kResp2); builder_->SendScoredArray(scored_array, true); ASSERT_TRUE(NoErrors()); ASSERT_EQ(TakePayload(), "*6\r\n$2\r\ne1\r\n$3\r\n1.1\r\n$2\r\ne2\r\n$3\r\n2.2\r\n$2\r\ne3\r\n$3\r\n3.3\r\n") << "Resp3 WITHSCORES failed."; - builder_->SetResp3(true); + builder_->SetRespVersion(RespVersion::kResp3); builder_->SendScoredArray(scored_array, true); ASSERT_TRUE(NoErrors()); ASSERT_EQ(TakePayload(), @@ -779,7 +779,7 @@ TEST_F(RedisReplyBuilderTest, SendLabeledScoredArray) { const std::vector> scored_array{ {"e1", 1.1}, {"e2", 2.2}, {"e3", 3.3}}; - builder_->SetResp3(false); + builder_->SetRespVersion(RespVersion::kResp2); builder_->SendLabeledScoredArray("foobar", scored_array); ASSERT_TRUE(NoErrors()); ASSERT_EQ(TakePayload(), @@ -787,7 +787,7 @@ TEST_F(RedisReplyBuilderTest, SendLabeledScoredArray) { "2\r\n*2\r\n$2\r\ne3\r\n$3\r\n3.3\r\n") << "Resp3 failed.\n"; - builder_->SetResp3(true); + builder_->SetRespVersion(RespVersion::kResp3); builder_->SendLabeledScoredArray("foobar", scored_array); ASSERT_TRUE(NoErrors()); ASSERT_EQ(TakePayload(), @@ -850,8 +850,8 @@ TEST_F(RedisReplyBuilderTest, BasicCapture) { big_arr_cb, }; - crb.SetResp3(true); - builder_->SetResp3(true); + crb.SetRespVersion(RespVersion::kResp3); + builder_->SetRespVersion(RespVersion::kResp3); // Run generator functions on both a regular redis builder // and the capturing builder with its capture applied. @@ -864,7 +864,7 @@ TEST_F(RedisReplyBuilderTest, BasicCapture) { EXPECT_EQ(expected, actual); } - builder_->SetResp3(false); + builder_->SetRespVersion(RespVersion::kResp2); } TEST_F(RedisReplyBuilderTest, FormatDouble) { @@ -889,17 +889,17 @@ TEST_F(RedisReplyBuilderTest, VerbatimString) { // test resp3 std::string str = "A simple string!"; - builder_->SetResp3(true); + builder_->SetRespVersion(RespVersion::kResp3); builder_->SendVerbatimString(str, RedisReplyBuilder::VerbatimFormat::TXT); ASSERT_TRUE(NoErrors()); ASSERT_EQ(TakePayload(), "=20\r\ntxt:A simple string!\r\n") << "Resp3 VerbatimString TXT failed."; - builder_->SetResp3(true); + builder_->SetRespVersion(RespVersion::kResp3); builder_->SendVerbatimString(str, RedisReplyBuilder::VerbatimFormat::MARKDOWN); ASSERT_TRUE(NoErrors()); ASSERT_EQ(TakePayload(), "=20\r\nmkd:A simple string!\r\n") << "Resp3 VerbatimString TXT failed."; - builder_->SetResp3(false); + builder_->SetRespVersion(RespVersion::kResp2); builder_->SendVerbatimString(str); ASSERT_TRUE(NoErrors()); ASSERT_EQ(TakePayload(), "$16\r\nA simple string!\r\n") << "Resp3 VerbatimString TXT failed."; diff --git a/src/facade/reply_capture.h b/src/facade/reply_capture.h index 1ffa4fd7cfce..7712992350e1 100644 --- a/src/facade/reply_capture.h +++ b/src/facade/reply_capture.h @@ -43,8 +43,9 @@ class CapturingReplyBuilder : public RedisReplyBuilder { struct SimpleString : public std::string {}; // SendSimpleString struct BulkString : public std::string {}; // SendBulkString - CapturingReplyBuilder(ReplyMode mode = ReplyMode::FULL) + CapturingReplyBuilder(ReplyMode mode = ReplyMode::FULL, RespVersion resp_v = RespVersion::kResp2) : RedisReplyBuilder{nullptr}, reply_mode_{mode}, stack_{}, current_{} { + SetRespVersion(resp_v); } using Payload = std::variantshard_id()]; DCHECK(!sinfo.cmds.empty()); auto* local_tx = sinfo.local_tx.get(); - facade::CapturingReplyBuilder crb; + facade::CapturingReplyBuilder crb(ReplyMode::FULL, resp_v); ConnectionContext local_cntx{cntx_, local_tx}; if (cntx_->conn()) { local_cntx.skip_acl_validation = cntx_->conn()->IsPrivileged(); @@ -244,14 +245,15 @@ bool MultiCommandSquasher::ExecuteSquashed(facade::RedisReplyBuilder* rb) { cntx_->cid = base_cid_; auto cb = [this](ShardId sid) { return !sharded_[sid].cmds.empty(); }; tx->PrepareSquashedMultiHop(base_cid_, cb); - tx->ScheduleSingleHop([this](auto* tx, auto* es) { return SquashedHopCb(tx, es); }); + tx->ScheduleSingleHop( + [this, rb](auto* tx, auto* es) { return SquashedHopCb(tx, es, rb->GetRespVersion()); }); } else { #if 1 fb2::BlockingCounter bc(num_shards); DVLOG(1) << "Squashing " << num_shards << " " << tx->DebugId(); - auto cb = [this, tx, bc]() mutable { - this->SquashedHopCb(tx, EngineShard::tlocal()); + auto cb = [this, tx, bc, rb]() mutable { + this->SquashedHopCb(tx, EngineShard::tlocal(), rb->GetRespVersion()); bc->Dec(); }; @@ -261,8 +263,9 @@ bool MultiCommandSquasher::ExecuteSquashed(facade::RedisReplyBuilder* rb) { } bc->Wait(); #else - shard_set->RunBlockingInParallel([this, tx](auto* es) { SquashedHopCb(tx, es); }, - [this](auto sid) { return !sharded_[sid].cmds.empty(); }); + shard_set->RunBlockingInParallel( + [this, tx, rb](auto* es) { SquashedHopCb(tx, es, rb->GetRespVersion()); }, + [this](auto sid) { return !sharded_[sid].cmds.empty(); }); #endif } diff --git a/src/server/multi_command_squasher.h b/src/server/multi_command_squasher.h index ccea7f7a2c71..e269cc52256e 100644 --- a/src/server/multi_command_squasher.h +++ b/src/server/multi_command_squasher.h @@ -61,7 +61,8 @@ class MultiCommandSquasher { bool ExecuteStandalone(facade::RedisReplyBuilder* rb, StoredCmd* cmd); // Callback that runs on shards during squashed hop. - facade::OpStatus SquashedHopCb(Transaction* parent_tx, EngineShard* es); + facade::OpStatus SquashedHopCb(Transaction* parent_tx, EngineShard* es, + facade::RespVersion resp_v); // Execute all currently squashed commands. Return false if aborting on error. bool ExecuteSquashed(facade::RedisReplyBuilder* rb); diff --git a/src/server/server_family.cc b/src/server/server_family.cc index 83c06f5a532a..9efcb2b73874 100644 --- a/src/server/server_family.cc +++ b/src/server/server_family.cc @@ -2696,10 +2696,10 @@ void ServerFamily::Hello(CmdArgList args, const CommandContext& cmd_cntx) { int proto_version = 2; if (is_resp3) { proto_version = 3; - rb->SetResp3(true); + rb->SetRespVersion(RespVersion::kResp3); } else { // Issuing hello 2 again is valid and should switch back to RESP2 - rb->SetResp3(false); + rb->SetRespVersion(RespVersion::kResp2); } SinkReplyBuilder::ReplyAggregator agg(rb); From ff4add0c9ec4daf43ce30c0bbf3e426c967f432f Mon Sep 17 00:00:00 2001 From: Roman Gershman Date: Sun, 5 Jan 2025 15:31:59 +0200 Subject: [PATCH 18/20] chore: deprecate unneeded runtime flags (#4405) Deprecate multi_exec_mode and track_exec_frequencies. Remove duplicated call to EnterLameDuck (done also in Service::Shutdown()). Signed-off-by: Roman Gershman --- .github/workflows/ci.yml | 2 - src/server/main_service.cc | 15 +++----- src/server/multi_test.cc | 73 +------------------------------------ src/server/server_family.cc | 3 -- 4 files changed, 7 insertions(+), 86 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4ea1a3552dbb..5fb1a3f4fc41 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -159,8 +159,6 @@ jobs: FLAGS_cluster_mode=emulated FLAGS_lock_on_hashtags=true timeout 20m ctest -V -L DFLY timeout 5m ./dragonfly_test - timeout 5m ./multi_test --multi_exec_mode=1 - timeout 5m ./multi_test --multi_exec_mode=3 timeout 5m ./json_family_test --jsonpathv2=false timeout 5m ./tiered_storage_test --vmodule=db_slice=2 --logtostderr - name: Upload unit logs on failure diff --git a/src/server/main_service.cc b/src/server/main_service.cc index df22f6b8ccd3..a33a86739af9 100644 --- a/src/server/main_service.cc +++ b/src/server/main_service.cc @@ -76,13 +76,13 @@ ABSL_FLAG(uint32_t, memcached_port, 0, "Memcached port"); ABSL_FLAG(uint32_t, num_shards, 0, "Number of database shards, 0 - to choose automatically"); -ABSL_FLAG(uint32_t, multi_exec_mode, 2, - "Set multi exec atomicity mode: 1 for global, 2 for locking ahead, 3 for non atomic"); +ABSL_RETIRED_FLAG(uint32_t, multi_exec_mode, 2, "DEPRECATED. Sets multi exec atomicity mode"); ABSL_FLAG(bool, multi_exec_squash, true, "Whether multi exec will squash single shard commands to optimize performance"); -ABSL_FLAG(bool, track_exec_frequencies, true, "Whether to track exec frequencies for multi exec"); +ABSL_RETIRED_FLAG(bool, track_exec_frequencies, true, + "DEPRECATED. Whether to track exec frequencies for multi exec"); ABSL_FLAG(bool, lua_resp2_legacy_float, false, "Return rounded down integers instead of floats for lua scripts with RESP2"); ABSL_FLAG(uint32_t, multi_eval_squash_buffer, 4096, "Max buffer for squashed commands per script"); @@ -651,8 +651,7 @@ Transaction::MultiMode DeduceExecMode(ExecScriptUse state, const ScriptMgr& script_mgr) { // Check if script most LIKELY has global eval transactions bool contains_global = false; - Transaction::MultiMode multi_mode = - static_cast(absl::GetFlag(FLAGS_multi_exec_mode)); + Transaction::MultiMode multi_mode = Transaction::LOCK_AHEAD; if (state == ExecScriptUse::SCRIPT_RUN) { contains_global = script_mgr.AreGlobalByDefault(); @@ -2207,10 +2206,8 @@ void Service::Exec(CmdArgList args, const CommandContext& cmd_cntx) { rb->StartArray(exec_info.body.size()); if (!exec_info.body.empty()) { - if (GetFlag(FLAGS_track_exec_frequencies)) { - string descr = CreateExecDescriptor(exec_info.body, cmd_cntx.tx->GetUniqueShardCnt()); - ServerState::tlocal()->exec_freq_count[descr]++; - } + string descr = CreateExecDescriptor(exec_info.body, cmd_cntx.tx->GetUniqueShardCnt()); + ServerState::tlocal()->exec_freq_count[descr]++; if (absl::GetFlag(FLAGS_multi_exec_squash) && state != ExecScriptUse::SCRIPT_RUN && !cntx->conn_state.tracking_info_.IsTrackingOn()) { diff --git a/src/server/multi_test.cc b/src/server/multi_test.cc index 7270459000f8..24be97d9eb3a 100644 --- a/src/server/multi_test.cc +++ b/src/server/multi_test.cc @@ -16,7 +16,6 @@ #include "server/test_utils.h" #include "server/transaction.h" -ABSL_DECLARE_FLAG(uint32_t, multi_exec_mode); ABSL_DECLARE_FLAG(bool, multi_exec_squash); ABSL_DECLARE_FLAG(bool, lua_auto_async); ABSL_DECLARE_FLAG(bool, lua_allow_undeclared_auto_correct); @@ -194,11 +193,6 @@ TEST_F(MultiTest, MultiSeq) { } TEST_F(MultiTest, MultiConsistent) { - if (auto mode = absl::GetFlag(FLAGS_multi_exec_mode); mode == Transaction::NON_ATOMIC) { - GTEST_SKIP() << "Skipped MultiConsistent test because multi_exec_mode is non atomic"; - return; - } - Run({"mset", kKey1, "base", kKey4, "base"}); auto mset_fb = pp_->at(0)->LaunchFiber([&] { @@ -244,11 +238,6 @@ TEST_F(MultiTest, MultiConsistent) { } TEST_F(MultiTest, MultiConsistent2) { - if (auto mode = absl::GetFlag(FLAGS_multi_exec_mode); mode == Transaction::NON_ATOMIC) { - GTEST_SKIP() << "Skipped MultiConsistent2 test because multi_exec_mode is non atomic"; - return; - } - const int kKeyCount = 50; const int kRuns = 50; const int kJobs = 20; @@ -589,11 +578,7 @@ TEST_F(MultiTest, MultiOOO) { auto metrics = GetMetrics(); // OOO works in LOCK_AHEAD mode. - int mode = absl::GetFlag(FLAGS_multi_exec_mode); - if (mode == Transaction::LOCK_AHEAD || mode == Transaction::NON_ATOMIC) - EXPECT_EQ(200, metrics.shard_stats.tx_ooo_total); - else - EXPECT_EQ(0, metrics.shard_stats.tx_ooo_total); + EXPECT_EQ(200, metrics.shard_stats.tx_ooo_total); } // Lua scripts lock their keys ahead and thus can run out of order. @@ -693,26 +678,11 @@ TEST_F(MultiTest, MultiCauseUnblocking) { } TEST_F(MultiTest, ExecGlobalFallback) { - absl::FlagSaver fs; - // Check global command MOVE falls back to global mode from lock ahead. - absl::SetFlag(&FLAGS_multi_exec_mode, Transaction::LOCK_AHEAD); Run({"multi"}); Run({"set", "a", "1"}); // won't run ooo, because it became part of global Run({"move", "a", "1"}); Run({"exec"}); EXPECT_EQ(1, GetMetrics().coordinator_stats.tx_global_cnt); - - ClearMetrics(); - - // Check non atomic mode does not fall back to global. - absl::SetFlag(&FLAGS_multi_exec_mode, Transaction::NON_ATOMIC); - Run({"multi"}); - Run({"set", "a", "1"}); // will run ooo(quick) - Run({"move", "a", "1"}); - Run({"exec"}); - - auto stats = GetMetrics().coordinator_stats; - EXPECT_EQ(1, stats.tx_normal_cnt); // move is global } #ifndef SANITIZERS @@ -774,11 +744,6 @@ TEST_F(MultiTest, ScriptFlagsEmbedded) { // Flaky because of https://github.com/google/sanitizers/issues/1760 #ifndef SANITIZERS TEST_F(MultiTest, UndeclaredKeyFlag) { - if (auto mode = absl::GetFlag(FLAGS_multi_exec_mode); mode != Transaction::LOCK_AHEAD) { - GTEST_SKIP() << "Skipped test because multi_exec_mode is not default"; - return; - } - absl::FlagSaver fs; // lua_undeclared_keys_shas changed via CONFIG cmd below const char* script = "return redis.call('GET', 'random-key');"; @@ -829,11 +794,6 @@ TEST_F(MultiTest, ScriptBadCommand) { #endif TEST_F(MultiTest, MultiEvalModeConflict) { - if (auto mode = absl::GetFlag(FLAGS_multi_exec_mode); mode == Transaction::GLOBAL) { - GTEST_SKIP() << "Skipped MultiEvalModeConflict test because multi_exec_mode is global"; - return; - } - const char* s1 = R"( --!df flags=allow-undeclared-keys return redis.call('GET', 'random-key'); @@ -851,11 +811,6 @@ TEST_F(MultiTest, MultiEvalModeConflict) { // Run multi-exec transactions that move values from a source list // to destination list through two contended channels. TEST_F(MultiTest, ContendedList) { - if (auto mode = absl::GetFlag(FLAGS_multi_exec_mode); mode == Transaction::NON_ATOMIC) { - GTEST_SKIP() << "Skipped ContendedList test because multi_exec_mode is non atomic"; - return; - } - constexpr int listSize = 50; constexpr int stepSize = 5; @@ -896,7 +851,6 @@ TEST_F(MultiTest, ContendedList) { TEST_F(MultiTest, TestSquashing) { absl::FlagSaver fs; absl::SetFlag(&FLAGS_multi_exec_squash, true); - absl::SetFlag(&FLAGS_multi_exec_mode, Transaction::LOCK_AHEAD); const char* keys[] = {kKeySid0, kKeySid1, kKeySid2}; @@ -930,11 +884,6 @@ TEST_F(MultiTest, TestSquashing) { } TEST_F(MultiTest, MultiLeavesTxQueue) { - if (auto mode = absl::GetFlag(FLAGS_multi_exec_mode); mode == Transaction::NON_ATOMIC) { - GTEST_SKIP() << "Skipped MultiLeavesTxQueue test because multi_exec_mode is non atomic"; - return; - } - // Tests the scenario, where the OOO multi-tx is scheduled into tx queue and there is another // tx (mget) after it that runs and tests for atomicity. absl::FlagSaver fs; @@ -1010,10 +959,6 @@ TEST_F(MultiTest, MultiLeavesTxQueue) { } TEST_F(MultiTest, TestLockedKeys) { - if (auto mode = absl::GetFlag(FLAGS_multi_exec_mode); mode != Transaction::LOCK_AHEAD) { - GTEST_SKIP() << "Skipped TestLockedKeys test because multi_exec_mode is not lock ahead"; - return; - } auto condition = [&]() { return IsLocked(0, "key1") && IsLocked(0, "key2"); }; auto fb = ExpectConditionWithSuspension(condition); @@ -1034,8 +979,6 @@ TEST_F(MultiTest, EvalExpiration) { return; } - absl::FlagSaver fs; - absl::SetFlag(&FLAGS_multi_exec_mode, Transaction::LOCK_AHEAD); Run({"eval", "redis.call('set', 'x', 0, 'ex', 5, 'nx')", "1", "x"}); EXPECT_LE(CheckedInt({"pttl", "x"}), 5000); } @@ -1064,11 +1007,6 @@ class MultiEvalTest : public BaseFamilyTest { }; TEST_F(MultiEvalTest, MultiAllEval) { - if (auto mode = absl::GetFlag(FLAGS_multi_exec_mode); mode == Transaction::NON_ATOMIC) { - GTEST_SKIP() << "Skipped MultiAllEval test because multi_exec_mode is non atomic"; - return; - } - RespExpr brpop_resp; // Run the fiber at creation. @@ -1087,10 +1025,6 @@ TEST_F(MultiEvalTest, MultiAllEval) { } TEST_F(MultiEvalTest, MultiSomeEval) { - if (auto mode = absl::GetFlag(FLAGS_multi_exec_mode); mode == Transaction::NON_ATOMIC) { - GTEST_SKIP() << "Skipped MultiAllEval test because multi_exec_mode is non atomic"; - return; - } RespExpr brpop_resp; // Run the fiber at creation. @@ -1131,11 +1065,6 @@ TEST_F(MultiEvalTest, ScriptSquashingUknownCmd) { #endif TEST_F(MultiEvalTest, MultiAndEval) { - if (auto mode = absl::GetFlag(FLAGS_multi_exec_mode); mode == Transaction::NON_ATOMIC) { - GTEST_SKIP() << "Skipped MultiAndEval test because multi_exec_mode is non atomic"; - return; - } - // We had a bug in borrowing interpreters which caused a crash in this scenario Run({"multi"}); Run({"eval", "return redis.call('set', 'x', 'y1')", "1", "x"}); diff --git a/src/server/server_family.cc b/src/server/server_family.cc index 9efcb2b73874..dea8d5d297fd 100644 --- a/src/server/server_family.cc +++ b/src/server/server_family.cc @@ -3087,9 +3087,6 @@ void ServerFamily::ShutdownCmd(CmdArgList args, const CommandContext& cmd_cntx) } } - service_.proactor_pool().AwaitFiberOnAll( - [](ProactorBase* pb) { ServerState::tlocal()->EnterLameDuck(); }); - CHECK_NOTNULL(acceptor_)->Stop(); cmd_cntx.rb->SendOk(); } From 7860a169d9bcb464797d0b7e741aa894b9ccfb9f Mon Sep 17 00:00:00 2001 From: Shahar Mike Date: Sun, 5 Jan 2025 16:28:45 +0200 Subject: [PATCH 19/20] feat: Yield inside huge values migration serialization (#4197) * feat: Yield inside huge values migration serialization With #4144 we break huge values slot migration into multiple commands. This PR now adds yield between those commands. It also adds a test that checks that modifying huge values while doing a migration works well, and that RSS doesn't grow too much. Fixes #4100 --- src/server/journal/streamer.cc | 16 ++++-- src/server/journal/streamer.h | 1 + tests/dragonfly/cluster_test.py | 81 ++++++++++++++++++++++++++++-- tests/dragonfly/requirements.txt | 1 + tests/dragonfly/seeder/__init__.py | 10 ++-- tests/dragonfly/seeder_test.py | 21 ++++++++ tests/dragonfly/utility.py | 34 ++++++++++--- 7 files changed, 148 insertions(+), 16 deletions(-) diff --git a/src/server/journal/streamer.cc b/src/server/journal/streamer.cc index 91480a181341..c2df86b39464 100644 --- a/src/server/journal/streamer.cc +++ b/src/server/journal/streamer.cc @@ -218,6 +218,12 @@ void RestoreStreamer::Run() { if (fiber_cancelled_) // Could have been cancelled in above call too return; + std::lock_guard guard(big_value_mu_); + + // Locking this never preempts. See snapshot.cc for why we need it. + auto* blocking_counter = db_slice_->BlockingCounter(); + std::lock_guard blocking_counter_guard(*blocking_counter); + WriteBucket(it); }); @@ -281,7 +287,6 @@ bool RestoreStreamer::ShouldWrite(cluster::SlotId slot_id) const { void RestoreStreamer::WriteBucket(PrimeTable::bucket_iterator it) { if (it.GetVersion() < snapshot_version_) { - FiberAtomicGuard fg; it.SetVersion(snapshot_version_); string key_buffer; // we can reuse it for (; !it.is_done(); ++it) { @@ -302,6 +307,7 @@ void RestoreStreamer::WriteBucket(PrimeTable::bucket_iterator it) { } void RestoreStreamer::OnDbChange(DbIndex db_index, const DbSlice::ChangeReq& req) { + std::lock_guard guard(big_value_mu_); DCHECK_EQ(db_index, 0) << "Restore migration only allowed in cluster mode in db0"; PrimeTable* table = db_slice_->GetTables(0).first; @@ -319,8 +325,12 @@ void RestoreStreamer::OnDbChange(DbIndex db_index, const DbSlice::ChangeReq& req void RestoreStreamer::WriteEntry(string_view key, const PrimeValue& pk, const PrimeValue& pv, uint64_t expire_ms) { - CmdSerializer serializer([&](std::string s) { Write(std::move(s)); }, - ServerState::tlocal()->serialization_max_chunk_size); + CmdSerializer serializer( + [&](std::string s) { + Write(std::move(s)); + ThrottleIfNeeded(); + }, + ServerState::tlocal()->serialization_max_chunk_size); serializer.SerializeEntry(key, pk, pv, expire_ms); } diff --git a/src/server/journal/streamer.h b/src/server/journal/streamer.h index a18615a053f1..907b6e65eee2 100644 --- a/src/server/journal/streamer.h +++ b/src/server/journal/streamer.h @@ -112,6 +112,7 @@ class RestoreStreamer : public JournalStreamer { cluster::SlotSet my_slots_; bool fiber_cancelled_ = false; bool snapshot_finished_ = false; + ThreadLocalMutex big_value_mu_; }; } // namespace dfly diff --git a/tests/dragonfly/cluster_test.py b/tests/dragonfly/cluster_test.py index b90d486650a2..a7262aa5dfe6 100644 --- a/tests/dragonfly/cluster_test.py +++ b/tests/dragonfly/cluster_test.py @@ -14,8 +14,7 @@ from redis.cluster import RedisCluster from redis.cluster import ClusterNode from .proxy import Proxy -from .seeder import SeederBase -from .seeder import StaticSeeder +from .seeder import Seeder, SeederBase, StaticSeeder from . import dfly_args @@ -33,6 +32,11 @@ def monotonically_increasing_port_number(): next_port = monotonically_increasing_port_number() +async def get_memory(client, field): + info = await client.info("memory") + return info[field] + + class RedisClusterNode: def __init__(self, port): self.port = port @@ -1981,6 +1985,7 @@ async def node1size0(): @dfly_args({"proactor_threads": 2, "cluster_mode": "yes"}) @pytest.mark.asyncio +@pytest.mark.opt_only async def test_cluster_migration_huge_container(df_factory: DflyInstanceFactory): instances = [ df_factory.create(port=next(next_port), admin_port=next(next_port)) for i in range(2) @@ -1995,7 +2000,7 @@ async def test_cluster_migration_huge_container(df_factory: DflyInstanceFactory) logging.debug("Generating huge containers") seeder = StaticSeeder( - key_target=10, + key_target=100, data_size=10_000_000, collection_size=10_000, variance=1, @@ -2005,6 +2010,8 @@ async def test_cluster_migration_huge_container(df_factory: DflyInstanceFactory) await seeder.run(nodes[0].client) source_data = await StaticSeeder.capture(nodes[0].client) + mem_before = await get_memory(nodes[0].client, "used_memory_rss") + nodes[0].migrations = [ MigrationInfo("127.0.0.1", instances[1].admin_port, [(0, 16383)], nodes[1].id) ] @@ -2017,6 +2024,74 @@ async def test_cluster_migration_huge_container(df_factory: DflyInstanceFactory) target_data = await StaticSeeder.capture(nodes[1].client) assert source_data == target_data + # Get peak memory, because migration removes the data + mem_after = await get_memory(nodes[0].client, "used_memory_peak_rss") + logging.debug(f"Memory before {mem_before} after {mem_after}") + assert mem_after < mem_before * 1.1 + + +@dfly_args({"proactor_threads": 2, "cluster_mode": "yes"}) +@pytest.mark.parametrize("chunk_size", [1_000_000, 30]) +@pytest.mark.asyncio +async def test_cluster_migration_while_seeding( + df_factory: DflyInstanceFactory, df_seeder_factory: DflySeederFactory, chunk_size +): + instances = [ + df_factory.create( + port=next(next_port), + admin_port=next(next_port), + serialization_max_chunk_size=chunk_size, + ) + for _ in range(2) + ] + df_factory.start_all(instances) + + nodes = [await create_node_info(instance) for instance in instances] + nodes[0].slots = [(0, 16383)] + nodes[1].slots = [] + client0 = nodes[0].client + client1 = nodes[1].client + + await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes]) + + logging.debug("Seeding cluster") + seeder = df_seeder_factory.create( + keys=10_000, port=instances[0].port, cluster_mode=True, mirror_to_fake_redis=True + ) + await seeder.run(target_deviation=0.1) + + seed = asyncio.create_task(seeder.run()) + await asyncio.sleep(1) + + nodes[0].migrations = [ + MigrationInfo("127.0.0.1", instances[1].admin_port, [(0, 16383)], nodes[1].id) + ] + logging.debug("Migrating slots") + await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes]) + + logging.debug("Waiting for migration to finish") + await wait_for_status(nodes[0].admin_client, nodes[1].id, "FINISHED", timeout=300) + logging.debug("Migration finished") + + logging.debug("Finalizing migration") + nodes[0].slots = [] + nodes[1].slots = [(0, 16383)] + await push_config(json.dumps(generate_config(nodes)), [node.admin_client for node in nodes]) + + await asyncio.sleep(1) # Let seeder feed dest before migration finishes + + seeder.stop() + await seed + logging.debug("Seeding finished") + + assert ( + await get_memory(client0, "used_memory_peak_rss") + < await get_memory(client0, "used_memory_rss") * 1.1 + ) + + capture = await seeder.capture_fake_redis() + assert await seeder.compare(capture, instances[1].port) + def parse_lag(replication_info: str): lags = re.findall("lag=([0-9]+)\r\n", replication_info) diff --git a/tests/dragonfly/requirements.txt b/tests/dragonfly/requirements.txt index cfbbd8262986..25fb8f69e251 100644 --- a/tests/dragonfly/requirements.txt +++ b/tests/dragonfly/requirements.txt @@ -25,3 +25,4 @@ pytest-emoji==0.2.0 pytest-icdiff==0.8 pytest-timeout==2.2.0 asyncio==3.4.3 +fakeredis[json]==2.26.2 diff --git a/tests/dragonfly/seeder/__init__.py b/tests/dragonfly/seeder/__init__.py index 4154e4d1cf93..351fc38a8458 100644 --- a/tests/dragonfly/seeder/__init__.py +++ b/tests/dragonfly/seeder/__init__.py @@ -177,14 +177,16 @@ async def run(self, client: aioredis.Redis, target_ops=None, target_deviation=No ] sha = await client.script_load(Seeder._load_script("generate")) - await asyncio.gather( - *(self._run_unit(client, sha, unit, using_stopkey, args) for unit in self.units) - ) + for unit in self.units: + # Must be serial, otherwise cluster clients throws an exception + await self._run_unit(client, sha, unit, using_stopkey, args) async def stop(self, client: aioredis.Redis): """Request seeder seeder if it's running without a target, future returned from start() must still be awaited""" - await asyncio.gather(*(client.set(unit.stop_key, "X") for unit in self.units)) + for unit in self.units: + # Must be serial, otherwise cluster clients throws an exception + await client.set(unit.stop_key, "X") def change_key_target(self, target: int): """Change key target, applied only on succeeding runs""" diff --git a/tests/dragonfly/seeder_test.py b/tests/dragonfly/seeder_test.py index 61557270f747..3d35242da54e 100644 --- a/tests/dragonfly/seeder_test.py +++ b/tests/dragonfly/seeder_test.py @@ -4,6 +4,8 @@ from redis import asyncio as aioredis from . import dfly_args from .seeder import Seeder, StaticSeeder +from .instance import DflyInstanceFactory, DflyInstance +from .utility import * @dfly_args({"proactor_threads": 4}) @@ -114,3 +116,22 @@ async def set_data(): # Do another change await async_client.spop("set1") assert capture != await Seeder.capture(async_client) + + +@pytest.mark.asyncio +@dfly_args({"proactor_threads": 2}) +async def test_seeder_fake_redis( + df_factory: DflyInstanceFactory, df_seeder_factory: DflySeederFactory +): + instance = df_factory.create() + df_factory.start_all([instance]) + + seeder = df_seeder_factory.create( + keys=100, port=instance.port, unsupported_types=[ValueType.JSON], mirror_to_fake_redis=True + ) + + await seeder.run(target_ops=5_000) + + capture = await seeder.capture_fake_redis() + + assert await seeder.compare(capture, instance.port) diff --git a/tests/dragonfly/utility.py b/tests/dragonfly/utility.py index 197d2a3d02c1..40f8ce1dd97e 100644 --- a/tests/dragonfly/utility.py +++ b/tests/dragonfly/utility.py @@ -14,6 +14,7 @@ import subprocess import pytest import os +import fakeredis from typing import Iterable, Union from enum import Enum @@ -271,7 +272,7 @@ def gen_shrink_cmd(self): ("LPUSH {k} {val}", ValueType.LIST), ("LPOP {k}", ValueType.LIST), ("SADD {k} {val}", ValueType.SET), - ("SPOP {k}", ValueType.SET), + # ("SPOP {k}", ValueType.SET), # Disabled because it is inconsistent ("HSETNX {k} v0 {val}", ValueType.HSET), ("HINCRBY {k} v1 1", ValueType.HSET), ("ZPOPMIN {k} 1", ValueType.ZSET), @@ -423,6 +424,7 @@ def __init__( unsupported_types=[], stop_on_failure=True, cluster_mode=False, + mirror_to_fake_redis=False, ): if cluster_mode: max_multikey = 1 @@ -436,11 +438,16 @@ def __init__( self.multi_transaction_probability = multi_transaction_probability self.stop_flag = False self.stop_on_failure = stop_on_failure + self.fake_redis = None self.log_file = log_file if self.log_file is not None: open(self.log_file, "w").close() + if mirror_to_fake_redis: + logging.debug("Creating FakeRedis instance") + self.fake_redis = fakeredis.FakeAsyncRedis() + async def run(self, target_ops=None, target_deviation=None): """ Run a seeding cycle on all dbs either until stop(), a fixed number of commands (target_ops) @@ -474,6 +481,14 @@ def reset(self): """Reset internal state. Needs to be called after flush or restart""" self.gen.reset() + async def capture_fake_redis(self): + keys = sorted(list(self.gen.keys_and_types())) + # TODO: support multiple databases + assert self.dbcount == 1 + assert self.fake_redis != None + capture = DataCapture(await self._capture_entries(self.fake_redis, keys)) + return [capture] + async def capture(self, port=None): """Create DataCapture for all dbs""" @@ -588,12 +603,19 @@ async def _executor_task(self, db, queue): queue.task_done() break - pipe = client.pipeline(transaction=tx_data[1]) - for cmd in tx_data[0]: - pipe.execute_command(*cmd) - try: - await pipe.execute() + if self.fake_redis is None: + pipe = client.pipeline(transaction=tx_data[1]) + for cmd in tx_data[0]: + pipe.execute_command(*cmd) + await pipe.execute() + else: + # To mirror consistently to Fake Redis we must only send to it successful + # commands. We can't use pipes because they might succeed partially. + for cmd in tx_data[0]: + dfly_resp = await client.execute_command(*cmd) + fake_resp = await self.fake_redis.execute_command(*cmd) + assert dfly_resp == fake_resp except (redis.exceptions.ConnectionError, redis.exceptions.ResponseError) as e: if self.stop_on_failure: await self._close_client(client) From 1c0f22f5fac7da1668971dc639b04ddefa8df8e1 Mon Sep 17 00:00:00 2001 From: adiholden Date: Mon, 6 Jan 2025 08:39:32 +0200 Subject: [PATCH 20/20] chore(pytest): add check for rss grow in replicaiton big values (#4406) Signed-off-by: adi_holden --- tests/dragonfly/replication_test.py | 33 +++++++++++++++++++++++------ tests/dragonfly/seeder/__init__.py | 1 + 2 files changed, 28 insertions(+), 6 deletions(-) diff --git a/tests/dragonfly/replication_test.py b/tests/dragonfly/replication_test.py index a2695e5b2e81..fef25ebb74f7 100644 --- a/tests/dragonfly/replication_test.py +++ b/tests/dragonfly/replication_test.py @@ -2662,11 +2662,12 @@ async def test_replication_timeout_on_full_sync_heartbeat_expiry( @pytest.mark.parametrize( "element_size, elements_number", - [(16, 20000), (20000, 16)], + [(16, 30000), (30000, 16)], ) +@dfly_args({"proactor_threads": 1}) async def test_big_containers(df_factory, element_size, elements_number): - master = df_factory.create(proactor_threads=4) - replica = df_factory.create(proactor_threads=4) + master = df_factory.create() + replica = df_factory.create() df_factory.start_all([master, replica]) c_master = master.client() @@ -2674,19 +2675,39 @@ async def test_big_containers(df_factory, element_size, elements_number): logging.debug("Fill master with test data") seeder = StaticSeeder( - key_target=10, + key_target=50, data_size=element_size * elements_number, collection_size=elements_number, variance=1, - samples=5, - types=["LIST", "SET", "ZSET", "HASH"], + samples=1, + types=["LIST", "SET", "ZSET", "HASH", "STREAM"], ) await seeder.run(c_master) + async def get_memory(client, field): + info = await client.info("memory") + return info[field] + + await asyncio.sleep(1) # wait for heartbeat to update rss memory + used_memory = await get_memory(c_master, "used_memory_rss") + logging.debug("Start replication and wait for full sync") await c_replica.execute_command(f"REPLICAOF localhost {master.port}") await wait_for_replicas_state(c_replica) + peak_memory = await get_memory(c_master, "used_memory_peak_rss") + + logging.info(f"Used memory {used_memory}, peak memory {peak_memory}") + assert peak_memory < 1.1 * used_memory + + await c_replica.execute_command("memory decommit") + await asyncio.sleep(1) + replica_peak_memory = await get_memory(c_replica, "used_memory_peak_rss") + replica_used_memory = await get_memory(c_replica, "used_memory_rss") + + logging.info(f"Replica Used memory {replica_used_memory}, peak memory {replica_peak_memory}") + assert replica_peak_memory < 1.1 * replica_used_memory + # Check replica data consisten replica_data = await StaticSeeder.capture(c_replica) master_data = await StaticSeeder.capture(c_master) diff --git a/tests/dragonfly/seeder/__init__.py b/tests/dragonfly/seeder/__init__.py index 351fc38a8458..78c51f3839d3 100644 --- a/tests/dragonfly/seeder/__init__.py +++ b/tests/dragonfly/seeder/__init__.py @@ -118,6 +118,7 @@ async def _run_unit(self, client: aioredis.Redis, dtype: str, prefix: str): args = ["DEBUG", "POPULATE", key_target, prefix, math.ceil(dsize)] args += ["RAND", "TYPE", dtype, "ELEMENTS", csize] + logging.debug(args) return await client.execute_command(*args)