From 49c146fd64c00817b67811935840aa9c92e246d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wojciech=20Przytu=C5=82a?= Date: Wed, 10 Jul 2024 07:25:11 +0200 Subject: [PATCH 1/5] locator: ReplicasOrdered comments & typo fixes --- scylla/src/transport/locator/mod.rs | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/scylla/src/transport/locator/mod.rs b/scylla/src/transport/locator/mod.rs index 64b4dc9fa0..e0f06e8ba2 100644 --- a/scylla/src/transport/locator/mod.rs +++ b/scylla/src/transport/locator/mod.rs @@ -472,19 +472,23 @@ impl<'a> IntoIterator for ReplicaSet<'a> { } enum ReplicaSetIteratorInner<'a> { + /// Token ring with SimpleStrategy, any datacenter Plain { replicas: ReplicasArray<'a>, idx: usize, }, + /// Tablets PlainSharded { replicas: &'a [(Arc, Shard)], idx: usize, }, + /// Token ring with SimpleStrategy, specific datacenter FilteredSimple { replicas: ReplicasArray<'a>, datacenter: &'a str, idx: usize, }, + /// Token ring with NetworkTopologyStrategy ChainedNTS { replicas: ReplicasArray<'a>, replicas_idx: usize, @@ -637,8 +641,8 @@ impl<'a> ReplicaSet<'a> { /// or it must compute them on-demand (in case of NetworkTopologyStrategy). /// The computation is lazy (performed by `ReplicasOrderedIterator` upon call to `next()`). /// For obtaining the primary replica, no allocations are needed. Therefore, the first call -/// to `next()` is optimised and doesn not allocate. -/// For the remaining others, unfortunately, allocation is unevitable. +/// to `next()` is optimised and does not allocate. +/// For the remaining others, unfortunately, allocation is inevitable. pub struct ReplicasOrdered<'a> { replica_set: ReplicaSet<'a>, } @@ -650,7 +654,8 @@ pub struct ReplicasOrderedIterator<'a> { enum ReplicasOrderedIteratorInner<'a> { AlreadyRingOrdered { - // In case of Plain and FilteredSimple variants, ReplicaSetIterator respects ring order. + // In case of Plain, PlainSharded and FilteredSimple variants, + // ReplicaSetIterator respects ring order. replica_set_iter: ReplicaSetIterator<'a>, }, PolyDatacenterNTS { From 527c32ef781bb491e1adbdb31a7a5e60db78978c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wojciech=20Przytu=C5=82a?= Date: Tue, 9 Jul 2024 22:01:12 +0200 Subject: [PATCH 2/5] default_lb: verbose test assertions They aid in debugging. --- .../src/transport/load_balancing/default.rs | 21 ++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/scylla/src/transport/load_balancing/default.rs b/scylla/src/transport/load_balancing/default.rs index 4280c855fa..e06eae44bc 100644 --- a/scylla/src/transport/load_balancing/default.rs +++ b/scylla/src/transport/load_balancing/default.rs @@ -1078,7 +1078,9 @@ mod tests { assert_eq!( got.len(), combined_groups_len, - "Plan length different than expected" + "Plan length different than expected. Got plan {:?}, expected groups {:?}", + got, + self.groups, ); // Now, split `got` into groups of expected sizes @@ -1095,10 +1097,10 @@ mod tests { // Verify that the group has the same nodes as the // expected one let got_set: HashSet<_> = got_group.iter().copied().collect(); - assert_eq!(&got_set, expected_set); + assert_eq!(&got_set, expected_set, "Unordered group mismatch"); } ExpectedGroup::Ordered(sequence) => { - assert_eq!(&got_group, sequence); + assert_eq!(&got_group, sequence, "Ordered group mismatch"); } } @@ -1117,7 +1119,11 @@ mod tests { // then expect there to be more than one group // in the set. if gots.len() > 1 && s.len() > 1 { - assert!(sets.len() > 1); + assert!( + sets.len() > 1, + "Group {:?} is expected to be nondeterministic, but it appears to be deterministic", + expected + ); } } ExpectedGroup::Deterministic(_) | ExpectedGroup::Ordered(_) => { @@ -1127,7 +1133,12 @@ mod tests { // the same order. // There will only be one, unique ordering shared // by all plans - check this - assert_eq!(sets.len(), 1); + assert_eq!( + sets.len(), + 1, + "Group {:?} is expected to be deterministic, but it appears to be nondeterministic", + expected + ); } } } From d77990c80ee88e66e807cb2d3bd050bd748dda75 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wojciech=20Przytu=C5=82a?= Date: Tue, 9 Jul 2024 21:41:04 +0200 Subject: [PATCH 3/5] default_lb: fix small rack+latency-awareness bug A copy-paste kind of bug made local rack non-replica nodes not present in the fallback iterator earlier than non-local-rack non-replica nodes. This was a fairly edge case bug, though, because for it to manifest, the following condition would have to be satisfied: 1. a non-replica node from the preferred rack would have to be penalised by LatencyAwareness, 2. a non-replica node from the preferred DC but not preferred rack would have to be penalised by LatencyAwareness, too. --- scylla/src/transport/load_balancing/default.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scylla/src/transport/load_balancing/default.rs b/scylla/src/transport/load_balancing/default.rs index e06eae44bc..5257ef91ea 100644 --- a/scylla/src/transport/load_balancing/default.rs +++ b/scylla/src/transport/load_balancing/default.rs @@ -313,7 +313,7 @@ or refrain from preferring datacenters (which may ban all other datacenters, if let maybe_local_rack_nodes = if let NodeLocationPreference::DatacenterAndRack(dc, rack) = &self.preferences { let rack_predicate = Self::make_rack_predicate( - |node| (self.pick_predicate)(node, None), + |node| Self::is_alive(node, None), NodeLocationCriteria::DatacenterAndRack(dc, rack), ); Either::Left( From 37971c57fec334a4105ab72a70eaa201d3b6450b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wojciech=20Przytu=C5=82a?= Date: Wed, 10 Jul 2024 07:08:50 +0200 Subject: [PATCH 4/5] default_lb: fix LWT routing bug There is a special logic in default policy that handles the case when computing the first acceptable replica (i.e. such replica that it satisfies `pick_predicate`) is expensive (i.e. requires allocation). That logic was to make `pick` return None, because then `fallback` would allocate and compute all acceptable replicas. Unfortunately, the logic contained a bug that made `pick` continue execution instead of returning None, leading to a non-necessarily-replica being returned. This would break the LWT optimisation, because in case that the primary replica is filtered out by LB (e.g. it's down, or disabled by HostFilter), the second replica should be targeted instead, deterministically. The fix involves creating an enum to distinguish between three scenarios: 1. No replica exists that could satisfy the pick predicate -> None; 2. The primary replica satisfies the pick predicate -> Some(Computed(primary_replica)); 3. The primary replica does not satisfy the pick predicate, but it's possible that another replica does -> Some(ToBeComputedInFallback). Before the fix, the third scenario would merge with the first, leading to incorrect behaviour of not returning None from `pick`. --- .../src/transport/load_balancing/default.rs | 70 ++++++++++++++----- 1 file changed, 52 insertions(+), 18 deletions(-) diff --git a/scylla/src/transport/load_balancing/default.rs b/scylla/src/transport/load_balancing/default.rs index 5257ef91ea..148887f566 100644 --- a/scylla/src/transport/load_balancing/default.rs +++ b/scylla/src/transport/load_balancing/default.rs @@ -70,6 +70,16 @@ enum StatementType { NonLwt, } +/// A result of `pick_replica`. +enum PickedReplica<'a> { + /// A replica that could be computed cheaply. + Computed((NodeRef<'a>, Shard)), + + /// A replica that could not be computed cheaply. `pick` should therefore return None + /// and `fallback` will then return that replica as the first in the iterator. + ToBeComputedInFallback, +} + /// The default load balancing policy. /// /// It can be configured to be datacenter-aware and token-aware. @@ -137,8 +147,14 @@ or refrain from preferring datacenters (which may ban all other datacenters, if table_spec, ); - if let Some((alive_local_rack_replica, shard)) = local_rack_picked { - return Some((alive_local_rack_replica, Some(shard))); + if let Some(picked) = local_rack_picked { + return match picked { + PickedReplica::Computed((alive_local_rack_replica, shard)) => { + Some((alive_local_rack_replica, Some(shard))) + } + // Let call to fallback() compute the replica, because it requires allocation. + PickedReplica::ToBeComputedInFallback => None, + }; } } @@ -155,8 +171,14 @@ or refrain from preferring datacenters (which may ban all other datacenters, if table_spec, ); - if let Some((alive_local_replica, shard)) = picked { - return Some((alive_local_replica, Some(shard))); + if let Some(picked) = picked { + return match picked { + PickedReplica::Computed((alive_local_replica, shard)) => { + Some((alive_local_replica, Some(shard))) + } + // Let call to fallback() compute the replica, because it requires allocation. + PickedReplica::ToBeComputedInFallback => None, + }; } } @@ -173,8 +195,14 @@ or refrain from preferring datacenters (which may ban all other datacenters, if statement_type, table_spec, ); - if let Some((alive_remote_replica, shard)) = picked { - return Some((alive_remote_replica, Some(shard))); + if let Some(picked) = picked { + return match picked { + PickedReplica::Computed((alive_remote_replica, shard)) => { + Some((alive_remote_replica, Some(shard))) + } + // Let call to fallback() compute the replica, because it requires allocation. + PickedReplica::ToBeComputedInFallback => None, + }; } } }; @@ -540,14 +568,14 @@ impl DefaultPolicy { cluster: &'a ClusterData, statement_type: StatementType, table_spec: &TableSpec, - ) -> Option<(NodeRef<'a>, Shard)> { + ) -> Option { match statement_type { StatementType::Lwt => { self.pick_first_replica(ts, replica_location, predicate, cluster, table_spec) } - StatementType::NonLwt => { - self.pick_random_replica(ts, replica_location, predicate, cluster, table_spec) - } + StatementType::NonLwt => self + .pick_random_replica(ts, replica_location, predicate, cluster, table_spec) + .map(PickedReplica::Computed), } } @@ -562,7 +590,8 @@ impl DefaultPolicy { // // If no DC/rack preferences are set, then the only possible replica to be returned // (due to expensive computation of the others, and we avoid expensive computation in `pick()`) - // is the primary replica. It is returned **iff** it satisfies the predicate, else None. + // is the primary replica. If it exists, Some is returned, with either Computed(primary_replica) + // **iff** it satisfies the predicate or ToBeComputedInFallback otherwise. fn pick_first_replica<'a>( &'a self, ts: &TokenWithStrategy<'a>, @@ -570,30 +599,34 @@ impl DefaultPolicy { predicate: impl Fn(NodeRef<'a>, Shard) -> bool + 'a, cluster: &'a ClusterData, table_spec: &TableSpec, - ) -> Option<(NodeRef<'a>, Shard)> { + ) -> Option { match replica_location { NodeLocationCriteria::Any => { // ReplicaSet returned by ReplicaLocator for this case: - // 1) can be precomputed and lated used cheaply, + // 1) can be precomputed and later used cheaply, // 2) returns replicas in the **non-ring order** (this because ReplicaSet chains // ring-ordered replicas sequences from different DCs, thus not preserving // the global ring order). // Because of 2), we can't use a precomputed ReplicaSet, but instead we need ReplicasOrdered. // As ReplicasOrdered can compute cheaply only the primary global replica // (computation of the remaining ones is expensive), in case that the primary replica - // does not satisfy the `predicate`, None is returned. All expensive computation - // is to be done only when `fallback()` is called. + // does not satisfy the `predicate`, ToBeComputedInFallback is returned. + // All expensive computation is to be done only when `fallback()` is called. self.nonfiltered_replica_set(ts, replica_location, cluster, table_spec) .into_replicas_ordered() .into_iter() .next() - .and_then(|(primary_replica, shard)| { - predicate(primary_replica, shard).then_some((primary_replica, shard)) + .map(|(primary_replica, shard)| { + if predicate(primary_replica, shard) { + PickedReplica::Computed((primary_replica, shard)) + } else { + PickedReplica::ToBeComputedInFallback + } }) } NodeLocationCriteria::Datacenter(_) | NodeLocationCriteria::DatacenterAndRack(_, _) => { // ReplicaSet returned by ReplicaLocator for this case: - // 1) can be precomputed and lated used cheaply, + // 1) can be precomputed and later used cheaply, // 2) returns replicas in the ring order (this is not true for the case // when multiple DCs are allowed, because ReplicaSet chains replicas sequences // from different DCs, thus not preserving the global ring order) @@ -606,6 +639,7 @@ impl DefaultPolicy { table_spec, ) .next() + .map(PickedReplica::Computed) } } } From 9407afbe06b04ffcdc1fe6f07175e904f687558a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wojciech=20Przytu=C5=82a?= Date: Tue, 9 Jul 2024 16:35:28 +0200 Subject: [PATCH 5/5] default_lb: LWT optimisation regression test The test runs against ClusterData with node F being disabled by HostFilter. In such arrangement, F should be never returned. As F is the primary replica for executed statement and no DC is preferred, the second replica (`A`) cannot be computed cheaply, so `pick` should return None. Before the bug was fixed, `pick` would return an arbitrary robinned node, e.g. `B` (not even a replica). --- .../src/transport/load_balancing/default.rs | 76 ++++++++++++++++++- 1 file changed, 75 insertions(+), 1 deletion(-) diff --git a/scylla/src/transport/load_balancing/default.rs b/scylla/src/transport/load_balancing/default.rs index 148887f566..46aa282992 100644 --- a/scylla/src/transport/load_balancing/default.rs +++ b/scylla/src/transport/load_balancing/default.rs @@ -993,6 +993,8 @@ impl<'a> TokenWithStrategy<'a> { #[cfg(test)] mod tests { + use std::collections::HashMap; + use scylla_cql::{frame::types::SerialConsistency, Consistency}; use tracing::info; @@ -1000,7 +1002,12 @@ mod tests { get_plan_and_collect_node_identifiers, mock_cluster_data_for_token_unaware_tests, ExpectedGroups, ExpectedGroupsBuilder, }; - use crate::transport::locator::test::{TABLE_NTS_RF_2, TABLE_NTS_RF_3, TABLE_SS_RF_2}; + use crate::host_filter::HostFilter; + use crate::transport::locator::tablets::TabletsInfo; + use crate::transport::locator::test::{ + id_to_invalid_addr, mock_metadata_for_token_aware_tests, TABLE_NTS_RF_2, TABLE_NTS_RF_3, + TABLE_SS_RF_2, + }; use crate::{ load_balancing::{ default::tests::framework::mock_cluster_data_for_token_aware_tests, Plan, RoutingInfo, @@ -2306,6 +2313,73 @@ mod tests { ) .await; } + + let cluster_with_disabled_node_f = ClusterData::new( + mock_metadata_for_token_aware_tests(), + &Default::default(), + &HashMap::new(), + &None, + { + struct FHostFilter; + impl HostFilter for FHostFilter { + fn accept(&self, peer: &crate::transport::topology::Peer) -> bool { + peer.address != id_to_invalid_addr(F) + } + } + + Some(&FHostFilter) + }, + TabletsInfo::new(), + ) + .await; + + let tests_with_disabled_node_f = [ + // Keyspace NTS with RF=3 without preferred DC. + // The primary replica does not satisfy the predicate (being disabled by HostFilter), + // so pick() should return None and fallback should return A first. + // + // This is a regression test after a bug was fixed. + Test { + policy: DefaultPolicy { + preferences: NodeLocationPreference::Any, + is_token_aware: true, + permit_dc_failover: true, + pick_predicate: Box::new(|node, _shard| node.address != id_to_invalid_addr(F)), + ..Default::default() + }, + routing_info: RoutingInfo { + token: Some(Token::new(160)), + table: Some(TABLE_NTS_RF_3), + consistency: Consistency::One, + is_confirmed_lwt: true, + ..Default::default() + }, + // going through the ring, we get order: F , A , C , D , G , B , E + // us eu eu us eu eu us + // r2 r1 r1 r1 r2 r1 r1 + expected_groups: ExpectedGroupsBuilder::new() + // pick is empty, because the primary replica does not satisfy pick predicate, + // and with LWT we cannot compute other replicas for NTS without allocations. + .ordered([A, C, D, G, E]) // replicas + .group([B]) // nodes + .build(), + }, + ]; + + for Test { + policy, + routing_info, + expected_groups, + } in tests_with_disabled_node_f + { + test_default_policy_with_given_cluster_and_routing_info( + &policy, + &cluster_with_disabled_node_f, + &routing_info, + &expected_groups, + ) + .await; + } } }