From 3c75e06a0a97a054c2eb83cdc0c23dca3c8b7150 Mon Sep 17 00:00:00 2001 From: Jay Lee Date: Sat, 6 Mar 2021 23:30:25 +0800 Subject: [PATCH 1/7] *: introduce judge_split_prevote During split prevote, a campaign will fail because all nodes think it will collect enough votes, so after they actually start campaign, no one votes for the other, the campaign has to fail. `judge_split_prevote` solves the problem by adding extra constraint to split prevote: only vote for nodes that have greater IDs. It's easy to conclude that it works for peer numbers not greater 5. For 7 nodes, it's still possible to split again. But it should be enough for most cases. Because the constraint is only added for split prevote, so even failure won't lead to worse result. Signed-off-by: Jay Lee --- harness/tests/integration_cases/test_raft.rs | 91 +++++++++++++++----- src/config.rs | 5 ++ src/raft.rs | 36 ++++++-- src/raft_log.rs | 31 ++++--- 4 files changed, 123 insertions(+), 40 deletions(-) diff --git a/harness/tests/integration_cases/test_raft.rs b/harness/tests/integration_cases/test_raft.rs index 584c7a35b..9ba81ac96 100644 --- a/harness/tests/integration_cases/test_raft.rs +++ b/harness/tests/integration_cases/test_raft.rs @@ -4329,6 +4329,40 @@ fn test_prevote_with_split_vote() { assert_eq!(network.peers[&3].state, StateRole::Follower, "peer 3 state",); } +/// TestPreVoteWithJudgeSplitPreVote verifies that during split vote, cluster can finish campaign +/// by letting raft node with larger ID to finish campaign. +#[test] +fn test_prevote_with_judge_split_prevote() { + let l = default_logger(); + let peers = (1..=3).map(|id| { + let mut config = new_test_config(id, 10, 1); + config.pre_vote = true; + config.judge_split_prevote = true; + let storage = new_storage(); + storage.initialize_with_conf_state((vec![1, 2, 3], vec![])); + let mut raft = new_test_raft_with_config(&config, storage, &l); + raft.become_follower(1, INVALID_ID); + Some(raft) + }); + let mut network = Network::new(peers.collect(), &l); + network.send(vec![new_message(1, 1, MessageType::MsgHup, 0)]); + + // simulate leader down. followers start split vote. + network.isolate(1); + network.send(vec![ + new_message(2, 2, MessageType::MsgHup, 0), + new_message(3, 3, MessageType::MsgHup, 0), + ]); + + // check whether the term values are expected + assert_eq!(network.peers[&2].term, 3, "peer 2 term",); + assert_eq!(network.peers[&3].term, 3, "peer 3 term",); + + // check state + assert_eq!(network.peers[&2].state, StateRole::Follower, "peer 2 state",); + assert_eq!(network.peers[&3].state, StateRole::Leader, "peer 3 state",); +} + // ensure that after a node become pre-candidate, it will checkQuorum correctly. #[test] fn test_prevote_with_check_quorum() { @@ -5286,6 +5320,14 @@ fn test_group_commit_consistent() { /// of the election with both priority and log. #[test] fn test_election_with_priority_log() { + let entry_set = vec![ + ( + "large term", + vec![new_entry(2, 1, SOME_DATA)], + vec![new_entry(1, 1, SOME_DATA)], + ), + ("large index", vec![new_entry(1, 1, SOME_DATA)], vec![]), + ]; let tests = vec![ // log is up to date or not 1..3, priority 1..3, id, state (true, false, false, 3, 1, 1, 1, StateRole::Leader), @@ -5301,30 +5343,37 @@ fn test_election_with_priority_log() { (false, false, true, 1, 1, 3, 1, StateRole::Leader), ]; - for (_i, &(l1, l2, l3, p1, p2, p3, id, state)) in tests.iter().enumerate() { - let l = default_logger(); - let mut n1 = new_test_raft(1, vec![1, 2, 3], 10, 1, new_storage(), &l); - let mut n2 = new_test_raft(2, vec![1, 2, 3], 10, 1, new_storage(), &l); - let mut n3 = new_test_raft(3, vec![1, 2, 3], 10, 1, new_storage(), &l); - n1.set_priority(p1); - n2.set_priority(p2); - n3.set_priority(p3); - let entries = vec![new_entry(1, 1, SOME_DATA), new_entry(1, 1, SOME_DATA)]; - if l1 { - n1.raft_log.append(&entries); - } - if l2 { - n2.raft_log.append(&entries); - } - if l3 { - n3.raft_log.append(&entries); - } + for (tag, latest_entries, outdated_entries) in entry_set { + for (_i, &(l1, l2, l3, p1, p2, p3, id, state)) in tests.iter().enumerate() { + let l = default_logger(); + let mut n1 = new_test_raft(1, vec![1, 2, 3], 10, 1, new_storage(), &l); + let mut n2 = new_test_raft(2, vec![1, 2, 3], 10, 1, new_storage(), &l); + let mut n3 = new_test_raft(3, vec![1, 2, 3], 10, 1, new_storage(), &l); + n1.set_priority(p1); + n2.set_priority(p2); + n3.set_priority(p3); + if l1 { + n1.raft_log.append(&latest_entries); + } else { + n1.raft_log.append(&outdated_entries); + } + if l2 { + n2.raft_log.append(&latest_entries); + } else { + n2.raft_log.append(&outdated_entries); + } + if l3 { + n3.raft_log.append(&latest_entries); + } else { + n3.raft_log.append(&outdated_entries); + } - let mut network = Network::new(vec![Some(n1), Some(n2), Some(n3)], &l); + let mut network = Network::new(vec![Some(n1), Some(n2), Some(n3)], &l); - network.send(vec![new_message(id, id, MessageType::MsgHup, 0)]); + network.send(vec![new_message(id, id, MessageType::MsgHup, 0)]); - assert_eq!(network.peers[&id].state, state); + assert_eq!(network.peers[&id].state, state, "{}", tag); + } } } diff --git a/src/config.rs b/src/config.rs index fdc88d9db..5c59282eb 100644 --- a/src/config.rs +++ b/src/config.rs @@ -67,6 +67,10 @@ pub struct Config { /// rejoins the cluster. pub pre_vote: bool, + /// If split vote happens, only vote for the large id. This can save a round of campaign + /// when combined with `pre_vote`. + pub judge_split_prevote: bool, + /// The range of election timeout. In some cases, we hope some nodes has less possibility /// to become leader. This configuration ensures that the randomized election_timeout /// will always be suit in [min_election_tick, max_election_tick). @@ -109,6 +113,7 @@ impl Default for Config { max_inflight_msgs: 256, check_quorum: false, pre_vote: false, + judge_split_prevote: false, min_election_tick: 0, max_election_tick: 0, read_only_option: ReadOnlyOption::Safe, diff --git a/src/raft.rs b/src/raft.rs index 91aa4c2e8..4e2ffb52d 100644 --- a/src/raft.rs +++ b/src/raft.rs @@ -14,7 +14,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::cmp; +use std::cmp::{self, Ordering}; use std::ops::{Deref, DerefMut}; use crate::eraftpb::{ @@ -228,6 +228,7 @@ pub struct RaftCore { skip_bcast_commit: bool, batch_append: bool, + judge_split_prevote: bool, heartbeat_timeout: usize, election_timeout: usize, @@ -329,6 +330,7 @@ impl Raft { promotable: false, check_quorum: c.check_quorum, pre_vote: c.pre_vote, + judge_split_prevote: c.judge_split_prevote, read_only: ReadOnly::new(c.read_only_option), heartbeat_timeout: c.heartbeat_tick, election_timeout: c.election_tick, @@ -1398,10 +1400,7 @@ impl Raft { // ...or this is a PreVote for a future term... (m.get_msg_type() == MessageType::MsgRequestPreVote && m.term > self.term); // ...and we believe the candidate is up to date. - if can_vote - && self.raft_log.is_up_to_date(m.index, m.log_term) - && (m.index > self.raft_log.last_index() || self.priority <= m.priority) - { + if can_vote && self.may_vote(&m) { // When responding to Msg{Pre,}Vote messages we include the term // from the message, not the local term. To see why consider the // case where a single node was previously partitioned away and @@ -1444,6 +1443,33 @@ impl Raft { Ok(()) } + /// Checks if this node may vote for the request. It may vote when from node + /// contains the same logs or more logs. When they contains same logs, priority + /// is considered. If priorities are still the same, and this node is also + /// starting campaign, then split vote happens. In this case, if `judge_split_prevote` + /// and `pre_vote` are enabled, it will vote only when from node has greater ID. + fn may_vote(&self, m: &Message) -> bool { + match self.raft_log.is_up_to_date(m.index, m.log_term) { + Ordering::Greater => true, + Ordering::Equal => { + match self.priority.cmp(&m.priority) { + Ordering::Greater => false, + Ordering::Equal => { + // judge split vote can break symmetry of campaign, but as + // it only happens during split vote, the impact should not + // be significant. + !self.judge_split_prevote + || self.state != StateRole::PreCandidate + || m.get_msg_type() != MessageType::MsgRequestPreVote + || self.id < m.from + } + Ordering::Less => true, + } + } + Ordering::Less => false, + } + } + fn hup(&mut self, transfer_leader: bool) { if self.state == StateRole::Leader { debug!( diff --git a/src/raft_log.rs b/src/raft_log.rs index 20f728cc6..387623f4f 100644 --- a/src/raft_log.rs +++ b/src/raft_log.rs @@ -14,7 +14,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::cmp; +use std::cmp::{self, Ordering}; use crate::eraftpb::{Entry, Snapshot}; use crate::errors::{Error, Result, StorageError}; @@ -369,8 +369,8 @@ impl RaftLog { /// later term is more up-to-date. If the logs end with the same term, then /// whichever log has the larger last_index is more up-to-date. If the logs are /// the same, the given log is up-to-date. - pub fn is_up_to_date(&self, last_index: u64, term: u64) -> bool { - term > self.last_term() || (term == self.last_term() && last_index >= self.last_index()) + pub fn is_up_to_date(&self, last_index: u64, term: u64) -> Ordering { + (term, last_index).cmp(&(self.last_term(), self.last_index())) } /// Returns committed and persisted entries since max(`since_idx` + 1, first_index). @@ -567,7 +567,7 @@ impl RaftLog { #[cfg(test)] mod test { use std::{ - cmp, + cmp::{self, Ordering}, panic::{self, AssertUnwindSafe}, }; @@ -660,22 +660,25 @@ mod test { raft_log.append(&previous_ents); let tests = vec![ // greater term, ignore lastIndex - (raft_log.last_index() - 1, 4, true), - (raft_log.last_index(), 4, true), - (raft_log.last_index() + 1, 4, true), + (raft_log.last_index() - 1, 4, Ordering::Greater), + (raft_log.last_index(), 4, Ordering::Greater), + (raft_log.last_index() + 1, 4, Ordering::Greater), // smaller term, ignore lastIndex - (raft_log.last_index() - 1, 2, false), - (raft_log.last_index(), 2, false), - (raft_log.last_index() + 1, 2, false), + (raft_log.last_index() - 1, 2, Ordering::Less), + (raft_log.last_index(), 2, Ordering::Less), + (raft_log.last_index() + 1, 2, Ordering::Less), // equal term, lager lastIndex wins - (raft_log.last_index() - 1, 3, false), - (raft_log.last_index(), 3, true), - (raft_log.last_index() + 1, 3, true), + (raft_log.last_index() - 1, 3, Ordering::Less), + (raft_log.last_index(), 3, Ordering::Equal), + (raft_log.last_index() + 1, 3, Ordering::Greater), ]; for (i, &(last_index, term, up_to_date)) in tests.iter().enumerate() { let g_up_to_date = raft_log.is_up_to_date(last_index, term); if g_up_to_date != up_to_date { - panic!("#{}: uptodate = {}, want {}", i, g_up_to_date, up_to_date); + panic!( + "#{}: uptodate = {:?}, want {:?}", + i, g_up_to_date, up_to_date + ); } } } From 6a895abe0f8af0ecff9ff43d01e43a21e58fa952 Mon Sep 17 00:00:00 2001 From: Jay Lee Date: Sun, 7 Mar 2021 20:49:53 +0800 Subject: [PATCH 2/7] give up prevote in split vote Signed-off-by: Jay Lee --- harness/tests/integration_cases/test_raft.rs | 65 ++++++++++++-------- src/raft.rs | 4 ++ 2 files changed, 45 insertions(+), 24 deletions(-) diff --git a/harness/tests/integration_cases/test_raft.rs b/harness/tests/integration_cases/test_raft.rs index 9ba81ac96..92219b98c 100644 --- a/harness/tests/integration_cases/test_raft.rs +++ b/harness/tests/integration_cases/test_raft.rs @@ -4334,33 +4334,50 @@ fn test_prevote_with_split_vote() { #[test] fn test_prevote_with_judge_split_prevote() { let l = default_logger(); - let peers = (1..=3).map(|id| { - let mut config = new_test_config(id, 10, 1); - config.pre_vote = true; - config.judge_split_prevote = true; - let storage = new_storage(); - storage.initialize_with_conf_state((vec![1, 2, 3], vec![])); - let mut raft = new_test_raft_with_config(&config, storage, &l); - raft.become_follower(1, INVALID_ID); - Some(raft) - }); - let mut network = Network::new(peers.collect(), &l); - network.send(vec![new_message(1, 1, MessageType::MsgHup, 0)]); + for cnt in &[3, 5] { + let peers = (1..=*cnt).map(|id| { + let mut config = new_test_config(id, 10, 1); + config.pre_vote = true; + config.judge_split_prevote = true; + let storage = new_storage(); + storage.initialize_with_conf_state(((1..=*cnt).collect::>(), vec![])); + let mut raft = new_test_raft_with_config(&config, storage, &l); + raft.become_follower(1, INVALID_ID); + Some(raft) + }); + let mut network = Network::new(peers.collect(), &l); + network.send(vec![new_message(1, 1, MessageType::MsgHup, 0)]); - // simulate leader down. followers start split vote. - network.isolate(1); - network.send(vec![ - new_message(2, 2, MessageType::MsgHup, 0), - new_message(3, 3, MessageType::MsgHup, 0), - ]); + // simulate leader down. followers start split vote. + network.isolate(1); + let msgs = (2..=*cnt) + .map(|id| new_message(id, id, MessageType::MsgHup, 0)) + .collect(); + network.send(msgs); - // check whether the term values are expected - assert_eq!(network.peers[&2].term, 3, "peer 2 term",); - assert_eq!(network.peers[&3].term, 3, "peer 3 term",); + // check whether the term values are expected + for id in 2..=*cnt { + assert_eq!(network.peers[&id].term, 3, "[{}] peer {} term", cnt, id); + } - // check state - assert_eq!(network.peers[&2].state, StateRole::Follower, "peer 2 state",); - assert_eq!(network.peers[&3].state, StateRole::Leader, "peer 3 state",); + // check state + for id in 2..=(*cnt - 1) { + assert_eq!( + network.peers[&id].state, + StateRole::Follower, + "[{}] peer {} state", + cnt, + id + ); + } + assert_eq!( + network.peers[cnt].state, + StateRole::Leader, + "[{}] peer {} state", + cnt, + cnt + ); + } } // ensure that after a node become pre-candidate, it will checkQuorum correctly. diff --git a/src/raft.rs b/src/raft.rs index 4e2ffb52d..8a539f114 100644 --- a/src/raft.rs +++ b/src/raft.rs @@ -1421,6 +1421,10 @@ impl Raft { self.election_elapsed = 0; self.vote = m.from; } + // This means it's in split vote, give up election. + if self.judge_split_prevote && self.state == StateRole::PreCandidate { + self.become_follower(self.term, INVALID_ID); + } } else { self.log_vote_reject(&m); let mut to_send = From 158f7f1b9ea3a2457f21b07db107be4871fef8a6 Mon Sep 17 00:00:00 2001 From: Jay Lee Date: Sun, 7 Mar 2021 22:25:50 +0800 Subject: [PATCH 3/7] reject prevote explicitly when in lease Signed-off-by: Jay Lee --- src/raft.rs | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/raft.rs b/src/raft.rs index 8a539f114..55d7b3ec5 100644 --- a/src/raft.rs +++ b/src/raft.rs @@ -1289,6 +1289,18 @@ impl Raft { "msg type" => ?m.get_msg_type(), ); + // When judge_split_prevote, reject explicitly to make candidate exit PreCandiate early + // so it will vote for other peer later. + if self.judge_split_prevote + && m.get_msg_type() == MessageType::MsgRequestPreVote + { + let mut to_send = + new_message(m.from, MessageType::MsgRequestPreVoteResponse, None); + to_send.term = m.term; + to_send.reject = true; + self.r.send(to_send, &mut self.msgs); + } + return Ok(()); } } From 0e535b5c4a2c043ba382ebb9b5ece2f433d70e16 Mon Sep 17 00:00:00 2001 From: Jay Lee Date: Sun, 7 Mar 2021 23:28:33 +0800 Subject: [PATCH 4/7] explain return value Signed-off-by: Jay Lee --- src/raft_log.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/raft_log.rs b/src/raft_log.rs index 387623f4f..9e66f65ff 100644 --- a/src/raft_log.rs +++ b/src/raft_log.rs @@ -369,6 +369,9 @@ impl RaftLog { /// later term is more up-to-date. If the logs end with the same term, then /// whichever log has the larger last_index is more up-to-date. If the logs are /// the same, the given log is up-to-date. + /// Returns `Ordering::Greater` means given log is more up-to-date, `Ordering::Equal` + /// means they have the same logs, `Ordering::Less` means existing logs is more + /// up-to-date. pub fn is_up_to_date(&self, last_index: u64, term: u64) -> Ordering { (term, last_index).cmp(&(self.last_term(), self.last_index())) } From 7cb2fc8565c763d697227c1a04d52070203c5e37 Mon Sep 17 00:00:00 2001 From: Jay Lee Date: Tue, 8 Jun 2021 16:24:08 +0800 Subject: [PATCH 5/7] use hash to do choice randomly Signed-off-by: Jay Lee --- harness/tests/integration_cases/test_raft.rs | 9 +++++---- src/raft.rs | 6 +++++- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/harness/tests/integration_cases/test_raft.rs b/harness/tests/integration_cases/test_raft.rs index 1104563d6..de4281397 100644 --- a/harness/tests/integration_cases/test_raft.rs +++ b/harness/tests/integration_cases/test_raft.rs @@ -4332,7 +4332,7 @@ fn test_prevote_with_split_vote() { } /// TestPreVoteWithJudgeSplitPreVote verifies that during split vote, cluster can finish campaign -/// by letting raft node with larger ID to finish campaign. +/// by letting raft node with larger hash value to finish campaign. #[test] fn test_prevote_with_judge_split_prevote() { let l = default_logger(); @@ -4362,8 +4362,9 @@ fn test_prevote_with_judge_split_prevote() { assert_eq!(network.peers[&id].term, 3, "[{}] peer {} term", cnt, id); } - // check state - for id in 2..=(*cnt - 1) { + let mut hashes: Vec<_> = (2..=*cnt).map(|id| (fxhash::hash64(&id), id)).collect(); + hashes.sort(); + for (_, id) in &hashes[..hashes.len() - 1] { assert_eq!( network.peers[&id].state, StateRole::Follower, @@ -4373,7 +4374,7 @@ fn test_prevote_with_judge_split_prevote() { ); } assert_eq!( - network.peers[cnt].state, + network.peers[&hashes[hashes.len() - 1].1].state, StateRole::Leader, "[{}] peer {} state", cnt, diff --git a/src/raft.rs b/src/raft.rs index 88cfa7ddb..2540d3ae6 100644 --- a/src/raft.rs +++ b/src/raft.rs @@ -1502,7 +1502,11 @@ impl Raft { !self.judge_split_prevote || self.state != StateRole::PreCandidate || m.get_msg_type() != MessageType::MsgRequestPreVote - || self.id < m.from + || { + let (my_h, from_h) = + (fxhash::hash64(&self.id), fxhash::hash64(&m.from)); + my_h < from_h || (my_h == from_h && self.id < m.from) + } } Ordering::Less => true, } From e5c3aaca069a46e6b9a3d10d0125f015c46e4aed Mon Sep 17 00:00:00 2001 From: Jay Lee Date: Thu, 10 Jun 2021 18:58:07 +0800 Subject: [PATCH 6/7] address comment Signed-off-by: Jay Lee --- src/raft.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/raft.rs b/src/raft.rs index 2540d3ae6..9b2187754 100644 --- a/src/raft.rs +++ b/src/raft.rs @@ -1499,13 +1499,16 @@ impl Raft { // judge split vote can break symmetry of campaign, but as // it only happens during split vote, the impact should not // be significant. + // Transfering leader skips prevote, so they won't have impact + // on the other. !self.judge_split_prevote || self.state != StateRole::PreCandidate || m.get_msg_type() != MessageType::MsgRequestPreVote || { + let from_id = m.from; let (my_h, from_h) = - (fxhash::hash64(&self.id), fxhash::hash64(&m.from)); - my_h < from_h || (my_h == from_h && self.id < m.from) + (fxhash::hash64(&self.id), fxhash::hash64(&from_id)); + my_h < from_h || (my_h == from_h && self.id < from_id) } } Ordering::Less => true, From d41a4599e656dfbcc43ed0e2a1a1327df13e564a Mon Sep 17 00:00:00 2001 From: Jay Lee Date: Wed, 16 Jun 2021 20:52:57 +0800 Subject: [PATCH 7/7] make clippy happy Signed-off-by: Jay Lee --- harness/tests/integration_cases/test_raft.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/harness/tests/integration_cases/test_raft.rs b/harness/tests/integration_cases/test_raft.rs index de4281397..191c430c4 100644 --- a/harness/tests/integration_cases/test_raft.rs +++ b/harness/tests/integration_cases/test_raft.rs @@ -4363,7 +4363,7 @@ fn test_prevote_with_judge_split_prevote() { } let mut hashes: Vec<_> = (2..=*cnt).map(|id| (fxhash::hash64(&id), id)).collect(); - hashes.sort(); + hashes.sort_unstable(); for (_, id) in &hashes[..hashes.len() - 1] { assert_eq!( network.peers[&id].state,