compute <-> sk protocol v3 (#10647)

## Problem As part of #8614 we need to pass membership configurations between compute and safekeepers. ## Summary of changes Add version 3 of the protocol carrying membership configurations. Greeting message in both sides gets full conf, and other messages generation number only. Use protocol bump to include other accumulated changes: - stop packing whole structs on the wire as is; - make the tag u8 instead of u64; - send all ints in network order; - drop proposer_uuid, we can pass it in START_WAL_PUSH and it wasn't much useful anyway. Per message changes, apart from mconf: - ProposerGreeting: tenant / timeline id is sent now as hex cstring. Remove proto version, it is passed outside in START_WAL_PUSH. Remove postgres timeline, it is unused. Reorder fields a bit. - AcceptorGreeting: reorder fields - VoteResponse: timeline_start_lsn is removed. It can be taken from first member of term history, and later we won't need it at all when all timelines will be explicitly created. Vote itself is u8 instead of u64. - ProposerElected: timeline_start_lsn is removed for the same reasons. - AppendRequest: epoch_start_lsn removed, it is known from term history in ProposerElected. Both compute and sk are able to talk v2 and v3 to make rollbacks (in case we need them) easier; neon.safekeeper_proto_version GUC sets the client version. v2 code can be dropped later. So far empty conf is passed everywhere, future PRs will handle them. To test, add param to some tests choosing proto version; we want to test both 2 and 3 until we fully migrate. ref #10326 --------- Co-authored-by: Arthur Petukhovsky <[email protected]>
neondatabase · Feb 25, 2025 · 758f597 · 758f597 · github-actions · Feb 25, 2025
1 parent 0d9a45a
commit 758f597
Show file tree

Hide file tree

Showing 17 changed files with 1,345 additions and 442 deletions.
diff --git a/libs/safekeeper_api/src/membership.rs b/libs/safekeeper_api/src/membership.rs
@@ -68,26 +68,24 @@ impl Display for SafekeeperId {
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
 #[serde(transparent)]
 pub struct MemberSet {
-    pub members: Vec<SafekeeperId>,
+    pub m: Vec<SafekeeperId>,
 }
 
 impl MemberSet {
     pub fn empty() -> Self {
-        MemberSet {
-            members: Vec::new(),
-        }
+        MemberSet { m: Vec::new() }
     }
 
     pub fn new(members: Vec<SafekeeperId>) -> anyhow::Result<Self> {
         let hs: HashSet<NodeId> = HashSet::from_iter(members.iter().map(|sk| sk.id));
         if hs.len() != members.len() {
             bail!("duplicate safekeeper id in the set {:?}", members);
         }
-        Ok(MemberSet { members })
+        Ok(MemberSet { m: members })
     }
 
     pub fn contains(&self, sk: &SafekeeperId) -> bool {
-        self.members.iter().any(|m| m.id == sk.id)
+        self.m.iter().any(|m| m.id == sk.id)
     }
 
     pub fn add(&mut self, sk: SafekeeperId) -> anyhow::Result<()> {
@@ -97,19 +95,15 @@ impl MemberSet {
                 sk.id, self
             ));
         }
-        self.members.push(sk);
+        self.m.push(sk);
         Ok(())
     }
 }
 
 impl Display for MemberSet {
     /// Display as a comma separated list of members.
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        let sks_str = self
-            .members
-            .iter()
-            .map(|m| m.to_string())
-            .collect::<Vec<_>>();
+        let sks_str = self.m.iter().map(|sk| sk.to_string()).collect::<Vec<_>>();
         write!(f, "({})", sks_str.join(", "))
     }
 }

diff --git a/libs/walproposer/src/walproposer.rs b/libs/walproposer/src/walproposer.rs
@@ -215,6 +215,7 @@ impl Wrapper {
             syncSafekeepers: config.sync_safekeepers,
             systemId: 0,
             pgTimeline: 1,
+            proto_version: 3,
             callback_data,
         };
         let c_config = Box::into_raw(Box::new(c_config));
@@ -276,6 +277,7 @@ mod tests {
     use core::panic;
     use std::{
         cell::Cell,
+        ffi::CString,
         sync::{atomic::AtomicUsize, mpsc::sync_channel},
     };
 
@@ -496,65 +498,71 @@ mod tests {
         // Messages definitions are at walproposer.h
         // xxx: it would be better to extract them from safekeeper crate and
         // use serialization/deserialization here.
-        let greeting_tag = (b'g' as u64).to_ne_bytes();
-        let proto_version = 2_u32.to_ne_bytes();
-        let pg_version: [u8; 4] = PG_VERSION_NUM.to_ne_bytes();
-        let proposer_id = [0; 16];
-        let system_id = 0_u64.to_ne_bytes();
-        let tenant_id = ttid.tenant_id.as_arr();
-        let timeline_id = ttid.timeline_id.as_arr();
-        let pg_tli = 1_u32.to_ne_bytes();
-        let wal_seg_size = 16777216_u32.to_ne_bytes();
+        let greeting_tag = (b'g').to_be_bytes();
+        let tenant_id = CString::new(ttid.tenant_id.to_string())
+            .unwrap()
+            .into_bytes_with_nul();
+        let timeline_id = CString::new(ttid.timeline_id.to_string())
+            .unwrap()
+            .into_bytes_with_nul();
+        let mconf_gen = 0_u32.to_be_bytes();
+        let mconf_members_len = 0_u32.to_be_bytes();
+        let mconf_members_new_len = 0_u32.to_be_bytes();
+        let pg_version: [u8; 4] = PG_VERSION_NUM.to_be_bytes();
+        let system_id = 0_u64.to_be_bytes();
+        let wal_seg_size = 16777216_u32.to_be_bytes();
+
         let proposer_greeting = [
             greeting_tag.as_slice(),
-            proto_version.as_slice(),
-            pg_version.as_slice(),
-            proposer_id.as_slice(),
-            system_id.as_slice(),
             tenant_id.as_slice(),
             timeline_id.as_slice(),
-            pg_tli.as_slice(),
+            mconf_gen.as_slice(),
+            mconf_members_len.as_slice(),
+            mconf_members_new_len.as_slice(),
+            pg_version.as_slice(),
+            system_id.as_slice(),
             wal_seg_size.as_slice(),
         ]
         .concat();
 
-        let voting_tag = (b'v' as u64).to_ne_bytes();
-        let vote_request_term = 3_u64.to_ne_bytes();
-        let proposer_id = [0; 16];
+        let voting_tag = (b'v').to_be_bytes();
+        let vote_request_term = 3_u64.to_be_bytes();
         let vote_request = [
             voting_tag.as_slice(),
+            mconf_gen.as_slice(),
             vote_request_term.as_slice(),
-            proposer_id.as_slice(),
         ]
         .concat();
 
-        let acceptor_greeting_term = 2_u64.to_ne_bytes();
-        let acceptor_greeting_node_id = 1_u64.to_ne_bytes();
+        let acceptor_greeting_term = 2_u64.to_be_bytes();
+        let acceptor_greeting_node_id = 1_u64.to_be_bytes();
         let acceptor_greeting = [
             greeting_tag.as_slice(),
-            acceptor_greeting_term.as_slice(),
             acceptor_greeting_node_id.as_slice(),
+            mconf_gen.as_slice(),
+            mconf_members_len.as_slice(),
+            mconf_members_new_len.as_slice(),
+            acceptor_greeting_term.as_slice(),
         ]
         .concat();
 
-        let vote_response_term = 3_u64.to_ne_bytes();
-        let vote_given = 1_u64.to_ne_bytes();
-        let flush_lsn = 0x539_u64.to_ne_bytes();
-        let truncate_lsn = 0x539_u64.to_ne_bytes();
-        let th_len = 1_u32.to_ne_bytes();
-        let th_term = 2_u64.to_ne_bytes();
-        let th_lsn = 0x539_u64.to_ne_bytes();
-        let timeline_start_lsn = 0x539_u64.to_ne_bytes();
+        let vote_response_term = 3_u64.to_be_bytes();
+        let vote_given = 1_u8.to_be_bytes();
+        let flush_lsn = 0x539_u64.to_be_bytes();
+        let truncate_lsn = 0x539_u64.to_be_bytes();
+        let th_len = 1_u32.to_be_bytes();
+        let th_term = 2_u64.to_be_bytes();
+        let th_lsn = 0x539_u64.to_be_bytes();
         let vote_response = [
             voting_tag.as_slice(),
+            mconf_gen.as_slice(),
             vote_response_term.as_slice(),
             vote_given.as_slice(),
             flush_lsn.as_slice(),
             truncate_lsn.as_slice(),
             th_len.as_slice(),
             th_term.as_slice(),
             th_lsn.as_slice(),
-            timeline_start_lsn.as_slice(),
         ]
         .concat();
 

diff --git a/pgxn/neon/neon_utils.c b/pgxn/neon/neon_utils.c
@@ -51,6 +51,26 @@ HexDecodeString(uint8 *result, char *input, int nbytes)
 	return true;
 }
 
+/* --------------------------------
+ *		pq_getmsgint16	- get a binary 2-byte int from a message buffer
+ * --------------------------------
+ */
+uint16
+pq_getmsgint16(StringInfo msg)
+{
+	return pq_getmsgint(msg, 2);
+}
+
+/* --------------------------------
+ *		pq_getmsgint32	- get a binary 4-byte int from a message buffer
+ * --------------------------------
+ */
+uint32
+pq_getmsgint32(StringInfo msg)
+{
+	return pq_getmsgint(msg, 4);
+}
+
 /* --------------------------------
  *		pq_getmsgint32_le	- get a binary 4-byte int from a message buffer in native (LE) order
  * --------------------------------

diff --git a/pgxn/neon/neon_utils.h b/pgxn/neon/neon_utils.h
@@ -8,6 +8,8 @@
 #endif
 
 bool		HexDecodeString(uint8 *result, char *input, int nbytes);
+uint16      pq_getmsgint16(StringInfo msg);
+uint32      pq_getmsgint32(StringInfo msg);
 uint32		pq_getmsgint32_le(StringInfo msg);
 uint64		pq_getmsgint64_le(StringInfo msg);
 void		pq_sendint32_le(StringInfo buf, uint32 i);