From c94e1848244b707a320290650e13715264892430 Mon Sep 17 00:00:00 2001 From: "Gao,Yan" Date: Tue, 18 Mar 2025 16:57:25 +0100 Subject: [PATCH 1/8] Fix: pacemaker-attrd: prevent segfault if a peer leaves when its name is unknown yet If nodes are not configured with names in corosync, when a node joins the cluster, it takes a while for it to learn the names of all the other online peers. Previously, if any of the peers happened to leave when its `name` was unknown to us yet, pacemaker-attrd would trigger assertion in `attrd_peer_remove()` and segfault in `attrd_remove_peer_protocol_ver()`. --- daemons/attrd/attrd_corosync.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/daemons/attrd/attrd_corosync.c b/daemons/attrd/attrd_corosync.c index e97e09cb865..e3b0ec22367 100644 --- a/daemons/attrd/attrd_corosync.c +++ b/daemons/attrd/attrd_corosync.c @@ -195,14 +195,16 @@ attrd_peer_change_cb(enum pcmk__node_update kind, pcmk__node_status_t *peer, } } else { // Remove all attribute values associated with lost nodes - attrd_peer_remove(peer->name, false, "loss"); + if (peer->name != NULL) { + attrd_peer_remove(peer->name, false, "loss"); + } gone = true; } break; } // Remove votes from cluster nodes that leave, in case election in progress - if (gone && !is_remote) { + if (gone && !is_remote && peer->name != NULL) { attrd_remove_voter(peer); attrd_remove_peer_protocol_ver(peer->name); attrd_do_not_expect_from_peer(peer->name); From 522a04a0b1b3f622ccb239828fb974d80dc5e873 Mon Sep 17 00:00:00 2001 From: "Gao,Yan" Date: Tue, 18 Mar 2025 18:25:02 +0100 Subject: [PATCH 2/8] Log: pacemaker-attrd: log the cluster layer id of the changed peer So that the messages are informative when the peer's name is unknown yet. --- daemons/attrd/attrd_corosync.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/daemons/attrd/attrd_corosync.c b/daemons/attrd/attrd_corosync.c index e3b0ec22367..423e0c66b0f 100644 --- a/daemons/attrd/attrd_corosync.c +++ b/daemons/attrd/attrd_corosync.c @@ -168,23 +168,26 @@ attrd_peer_change_cb(enum pcmk__node_update kind, pcmk__node_status_t *peer, switch (kind) { case pcmk__node_update_name: - crm_debug("%s node %s is now %s", + crm_debug("%s node %s[%" PRIu32 "] is now %s", (is_remote? "Remote" : "Cluster"), - peer->name, state_text(peer->state)); + pcmk__s(peer->name, "unknown"), peer->cluster_layer_id, + state_text(peer->state)); break; case pcmk__node_update_processes: if (!pcmk_is_set(peer->processes, crm_get_cluster_proc())) { gone = true; } - crm_debug("Node %s is %s a peer", - peer->name, (gone? "no longer" : "now")); + crm_debug("Node %s[%" PRIu32 "] is %s a peer", + pcmk__s(peer->name, "unknown"), peer->cluster_layer_id, + (gone? "no longer" : "now")); break; case pcmk__node_update_state: - crm_debug("%s node %s is now %s (was %s)", + crm_debug("%s node %s[%" PRIu32 "] is now %s (was %s)", (is_remote? "Remote" : "Cluster"), - peer->name, state_text(peer->state), state_text(data)); + pcmk__s(peer->name, "unknown"), peer->cluster_layer_id, + state_text(peer->state), state_text(data)); if (pcmk__str_eq(peer->state, PCMK_VALUE_MEMBER, pcmk__str_none)) { /* If we're the writer, send new peers a list of all attributes * (unless it's a remote node, which doesn't run its own attrd) From bc7726e7e127ed548bbe6800b8050930e68e991e Mon Sep 17 00:00:00 2001 From: "Gao,Yan" Date: Wed, 19 Mar 2025 01:04:16 +0100 Subject: [PATCH 3/8] Log: libcrmcluster: correctly log node id --- lib/cluster/cluster.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/cluster/cluster.c b/lib/cluster/cluster.c index dda4b8e89ae..14b2bc0c961 100644 --- a/lib/cluster/cluster.c +++ b/lib/cluster/cluster.c @@ -271,7 +271,7 @@ pcmk__cluster_node_name(uint32_t nodeid) } crm_notice("Could not obtain a node name for node with " - PCMK_XA_ID "=" PRIu32, + PCMK_XA_ID "=%" PRIu32, nodeid); return NULL; } From 5f643a492be4c0088ddead2fc4d0c4765e463b49 Mon Sep 17 00:00:00 2001 From: "Gao,Yan" Date: Wed, 19 Mar 2025 01:15:45 +0100 Subject: [PATCH 4/8] Log: pacemaker-attrd: use %PRIu32 format specifier instead of %u for node id --- daemons/attrd/attrd_corosync.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/daemons/attrd/attrd_corosync.c b/daemons/attrd/attrd_corosync.c index 423e0c66b0f..4c58c14d22d 100644 --- a/daemons/attrd/attrd_corosync.c +++ b/daemons/attrd/attrd_corosync.c @@ -114,7 +114,7 @@ attrd_cpg_dispatch(cpg_handle_t handle, xml = pcmk__xml_parse(data); if (xml == NULL) { - crm_err("Bad message received from %s[%u]: '%.120s'", + crm_err("Bad message received from %s[%" PRIu32 "]: '%.120s'", from, nodeid, data); } else { attrd_peer_message(pcmk__get_node(nodeid, from, NULL, From 666a4113fa71d2f7501c28f4e5e76ba10c051e45 Mon Sep 17 00:00:00 2001 From: "Gao,Yan" Date: Thu, 20 Mar 2025 16:08:26 +0100 Subject: [PATCH 5/8] Fix: pacemaker-attrd: remember names of peers from attribute update in case unknown ... in the membership cache, so that we also learn names of non-writers as early as possible through processing `sync-response` from the writer. Regression introduced by 85d7a70916 (not yet released). --- daemons/attrd/attrd_corosync.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/daemons/attrd/attrd_corosync.c b/daemons/attrd/attrd_corosync.c index 4c58c14d22d..370f96e5481 100644 --- a/daemons/attrd/attrd_corosync.c +++ b/daemons/attrd/attrd_corosync.c @@ -325,8 +325,13 @@ update_attr_on_host(attribute_t *a, const pcmk__node_status_t *peer, // Remember node's XML ID if we're just learning it if ((node_xml_id != NULL) && !pcmk__str_eq(node_xml_id, prev_xml_id, pcmk__str_none)) { + // Remember node's name in case unknown in the membership cache + pcmk__node_status_t *known_peer = + pcmk__get_node(0, host, node_xml_id, + pcmk__node_search_cluster_member); + crm_trace("Learned %s[%s] node XML ID is %s (was %s)", - a->id, v->nodename, node_xml_id, + a->id, known_peer->name, node_xml_id, pcmk__s(prev_xml_id, "unknown")); attrd_set_node_xml_id(v->nodename, node_xml_id); if (attrd_election_won()) { From 98338298133c81b9cc64c55c98481e232b0643cb Mon Sep 17 00:00:00 2001 From: "Gao,Yan" Date: Fri, 21 Mar 2025 14:15:39 +0100 Subject: [PATCH 6/8] Refactor: pacemaker-attrd: make attrd_send_message()'s node argument const --- daemons/attrd/attrd_messages.c | 2 +- daemons/attrd/pacemaker-attrd.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/daemons/attrd/attrd_messages.c b/daemons/attrd/attrd_messages.c index e1038a820b5..8411ce5ef1a 100644 --- a/daemons/attrd/attrd_messages.c +++ b/daemons/attrd/attrd_messages.c @@ -328,7 +328,7 @@ attrd_broadcast_protocol(void) } gboolean -attrd_send_message(pcmk__node_status_t *node, xmlNode *data, bool confirm) +attrd_send_message(const pcmk__node_status_t *node, xmlNode *data, bool confirm) { const char *op = crm_element_value(data, PCMK_XA_TASK); diff --git a/daemons/attrd/pacemaker-attrd.h b/daemons/attrd/pacemaker-attrd.h index cc0dcf29ee4..f818e3b0a84 100644 --- a/daemons/attrd/pacemaker-attrd.h +++ b/daemons/attrd/pacemaker-attrd.h @@ -200,7 +200,7 @@ xmlNode *attrd_client_clear_failure(pcmk__request_t *request); xmlNode *attrd_client_update(pcmk__request_t *request); xmlNode *attrd_client_refresh(pcmk__request_t *request); xmlNode *attrd_client_query(pcmk__request_t *request); -gboolean attrd_send_message(pcmk__node_status_t *node, xmlNode *data, +gboolean attrd_send_message(const pcmk__node_status_t *node, xmlNode *data, bool confirm); xmlNode *attrd_add_value_xml(xmlNode *parent, const attribute_t *a, From 63198fda240afa2517970c36681bfd143a1e6146 Mon Sep 17 00:00:00 2001 From: "Gao,Yan" Date: Fri, 21 Mar 2025 14:18:15 +0100 Subject: [PATCH 7/8] Refactor: pacemaker-attrd: ability to send a protocol message to a single peer Rename attrd_broadcast_protocol() to attrd_send_protocol(). --- daemons/attrd/attrd_messages.c | 24 +++++++++++++++++++----- daemons/attrd/pacemaker-attrd.c | 2 +- daemons/attrd/pacemaker-attrd.h | 2 +- 3 files changed, 21 insertions(+), 7 deletions(-) diff --git a/daemons/attrd/attrd_messages.c b/daemons/attrd/attrd_messages.c index 8411ce5ef1a..5afc3c54c1e 100644 --- a/daemons/attrd/attrd_messages.c +++ b/daemons/attrd/attrd_messages.c @@ -302,10 +302,10 @@ attrd_handle_request(pcmk__request_t *request) /*! \internal - \brief Broadcast private attribute for local node with protocol version + \brief Send or broadcast private attribute for local node with protocol version */ void -attrd_broadcast_protocol(void) +attrd_send_protocol(const pcmk__node_status_t *peer) { xmlNode *attrd_op = pcmk__xe_create(NULL, __func__); @@ -319,10 +319,24 @@ attrd_broadcast_protocol(void) crm_xml_add(attrd_op, PCMK__XA_ATTR_HOST_ID, attrd_cluster->priv->node_xml_id); - crm_debug("Broadcasting attrd protocol version %s for node %s", - ATTRD_PROTOCOL_VERSION, attrd_cluster->priv->node_name); + if (peer == NULL) { + crm_debug("Broadcasting attrd protocol version %s for node %s[%" PRIu32 + "]", + ATTRD_PROTOCOL_VERSION, + pcmk__s(attrd_cluster->priv->node_name, "unknown"), + attrd_cluster->priv->node_id); - attrd_send_message(NULL, attrd_op, false); /* ends up at attrd_peer_message() */ + } else { + crm_debug("Sending attrd protocol version %s for node %s[%" PRIu32 + "] to node %s[%" PRIu32 "]", + ATTRD_PROTOCOL_VERSION, + pcmk__s(attrd_cluster->priv->node_name, "unknown"), + attrd_cluster->priv->node_id, + pcmk__s(peer->name, "unknown"), + peer->cluster_layer_id); + } + + attrd_send_message(peer, attrd_op, false); /* ends up at attrd_peer_message() */ pcmk__xml_free(attrd_op); } diff --git a/daemons/attrd/pacemaker-attrd.c b/daemons/attrd/pacemaker-attrd.c index 3c31bcd932e..ee479c6447a 100644 --- a/daemons/attrd/pacemaker-attrd.c +++ b/daemons/attrd/pacemaker-attrd.c @@ -184,7 +184,7 @@ main(int argc, char **argv) * across all nodes. It also ensures that the writer learns our node name, * so it can send our attributes to the CIB. */ - attrd_broadcast_protocol(); + attrd_send_protocol(NULL); attrd_init_ipc(); crm_notice("Pacemaker node attribute manager successfully started and accepting connections"); diff --git a/daemons/attrd/pacemaker-attrd.h b/daemons/attrd/pacemaker-attrd.h index f818e3b0a84..d9423c8915e 100644 --- a/daemons/attrd/pacemaker-attrd.h +++ b/daemons/attrd/pacemaker-attrd.h @@ -194,7 +194,7 @@ void attrd_peer_clear_failure(pcmk__request_t *request); void attrd_peer_sync_response(const pcmk__node_status_t *peer, bool peer_won, xmlNode *xml); -void attrd_broadcast_protocol(void); +void attrd_send_protocol(const pcmk__node_status_t *peer); xmlNode *attrd_client_peer_remove(pcmk__request_t *request); xmlNode *attrd_client_clear_failure(pcmk__request_t *request); xmlNode *attrd_client_update(pcmk__request_t *request); From a6718479a04f50107e1fa6dc5ba00012c06a6ef6 Mon Sep 17 00:00:00 2001 From: "Gao,Yan" Date: Fri, 21 Mar 2025 16:03:41 +0100 Subject: [PATCH 8/8] Fix: pacemaker-attrd: make a peer learn our node name once it has joined ... by sending it a protocol message. This is an additional assurance besides the way of learning non-writers' names through processing `sync-response` from the writer as of 666a4113fa. --- daemons/attrd/attrd_corosync.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/daemons/attrd/attrd_corosync.c b/daemons/attrd/attrd_corosync.c index 370f96e5481..e681fb24d43 100644 --- a/daemons/attrd/attrd_corosync.c +++ b/daemons/attrd/attrd_corosync.c @@ -192,10 +192,16 @@ attrd_peer_change_cb(enum pcmk__node_update kind, pcmk__node_status_t *peer, /* If we're the writer, send new peers a list of all attributes * (unless it's a remote node, which doesn't run its own attrd) */ - if (attrd_election_won() - && !pcmk_is_set(peer->flags, pcmk__node_status_remote)) { - attrd_peer_sync(peer); + if (!is_remote) { + if (attrd_election_won()) { + attrd_peer_sync(peer); + + } else { + // Anyway send a message so that the peer learns our name + attrd_send_protocol(peer); + } } + } else { // Remove all attribute values associated with lost nodes if (peer->name != NULL) {