From 4e46bd80645b039f9c5f39134d262b7bd9f9b7f4 Mon Sep 17 00:00:00 2001 From: Vincent Liu Date: Wed, 19 Jun 2024 22:14:53 +0100 Subject: [PATCH] Send alerts only for previously (non)-live hosts Add logic to send alerts based on the previous liveness of the hosts. This avoids the problem where a previously dead hosts could trigger a duplicate alert to be sent. Also rename local variables to better reflect the meaning. Signed-off-by: Vincent Liu --- ocaml/xapi/xapi_cluster_helpers.ml | 8 ++-- ocaml/xapi/xapi_cluster_host.ml | 12 ++--- ocaml/xapi/xapi_clustering.ml | 73 +++++++++++++++++------------- 3 files changed, 51 insertions(+), 42 deletions(-) diff --git a/ocaml/xapi/xapi_cluster_helpers.ml b/ocaml/xapi/xapi_cluster_helpers.ml index 31a655a3e72..f7ea78eab9d 100644 --- a/ocaml/xapi/xapi_cluster_helpers.ml +++ b/ocaml/xapi/xapi_cluster_helpers.ml @@ -114,7 +114,7 @@ let corosync3_enabled ~__context = let restrictions = Db.Pool.get_restrictions ~__context ~self:pool in List.assoc_opt "restrict_corosync3" restrictions = Some "false" -let maybe_generate_alert ~__context ~num_hosts ~missing_hosts ~new_hosts ~quorum +let maybe_generate_alert ~__context ~num_hosts ~hosts_left ~hosts_joined ~quorum = let generate_alert join cluster_host = let host = Db.Cluster_host.get_host ~__context ~self:cluster_host in @@ -148,10 +148,10 @@ let maybe_generate_alert ~__context ~num_hosts ~missing_hosts ~new_hosts ~quorum ) in if cluster_health_enabled ~__context then ( - List.iter (generate_alert false) missing_hosts ; - List.iter (generate_alert true) new_hosts ; + List.iter (generate_alert false) hosts_left ; + List.iter (generate_alert true) hosts_joined ; (* only generate this alert when the number of hosts is decreasing *) - if missing_hosts <> [] && num_hosts <= quorum then + if hosts_left <> [] && num_hosts <= quorum then let pool = Helpers.get_pool ~__context in let pool_uuid = Db.Pool.get_uuid ~__context ~self:pool in let name, priority = Api_messages.cluster_quorum_approaching_lost in diff --git a/ocaml/xapi/xapi_cluster_host.ml b/ocaml/xapi/xapi_cluster_host.ml index 78c943e18a5..e3038781d66 100644 --- a/ocaml/xapi/xapi_cluster_host.ml +++ b/ocaml/xapi/xapi_cluster_host.ml @@ -55,19 +55,19 @@ let call_api_function_with_alert ~__context ~msg ~cls ~obj_uuid ~body raise err ) -let alert_for_cluster_host ~__context ~cluster_host ~missing_hosts ~new_hosts = +let alert_for_cluster_host ~__context ~cluster_host ~hosts_left ~hosts_joined = let num_hosts = Db.Cluster_host.get_all ~__context |> List.length in let cluster = Db.Cluster_host.get_cluster ~__context ~self:cluster_host in let quorum = Db.Cluster.get_quorum ~__context ~self:cluster |> Int64.to_int in - maybe_generate_alert ~__context ~missing_hosts ~new_hosts ~num_hosts ~quorum + maybe_generate_alert ~__context ~hosts_left ~hosts_joined ~num_hosts ~quorum let alert_for_cluster_host_leave ~__context ~cluster_host = - alert_for_cluster_host ~__context ~cluster_host ~missing_hosts:[cluster_host] - ~new_hosts:[] + alert_for_cluster_host ~__context ~cluster_host ~hosts_left:[cluster_host] + ~hosts_joined:[] let alert_for_cluster_host_join ~__context ~cluster_host = - alert_for_cluster_host ~__context ~cluster_host ~missing_hosts:[] - ~new_hosts:[cluster_host] + alert_for_cluster_host ~__context ~cluster_host ~hosts_left:[] + ~hosts_joined:[cluster_host] (* Create xapi db object for cluster_host, resync_host calls clusterd *) let create_internal ~__context ~cluster ~host ~pIF : API.ref_Cluster_host = diff --git a/ocaml/xapi/xapi_clustering.ml b/ocaml/xapi/xapi_clustering.ml index 85fe7357d85..7210d537592 100644 --- a/ocaml/xapi/xapi_clustering.ml +++ b/ocaml/xapi/xapi_clustering.ml @@ -441,6 +441,13 @@ module Watcher = struct Db.Cluster.set_is_quorate ~__context ~self:cluster ~value:diag.is_quorate ; let all_cluster_hosts = Db.Cluster_host.get_all ~__context in + let live_hosts = + Db.Cluster_host.get_refs_where ~__context + ~expr:(Eq (Field "live", Literal "true")) + in + let dead_hosts = + List.filter (fun h -> not (List.mem h live_hosts)) all_cluster_hosts + in let ip_ch = List.map (fun ch -> @@ -464,6 +471,7 @@ module Watcher = struct ) all_cluster_hosts | Some nodel -> + (* nodel contains the current members of the cluster, according to corosync *) let quorum_hosts = List.filter_map (fun {addr; _} -> @@ -480,15 +488,22 @@ module Watcher = struct ) nodel in - let missing_hosts = - List.filter - (fun h -> not (List.mem h quorum_hosts)) - all_cluster_hosts + + (* we handle unclean hosts join and leave below, i.e. hosts joining and leaving + due to network problems, power cut, etc. Join and leave initiated by the + API will be handled in the API call themselves *) + + (* hosts_left contains the hosts that were live, but not in the list + of live hosts according to corosync *) + let hosts_left = + List.filter (fun h -> not (List.mem h quorum_hosts)) live_hosts in - let new_hosts = + (* hosts_joined contains the hosts that were not live, but now in the + list of live hosts according to corosync *) + let hosts_joined = List.filter (fun h -> not (Db.Cluster_host.get_live ~__context ~self:h)) - quorum_hosts + dead_hosts in List.iter (fun self -> @@ -496,15 +511,15 @@ module Watcher = struct Db.Cluster_host.set_last_update_live ~__context ~self ~value:current_time ) - new_hosts ; + hosts_joined ; List.iter (fun self -> Db.Cluster_host.set_live ~__context ~self ~value:false ; Db.Cluster_host.set_last_update_live ~__context ~self ~value:current_time ) - missing_hosts ; - maybe_generate_alert ~__context ~missing_hosts ~new_hosts + hosts_left ; + maybe_generate_alert ~__context ~hosts_left ~hosts_joined ~num_hosts:(List.length quorum_hosts) ~quorum:diag.quorum ) ; Db.Cluster.set_quorum ~__context ~self:cluster @@ -520,17 +535,15 @@ module Watcher = struct performing update" __FUNCTION__ (Printexc.to_string exn) - let cluster_change_watcher : Thread.t option ref = ref None + let cluster_change_watcher : bool Atomic.t = Atomic.make false - let mu = Mutex.create () - - open Xapi_stdext_threads.Threadext + let cluster_change_interval = 3. let watch_cluster_change ~__context ~host = while !Daemon.enabled do let m = Cluster_client.LocalClient.UPDATES.get (rpc ~__context) - "call cluster watcher" 3. + "call cluster watcher" cluster_change_interval in match Idl.IdM.run @@ Cluster_client.IDL.T.get m with | Ok updates -> ( @@ -550,9 +563,9 @@ module Watcher = struct | exception exn -> warn "%s: Got exception %s while query cluster host updates, retrying" __FUNCTION__ (Printexc.to_string exn) ; - Thread.delay 3. + Thread.delay cluster_change_interval done ; - Mutex.execute mu (fun () -> cluster_change_watcher := None) + Atomic.set cluster_change_watcher false (** [create_as_necessary] will create cluster watchers on the coordinator if they are not already created. @@ -560,20 +573,16 @@ module Watcher = struct these threads will exit as well. *) let create_as_necessary ~__context ~host = if Helpers.is_pool_master ~__context ~host then - if Xapi_cluster_helpers.cluster_health_enabled ~__context then ( - debug "%s: create watcher for corosync-notifyd on coordinator" - __FUNCTION__ ; - - Mutex.execute mu (fun () -> - match !cluster_change_watcher with - | None -> - cluster_change_watcher := - Thread.create - (fun () -> watch_cluster_change ~__context ~host) - () - |> Option.some - | Some _ -> - () - ) - ) + if Xapi_cluster_helpers.cluster_health_enabled ~__context then + if Atomic.compare_and_set cluster_change_watcher false true then ( + debug "%s: create watcher for corosync-notifyd on coordinator" + __FUNCTION__ ; + ignore + @@ Thread.create (fun () -> watch_cluster_change ~__context ~host) () + ) else + (* someone else must have gone into the if branch above and created the thread + before us, leave it to them *) + debug + "%s: not create watcher for corosync-notifyd as it already exists" + __FUNCTION__ end