Skip to content

Commit

Permalink
Send alerts only for previously (non)-live hosts
Browse files Browse the repository at this point in the history
Add logic to send alerts based on the previous liveness of the hosts.
This avoids the problem where a previously dead hosts could trigger a
duplicate alert to be sent.

Also rename local variables to better reflect the meaning.

Signed-off-by: Vincent Liu <[email protected]>
  • Loading branch information
Vincent-lau committed Jun 19, 2024
1 parent cb4329c commit 4e46bd8
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 42 deletions.
8 changes: 4 additions & 4 deletions ocaml/xapi/xapi_cluster_helpers.ml
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ let corosync3_enabled ~__context =
let restrictions = Db.Pool.get_restrictions ~__context ~self:pool in
List.assoc_opt "restrict_corosync3" restrictions = Some "false"

let maybe_generate_alert ~__context ~num_hosts ~missing_hosts ~new_hosts ~quorum
let maybe_generate_alert ~__context ~num_hosts ~hosts_left ~hosts_joined ~quorum
=
let generate_alert join cluster_host =
let host = Db.Cluster_host.get_host ~__context ~self:cluster_host in
Expand Down Expand Up @@ -148,10 +148,10 @@ let maybe_generate_alert ~__context ~num_hosts ~missing_hosts ~new_hosts ~quorum
)
in
if cluster_health_enabled ~__context then (
List.iter (generate_alert false) missing_hosts ;
List.iter (generate_alert true) new_hosts ;
List.iter (generate_alert false) hosts_left ;
List.iter (generate_alert true) hosts_joined ;
(* only generate this alert when the number of hosts is decreasing *)
if missing_hosts <> [] && num_hosts <= quorum then
if hosts_left <> [] && num_hosts <= quorum then
let pool = Helpers.get_pool ~__context in
let pool_uuid = Db.Pool.get_uuid ~__context ~self:pool in
let name, priority = Api_messages.cluster_quorum_approaching_lost in
Expand Down
12 changes: 6 additions & 6 deletions ocaml/xapi/xapi_cluster_host.ml
Original file line number Diff line number Diff line change
Expand Up @@ -55,19 +55,19 @@ let call_api_function_with_alert ~__context ~msg ~cls ~obj_uuid ~body
raise err
)

let alert_for_cluster_host ~__context ~cluster_host ~missing_hosts ~new_hosts =
let alert_for_cluster_host ~__context ~cluster_host ~hosts_left ~hosts_joined =
let num_hosts = Db.Cluster_host.get_all ~__context |> List.length in
let cluster = Db.Cluster_host.get_cluster ~__context ~self:cluster_host in
let quorum = Db.Cluster.get_quorum ~__context ~self:cluster |> Int64.to_int in
maybe_generate_alert ~__context ~missing_hosts ~new_hosts ~num_hosts ~quorum
maybe_generate_alert ~__context ~hosts_left ~hosts_joined ~num_hosts ~quorum

let alert_for_cluster_host_leave ~__context ~cluster_host =
alert_for_cluster_host ~__context ~cluster_host ~missing_hosts:[cluster_host]
~new_hosts:[]
alert_for_cluster_host ~__context ~cluster_host ~hosts_left:[cluster_host]
~hosts_joined:[]

let alert_for_cluster_host_join ~__context ~cluster_host =
alert_for_cluster_host ~__context ~cluster_host ~missing_hosts:[]
~new_hosts:[cluster_host]
alert_for_cluster_host ~__context ~cluster_host ~hosts_left:[]
~hosts_joined:[cluster_host]

(* Create xapi db object for cluster_host, resync_host calls clusterd *)
let create_internal ~__context ~cluster ~host ~pIF : API.ref_Cluster_host =
Expand Down
73 changes: 41 additions & 32 deletions ocaml/xapi/xapi_clustering.ml
Original file line number Diff line number Diff line change
Expand Up @@ -441,6 +441,13 @@ module Watcher = struct
Db.Cluster.set_is_quorate ~__context ~self:cluster
~value:diag.is_quorate ;
let all_cluster_hosts = Db.Cluster_host.get_all ~__context in
let live_hosts =
Db.Cluster_host.get_refs_where ~__context
~expr:(Eq (Field "live", Literal "true"))
in
let dead_hosts =
List.filter (fun h -> not (List.mem h live_hosts)) all_cluster_hosts
in
let ip_ch =
List.map
(fun ch ->
Expand All @@ -464,6 +471,7 @@ module Watcher = struct
)
all_cluster_hosts
| Some nodel ->
(* nodel contains the current members of the cluster, according to corosync *)
let quorum_hosts =
List.filter_map
(fun {addr; _} ->
Expand All @@ -480,31 +488,38 @@ module Watcher = struct
)
nodel
in
let missing_hosts =
List.filter
(fun h -> not (List.mem h quorum_hosts))
all_cluster_hosts

(* we handle unclean hosts join and leave below, i.e. hosts joining and leaving
due to network problems, power cut, etc. Join and leave initiated by the
API will be handled in the API call themselves *)

(* hosts_left contains the hosts that were live, but not in the list
of live hosts according to corosync *)
let hosts_left =
List.filter (fun h -> not (List.mem h quorum_hosts)) live_hosts
in
let new_hosts =
(* hosts_joined contains the hosts that were not live, but now in the
list of live hosts according to corosync *)
let hosts_joined =
List.filter
(fun h -> not (Db.Cluster_host.get_live ~__context ~self:h))
quorum_hosts
dead_hosts
in
List.iter
(fun self ->
Db.Cluster_host.set_live ~__context ~self ~value:true ;
Db.Cluster_host.set_last_update_live ~__context ~self
~value:current_time
)
new_hosts ;
hosts_joined ;
List.iter
(fun self ->
Db.Cluster_host.set_live ~__context ~self ~value:false ;
Db.Cluster_host.set_last_update_live ~__context ~self
~value:current_time
)
missing_hosts ;
maybe_generate_alert ~__context ~missing_hosts ~new_hosts
hosts_left ;
maybe_generate_alert ~__context ~hosts_left ~hosts_joined
~num_hosts:(List.length quorum_hosts) ~quorum:diag.quorum
) ;
Db.Cluster.set_quorum ~__context ~self:cluster
Expand All @@ -520,17 +535,15 @@ module Watcher = struct
performing update"
__FUNCTION__ (Printexc.to_string exn)

let cluster_change_watcher : Thread.t option ref = ref None
let cluster_change_watcher : bool Atomic.t = Atomic.make false

let mu = Mutex.create ()

open Xapi_stdext_threads.Threadext
let cluster_change_interval = 3.

let watch_cluster_change ~__context ~host =
while !Daemon.enabled do
let m =
Cluster_client.LocalClient.UPDATES.get (rpc ~__context)
"call cluster watcher" 3.
"call cluster watcher" cluster_change_interval
in
match Idl.IdM.run @@ Cluster_client.IDL.T.get m with
| Ok updates -> (
Expand All @@ -550,30 +563,26 @@ module Watcher = struct
| exception exn ->
warn "%s: Got exception %s while query cluster host updates, retrying"
__FUNCTION__ (Printexc.to_string exn) ;
Thread.delay 3.
Thread.delay cluster_change_interval
done ;
Mutex.execute mu (fun () -> cluster_change_watcher := None)
Atomic.set cluster_change_watcher false

(** [create_as_necessary] will create cluster watchers on the coordinator if they are not
already created.
There is no need to destroy them: once the clustering daemon is disabled,
these threads will exit as well. *)
let create_as_necessary ~__context ~host =
if Helpers.is_pool_master ~__context ~host then
if Xapi_cluster_helpers.cluster_health_enabled ~__context then (
debug "%s: create watcher for corosync-notifyd on coordinator"
__FUNCTION__ ;

Mutex.execute mu (fun () ->
match !cluster_change_watcher with
| None ->
cluster_change_watcher :=
Thread.create
(fun () -> watch_cluster_change ~__context ~host)
()
|> Option.some
| Some _ ->
()
)
)
if Xapi_cluster_helpers.cluster_health_enabled ~__context then
if Atomic.compare_and_set cluster_change_watcher false true then (
debug "%s: create watcher for corosync-notifyd on coordinator"
__FUNCTION__ ;
ignore
@@ Thread.create (fun () -> watch_cluster_change ~__context ~host) ()
) else
(* someone else must have gone into the if branch above and created the thread
before us, leave it to them *)
debug
"%s: not create watcher for corosync-notifyd as it already exists"
__FUNCTION__
end

0 comments on commit 4e46bd8

Please sign in to comment.