From f02c9e7cfe3e41feef5cfc3dbef77184d00696f8 Mon Sep 17 00:00:00 2001 From: Vincent Liu Date: Mon, 17 Jun 2024 17:10:49 +0100 Subject: [PATCH] CA-394109: Reduce number of alerts Currently there are two ways of raising an alert when a cluster host joins/leaves the cluster: 1. through the api call such as cluster-host-leave; 2. through the cluster watcher which checks for updates from `corosync-notifyd`. This will cause all alerts to be raised twice. This patch favours the second approach as it accounts for "unclean" leave of a cluster host. Moreover, a "clean" leave triggered by an API will cause a change in `corosync-notifyd` which will get detected by the watcher anyway. This solves the double alerting problem. Signed-off-by: Vincent Liu --- ocaml/xapi/xapi_cluster_host.ml | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/ocaml/xapi/xapi_cluster_host.ml b/ocaml/xapi/xapi_cluster_host.ml index de14b266a96..11730e32e72 100644 --- a/ocaml/xapi/xapi_cluster_host.ml +++ b/ocaml/xapi/xapi_cluster_host.ml @@ -13,7 +13,6 @@ *) open Xapi_clustering -open Xapi_cluster_helpers open Ipaddr_rpc_type module D = Debug.Make (struct let name = "xapi_cluster_host" end) @@ -55,19 +54,6 @@ let call_api_function_with_alert ~__context ~msg ~cls ~obj_uuid ~body raise err ) -let alert_for_cluster_host ~__context ~cluster_host ~missing_hosts ~new_hosts = - let num_hosts = Db.Cluster_host.get_all ~__context |> List.length in - let cluster = Db.Cluster_host.get_cluster ~__context ~self:cluster_host in - let quorum = Db.Cluster.get_quorum ~__context ~self:cluster |> Int64.to_int in - maybe_generate_alert ~__context ~missing_hosts ~new_hosts ~num_hosts ~quorum - -let alert_for_cluster_host_leave ~__context ~cluster_host = - alert_for_cluster_host ~__context ~cluster_host ~missing_hosts:[cluster_host] - ~new_hosts:[] - -let alert_for_cluster_host_join ~__context ~cluster_host = - alert_for_cluster_host ~__context ~cluster_host ~missing_hosts:[] - ~new_hosts:[cluster_host] (* Create xapi db object for cluster_host, resync_host calls clusterd *) let create_internal ~__context ~cluster ~host ~pIF : API.ref_Cluster_host = @@ -81,7 +67,6 @@ let create_internal ~__context ~cluster ~host ~pIF : API.ref_Cluster_host = ~enabled:false ~current_operations:[] ~allowed_operations:[] ~other_config:[] ~joined:false ~live:false ~last_update_live:API.Date.epoch ; - alert_for_cluster_host_join ~__context ~cluster_host:ref ; ref ) @@ -274,14 +259,12 @@ let destroy_op ~__context ~self ~force = let result = local_fn (rpc ~__context) dbg in match Idl.IdM.run @@ Cluster_client.IDL.T.get result with | Ok () -> - alert_for_cluster_host_leave ~__context ~cluster_host:self ; Db.Cluster_host.destroy ~__context ~self ; debug "Cluster_host.%s was successful" fn_str ; Xapi_clustering.Daemon.disable ~__context | Error error -> warn "Error occurred during Cluster_host.%s" fn_str ; if force then ( - alert_for_cluster_host_leave ~__context ~cluster_host:self ; let ref_str = Ref.string_of self in Db.Cluster_host.destroy ~__context ~self ; debug "Cluster_host %s force destroyed." ref_str @@ -329,7 +312,6 @@ let forget ~__context ~self = Db.Cluster.set_pending_forget ~__context ~self:cluster ~value:[] ; (* must not disable the daemon here, because we declared another unreachable node dead, * not the current one *) - alert_for_cluster_host_leave ~__context ~cluster_host:self ; Db.Cluster_host.destroy ~__context ~self ; debug "Cluster_host.forget was successful" | Error error ->