Skip to content

Commit

Permalink
CA-394343: After clock jump the xapi assumed the host is HOST_OFFLINE
Browse files Browse the repository at this point in the history
Prior to this commit, the xapi on the coordinator host records the
'Unix.gettimeofday' as the timestamps of the heartbeat with other pool
supporter hosts. When the system clock is updated with a huge jump
forward, the timestamps would be far back into the past. This would
cause the xapi assumes that the hosts are offline as long time no
heartbeats.

In this commit, the timestamps are changed to get from a monotonic
clock. In this way, the system clock changes will not impact the
heartbeats' timestamps any more.

Additionally, Host_metrics.last_updated is only set when the object is
created. It's useless in check_host_liveness at all.

Signed-off-by: Ming Lu <[email protected]>
  • Loading branch information
minglumlu committed Jul 9, 2024
1 parent 5519cf9 commit 9291f21
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 38 deletions.
56 changes: 21 additions & 35 deletions ocaml/xapi/db_gc.ml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ let use_host_heartbeat_for_liveness = ref true

let use_host_heartbeat_for_liveness_m = Mutex.create ()

let host_heartbeat_table : (API.ref_host, float) Hashtbl.t = Hashtbl.create 16
let host_heartbeat_table : (API.ref_host, Clock.Timer.t) Hashtbl.t =
Hashtbl.create 16

let host_skew_table : (API.ref_host, float) Hashtbl.t = Hashtbl.create 16

Expand Down Expand Up @@ -77,45 +78,24 @@ let detect_clock_skew ~__context host skew =
(* Master compares the database with the in-memory host heartbeat table and sets the live flag accordingly.
Called with the use_host_heartbeat_for_liveness_m and use_host_heartbeat_for_liveness is true (ie non-HA mode) *)
let check_host_liveness ~__context =
(* Check for rolling upgrade mode - if so, use host metrics for liveness else use hashtbl *)
let rum =
try Helpers.rolling_upgrade_in_progress ~__context with _ -> false
in
(* CA-16351: when performing the initial GC pass on first boot there won't be a localhost *)
let localhost = try Helpers.get_localhost ~__context with _ -> Ref.null in
(* Look for "true->false" transition on Host_metrics.live *)
let check_host host =
if host <> localhost then
try
let hmetric = Db.Host.get_metrics ~__context ~self:host in
let live = Db.Host_metrics.get_live ~__context ~self:hmetric in
(* See if the host is using the new HB mechanism, if so we'll use that *)
let new_heartbeat_time =
let timer =
with_lock host_table_m (fun () ->
Option.value
(Hashtbl.find_opt host_heartbeat_table host)
~default:Clock.Date.(epoch |> to_unix_time)
match Hashtbl.find_opt host_heartbeat_table host with
| Some x ->
x
| None ->
Clock.Timer.start
~duration:!Xapi_globs.host_assumed_dead_interval
)
in
let old_heartbeat_time =
if
rum
&& Xapi_version.platform_version ()
<> Helpers.version_string_of ~__context (Helpers.LocalObject host)
then (
debug
"Host %s considering using metrics last update time as heartbeat"
(Ref.string_of host) ;
Date.to_float
(Db.Host_metrics.get_last_updated ~__context ~self:hmetric)
) else
0.0
in
(* Use whichever value is the most recent to determine host liveness *)
let host_time = max old_heartbeat_time new_heartbeat_time in
let now = Unix.gettimeofday () in
(* we can now compare 'host_time' with 'now' *)
if now -. host_time < !Xapi_globs.host_assumed_dead_interval then
if not (Clock.Timer.has_expired timer) then
(* From the heartbeat PoV the host looks alive. We try to (i) minimise database sets; and (ii)
avoid toggling the host back to live if it has been marked as shutting_down. *)
with_lock Xapi_globs.hosts_which_are_shutting_down_m (fun () ->
Expand All @@ -131,10 +111,14 @@ let check_host_liveness ~__context =
)
)
else if live then (
let host_name_label = Db.Host.get_name_label ~__context ~self:host in
let host_uuid = Db.Host.get_uuid ~__context ~self:host in
let elapsed = Clock.Timer.elapsed timer in
debug
"Assuming host is offline since the heartbeat/metrics haven't been \
updated for %.2f seconds; setting live to false"
(now -. host_time) ;
"Assuming host '%s' (%s) is offline since the heartbeat hasn't \
been updated for %s seconds; setting live to false"
host_name_label host_uuid
(Clock.Timer.span_to_s elapsed |> string_of_float) ;
Db.Host_metrics.set_live ~__context ~self:hmetric ~value:false ;
Xapi_host_helpers.update_allowed_operations ~__context ~self:host
) ;
Expand Down Expand Up @@ -252,16 +236,18 @@ let tickle_heartbeat ~__context host stuff =
let reason = Xapi_hooks.reason__clean_shutdown in
if use_host_heartbeat_for_liveness then
Xapi_host_helpers.mark_host_as_dead ~__context ~host ~reason
) else
) else (
Hashtbl.replace host_heartbeat_table host
(Clock.Timer.start ~duration:!Xapi_globs.host_assumed_dead_interval) ;
let now = Unix.gettimeofday () in
Hashtbl.replace host_heartbeat_table host now ;
(* compute the clock skew for later analysis *)
if List.mem_assoc _time stuff then
try
let slave = float_of_string (List.assoc _time stuff) in
let skew = abs_float (now -. slave) in
Hashtbl.replace host_skew_table host skew
with _ -> ()
)
) ;
[]

Expand Down
6 changes: 4 additions & 2 deletions ocaml/xapi/xapi_globs.ml
Original file line number Diff line number Diff line change
Expand Up @@ -707,7 +707,7 @@ let snapshot_with_quiesce_timeout = ref 600.
let host_heartbeat_interval = ref 30.

(* If we haven't heard a heartbeat from a host for this interval then the host is assumed dead *)
let host_assumed_dead_interval = ref 600.0
let host_assumed_dead_interval = ref Mtime.Span.(10 * min)

(* If a session has a last_active older than this we delete it *)
let inactive_session_timeout = ref 86400. (* 24 hrs in seconds *)
Expand Down Expand Up @@ -1070,7 +1070,9 @@ let xapi_globs_spec =
; ("wait_memory_target_timeout", Float wait_memory_target_timeout)
; ("snapshot_with_quiesce_timeout", Float snapshot_with_quiesce_timeout)
; ("host_heartbeat_interval", Float host_heartbeat_interval)
; ("host_assumed_dead_interval", Float host_assumed_dead_interval)
; ( "host_assumed_dead_interval"
, LongDurationFromSeconds host_assumed_dead_interval
)
; ("fuse_time", Float Constants.fuse_time)
; ("db_restore_fuse_time", Float Constants.db_restore_fuse_time)
; ("inactive_session_timeout", Float inactive_session_timeout)
Expand Down
2 changes: 1 addition & 1 deletion ocaml/xapi/xapi_ha.ml
Original file line number Diff line number Diff line change
Expand Up @@ -837,7 +837,7 @@ module Monitor = struct
(ExnHelper.string_of_exn e) ;
Thread.delay !Xapi_globs.ha_monitor_interval
done ;
debug "Re-enabling old Host_metrics.live heartbeat" ;
debug "Re-enabling host heartbeat" ;
with_lock Db_gc.use_host_heartbeat_for_liveness_m (fun () ->
Db_gc.use_host_heartbeat_for_liveness := true
) ;
Expand Down

0 comments on commit 9291f21

Please sign in to comment.