Skip to content

Commit

Permalink
Merge pull request #5980 from Vincent-lau/private/shul2/sr-health-handle
Browse files Browse the repository at this point in the history
CP-49448: Add handling logic for SR health state
  • Loading branch information
Vincent-lau authored Sep 11, 2024
2 parents 1e924cb + 9b5d66f commit 0438aee
Show file tree
Hide file tree
Showing 7 changed files with 113 additions and 77 deletions.
9 changes: 7 additions & 2 deletions ocaml/idl/datamodel.ml
Original file line number Diff line number Diff line change
Expand Up @@ -2791,8 +2791,13 @@ module Sr_stat = struct
, [
("healthy", "Storage is fully available")
; ("recovering", "Storage is busy recovering, e.g. rebuilding mirrors.")
; ("unreachable", "Storage is unreachable")
; ("unavailable", "Storage is unavailable")
; ( "unreachable"
, "Storage is unreachable but may be recoverable with admin \
intervention"
)
; ( "unavailable"
, "Storage is unavailable, a host reboot will be required"
)
]
)

Expand Down
3 changes: 3 additions & 0 deletions ocaml/idl/datamodel_errors.ml
Original file line number Diff line number Diff line change
Expand Up @@ -1256,6 +1256,9 @@ let _ =
() ;
error Api_errors.sr_is_cache_sr ["host"]
~doc:"The SR is currently being used as a local cache SR." () ;
error Api_errors.sr_unhealthy ["sr"; "health"; "fix"]
~doc:"The SR is currently unhealthy. See the suggestion on how to fix it."
() ;
error Api_errors.clustered_sr_degraded ["sr"]
~doc:
"An SR is using clustered local storage. It is not safe to reboot a host \
Expand Down
2 changes: 1 addition & 1 deletion ocaml/idl/schematest.ml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ let hash x = Digest.string x |> Digest.to_hex
(* BEWARE: if this changes, check that schema has been bumped accordingly in
ocaml/idl/datamodel_common.ml, usually schema_minor_vsn *)

let last_known_schema_hash = "ce370e3b85178acfbcfce4963c4f8534"
let last_known_schema_hash = "428caff23cdb969c59a9960beefd7bb6"

let current_schema_hash : string =
let open Datamodel_types in
Expand Down
2 changes: 2 additions & 0 deletions ocaml/xapi-consts/api_errors.ml
Original file line number Diff line number Diff line change
Expand Up @@ -512,6 +512,8 @@ let sr_requires_upgrade = add_error "SR_REQUIRES_UPGRADE"

let sr_is_cache_sr = add_error "SR_IS_CACHE_SR"

let sr_unhealthy = add_error "SR_UNHEALTHY"

let vdi_in_use = add_error "VDI_IN_USE"

let vdi_is_sharable = add_error "VDI_IS_SHARABLE"
Expand Down
2 changes: 1 addition & 1 deletion ocaml/xapi-idl/storage/storage_interface.ml
Original file line number Diff line number Diff line change
Expand Up @@ -354,7 +354,7 @@ module Errors = struct
| Cancelled of string
| Redirect of string option
| Sr_attached of string
| Sr_unhealthy of sr_health
| Sr_unhealthy of string * sr_health
| Unimplemented of string
| Activated_on_another_host of uuid
| Duplicated_key of string
Expand Down
138 changes: 73 additions & 65 deletions ocaml/xapi-storage-script/main.ml
Original file line number Diff line number Diff line change
Expand Up @@ -1208,71 +1208,79 @@ let bind ~volume_script_dir =
in
S.SR.scan sr_scan_impl ;
let sr_scan2_impl dbg sr =
Attached_SRs.find sr
>>>= (fun sr ->
return_volume_rpc (fun () -> Sr_client.stat (volume_rpc ~dbg) dbg sr)
>>>= fun response ->
Deferred.Result.return
{
Storage_interface.sr_uuid= response.Xapi_storage.Control.uuid
; name_label= response.Xapi_storage.Control.name
; name_description= response.Xapi_storage.Control.description
; total_space= response.Xapi_storage.Control.total_space
; free_space= response.Xapi_storage.Control.free_space
; clustered= response.Xapi_storage.Control.clustered
; health=
( match response.Xapi_storage.Control.health with
| Xapi_storage.Control.Healthy _ ->
Healthy
| Xapi_storage.Control.Recovering _ ->
Recovering
| Xapi_storage.Control.Unreachable _ ->
Unreachable
| Xapi_storage.Control.Unavailable _ ->
Unavailable
)
}
>>>= fun sr_info ->
match sr_info.health with
| Healthy ->
return_volume_rpc (fun () ->
Sr_client.ls
(volume_rpc ~dbg ~compat_out:Compat.compat_out_volumes)
dbg sr
)
>>>= fun response ->
let response = Array.to_list response in
(* Filter out volumes which are clone-on-boot transients *)
let transients =
List.fold
~f:(fun set x ->
match
List.Assoc.find x.Xapi_storage.Control.keys
_clone_on_boot_key ~equal:String.equal
with
| None ->
set
| Some transient ->
Set.add set transient
)
~init:Core.String.Set.empty response
in
let response =
List.filter
~f:(fun x ->
not (Set.mem transients x.Xapi_storage.Control.key)
)
response
in
Deferred.Result.return
(List.map ~f:vdi_of_volume response, sr_info)
| health ->
debug "%s: sr unhealthy %s" __FUNCTION__
(Storage_interface.show_sr_health health) ;
Deferred.Result.fail
Storage_interface.(Errors.Sr_unhealthy health)
)
|> wrap
let sr_uuid = Storage_interface.Sr.string_of sr in
let get_sr_info sr =
return_volume_rpc (fun () -> Sr_client.stat (volume_rpc ~dbg) dbg sr)
>>>= fun response ->
Deferred.Result.return
{
Storage_interface.sr_uuid= response.Xapi_storage.Control.uuid
; name_label= response.Xapi_storage.Control.name
; name_description= response.Xapi_storage.Control.description
; total_space= response.Xapi_storage.Control.total_space
; free_space= response.Xapi_storage.Control.free_space
; clustered= response.Xapi_storage.Control.clustered
; health=
( match response.Xapi_storage.Control.health with
| Xapi_storage.Control.Healthy _ ->
Healthy
| Xapi_storage.Control.Recovering _ ->
Recovering
| Xapi_storage.Control.Unreachable _ ->
Unreachable
| Xapi_storage.Control.Unavailable _ ->
Unavailable
)
}
in
let get_volume_info sr sr_info =
return_volume_rpc (fun () ->
Sr_client.ls
(volume_rpc ~dbg ~compat_out:Compat.compat_out_volumes)
dbg sr
)
>>>= fun response ->
let response = Array.to_list response in
(* Filter out volumes which are clone-on-boot transients *)
let transients =
List.fold
~f:(fun set x ->
match
List.Assoc.find x.Xapi_storage.Control.keys _clone_on_boot_key
~equal:String.equal
with
| None ->
set
| Some transient ->
Set.add set transient
)
~init:Core.String.Set.empty response
in
let response =
List.filter
~f:(fun x -> not (Set.mem transients x.Xapi_storage.Control.key))
response
in
Deferred.Result.return (List.map ~f:vdi_of_volume response, sr_info)
in
let rec stat_with_retry ?(times = 3) sr =
get_sr_info sr >>>= fun sr_info ->
match sr_info.health with
| Healthy ->
debug "%s sr %s is healthy" __FUNCTION__ sr_uuid ;
get_volume_info sr sr_info
| Unreachable when times > 0 ->
debug "%s: sr %s is unreachable, remaining %d retries" __FUNCTION__
sr_uuid times ;
Clock.after Time.Span.second >>= fun () ->
stat_with_retry ~times:(times - 1) sr
| health ->
debug "%s: sr unhealthy because it is %s" __FUNCTION__
(Storage_interface.show_sr_health health) ;
Deferred.Result.fail
Storage_interface.(Errors.Sr_unhealthy (sr_uuid, health))
in
Attached_SRs.find sr >>>= stat_with_retry |> wrap
in
S.SR.scan2 sr_scan2_impl ;
let vdi_create_impl dbg sr (vdi_info : Storage_interface.vdi_info) =
Expand Down
34 changes: 26 additions & 8 deletions ocaml/xapi/storage_access.ml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,11 @@ let s_of_vdi = Vdi.string_of
let s_of_sr = Sr.string_of

let transform_storage_exn f =
let get_sr_ref sr_uuid =
Server_helpers.exec_with_new_task "transform_storage_exn" (fun __context ->
Db.SR.get_by_uuid ~__context ~uuid:sr_uuid
)
in
try f () with
| Storage_error (Backend_error (code, params)) as e ->
Backtrace.reraise e (Api_errors.Server_error (code, params))
Expand All @@ -39,17 +44,30 @@ let transform_storage_exn f =
let backtrace = Backtrace.Interop.of_json "SM" backtrace in
Backtrace.add e backtrace ;
Backtrace.reraise e (Api_errors.Server_error (code, params))
| Storage_error (Sr_unhealthy (sr, health)) as e ->
let advice =
match health with
| Unavailable ->
"try reboot"
| Unreachable ->
"try again later"
| _health ->
""
in
let sr = get_sr_ref sr in
Backtrace.reraise e
(Api_errors.Server_error
( Api_errors.sr_unhealthy
, [Ref.string_of sr; Storage_interface.show_sr_health health; advice]
)
)
| Api_errors.Server_error _ as e ->
raise e
| Storage_error (No_storage_plugin_for_sr sr) as e ->
Server_helpers.exec_with_new_task "transform_storage_exn"
(fun __context ->
let sr = Db.SR.get_by_uuid ~__context ~uuid:sr in
Backtrace.reraise e
(Api_errors.Server_error
(Api_errors.sr_not_attached, [Ref.string_of sr])
)
)
let sr = get_sr_ref sr in
Backtrace.reraise e
(Api_errors.Server_error (Api_errors.sr_not_attached, [Ref.string_of sr])
)
| e ->
Backtrace.reraise e
(Api_errors.Server_error
Expand Down

0 comments on commit 0438aee

Please sign in to comment.