From e84d92d25dcea22fda5963b7f86b9723cb70ed20 Mon Sep 17 00:00:00 2001 From: Vincent Liu Date: Mon, 9 Sep 2024 17:00:58 +0100 Subject: [PATCH 1/4] Add sr to the Sr_unhealthy error constructor Where the first parameter represents the uuid of the SR, later used in generating the API error. Signed-off-by: Vincent Liu --- ocaml/xapi-idl/storage/storage_interface.ml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocaml/xapi-idl/storage/storage_interface.ml b/ocaml/xapi-idl/storage/storage_interface.ml index 02513ae4936..4b3e03e48e9 100644 --- a/ocaml/xapi-idl/storage/storage_interface.ml +++ b/ocaml/xapi-idl/storage/storage_interface.ml @@ -354,7 +354,7 @@ module Errors = struct | Cancelled of string | Redirect of string option | Sr_attached of string - | Sr_unhealthy of sr_health + | Sr_unhealthy of string * sr_health | Unimplemented of string | Activated_on_another_host of uuid | Duplicated_key of string From b9e93695e4e39852fc5b9f1e1e734073b0b7cc04 Mon Sep 17 00:00:00 2001 From: Vincent Liu Date: Wed, 21 Aug 2024 11:21:30 +0100 Subject: [PATCH 2/4] Add more description on sr health Aligned with https://xapi-project.github.io/xapi-storage/#volume-type-definitions Signed-off-by: Vincent Liu --- ocaml/idl/datamodel.ml | 9 +++++++-- ocaml/idl/schematest.ml | 2 +- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/ocaml/idl/datamodel.ml b/ocaml/idl/datamodel.ml index 4c014ca939c..2efca2c2ce8 100644 --- a/ocaml/idl/datamodel.ml +++ b/ocaml/idl/datamodel.ml @@ -2791,8 +2791,13 @@ module Sr_stat = struct , [ ("healthy", "Storage is fully available") ; ("recovering", "Storage is busy recovering, e.g. rebuilding mirrors.") - ; ("unreachable", "Storage is unreachable") - ; ("unavailable", "Storage is unavailable") + ; ( "unreachable" + , "Storage is unreachable but may be recoverable with admin \ + intervention" + ) + ; ( "unavailable" + , "Storage is unavailable, a host reboot will be required" + ) ] ) diff --git a/ocaml/idl/schematest.ml b/ocaml/idl/schematest.ml index 9b25ca48aee..c375a909149 100644 --- a/ocaml/idl/schematest.ml +++ b/ocaml/idl/schematest.ml @@ -3,7 +3,7 @@ let hash x = Digest.string x |> Digest.to_hex (* BEWARE: if this changes, check that schema has been bumped accordingly in ocaml/idl/datamodel_common.ml, usually schema_minor_vsn *) -let last_known_schema_hash = "ce370e3b85178acfbcfce4963c4f8534" +let last_known_schema_hash = "428caff23cdb969c59a9960beefd7bb6" let current_schema_hash : string = let open Datamodel_types in From 0b22e4f9b235bb697a7c6c68f6bbec0117f96910 Mon Sep 17 00:00:00 2001 From: Vincent Liu Date: Mon, 9 Sep 2024 14:54:35 +0100 Subject: [PATCH 3/4] Refactor sr_scan2_impl This is to help handling of retry logic in the next commit. It also de-nest the code (a bit). Signed-off-by: Vincent Liu --- ocaml/xapi-storage-script/main.ml | 116 ++++++++++++++---------------- 1 file changed, 55 insertions(+), 61 deletions(-) diff --git a/ocaml/xapi-storage-script/main.ml b/ocaml/xapi-storage-script/main.ml index 196e3edfc07..b7bf12d13cc 100644 --- a/ocaml/xapi-storage-script/main.ml +++ b/ocaml/xapi-storage-script/main.ml @@ -1208,69 +1208,63 @@ let bind ~volume_script_dir = in S.SR.scan sr_scan_impl ; let sr_scan2_impl dbg sr = + let get_sr_info sr = + return_volume_rpc (fun () -> Sr_client.stat (volume_rpc ~dbg) dbg sr) + >>>= fun response -> + Deferred.Result.return + { + Storage_interface.sr_uuid= response.Xapi_storage.Control.uuid + ; name_label= response.Xapi_storage.Control.name + ; name_description= response.Xapi_storage.Control.description + ; total_space= response.Xapi_storage.Control.total_space + ; free_space= response.Xapi_storage.Control.free_space + ; clustered= response.Xapi_storage.Control.clustered + ; health= + ( match response.Xapi_storage.Control.health with + | Xapi_storage.Control.Healthy _ -> + Healthy + | Xapi_storage.Control.Recovering _ -> + Recovering + | Xapi_storage.Control.Unreachable _ -> + Unreachable + | Xapi_storage.Control.Unavailable _ -> + Unavailable + ) + } + in + let get_volume_info sr sr_info = + return_volume_rpc (fun () -> + Sr_client.ls + (volume_rpc ~dbg ~compat_out:Compat.compat_out_volumes) + dbg sr + ) + >>>= fun response -> + let response = Array.to_list response in + (* Filter out volumes which are clone-on-boot transients *) + let transients = + List.fold + ~f:(fun set x -> + match + List.Assoc.find x.Xapi_storage.Control.keys _clone_on_boot_key + ~equal:String.equal + with + | None -> + set + | Some transient -> + Set.add set transient + ) + ~init:Core.String.Set.empty response + in + let response = + List.filter + ~f:(fun x -> not (Set.mem transients x.Xapi_storage.Control.key)) + response + in + Deferred.Result.return (List.map ~f:vdi_of_volume response, sr_info) + in Attached_SRs.find sr >>>= (fun sr -> - return_volume_rpc (fun () -> Sr_client.stat (volume_rpc ~dbg) dbg sr) - >>>= fun response -> - Deferred.Result.return - { - Storage_interface.sr_uuid= response.Xapi_storage.Control.uuid - ; name_label= response.Xapi_storage.Control.name - ; name_description= response.Xapi_storage.Control.description - ; total_space= response.Xapi_storage.Control.total_space - ; free_space= response.Xapi_storage.Control.free_space - ; clustered= response.Xapi_storage.Control.clustered - ; health= - ( match response.Xapi_storage.Control.health with - | Xapi_storage.Control.Healthy _ -> - Healthy - | Xapi_storage.Control.Recovering _ -> - Recovering - | Xapi_storage.Control.Unreachable _ -> - Unreachable - | Xapi_storage.Control.Unavailable _ -> - Unavailable - ) - } - >>>= fun sr_info -> - match sr_info.health with - | Healthy -> - return_volume_rpc (fun () -> - Sr_client.ls - (volume_rpc ~dbg ~compat_out:Compat.compat_out_volumes) - dbg sr - ) - >>>= fun response -> - let response = Array.to_list response in - (* Filter out volumes which are clone-on-boot transients *) - let transients = - List.fold - ~f:(fun set x -> - match - List.Assoc.find x.Xapi_storage.Control.keys - _clone_on_boot_key ~equal:String.equal - with - | None -> - set - | Some transient -> - Set.add set transient - ) - ~init:Core.String.Set.empty response - in - let response = - List.filter - ~f:(fun x -> - not (Set.mem transients x.Xapi_storage.Control.key) - ) - response - in - Deferred.Result.return - (List.map ~f:vdi_of_volume response, sr_info) - | health -> - debug "%s: sr unhealthy %s" __FUNCTION__ - (Storage_interface.show_sr_health health) ; - Deferred.Result.fail - Storage_interface.(Errors.Sr_unhealthy health) + get_sr_info sr >>>= fun sr_info -> get_volume_info sr sr_info ) |> wrap in From 9b5d66f9094d6e7f7e8569bf16c77962d4e01222 Mon Sep 17 00:00:00 2001 From: Vincent Liu Date: Mon, 9 Sep 2024 14:57:07 +0100 Subject: [PATCH 4/4] CP-49448: Add handling logic for SR health state For SMAPIv3 SRs, we add additional logic to handle the health state returned by SR.stat in sr_scan2_impl. This takes advantage of the additional information returned by the storage layer. Specifically, two new health states are handled as follows: - Unreachable: retry a few times before sending an error message to the user, suggesting retry later. - Unavailable: return an error message immediately and tell the user to reboot in order to recover from this error. Signed-off-by: Vincent Liu --- ocaml/idl/datamodel_errors.ml | 3 +++ ocaml/xapi-consts/api_errors.ml | 2 ++ ocaml/xapi-storage-script/main.ml | 24 +++++++++++++++++----- ocaml/xapi/storage_access.ml | 34 +++++++++++++++++++++++-------- 4 files changed, 50 insertions(+), 13 deletions(-) diff --git a/ocaml/idl/datamodel_errors.ml b/ocaml/idl/datamodel_errors.ml index 3071a4add47..aead3e0abc4 100644 --- a/ocaml/idl/datamodel_errors.ml +++ b/ocaml/idl/datamodel_errors.ml @@ -1256,6 +1256,9 @@ let _ = () ; error Api_errors.sr_is_cache_sr ["host"] ~doc:"The SR is currently being used as a local cache SR." () ; + error Api_errors.sr_unhealthy ["sr"; "health"; "fix"] + ~doc:"The SR is currently unhealthy. See the suggestion on how to fix it." + () ; error Api_errors.clustered_sr_degraded ["sr"] ~doc: "An SR is using clustered local storage. It is not safe to reboot a host \ diff --git a/ocaml/xapi-consts/api_errors.ml b/ocaml/xapi-consts/api_errors.ml index 53d9684561f..97880cde57a 100644 --- a/ocaml/xapi-consts/api_errors.ml +++ b/ocaml/xapi-consts/api_errors.ml @@ -512,6 +512,8 @@ let sr_requires_upgrade = add_error "SR_REQUIRES_UPGRADE" let sr_is_cache_sr = add_error "SR_IS_CACHE_SR" +let sr_unhealthy = add_error "SR_UNHEALTHY" + let vdi_in_use = add_error "VDI_IN_USE" let vdi_is_sharable = add_error "VDI_IS_SHARABLE" diff --git a/ocaml/xapi-storage-script/main.ml b/ocaml/xapi-storage-script/main.ml index b7bf12d13cc..74ea3bb8d9f 100644 --- a/ocaml/xapi-storage-script/main.ml +++ b/ocaml/xapi-storage-script/main.ml @@ -1208,6 +1208,7 @@ let bind ~volume_script_dir = in S.SR.scan sr_scan_impl ; let sr_scan2_impl dbg sr = + let sr_uuid = Storage_interface.Sr.string_of sr in let get_sr_info sr = return_volume_rpc (fun () -> Sr_client.stat (volume_rpc ~dbg) dbg sr) >>>= fun response -> @@ -1262,11 +1263,24 @@ let bind ~volume_script_dir = in Deferred.Result.return (List.map ~f:vdi_of_volume response, sr_info) in - Attached_SRs.find sr - >>>= (fun sr -> - get_sr_info sr >>>= fun sr_info -> get_volume_info sr sr_info - ) - |> wrap + let rec stat_with_retry ?(times = 3) sr = + get_sr_info sr >>>= fun sr_info -> + match sr_info.health with + | Healthy -> + debug "%s sr %s is healthy" __FUNCTION__ sr_uuid ; + get_volume_info sr sr_info + | Unreachable when times > 0 -> + debug "%s: sr %s is unreachable, remaining %d retries" __FUNCTION__ + sr_uuid times ; + Clock.after Time.Span.second >>= fun () -> + stat_with_retry ~times:(times - 1) sr + | health -> + debug "%s: sr unhealthy because it is %s" __FUNCTION__ + (Storage_interface.show_sr_health health) ; + Deferred.Result.fail + Storage_interface.(Errors.Sr_unhealthy (sr_uuid, health)) + in + Attached_SRs.find sr >>>= stat_with_retry |> wrap in S.SR.scan2 sr_scan2_impl ; let vdi_create_impl dbg sr (vdi_info : Storage_interface.vdi_info) = diff --git a/ocaml/xapi/storage_access.ml b/ocaml/xapi/storage_access.ml index a307eb48bdd..c92651bc576 100644 --- a/ocaml/xapi/storage_access.ml +++ b/ocaml/xapi/storage_access.ml @@ -31,6 +31,11 @@ let s_of_vdi = Vdi.string_of let s_of_sr = Sr.string_of let transform_storage_exn f = + let get_sr_ref sr_uuid = + Server_helpers.exec_with_new_task "transform_storage_exn" (fun __context -> + Db.SR.get_by_uuid ~__context ~uuid:sr_uuid + ) + in try f () with | Storage_error (Backend_error (code, params)) as e -> Backtrace.reraise e (Api_errors.Server_error (code, params)) @@ -39,17 +44,30 @@ let transform_storage_exn f = let backtrace = Backtrace.Interop.of_json "SM" backtrace in Backtrace.add e backtrace ; Backtrace.reraise e (Api_errors.Server_error (code, params)) + | Storage_error (Sr_unhealthy (sr, health)) as e -> + let advice = + match health with + | Unavailable -> + "try reboot" + | Unreachable -> + "try again later" + | _health -> + "" + in + let sr = get_sr_ref sr in + Backtrace.reraise e + (Api_errors.Server_error + ( Api_errors.sr_unhealthy + , [Ref.string_of sr; Storage_interface.show_sr_health health; advice] + ) + ) | Api_errors.Server_error _ as e -> raise e | Storage_error (No_storage_plugin_for_sr sr) as e -> - Server_helpers.exec_with_new_task "transform_storage_exn" - (fun __context -> - let sr = Db.SR.get_by_uuid ~__context ~uuid:sr in - Backtrace.reraise e - (Api_errors.Server_error - (Api_errors.sr_not_attached, [Ref.string_of sr]) - ) - ) + let sr = get_sr_ref sr in + Backtrace.reraise e + (Api_errors.Server_error (Api_errors.sr_not_attached, [Ref.string_of sr]) + ) | e -> Backtrace.reraise e (Api_errors.Server_error