Skip to content

Mux mirror failure check for SXM #6439

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 39 additions & 0 deletions ocaml/xapi-idl/storage/storage_interface.ml
Original file line number Diff line number Diff line change
Expand Up @@ -1179,6 +1179,27 @@ module StorageAPI (R : RPC) = struct
let receive_cancel2 =
declare "DATA.MIRROR.receive_cancel2" []
(dbg_p @-> id_p @-> url_p @-> verify_dest_p @-> returning unit_p err)

let pre_deactivate_hook =
declare "DATA.MIRROR.pre_deactivate_hook" []
(dbg_p @-> dp_p @-> sr_p @-> vdi_p @-> returning unit_p err)

let is_mirror_failed =
let mirror_failed_p =
Param.mk ~name:"mirror_failed_p" ~description:[] Types.bool
in
declare "DATA.MIRROR.is_mirror_failed" []
(dbg_p @-> id_p @-> sr_p @-> returning mirror_failed_p err)

let list =
let result_p =
Param.mk ~name:"mirrors" TypeCombinators.(list (pair Mirror.(id, t)))
in
declare "DATA.MIRROR.list" [] (dbg_p @-> returning result_p err)

let stat =
let result_p = Param.mk ~name:"result" Mirror.t in
declare "DATA.MIRROR.stat" [] (dbg_p @-> id_p @-> returning result_p err)
end
end

Expand Down Expand Up @@ -1285,6 +1306,16 @@ module type MIRROR = sig
-> url:string
-> verify_dest:bool
-> unit

val pre_deactivate_hook :
context -> dbg:debug_info -> dp:dp -> sr:sr -> vdi:vdi -> unit

val is_mirror_failed :
context -> dbg:debug_info -> mirror_id:Mirror.id -> sr:Sr.t -> bool

val list : context -> dbg:debug_info -> (Mirror.id * Mirror.t) list

val stat : context -> dbg:debug_info -> id:Mirror.id -> Mirror.t
end

module type Server_impl = sig
Expand Down Expand Up @@ -1759,6 +1790,14 @@ module Server (Impl : Server_impl) () = struct
Impl.DATA.MIRROR.receive_finalize2 () ~dbg ~mirror_id ~sr ~url
~verify_dest
) ;
S.DATA.MIRROR.pre_deactivate_hook (fun dbg dp sr vdi ->
Impl.DATA.MIRROR.pre_deactivate_hook () ~dbg ~dp ~sr ~vdi
) ;
S.DATA.MIRROR.is_mirror_failed (fun dbg mirror_id sr ->
Impl.DATA.MIRROR.is_mirror_failed () ~dbg ~mirror_id ~sr
) ;
S.DATA.MIRROR.list (fun dbg -> Impl.DATA.MIRROR.list () ~dbg) ;
S.DATA.MIRROR.stat (fun dbg id -> Impl.DATA.MIRROR.stat () ~dbg ~id) ;
S.DATA.import_activate (fun dbg dp sr vdi vm ->
Impl.DATA.import_activate () ~dbg ~dp ~sr ~vdi ~vm
) ;
Expand Down
10 changes: 10 additions & 0 deletions ocaml/xapi-idl/storage/storage_skeleton.ml
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,16 @@ module DATA = struct

let receive_cancel2 ctx ~dbg ~mirror_id ~url ~verify_dest =
u "DATA.MIRROR.receive_cancel2"

let pre_deactivate_hook ctx ~dbg ~dp ~sr ~vdi =
u "DATA.MIRROR.pre_deactivate_hook"

let is_mirror_failed ctx ~dbg ~mirror_id ~sr =
u "DATA.MIRROR.is_mirror_failed"

let list ctx ~dbg = u "DATA.MIRROR.list"

let stat ctx ~dbg ~id = u "DATA.MIRROR.stat"
end
end

Expand Down
2 changes: 1 addition & 1 deletion ocaml/xapi-storage-cli/main.ml
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ let string_of_file filename =

let mirror_list common_opts =
wrap common_opts (fun () ->
let list = Storage_migrate.list ~dbg in
let list = Client.DATA.MIRROR.list dbg in
List.iter
(fun (id, status) -> Printf.printf "%s" (string_of_mirror id status))
list
Expand Down
4 changes: 4 additions & 0 deletions ocaml/xapi-storage-script/main.ml
Original file line number Diff line number Diff line change
Expand Up @@ -1927,6 +1927,10 @@ let bind ~volume_script_dir =
S.DATA.MIRROR.receive_finalize2 (u "DATA.MIRROR.receive_finalize2") ;
S.DATA.MIRROR.receive_cancel (u "DATA.MIRROR.receive_cancel") ;
S.DATA.MIRROR.receive_cancel2 (u "DATA.MIRROR.receive_cancel2") ;
S.DATA.MIRROR.pre_deactivate_hook (u "DATA.MIRROR.pre_deactivate_hook") ;
S.DATA.MIRROR.is_mirror_failed (u "DATA.MIRROR.is_mirror_failed") ;
S.DATA.MIRROR.list (u "DATA.MIRROR.list") ;
S.DATA.MIRROR.stat (u "DATA.MIRROR.stat") ;
S.DP.create (u "DP.create") ;
S.TASK.cancel (u "TASK.cancel") ;
S.TASK.list (u "TASK.list") ;
Expand Down
2 changes: 1 addition & 1 deletion ocaml/xapi/storage_access.ml
Original file line number Diff line number Diff line change
Expand Up @@ -446,7 +446,7 @@ let update_task ~__context id =
let update_mirror ~__context id =
try
let dbg = Context.string_of_task __context in
let m = Storage_migrate.stat ~dbg ~id in
let m = Client.DATA.MIRROR.stat dbg id in
if m.Mirror.failed then
debug "Mirror %s has failed" id ;
let task = get_mirror_task id in
Expand Down
73 changes: 8 additions & 65 deletions ocaml/xapi/storage_migrate.ml
Original file line number Diff line number Diff line change
Expand Up @@ -209,28 +209,20 @@ module MigrateLocal = struct
stop ~dbg ~id:mirror_id ;
raise e

let stat ~dbg:_ ~id =
let stat ~dbg ~id =
let recv_opt = State.find_active_receive_mirror id in
let send_opt = State.find_active_local_mirror id in
let copy_opt = State.find_active_copy id in
let sr, _vdi = State.of_mirror_id id in
let open State in
let failed =
match send_opt with
| Some send_state ->
let (module Migrate_Backend) = choose_backend dbg sr in
let failed =
match send_state.Send_state.tapdev with
| Some tapdev -> (
try
let stats = Tapctl.stats (Tapctl.create ()) tapdev in
stats.Tapctl.Stats.nbd_mirror_failed = 1
with _ ->
debug "Using cached copy of failure status" ;
send_state.Send_state.failed
)
| None ->
false
Migrate_Backend.is_mirror_failed () ~dbg ~mirror_id:id ~sr
in
send_state.Send_state.failed <- failed ;
send_state.failed <- failed ;
failed
| None ->
false
Expand Down Expand Up @@ -325,58 +317,9 @@ module MigrateLocal = struct
State.clear ()
end

exception Timeout of Mtime.Span.t

let reqs_outstanding_timeout = Mtime.Span.(150 * s)

let pp_time () = Fmt.str "%a" Mtime.Span.pp

(* Tapdisk should time out after 2 mins. We can wait a little longer *)

let pre_deactivate_hook ~dbg:_ ~dp:_ ~sr ~vdi =
let open State.Send_state in
let id = State.mirror_id_of (sr, vdi) in
let start = Mtime_clock.counter () in
State.find_active_local_mirror id
|> Option.iter (fun s ->
(* We used to pause here and then check the nbd_mirror_failed key. Now, we poll
until the number of outstanding requests has gone to zero, then check the
status. This avoids confusing the backend (CA-128460) *)
try
match s.tapdev with
| None ->
()
| Some tapdev ->
let open Tapctl in
let ctx = create () in
let rec wait () =
let elapsed = Mtime_clock.count start in
if Mtime.Span.compare elapsed reqs_outstanding_timeout > 0 then
raise (Timeout elapsed) ;
let st = stats ctx tapdev in
if st.Stats.reqs_outstanding > 0 then (
Thread.delay 1.0 ; wait ()
) else
(st, elapsed)
in
let st, elapsed = wait () in
debug "Got final stats after waiting %a" pp_time elapsed ;
if st.Stats.nbd_mirror_failed = 1 then (
error "tapdisk reports mirroring failed" ;
s.failed <- true
)
with
| Timeout elapsed ->
error
"Timeout out after %a waiting for tapdisk to complete all \
outstanding requests"
pp_time elapsed ;
s.failed <- true
| e ->
error "Caught exception while finally checking mirror state: %s"
(Printexc.to_string e) ;
s.failed <- true
)
let pre_deactivate_hook ~dbg ~dp ~sr ~vdi =
let (module Migrate_Backend) = choose_backend dbg sr in
Migrate_Backend.pre_deactivate_hook () ~dbg ~dp ~sr ~vdi

let post_deactivate_hook ~sr ~vdi ~dp:_ =
let open State.Send_state in
Expand Down
16 changes: 16 additions & 0 deletions ocaml/xapi/storage_mux.ml
Original file line number Diff line number Diff line change
Expand Up @@ -857,6 +857,22 @@ module Mux = struct

let receive_cancel2 () ~dbg:_ ~mirror_id:_ ~url:_ ~verify_dest:_ =
u __FUNCTION__

let pre_deactivate_hook _ctx ~dbg:_ ~dp:_ ~sr:_ ~vdi:_ =
u "DATA.MIRROR.pre_deactivate_hook"

let is_mirror_failed _ctx ~dbg:_ ~mirror_id:_ ~sr:_ =
u "DATA.MIRROR.is_mirror_failed"

let list () ~dbg =
with_dbg ~name:"DATA.MIRROR.list" ~dbg @@ fun di ->
info "%s dbg: %s" __FUNCTION__ dbg ;
Storage_migrate.list ~dbg:di.log

let stat () ~dbg ~id =
with_dbg ~name:"DATA.MIRROR.stat" ~dbg @@ fun di ->
info "%s dbg: %s mirror_id: %s" __FUNCTION__ di.log id ;
Storage_migrate.stat ~dbg:di.log ~id
end
end

Expand Down
8 changes: 8 additions & 0 deletions ocaml/xapi/storage_smapiv1.ml
Original file line number Diff line number Diff line change
Expand Up @@ -1157,6 +1157,14 @@ module SMAPIv1 : Server_impl = struct

let receive_cancel2 _context ~dbg:_ ~mirror_id:_ ~url:_ ~verify_dest:_ =
assert false

let pre_deactivate_hook _context ~dbg:_ ~dp:_ ~sr:_ ~vdi:_ = assert false

let is_mirror_failed _context ~dbg:_ ~mirror_id:_ ~sr:_ = assert false

let list _context ~dbg:_ = assert false

let stat _context ~dbg:_ ~id:_ = assert false
end
end

Expand Down
72 changes: 72 additions & 0 deletions ocaml/xapi/storage_smapiv1_migrate.ml
Original file line number Diff line number Diff line change
Expand Up @@ -757,4 +757,76 @@ module MIRROR : SMAPIv2_MIRROR = struct
let receive_cancel2 _ctx ~dbg:_ ~mirror_id:_ ~url:_ ~verify_dest:_ =
(* see Storage_migrate.receive_cancel2 *)
u __FUNCTION__

exception Timeout of Mtime.Span.t

let reqs_outstanding_timeout = Mtime.Span.(150 * s)

let pp_time () = Fmt.str "%a" Mtime.Span.pp

(* Tapdisk should time out after 2 mins. We can wait a little longer *)

let pre_deactivate_hook _ctx ~dbg:_ ~dp:_ ~sr ~vdi =
let open State.Send_state in
let id = State.mirror_id_of (sr, vdi) in
let start = Mtime_clock.counter () in
State.find_active_local_mirror id
|> Option.iter (fun s ->
(* We used to pause here and then check the nbd_mirror_failed key. Now, we poll
until the number of outstanding requests has gone to zero, then check the
status. This avoids confusing the backend (CA-128460) *)
try
match s.tapdev with
| None ->
()
| Some tapdev ->
let open Tapctl in
let ctx = create () in
let rec wait () =
let elapsed = Mtime_clock.count start in
if Mtime.Span.compare elapsed reqs_outstanding_timeout > 0
then
raise (Timeout elapsed) ;
let st = stats ctx tapdev in
if st.Stats.reqs_outstanding > 0 then (
Thread.delay 1.0 ; wait ()
) else
(st, elapsed)
in
let st, elapsed = wait () in
D.debug "Got final stats after waiting %a" pp_time elapsed ;
if st.Stats.nbd_mirror_failed = 1 then (
D.error "tapdisk reports mirroring failed" ;
s.failed <- true
)
with
| Timeout elapsed ->
D.error
"Timeout out after %a waiting for tapdisk to complete all \
outstanding requests"
pp_time elapsed ;
s.failed <- true
| e ->
D.error
"Caught exception while finally checking mirror state: %s"
(Printexc.to_string e) ;
s.failed <- true
)

let is_mirror_failed _ctx ~dbg:_ ~mirror_id ~sr:_ =
match State.find_active_local_mirror mirror_id with
| Some {tapdev= Some tapdev; failed; _} -> (
try
let stats = Tapctl.stats (Tapctl.create ()) tapdev in
stats.Tapctl.Stats.nbd_mirror_failed = 1
with _ ->
D.debug "Using cached copy of failure status" ;
failed
)
| _ ->
false

let list _ctx = u __FUNCTION__

let stat _ctx = u __FUNCTION__
end
9 changes: 9 additions & 0 deletions ocaml/xapi/storage_smapiv1_wrapper.ml
Original file line number Diff line number Diff line change
Expand Up @@ -1219,6 +1219,15 @@ functor

let receive_cancel2 _context ~dbg:_ ~mirror_id:_ ~url:_ ~verify_dest:_ =
u __FUNCTION__

let pre_deactivate_hook _context ~dbg:_ ~dp:_ ~sr:_ ~vdi:_ =
u __FUNCTION__

let is_mirror_failed _context ~dbg:_ ~mirror_id:_ ~sr:_ = u __FUNCTION__

let list _context ~dbg:_ = u __FUNCTION__

let stat _context ~dbg:_ ~id:_ = u __FUNCTION__
end
end

Expand Down
8 changes: 8 additions & 0 deletions ocaml/xapi/storage_smapiv3_migrate.ml
Original file line number Diff line number Diff line change
Expand Up @@ -38,4 +38,12 @@ module MIRROR : SMAPIv2_MIRROR = struct
let receive_cancel _ctx = u __FUNCTION__

let receive_cancel2 _ctx = u __FUNCTION__

let is_mirror_failed _ctx = u __FUNCTION__

let pre_deactivate_hook _ctx = u __FUNCTION__

let list _ctx = u __FUNCTION__

let stat _ctx = u __FUNCTION__
end
4 changes: 2 additions & 2 deletions ocaml/xapi/xapi_vm_migrate.ml
Original file line number Diff line number Diff line change
Expand Up @@ -1077,7 +1077,7 @@ let vdi_copy_fun __context dbg vdi_map remote is_intra_pool remote_vdis so_far
(None, vdi.vdi)
) else
let mirrorid = task_result |> mirror_of_task dbg in
let m = Storage_migrate.stat ~dbg ~id:mirrorid in
let m = SMAPI.DATA.MIRROR.stat dbg mirrorid in
(Some mirrorid, m.Mirror.dest_vdi)
in
so_far := Int64.add !so_far vconf.size ;
Expand Down Expand Up @@ -1106,7 +1106,7 @@ let vdi_copy_fun __context dbg vdi_map remote is_intra_pool remote_vdis so_far
match mirror_id with
| Some mid ->
ignore (Storage_access.unregister_mirror mid) ;
let m = Storage_migrate.stat ~dbg ~id:mid in
let m = SMAPI.DATA.MIRROR.stat dbg mid in
(try Storage_migrate.stop ~dbg ~id:mid with _ -> ()) ;
m.Mirror.failed
| None ->
Expand Down
Loading