From 7ada7349447cd4e46d3a1954356c7241d0716a9f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= Date: Wed, 7 Aug 2024 17:11:32 +0100 Subject: [PATCH 1/4] CA-388210: factor out computing the domain parameter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We are going to change how we compute it, so factor it out into a function. No functional change. Signed-off-by: Edwin Török --- ocaml/xapi-storage-script/main.ml | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/ocaml/xapi-storage-script/main.ml b/ocaml/xapi-storage-script/main.ml index c69d28847d..a3457ae309 100644 --- a/ocaml/xapi-storage-script/main.ml +++ b/ocaml/xapi-storage-script/main.ml @@ -66,6 +66,8 @@ let backend_backtrace_error name args backtrace = let missing_uri () = backend_error "MISSING_URI" ["Please include a URI in the device-config"] +let domain_of ~dp:_ ~vm' = Storage_interface.Vm.string_of vm' + (** Functions to wrap calls to the above client modules and convert their exceptions and errors into SMAPIv2 errors of type [Storage_interface.Exception.exnty]. The above client modules should only @@ -1432,9 +1434,9 @@ let bind ~volume_script_dir = |> wrap in S.VDI.introduce vdi_introduce_impl ; - let vdi_attach3_impl dbg _dp sr vdi' vm' _readwrite = + let vdi_attach3_impl dbg dp sr vdi' vm' _readwrite = (let vdi = Storage_interface.Vdi.string_of vdi' in - let domain = Storage_interface.Vm.string_of vm' in + let domain = domain_of ~dp ~vm' in vdi_attach_common dbg sr vdi domain >>>= fun response -> let convert_implementation = function | Xapi_storage.Data.XenDisk {params; extra; backend_type} -> @@ -1456,9 +1458,9 @@ let bind ~volume_script_dir = |> wrap in S.VDI.attach3 vdi_attach3_impl ; - let vdi_activate_common dbg sr vdi' vm' readonly = + let vdi_activate_common dbg dp sr vdi' vm' readonly = (let vdi = Storage_interface.Vdi.string_of vdi' in - let domain = Storage_interface.Vm.string_of vm' in + let domain = domain_of ~dp ~vm' in Attached_SRs.find sr >>>= fun sr -> (* Discover the URIs using Volume.stat *) stat ~dbg ~sr ~vdi >>>= fun response -> @@ -1483,17 +1485,17 @@ let bind ~volume_script_dir = ) |> wrap in - let vdi_activate3_impl dbg _dp sr vdi' vm' = - vdi_activate_common dbg sr vdi' vm' false + let vdi_activate3_impl dbg dp sr vdi' vm' = + vdi_activate_common dbg dp sr vdi' vm' false in S.VDI.activate3 vdi_activate3_impl ; - let vdi_activate_readonly_impl dbg _dp sr vdi' vm' = - vdi_activate_common dbg sr vdi' vm' true + let vdi_activate_readonly_impl dbg dp sr vdi' vm' = + vdi_activate_common dbg dp sr vdi' vm' true in S.VDI.activate_readonly vdi_activate_readonly_impl ; - let vdi_deactivate_impl dbg _dp sr vdi' vm' = + let vdi_deactivate_impl dbg dp sr vdi' vm' = (let vdi = Storage_interface.Vdi.string_of vdi' in - let domain = Storage_interface.Vm.string_of vm' in + let domain = domain_of ~dp ~vm' in Attached_SRs.find sr >>>= fun sr -> (* Discover the URIs using Volume.stat *) stat ~dbg ~sr ~vdi >>>= fun response -> @@ -1514,9 +1516,9 @@ let bind ~volume_script_dir = |> wrap in S.VDI.deactivate vdi_deactivate_impl ; - let vdi_detach_impl dbg _dp sr vdi' vm' = + let vdi_detach_impl dbg dp sr vdi' vm' = (let vdi = Storage_interface.Vdi.string_of vdi' in - let domain = Storage_interface.Vm.string_of vm' in + let domain = domain_of ~dp ~vm' in Attached_SRs.find sr >>>= fun sr -> (* Discover the URIs using Volume.stat *) stat ~dbg ~sr ~vdi >>>= fun response -> @@ -1627,9 +1629,9 @@ let bind ~volume_script_dir = S.VDI.epoch_end vdi_epoch_end_impl ; let vdi_set_persistent_impl _dbg _sr _vdi _persistent = return () |> wrap in S.VDI.set_persistent vdi_set_persistent_impl ; - let dp_destroy2 dbg _dp sr vdi' vm' _allow_leak = + let dp_destroy2 dbg dp sr vdi' vm' _allow_leak = (let vdi = Storage_interface.Vdi.string_of vdi' in - let domain = Storage_interface.Vm.string_of vm' in + let domain = domain_of ~dp ~vm' in Attached_SRs.find sr >>>= fun sr -> (* Discover the URIs using Volume.stat *) stat ~dbg ~sr ~vdi >>>= fun response -> From 864d73420e654b5395a6e98a60615ed292a6e1dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= Date: Wed, 7 Aug 2024 17:12:33 +0100 Subject: [PATCH 2/4] CA-388210: SMAPIv3 concurrency safety: send the (unique) datapath argument as domain for Dom0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Dom0 allows multiple attaches of RO disks, used e.g. for VDI.copy. Send a unique value to SMAPIv3 plugins to avoid bugs to the lack of reference counting in those plugins. XAPI already sends a unique value here, either vbd/domid/device, or a fresh UUID (for storage migration). Signed-off-by: Edwin Török --- ocaml/xapi-storage-script/main.ml | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/ocaml/xapi-storage-script/main.ml b/ocaml/xapi-storage-script/main.ml index a3457ae309..d76ff039e7 100644 --- a/ocaml/xapi-storage-script/main.ml +++ b/ocaml/xapi-storage-script/main.ml @@ -66,7 +66,22 @@ let backend_backtrace_error name args backtrace = let missing_uri () = backend_error "MISSING_URI" ["Please include a URI in the device-config"] -let domain_of ~dp:_ ~vm' = Storage_interface.Vm.string_of vm' +(** return a unique 'domain' string for Dom0, so that we can plug disks + multiple times (e.g. for copy). + + XAPI should give us a unique 'dp' (datapath) string, e.g. a UUID for storage migration, + or vbd/domid/device. + For regular guests keep the domain as passed by XAPI (an integer). + *) +let domain_of ~dp ~vm' = + let vm = Storage_interface.Vm.string_of vm' in + match vm with + | "0" -> + (* SM tries to use this in filesystem paths, so cannot have /, + and systemd might be a bit unhappy with - *) + "u0-" ^ dp |> String.map ~f:(function '/' | '-' -> '_' | c -> c) + | _ -> + vm (** Functions to wrap calls to the above client modules and convert their exceptions and errors into SMAPIv2 errors of type From d5cd034ad38e892831ae103b595455dcee9486af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= Date: Wed, 7 Aug 2024 14:24:58 +0100 Subject: [PATCH 3/4] CA-388210: SMAPIv3 debugging: log PID MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Log PID on successful and failed operations, and log full cmdline for newly spawned processes. This can be used to debug stuck scripts, so that we know which invocation is the one that is stuck. Signed-off-by: Edwin Török --- ocaml/xapi-storage-script/lib.ml | 4 +++- ocaml/xapi-storage-script/lib.mli | 3 ++- ocaml/xapi-storage-script/main.ml | 28 ++++++++++++++++----------- ocaml/xapi-storage-script/test_lib.ml | 25 ++++++++++++++++++++---- 4 files changed, 43 insertions(+), 17 deletions(-) diff --git a/ocaml/xapi-storage-script/lib.ml b/ocaml/xapi-storage-script/lib.ml index 9c9059432b..a3beb9a800 100644 --- a/ocaml/xapi-storage-script/lib.ml +++ b/ocaml/xapi-storage-script/lib.ml @@ -131,6 +131,7 @@ module Process = struct type t = { exit_status: (unit, exit_or_signal) Result.t + ; pid: int ; stdout: string ; stderr: string } @@ -176,6 +177,7 @@ module Process = struct let run ~env ~prog ~args ~input = let ( let@ ) f x = f x in let@ p = with_process ~env ~prog ~args in + let pid = p#pid in let sender = send p#stdin input in let receiver_out = receive p#stdout in let receiver_err = receive p#stderr in @@ -185,7 +187,7 @@ module Process = struct Lwt.both sender receiver >>= fun ((), (stdout, stderr)) -> p#status >>= fun status -> let exit_status = Output.exit_or_signal_of_unix status in - Lwt.return {Output.exit_status; stdout; stderr} + Lwt.return {Output.exit_status; pid; stdout; stderr} ) (function | Lwt.Canceled as exn -> diff --git a/ocaml/xapi-storage-script/lib.mli b/ocaml/xapi-storage-script/lib.mli index a55c4b81fb..eae9183a17 100644 --- a/ocaml/xapi-storage-script/lib.mli +++ b/ocaml/xapi-storage-script/lib.mli @@ -65,6 +65,7 @@ module Process : sig type t = { exit_status: (unit, exit_or_signal) result + ; pid: int ; stdout: string ; stderr: string } @@ -78,7 +79,7 @@ module Process : sig -> Output.t Lwt.t (** Runs a cli program, writes [input] into its stdin, then closing the fd, and finally waits for the program to finish and returns the exit status, - its stdout and stderr. *) + the pid, and its stdout and stderr. *) end module DirWatcher : sig diff --git a/ocaml/xapi-storage-script/main.ml b/ocaml/xapi-storage-script/main.ml index d76ff039e7..a0b99ae91a 100644 --- a/ocaml/xapi-storage-script/main.ml +++ b/ocaml/xapi-storage-script/main.ml @@ -79,7 +79,7 @@ let domain_of ~dp ~vm' = | "0" -> (* SM tries to use this in filesystem paths, so cannot have /, and systemd might be a bit unhappy with - *) - "u0-" ^ dp |> String.map ~f:(function '/' | '-' -> '_' | c -> c) + "u0-" ^ dp |> String.map (function '/' | '-' -> '_' | c -> c) | _ -> vm @@ -477,6 +477,8 @@ let fork_exec_rpc : ) >>>= fun input -> let input = compat_in input |> Jsonrpc.to_string in + debug (fun m -> m "Running %s" @@ Filename.quote_command script_name args) + >>= fun () -> Process.run ~env ~prog:script_name ~args ~input >>= fun output -> let fail_because ~cause description = fail @@ -500,12 +502,13 @@ let fork_exec_rpc : with | Error _ -> error (fun m -> - m "%s failed and printed bad error json: %s" script_name - output.Process.Output.stdout + m "%s[%d] failed and printed bad error json: %s" script_name + output.pid output.Process.Output.stdout ) >>= fun () -> error (fun m -> - m "%s failed, stderr: %s" script_name output.Process.Output.stderr + m "%s[%d] failed, stderr: %s" script_name output.pid + output.Process.Output.stderr ) >>= fun () -> fail_because "non-zero exit and bad json on stdout" @@ -516,12 +519,12 @@ let fork_exec_rpc : with | Error _ -> error (fun m -> - m "%s failed and printed bad error json: %s" script_name - output.Process.Output.stdout + m "%s[%d] failed and printed bad error json: %s" script_name + output.pid output.Process.Output.stdout ) >>= fun () -> error (fun m -> - m "%s failed, stderr: %s" script_name + m "%s[%d] failed, stderr: %s" script_name output.pid output.Process.Output.stderr ) >>= fun () -> @@ -532,7 +535,9 @@ let fork_exec_rpc : ) ) | Error (Signal signal) -> - error (fun m -> m "%s caught a signal and failed" script_name) + error (fun m -> + m "%s[%d] caught a signal and failed" script_name output.pid + ) >>= fun () -> fail_because "signalled" ~cause:(Signal.to_string signal) | Ok () -> ( (* Parse the json on stdout. We get back a JSON-RPC @@ -544,8 +549,8 @@ let fork_exec_rpc : with | Error _ -> error (fun m -> - m "%s succeeded but printed bad json: %s" script_name - output.Process.Output.stdout + m "%s[%d] succeeded but printed bad json: %s" script_name + output.pid output.Process.Output.stdout ) >>= fun () -> fail @@ -554,7 +559,8 @@ let fork_exec_rpc : ) | Ok response -> info (fun m -> - m "%s succeeded: %s" script_name output.Process.Output.stdout + m "%s[%d] succeeded: %s" script_name output.pid + output.Process.Output.stdout ) >>= fun () -> let response = compat_out response in diff --git a/ocaml/xapi-storage-script/test_lib.ml b/ocaml/xapi-storage-script/test_lib.ml index e016d1368a..ca1d0a07a1 100644 --- a/ocaml/xapi-storage-script/test_lib.ml +++ b/ocaml/xapi-storage-script/test_lib.ml @@ -103,12 +103,20 @@ let test_run_status = let module P = Process in let test () = let* output = P.run ~prog:"true" ~args:[] ~input:"" ~env:[] in - let expected = P.Output.{exit_status= Ok (); stdout= ""; stderr= ""} in + let expected = + P.Output.{exit_status= Ok (); pid= output.pid; stdout= ""; stderr= ""} + in Alcotest.(check output_c) "Exit status is correct" expected output ; let* output = P.run ~prog:"false" ~args:[] ~input:"" ~env:[] in let expected = - P.Output.{exit_status= Error (Exit_non_zero 1); stdout= ""; stderr= ""} + P.Output. + { + exit_status= Error (Exit_non_zero 1) + ; pid= output.pid + ; stdout= "" + ; stderr= "" + } in Alcotest.(check output_c) "Exit status is correct" expected output ; @@ -121,7 +129,10 @@ let test_run_output = let test () = let content = "@@@@@@" in let* output = P.run ~prog:"cat" ~args:["-"] ~input:content ~env:[] in - let expected = P.Output.{exit_status= Ok (); stdout= content; stderr= ""} in + let expected = + P.Output. + {exit_status= Ok (); pid= output.pid; stdout= content; stderr= ""} + in Alcotest.(check output_c) "Stdout is correct" expected output ; let* output = P.run ~prog:"cat" ~args:[content] ~input:content ~env:[] in @@ -129,7 +140,13 @@ let test_run_output = Printf.sprintf "cat: %s: No such file or directory\n" content in let expected = - P.Output.{exit_status= Error (Exit_non_zero 1); stdout= ""; stderr} + P.Output. + { + exit_status= Error (Exit_non_zero 1) + ; pid= output.pid + ; stdout= "" + ; stderr + } in Alcotest.(check output_c) "Stderr is correct" expected output ; Lwt.return () From ce82302b5244f248a1dd736421b506499a5d402d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= Date: Wed, 7 Aug 2024 14:24:58 +0100 Subject: [PATCH 4/4] CA-388210: SMAPIv3 concurrency: turn on concurrent operations by default MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It is believed that the deadlocks in SMAPIv3 were caused by VDI.copy operations that attach the same disk RO multiple times in Dom0. With the previous commits we now use a unique identifier and spawn a separate qemu-dp process in the SMAPIv3 plugins, which should prevent the deadlocks and IO errors due to lack of refcounting. Signed-off-by: Edwin Török --- ocaml/xapi-storage-script/main.ml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocaml/xapi-storage-script/main.ml b/ocaml/xapi-storage-script/main.ml index a0b99ae91a..e29f4937ab 100644 --- a/ocaml/xapi-storage-script/main.ml +++ b/ocaml/xapi-storage-script/main.ml @@ -1796,7 +1796,7 @@ let rec diff a b = (* default false due to bugs in SMAPIv3 plugins, once they are fixed this should be set to true *) -let concurrent = ref false +let concurrent = ref true type reload = All | Files of string list | Nothing