From a84fb5c63b3fc5376a271423b6bf2887c414282b Mon Sep 17 00:00:00 2001 From: Thomas Applencourt Date: Mon, 30 Sep 2024 22:04:33 +0000 Subject: [PATCH 1/2] better error message when bin doesn't exist --- xprof/xprof.rb.in | 49 +++++++++++++++++++++++++++++++---------------- 1 file changed, 32 insertions(+), 17 deletions(-) diff --git a/xprof/xprof.rb.in b/xprof/xprof.rb.in index 9b115c23..404987f3 100755 --- a/xprof/xprof.rb.in +++ b/xprof/xprof.rb.in @@ -307,12 +307,12 @@ class Sync_daemon # we always call clean-up the daemon def self.open yield f = new - rescue StandardError - raise ensure - return unless f - f.global_barrier - f.finalize + # https://www.rubydoc.info/gems/rubocop/RuboCop/Cop/Lint/EnsureReturn + if f + f.global_barrier + f.finalize + end end end @@ -346,8 +346,7 @@ def env_tracers %w[ze ze libze_loader libTracerZE], %w[cuda cuda libcuda libTracerCUDA], %w[hip hip libamdhip64 libTracerHIP], - %w[mpi mpi libmpi libTracerMPI], - ].each do |name, bt_name, lib, libtracer| + %w[mpi mpi libmpi libTracerMPI]].each do |name, bt_name, lib, libtracer| # Backend requested, skip omp. It will be handled in a custom case bellow next unless OPTIONS[:'backend-names'].include?(bt_name) @@ -430,6 +429,9 @@ def launch_usr_bin(env, cmd) LOGGER.warn { 'Application Exited' } rescue Interrupt LOGGER.warn { 'Application Received Interrupt Signal' } + rescue Errno::ENOENT + warn("#{__FILE__}: Can't find executable #{cmd.first}") + raise Errno::ENOENT end end @@ -570,11 +572,13 @@ end def lm_lttng_teardown_session raise unless mpi_local_master? + exec("lttng destroy #{lttng_session_uuid}") end def lm_lttng_kill_sessiond raise unless mpi_local_master? + # Need to kill the sessiond Daemon. It's safe because each job has their own # # In theory, opening the lttng-sessiond.pid file is racy. @@ -650,7 +654,19 @@ end # Start, Stop lttng, amd do the on-node analsysis def trace_and_on_node_processing(usr_argv) - # Global barrier at exit + def teardown_lttng(syncd) + # We need to be sure that all the local ranks are finished + # before the local master stops the lttng session + syncd.local_barrier('waiting_for_application_ending') + return unless mpi_local_master? + + # Stop Lttng session + lm_lttng_teardown_session + # Lttng session is finished, + # we can kill the session daemon + lm_lttng_kill_sessiond + end + Sync_daemon.open do |syncd| # Load Tracers and APILoaders Lib backends, h = env_tracers @@ -661,19 +677,18 @@ def trace_and_on_node_processing(usr_argv) # Only local master spawn LTTNG daemon and start session lm_setup_lttng(backends) if mpi_local_master? syncd.local_barrier('waiting_for_lttng_setup') + # Launch User Command - launch_usr_bin(h, usr_argv) + begin + launch_usr_bin(h, usr_argv) + rescue Errno::ENOENT + teardown_lttng(syncd) + exit(1) + end - # We need to be sure that all the local ranks are finished - # before the local master stops the lttng session - syncd.local_barrier('waiting_for_application_ending') + teardown_lttng(syncd) return unless mpi_local_master? - # Stop Lttng session - lm_lttng_teardown_session - # Lttng session is finished, - # we can kill the session daemon - lm_lttng_kill_sessiond # Preprocess trace lm_babeltrace(backends) lm_move_to_shared From 19c3b4ddb7f39777df402d91b34f975f54b04ada Mon Sep 17 00:00:00 2001 From: Thomas Applencourt Date: Mon, 30 Sep 2024 22:54:19 +0000 Subject: [PATCH 2/2] fix daemon not run --- xprof/xprof.rb.in | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/xprof/xprof.rb.in b/xprof/xprof.rb.in index 404987f3..0a2ed09a 100755 --- a/xprof/xprof.rb.in +++ b/xprof/xprof.rb.in @@ -307,6 +307,8 @@ class Sync_daemon # we always call clean-up the daemon def self.open yield f = new + rescue Errno::ENOENT + exit(1) ensure # https://www.rubydoc.info/gems/rubocop/RuboCop/Cop/Lint/EnsureReturn if f @@ -683,7 +685,7 @@ def trace_and_on_node_processing(usr_argv) launch_usr_bin(h, usr_argv) rescue Errno::ENOENT teardown_lttng(syncd) - exit(1) + raise end teardown_lttng(syncd)