diff --git a/integration_tests/general.bats b/integration_tests/general.bats index 2279905e..e3aeed4a 100644 --- a/integration_tests/general.bats +++ b/integration_tests/general.bats @@ -71,3 +71,11 @@ teardown_file() { [ "$status" != 0 ] rm out.pftrace } + +@test "exit_code_propagated" { + run $IPROF -- bash -c "exit 55" + [ "$status" == 55 ] + + run $IPROF --no-analysis -- bash -c "exit 55" + [ "$status" == 55 ] +} diff --git a/xprof/sync_daemon_fs b/xprof/sync_daemon_fs index dea845b7..c04c0d10 100755 --- a/xprof/sync_daemon_fs +++ b/xprof/sync_daemon_fs @@ -61,19 +61,19 @@ global_handle = nil parent_pid = nil # Set trap -Signal.trap(Sync_daemon::RT_SIGNAL_GLOBAL_BARRIER) do +Signal.trap(SyncDaemon::RT_SIGNAL_GLOBAL_BARRIER) do global_barrier(global_handle) - Process.kill(Sync_daemon::RT_SIGNAL_READY, parent_pid) + Process.kill(SyncDaemon::RT_SIGNAL_READY, parent_pid) end local_barier_count = 0 -Signal.trap(Sync_daemon::RT_SIGNAL_LOCAL_BARRIER) do +Signal.trap(SyncDaemon::RT_SIGNAL_LOCAL_BARRIER) do local_barier(local_barier_count.to_s) local_barier_count += 1 - Process.kill(Sync_daemon::RT_SIGNAL_READY, parent_pid) + Process.kill(SyncDaemon::RT_SIGNAL_READY, parent_pid) end -Signal.trap(Sync_daemon::RT_SIGNAL_FINISH) do +Signal.trap(SyncDaemon::RT_SIGNAL_FINISH) do # We cannot delete SHARED_LOCAL_FILESYSTEM # Some rank can exit the `global_barier` (hence calling this function) # when others ranks are still in the `local_barrier` @@ -83,12 +83,12 @@ Signal.trap(Sync_daemon::RT_SIGNAL_FINISH) do # is to make all ranks busy_wait in the `global_barrier`. # This will ensure that every-one exited the `local_barrier`. # but given the poor performance of our FS, we will avoid that for now... - Process.kill(Sync_daemon::RT_SIGNAL_READY, parent_pid) + Process.kill(SyncDaemon::RT_SIGNAL_READY, parent_pid) exit end # Init global barrier global_handle = init_global_barrier parent_pid = ARGV[0].to_i -Process.kill(Sync_daemon::RT_SIGNAL_READY, parent_pid) +Process.kill(SyncDaemon::RT_SIGNAL_READY, parent_pid) sleep diff --git a/xprof/xprof.rb.in b/xprof/xprof.rb.in index 75a52100..0ca9e03f 100755 --- a/xprof/xprof.rb.in +++ b/xprof/xprof.rb.in @@ -18,6 +18,18 @@ LTTNG_ARCHIVE_TIMER = '60s' LTTNG_DIRWATCH_SIZE = '500' # In MiB LTTNG_DIRWATCH_LOCK_RETRY_DELAY = 0.1 +class XprofExitCode + @@exit_code = 0 + def self.update(exit_code) + # Keep only the first error + @@exit_code = exit_code if @@exit_code == 0 + end + + def self.get + @@exit_code + end +end + $LOAD_PATH.unshift(DATADIR) if File.directory?(DATADIR) require 'open3' require 'fileutils' @@ -238,7 +250,7 @@ end # |_) _. ._ ._ o _ ._ # |_) (_| | | | (/_ | # -class Sync_daemon +class SyncDaemon SIGRTMIN = 34 RT_SIGNAL_READY = SIGRTMIN RT_SIGNAL_GLOBAL_BARRIER = SIGRTMIN + 1 @@ -285,13 +297,13 @@ class Sync_daemon end LOGGER.debug { "spawn(#{daemon} #{Process.pid})" } - lazy_exec("Initialize Sync_daemon #{daemon_type}") do + lazy_exec("Initialize SyncDaemon #{daemon_type}") do @pid = spawn("#{daemon} #{Process.pid}") end end def finalize - lazy_exec('Finalize Sync_daemon') do + lazy_exec('Finalize SyncDaemon') do `kill -#{RT_SIGNAL_FINISH} #{@pid}` end end @@ -311,14 +323,15 @@ class Sync_daemon # Context manager, ensure that when the block yield is exited # we always call clean-up the daemon def self.open - yield f = new - rescue StandardError - raise + yield syncd = new + rescue Errno::ENOENT + exit(1) ensure - return unless f - - f.global_barrier - f.finalize + # https://www.rubydoc.info/gems/rubocop/RuboCop/Cop/Lint/EnsureReturn + if syncd + syncd.global_barrier + syncd.finalize + end end end @@ -348,11 +361,10 @@ def env_tracers backends = [] [%w[opencl cl libOpenCL libTracerOpenCL], - %w[ze ze libze_loader libTracerZE], + %w[ze ze libze_loader libze_loader], %w[cuda cuda libcuda libTracerCUDA], %w[hip hip libamdhip64 libTracerHIP], - %w[mpi mpi libmpi libTracerMPI], - ].each do |name, bt_name, lib, libtracer| + %w[mpi mpi libmpi libTracerMPI]].each do |name, bt_name, lib, libtracer| # Backend requested, skip omp. It will be handled in a custom case bellow next unless OPTIONS[:'backend-names'].include?(bt_name) @@ -393,7 +405,7 @@ def env_tracers h['LTTNG_UST_SAMPLING'] = 1 h['LTTNG_UST_SAMPLING_ENERGY'] = 1 # The current only reliable way to use zes api - # is to call zesInit and set ZES_ENABLE_SYSMAN to 0 + # is to call zesInit and set ZES_ENABLE_SYSMAN to 0 h['ZES_ENABLE_SYSMAN'] = 0 if OPTIONS[:'backend-names'].include?('ze') end @@ -432,14 +444,20 @@ def launch_usr_bin(env, cmd) begin PTY.spawn(bash_env, *cmd) do |stdout, _stdin, _pid| + # Reading stdout will trigger Errno::EIO stdout.each { |line| print line } rescue Errno::EIO + # Wait for the PTY to finish, to set $? + Process.wait(_pid) + return $?.exitstatus end - # Not sure how this exception can be triggered - rescue PTY::ChildExited - LOGGER.warn { 'Application Exited' } rescue Interrupt LOGGER.warn { 'Application Received Interrupt Signal' } + # SigINT is 2 + 2 + rescue Errno::ENOENT + warn("#{__FILE__}: Can't find executable #{cmd.first}") + raise Errno::ENOENT end end @@ -693,29 +711,8 @@ end # Start, Stop lttng, amd do the on-node analsysis def trace_and_on_node_processing(usr_argv) - # Global barrier at exit - Sync_daemon.open do |syncd| - # Load Tracers and APILoaders Lib - backends, h = env_tracers - - # All ranks need to set the LLTTNG_HOME env - # so they can have access to the daemon - ENV['LTTNG_HOME'] = lttng_home_dir - LOGGER.debug("LTTNG_HOME = #{ENV.fetch('LTTNG_HOME', nil)}") - - # Only local master spawn daemons (lttng, and babeltrace) - # and the start the lttng-session - pids = if mpi_local_master? - lm_setup_lttng(backends) - lm_babeltrace(backends) if OPTIONS[:archive] - end - # Other local node cannot start before lttng and the daemon - syncd.local_barrier('waiting_for_lttng_setup') - # Launch User Command - launch_usr_bin(h, usr_argv) - # We need to ensure that all the local ranks have finished - # running the user application - # before the local master stops the lttng session + def teardown_lttng(syncd, pids) + # We need to be sure that all the local ranks are finished syncd.local_barrier('waiting_for_application_ending') # Everything from now on, is some local-master processing @@ -729,11 +726,40 @@ def trace_and_on_node_processing(usr_argv) LOGGER.debug("Waiting for babeltrace_thapi and dirwatch (#{pids}) to finish") pids.each do |pid| Process.wait(pid) - raise "#{pid} failed" unless $?.success? + XprofExitCode.update($?.exitstatus) + Logger.warn("#{pid} failed") unless $?.success? end end # we can kill the session daemon lm_lttng_kill_sessiond + end + + SyncDaemon.open do |syncd| + # Load Tracers and APILoaders Lib + backends, h = env_tracers + + # All ranks need to set the LLTTNG_HOME env + # so they can have access to the daemon + ENV['LTTNG_HOME'] = lttng_home_dir + # Only local master spawn LTTNG daemon and start session + pids = if mpi_local_master? + lm_setup_lttng(backends) + lm_babeltrace(backends) if OPTIONS[:archive] + end + + syncd.local_barrier('waiting_for_lttng_setup') + + # Launch User Command + begin + XprofExitCode.update(launch_usr_bin(h, usr_argv)) + rescue Errno::ENOENT + teardown_lttng(syncd, pids) + raise + end + + teardown_lttng(syncd, pids) + return unless mpi_local_master? + # Preprocess trace lm_babeltrace(backends) unless OPTIONS[:archive] lm_move_to_shared @@ -784,7 +810,7 @@ def gm_processing(folder) fo.close end - exit(1) unless $?.success? + $?.exitstatus end # @@ -906,8 +932,12 @@ if __FILE__ == $PROGRAM_NAME # Right now, `replay` means no tracing. # But we don't have a way of disabling post-processing folder = OPTIONS.include?(:replay) ? OPTIONS[:replay] || last_trace_saved : trace_and_on_node_processing(ARGV) + if mpi_master? warn("THAPI: Trace location: #{folder}") - gm_processing(folder) if OPTIONS[:analysis] + XprofExitCode.update(gm_processing(folder)) if OPTIONS[:analysis] end + + exit(XprofExitCode.get) + end diff --git a/ze/Makefile.am b/ze/Makefile.am index 3e06da70..60f88f30 100644 --- a/ze/Makefile.am +++ b/ze/Makefile.am @@ -172,21 +172,22 @@ libzetracepoints_la_CPPFLAGS = -I$(top_srcdir)/utils -I$(top_srcdir)/utils/inclu libzetracepoints_la_CFLAGS = -fPIC -Wall -Wextra -Wno-unused-parameter -Wno-type-limits -Wno-sign-compare $(WERROR) $(LTTNG_UST_CFLAGS) libzetracepoints_la_LDFLAGS = $(LTTNG_UST_LIBS) -lib_LTLIBRARIES = libTracerZE.la libZEInterval.la +lib_LTLIBRARIES = libze_loader.la libZEInterval.la -nodist_libTracerZE_la_SOURCES = \ +nodist_libze_loader_la_SOURCES = \ $(ZE_PROBES_INCL) \ $(ZE_STATIC_PROBES_INCL) \ tracer_ze.c -libTracerZE_la_CPPFLAGS = -I$(top_srcdir)/utils -I$(top_srcdir)/utils/include -I$(top_srcdir)/sampling -I$(srcdir)/include -I$(top_srcdir)/utils -I./ -libTracerZE_la_CFLAGS = -Wall -Wextra $(WERROR) $(LIBFFI_CFLAGS) $(LTTNG_UST_CFLAGS) -libTracerZE_la_LDFLAGS = $(LTTNG_UST_LIBS) -ldl -lpthread $(LIBFFI_LIBS) ../sampling/libThapiSampling.la -libTracerZE_la_LIBADD = libzetracepoints.la +libze_loader_la_CPPFLAGS = -I$(top_srcdir)/utils -I$(top_srcdir)/utils/include -I$(top_srcdir)/sampling -I$(srcdir)/include -I$(top_srcdir)/utils -I./ +libze_loader_la_CFLAGS = -Wall -Wextra $(WERROR) $(LIBFFI_CFLAGS) $(LTTNG_UST_CFLAGS) +libze_loader_la_LDFLAGS = $(LTTNG_UST_LIBS) -ldl -lpthread $(LIBFFI_LIBS) ../sampling/libThapiSampling.la +libze_loader_la_LDFLAGS += -version-info 1:0:0 +libze_loader_la_LIBADD = libzetracepoints.la install-exec-hook: $(MKDIR_P) $(DESTDIR)$(pkglibdir)/ze - $(LN_S) -f $(DESTDIR)$(libdir)/libTracerZE.so.0.0.0 $(DESTDIR)$(pkglibdir)/ze/libze_loader.so.1 + $(LN_S) -f $(DESTDIR)$(libdir)/libze_loader.so.1.0.0 $(DESTDIR)$(pkglibdir)/ze/libze_loader.so.1 $(LN_S) -f $(DESTDIR)$(pkglibdir)/ze/libze_loader.so.1 $(DESTDIR)$(pkglibdir)/ze/libze_loader.so $(MKDIR_P) $(DESTDIR)$(pkglibdir)/bt2 $(LN) -f $(DESTDIR)$(libdir)/libZEInterval.so $(DESTDIR)$(pkglibdir)/bt2/libZEInterval.so diff --git a/ze/tracer_ze.sh.in b/ze/tracer_ze.sh.in index 42263c7e..0185ac43 100644 --- a/ze/tracer_ze.sh.in +++ b/ze/tracer_ze.sh.in @@ -126,7 +126,7 @@ then else export LD_LIBRARY_PATH=$pkglibdir/ze:$LD_LIBRARY_PATH fi -export LD_PRELOAD=$libdir/libTracerZE.so:$LD_PRELOAD +export LD_PRELOAD=$libdir/libze_loader.so:$LD_PRELOAD export LTTNG_UST_ALLOW_BLOCKING=1 export LTTNG_UST_ZE_VERBOSE=1 lttng start diff --git a/ze/tracer_ze_helpers.include.c b/ze/tracer_ze_helpers.include.c index e58f9f0e..b6b6c113 100644 --- a/ze/tracer_ze_helpers.include.c +++ b/ze/tracer_ze_helpers.include.c @@ -1112,6 +1112,7 @@ static int initializeHandles() { _sampling_hSubDevices[driverIdx] = (ze_device_handle_t **)calloc( _sampling_deviceCount[driverIdx], sizeof(ze_device_handle_t *)); for (uint32_t deviceIdx = 0; deviceIdx < _sampling_deviceCount[driverIdx]; deviceIdx++) { + zes_device_properties_t deviceProps = {0}; deviceProps.stype = ZES_STRUCTURE_TYPE_DEVICE_PROPERTIES; deviceProps.pNext = NULL; @@ -1127,7 +1128,7 @@ static int initializeHandles() { (ze_device_handle_t)_sampling_hDevices[driverIdx][deviceIdx], &_sampling_subDeviceCount[driverIdx][deviceIdx], NULL); if (res != ZE_RESULT_SUCCESS) { - _ZE_ERROR_MSG("ZE_DEVICE_GET_SUB_DEVICES_PTR", res); + _ZE_ERROR_MSG("ZES_DEVICE_GET_PROPERTIES_PTR", res); _sampling_subDeviceCount[driverIdx][deviceIdx] = 0; } if (_sampling_subDeviceCount[driverIdx][deviceIdx] > 0) {