From 152b480ac5401d8000cdddfb7bd9d6df63d6a8f1 Mon Sep 17 00:00:00 2001 From: Thomas Applencourt Date: Wed, 4 Sep 2024 15:36:13 -0500 Subject: [PATCH] Add archive (#287) Enable usage of session rotation for lossless online trace consumption. --------- Co-authored-by: Thomas Applencourt --- .github/workflows/presubmit.yml | 8 ++-- integration_tests/general.bats | 4 ++ utils/babeltrace_thapi.in | 9 +++++ xprof/xprof.rb.in | 71 ++++++++++++++++++++++++++++----- 4 files changed, 79 insertions(+), 13 deletions(-) diff --git a/.github/workflows/presubmit.yml b/.github/workflows/presubmit.yml index 6ec61195..58eaa44b 100644 --- a/.github/workflows/presubmit.yml +++ b/.github/workflows/presubmit.yml @@ -190,7 +190,7 @@ jobs: run: tar -xvf thapi.tar - run: sudo apt update; sudo apt install -y $APT_PACKAGE bats coreutils libpocl2 clinfo - run: sudo gem install babeltrace2 opencl_ruby_ffi - - name: Load Babeltrace2 + - name: Load Efficios Dependencies run: | echo "$HOME/efficios_dep/bin" >> $GITHUB_PATH echo "LD_LIBRARY_PATH=$HOME/efficios_dep/lib:$LD_LIBRARY_PATH" >> $GITHUB_ENV @@ -214,7 +214,7 @@ jobs: key: ${{ runner.os }}-build-${{ env.cache-name }} - run: sudo apt update; sudo apt install -y $APT_PACKAGE - run: sudo gem install cast-to-yaml nokogiri babeltrace2 opencl_ruby_ffi metababel - - name: Load Babeltrace2 + - name: Load Efficios Dependencies run: | echo "$HOME/efficios_dep/bin" >> $GITHUB_PATH echo "PKG_CONFIG_PATH=$HOME/efficios_dep/lib/pkgconfig:$PKG_CONFIG_PATH" >> $GITHUB_ENV @@ -252,7 +252,7 @@ jobs: key: ${{ runner.os }}-build-${{ env.cache-name }} - run: sudo apt update; sudo apt install -y $APT_PACKAGE - run: sudo gem install cast-to-yaml nokogiri babeltrace2 opencl_ruby_ffi metababel - - name: Load Babeltrace2 + - name: Load Efficios Dependencies run: | echo "$HOME/efficios_dep/bin" >> $GITHUB_PATH echo "PKG_CONFIG_PATH=$HOME/efficios_dep/lib/pkgconfig:$PKG_CONFIG_PATH" >> $GITHUB_ENV @@ -284,7 +284,7 @@ jobs: key: ${{ runner.os }}-build-${{ env.cache-name }} - run: sudo apt update; sudo apt install -y $APT_PACKAGE - run: sudo gem install cast-to-yaml nokogiri babeltrace2 opencl_ruby_ffi metababel - - name: Load Babeltrace2 + - name: Load Efficios Dependencie run: | echo "$HOME/efficios_dep/bin" >> $GITHUB_PATH echo "PKG_CONFIG_PATH=$HOME/efficios_dep/lib/pkgconfig:$PKG_CONFIG_PATH" >> $GITHUB_ENV diff --git a/integration_tests/general.bats b/integration_tests/general.bats index b4784770..e3aeed4a 100644 --- a/integration_tests/general.bats +++ b/integration_tests/general.bats @@ -22,6 +22,10 @@ teardown_file() { rm out.pftrace } +@test "archive_summary" { + $IPROF --archive $THAPI_TEST_BIN +} + @test "replay_summary" { $IPROF $THAPI_TEST_BIN $IPROF -r diff --git a/utils/babeltrace_thapi.in b/utils/babeltrace_thapi.in index fe344861..08125a0a 100755 --- a/utils/babeltrace_thapi.in +++ b/utils/babeltrace_thapi.in @@ -130,6 +130,7 @@ def get_components(names) components_classes = { 'source.ctf.fs' => BT2::BTPlugin.find('ctf').get_source_component_class_by_name('fs'), 'source.ctf.lttng_live' => BT2::BTPlugin.find('ctf').get_source_component_class_by_name('lttng-live'), + 'source.ctf.lttng_archive' => BT2::BTPlugin.find('ctf').get_source_component_class_by_name('lttng-archive'), 'filter.utils.muxer' => BT2::BTPlugin.find('utils').get_filter_component_class_by_name('muxer'), 'sink.text.pretty' => BT2::BTPlugin.find('text').get_sink_component_class_by_name('pretty'), 'sink.ctf.fs' => BT2::BTPlugin.find('ctf').get_sink_component_class_by_name('fs'), @@ -200,6 +201,10 @@ def get_and_add_components(graph, names, l_inputs) graph.add(comp, 'source_live', params: { 'inputs' => $options[:inputs], 'session-not-found-action' => 'end' }) + when 'source.ctf.lttng_archive' + graph.add(comp, 'source_archive', + params: { 'session-name' => $options[:archive], + 'session-found-file-path' => $options[:'archive-session-found-file-path'] }) when 'source.ctf.fs' s = Find.find(*l_inputs) .reject { |path| FileTest.directory?(path) } @@ -281,6 +286,8 @@ def bt_graphs(inputs) @bt_graphs[inputs] ||= begin g_comps = [if $options[:live] 'source.ctf.lttng_live' + elsif $options[:archive] + 'source.ctf.lttng_archive' else 'source.ctf.fs' end] @@ -354,6 +361,8 @@ class BabeltraceParserThapi < OptionParserWithDefaultAndValidation 'Format: backend_name[:backend_level],...', default: ['mpi:3', 'omp:2', 'cl:1', 'ze:1', 'cuda:1', 'hip:1']) on('--debug', default: false) + on('--archive SESSION-NAME') + on('--archive-session-found-file-path PATH') on('--[no-]muxer') on('-v', '--version', 'Print the version string') do puts File.read(File.join(DATADIR, 'version')) diff --git a/xprof/xprof.rb.in b/xprof/xprof.rb.in index 75d0ac78..982b5259 100755 --- a/xprof/xprof.rb.in +++ b/xprof/xprof.rb.in @@ -20,6 +20,11 @@ PREFIX = '@prefix@' DATAROOTDIR = File.join(PREFIX, 'share') DATADIR = DATAROOTDIR +LTTNG_ARCHIVE_SIZE = '50M' +LTTNG_ARCHIVE_TIMER = '60s' +LTTNG_DIRWATCH_SIZE = '500' # In MiB +LTTNG_DIRWATCH_LOCK_RETRY_DELAY = 0.1 + class XprofExitCode @@exit_code = 0 def self.update(status, name) @@ -607,12 +612,18 @@ def lm_setup_lttng(backends) end end + # This is required to force the creation of an trace, + # so that dirwatch doesn't complain about empty trace + if OPTIONS[:archive] + exec("lttng enable-rotation --session=#{lttng_session_uuid} --size=#{LTTNG_ARCHIVE_SIZE} --timer=#{LTTNG_ARCHIVE_TIMER}") + end exec("lttng start #{lttng_session_uuid}") end def lm_lttng_teardown_session raise unless mpi_local_master? + exec("lttng rotate #{lttng_session_uuid}") if OPTIONS[:archive] exec("lttng destroy #{lttng_session_uuid}") end @@ -645,7 +656,27 @@ def lm_babeltrace(backends) opts << "--output #{thapi_trace_dir_tmp}" opts << "--backends #{backends.join(',')}" opts << '--no-discard-metadata' if type == 'aggreg' && OPTIONS.include?(:'kernel-verbose') - exec("#{BINDIR}/babeltrace_thapi #{opts.join(' ')} -- #{lttng_trace_dir_tmp}") + + if OPTIONS[:archive] + read_file = File.join(lttng_trace_dir_tmp, 'bt_archive_ready') + opts << "--archive #{lttng_session_uuid} --archive-session-found-file-path=#{read_file}" + cmd = "#{BINDIR}/babeltrace_thapi #{opts.join(' ')} -- #{lttng_trace_dir_tmp}" + LOGGER.debug(cmd) + pid_bt = spawn(cmd) + + cmd = "dirwatch.py --log-level=CRITICAL #{lttng_session_uuid} #{LTTNG_DIRWATCH_SIZE}" + LOGGER.debug(cmd) + pid_dirwatch = spawn(cmd) + + until File.exist?(read_file) + # Ensure that dirwatch.py didn't crash, and deadlock + Process.wait(pid_dirwatch, Process::WNOHANG) + sleep(LTTNG_DIRWATCH_LOCK_RETRY_DELAY) + end + [pid_bt, pid_dirwatch] + else + exec("#{BINDIR}/babeltrace_thapi #{opts.join(' ')} -- #{lttng_trace_dir_tmp}") + end end # _ @@ -663,6 +694,11 @@ def lm_move_to_shared if OPTIONS.include?(:trace) || !OPTIONS[:analysis] # The Apps finished, lttng finished, need to move to the shared tmp folder FileUtils.mkdir_p(File.dirname(thapi_trace_dir_tmp)) + # NOTE: I don't understand `mv` + # File.mv(a, b) will put a into b (aka a/b) + # FileUtils.rename(a,b) will move a as b, but may + # raise Invalid cross-device error. + # So we use `exec(mv -T a b)`, this have the added benefice of logging exec("mv #{lttng_trace_dir_tmp} #{thapi_trace_dir_tmp}") else # `lm_babeltrace` finished, can remove `tmp` folder @@ -694,15 +730,25 @@ end # Start, Stop lttng, amd do the on-node analsysis def trace_and_on_node_processing(usr_argv) - def teardown_lttng(syncd) + def teardown_lttng(syncd, pids) # We need to be sure that all the local ranks are finished - # before the local master stops the lttng session syncd.local_barrier('waiting_for_application_ending') + + # Everything from now on, is some local-master processing + # The `Sync_daemon` context will handle the call to the global barrier + # for the early exiting ranks return unless mpi_local_master? - # Stop Lttng session + # Stop Lttng session and babeltrace daemons lm_lttng_teardown_session - # Lttng session is finished, + if OPTIONS[:archive] + LOGGER.debug("Waiting for babeltrace_thapi and dirwatch (#{pids}) to finish") + pids.each do |pid| + Process.wait(pid) + XprofExitCode.update($?.exitstatus) + Logger.warn("#{pid} failed") unless $?.success? + end + end # we can kill the session daemon lm_lttng_kill_sessiond end @@ -715,24 +761,30 @@ def trace_and_on_node_processing(usr_argv) # so they can have access to the daemon ENV['LTTNG_HOME'] = lttng_home_dir # Only local master spawn LTTNG daemon and start session - lm_setup_lttng(backends) if mpi_local_master? + pids = if mpi_local_master? + lm_setup_lttng(backends) + lm_babeltrace(backends) if OPTIONS[:archive] + end + syncd.local_barrier('waiting_for_lttng_setup') # Launch User Command begin XprofExitCode.update(launch_usr_bin(h, usr_argv), usr_argv.join(' ')) rescue Errno::ENOENT - teardown_lttng(syncd) + teardown_lttng(syncd, pids) raise end - teardown_lttng(syncd) + teardown_lttng(syncd, pids) return unless mpi_local_master? # Preprocess trace - lm_babeltrace(backends) + lm_babeltrace(backends) unless OPTIONS[:archive] lm_move_to_shared end + # Global master rename the unique trace folder to a more + # human friendly name gm_rename_folder if mpi_master? end @@ -826,6 +878,7 @@ if __FILE__ == $PROGRAM_NAME parser.on('-b', '--backends BACKENDS', Array, "Select which and how backends' need to handled.", 'Format: backend_name[:backend_level],...', default: ['mpi:3', 'omp:2', 'cl:1', 'ze:1', 'cuda:1', 'hip:1']) + parser.on('--[no-]archive', 'Trigger for ardhive support', default: false) # Analysis parser.on('-r', '--replay [PATH]', 'Replay traces for post-mortem analysis.',