Skip to content

Commit

Permalink
Add archive (#287)
Browse files Browse the repository at this point in the history
Enable usage of session rotation for lossless online trace consumption.

---------

Co-authored-by: Thomas Applencourt <[email protected]>
  • Loading branch information
2 people authored and Thomas Applencourt committed Jan 6, 2025
1 parent 7d206e5 commit 152b480
Show file tree
Hide file tree
Showing 4 changed files with 79 additions and 13 deletions.
8 changes: 4 additions & 4 deletions .github/workflows/presubmit.yml
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ jobs:
run: tar -xvf thapi.tar
- run: sudo apt update; sudo apt install -y $APT_PACKAGE bats coreutils libpocl2 clinfo
- run: sudo gem install babeltrace2 opencl_ruby_ffi
- name: Load Babeltrace2
- name: Load Efficios Dependencies
run: |
echo "$HOME/efficios_dep/bin" >> $GITHUB_PATH
echo "LD_LIBRARY_PATH=$HOME/efficios_dep/lib:$LD_LIBRARY_PATH" >> $GITHUB_ENV
Expand All @@ -214,7 +214,7 @@ jobs:
key: ${{ runner.os }}-build-${{ env.cache-name }}
- run: sudo apt update; sudo apt install -y $APT_PACKAGE
- run: sudo gem install cast-to-yaml nokogiri babeltrace2 opencl_ruby_ffi metababel
- name: Load Babeltrace2
- name: Load Efficios Dependencies
run: |
echo "$HOME/efficios_dep/bin" >> $GITHUB_PATH
echo "PKG_CONFIG_PATH=$HOME/efficios_dep/lib/pkgconfig:$PKG_CONFIG_PATH" >> $GITHUB_ENV
Expand Down Expand Up @@ -252,7 +252,7 @@ jobs:
key: ${{ runner.os }}-build-${{ env.cache-name }}
- run: sudo apt update; sudo apt install -y $APT_PACKAGE
- run: sudo gem install cast-to-yaml nokogiri babeltrace2 opencl_ruby_ffi metababel
- name: Load Babeltrace2
- name: Load Efficios Dependencies
run: |
echo "$HOME/efficios_dep/bin" >> $GITHUB_PATH
echo "PKG_CONFIG_PATH=$HOME/efficios_dep/lib/pkgconfig:$PKG_CONFIG_PATH" >> $GITHUB_ENV
Expand Down Expand Up @@ -284,7 +284,7 @@ jobs:
key: ${{ runner.os }}-build-${{ env.cache-name }}
- run: sudo apt update; sudo apt install -y $APT_PACKAGE
- run: sudo gem install cast-to-yaml nokogiri babeltrace2 opencl_ruby_ffi metababel
- name: Load Babeltrace2
- name: Load Efficios Dependencie
run: |
echo "$HOME/efficios_dep/bin" >> $GITHUB_PATH
echo "PKG_CONFIG_PATH=$HOME/efficios_dep/lib/pkgconfig:$PKG_CONFIG_PATH" >> $GITHUB_ENV
Expand Down
4 changes: 4 additions & 0 deletions integration_tests/general.bats
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@ teardown_file() {
rm out.pftrace
}

@test "archive_summary" {
$IPROF --archive $THAPI_TEST_BIN
}

@test "replay_summary" {
$IPROF $THAPI_TEST_BIN
$IPROF -r
Expand Down
9 changes: 9 additions & 0 deletions utils/babeltrace_thapi.in
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@ def get_components(names)
components_classes = {
'source.ctf.fs' => BT2::BTPlugin.find('ctf').get_source_component_class_by_name('fs'),
'source.ctf.lttng_live' => BT2::BTPlugin.find('ctf').get_source_component_class_by_name('lttng-live'),
'source.ctf.lttng_archive' => BT2::BTPlugin.find('ctf').get_source_component_class_by_name('lttng-archive'),
'filter.utils.muxer' => BT2::BTPlugin.find('utils').get_filter_component_class_by_name('muxer'),
'sink.text.pretty' => BT2::BTPlugin.find('text').get_sink_component_class_by_name('pretty'),
'sink.ctf.fs' => BT2::BTPlugin.find('ctf').get_sink_component_class_by_name('fs'),
Expand Down Expand Up @@ -200,6 +201,10 @@ def get_and_add_components(graph, names, l_inputs)
graph.add(comp, 'source_live',
params: { 'inputs' => $options[:inputs],
'session-not-found-action' => 'end' })
when 'source.ctf.lttng_archive'
graph.add(comp, 'source_archive',
params: { 'session-name' => $options[:archive],
'session-found-file-path' => $options[:'archive-session-found-file-path'] })
when 'source.ctf.fs'
s = Find.find(*l_inputs)
.reject { |path| FileTest.directory?(path) }
Expand Down Expand Up @@ -281,6 +286,8 @@ def bt_graphs(inputs)
@bt_graphs[inputs] ||= begin
g_comps = [if $options[:live]
'source.ctf.lttng_live'
elsif $options[:archive]
'source.ctf.lttng_archive'
else
'source.ctf.fs'
end]
Expand Down Expand Up @@ -354,6 +361,8 @@ class BabeltraceParserThapi < OptionParserWithDefaultAndValidation
'Format: backend_name[:backend_level],...',
default: ['mpi:3', 'omp:2', 'cl:1', 'ze:1', 'cuda:1', 'hip:1'])
on('--debug', default: false)
on('--archive SESSION-NAME')
on('--archive-session-found-file-path PATH')
on('--[no-]muxer')
on('-v', '--version', 'Print the version string') do
puts File.read(File.join(DATADIR, 'version'))
Expand Down
71 changes: 62 additions & 9 deletions xprof/xprof.rb.in
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,11 @@ PREFIX = '@prefix@'
DATAROOTDIR = File.join(PREFIX, 'share')
DATADIR = DATAROOTDIR

LTTNG_ARCHIVE_SIZE = '50M'
LTTNG_ARCHIVE_TIMER = '60s'
LTTNG_DIRWATCH_SIZE = '500' # In MiB
LTTNG_DIRWATCH_LOCK_RETRY_DELAY = 0.1

class XprofExitCode
@@exit_code = 0
def self.update(status, name)
Expand Down Expand Up @@ -607,12 +612,18 @@ def lm_setup_lttng(backends)
end

end
# This is required to force the creation of an trace,
# so that dirwatch doesn't complain about empty trace
if OPTIONS[:archive]
exec("lttng enable-rotation --session=#{lttng_session_uuid} --size=#{LTTNG_ARCHIVE_SIZE} --timer=#{LTTNG_ARCHIVE_TIMER}")
end
exec("lttng start #{lttng_session_uuid}")
end

def lm_lttng_teardown_session
raise unless mpi_local_master?

exec("lttng rotate #{lttng_session_uuid}") if OPTIONS[:archive]
exec("lttng destroy #{lttng_session_uuid}")
end

Expand Down Expand Up @@ -645,7 +656,27 @@ def lm_babeltrace(backends)
opts << "--output #{thapi_trace_dir_tmp}"
opts << "--backends #{backends.join(',')}"
opts << '--no-discard-metadata' if type == 'aggreg' && OPTIONS.include?(:'kernel-verbose')
exec("#{BINDIR}/babeltrace_thapi #{opts.join(' ')} -- #{lttng_trace_dir_tmp}")

if OPTIONS[:archive]
read_file = File.join(lttng_trace_dir_tmp, 'bt_archive_ready')
opts << "--archive #{lttng_session_uuid} --archive-session-found-file-path=#{read_file}"
cmd = "#{BINDIR}/babeltrace_thapi #{opts.join(' ')} -- #{lttng_trace_dir_tmp}"
LOGGER.debug(cmd)
pid_bt = spawn(cmd)

cmd = "dirwatch.py --log-level=CRITICAL #{lttng_session_uuid} #{LTTNG_DIRWATCH_SIZE}"
LOGGER.debug(cmd)
pid_dirwatch = spawn(cmd)

until File.exist?(read_file)
# Ensure that dirwatch.py didn't crash, and deadlock
Process.wait(pid_dirwatch, Process::WNOHANG)
sleep(LTTNG_DIRWATCH_LOCK_RETRY_DELAY)
end
[pid_bt, pid_dirwatch]
else
exec("#{BINDIR}/babeltrace_thapi #{opts.join(' ')} -- #{lttng_trace_dir_tmp}")
end
end

# _
Expand All @@ -663,6 +694,11 @@ def lm_move_to_shared
if OPTIONS.include?(:trace) || !OPTIONS[:analysis]
# The Apps finished, lttng finished, need to move to the shared tmp folder
FileUtils.mkdir_p(File.dirname(thapi_trace_dir_tmp))
# NOTE: I don't understand `mv`
# File.mv(a, b) will put a into b (aka a/b)
# FileUtils.rename(a,b) will move a as b, but may
# raise Invalid cross-device error.
# So we use `exec(mv -T a b)`, this have the added benefice of logging
exec("mv #{lttng_trace_dir_tmp} #{thapi_trace_dir_tmp}")
else
# `lm_babeltrace` finished, can remove `tmp` folder
Expand Down Expand Up @@ -694,15 +730,25 @@ end

# Start, Stop lttng, amd do the on-node analsysis
def trace_and_on_node_processing(usr_argv)
def teardown_lttng(syncd)
def teardown_lttng(syncd, pids)
# We need to be sure that all the local ranks are finished
# before the local master stops the lttng session
syncd.local_barrier('waiting_for_application_ending')

# Everything from now on, is some local-master processing
# The `Sync_daemon` context will handle the call to the global barrier
# for the early exiting ranks
return unless mpi_local_master?

# Stop Lttng session
# Stop Lttng session and babeltrace daemons
lm_lttng_teardown_session
# Lttng session is finished,
if OPTIONS[:archive]
LOGGER.debug("Waiting for babeltrace_thapi and dirwatch (#{pids}) to finish")
pids.each do |pid|
Process.wait(pid)
XprofExitCode.update($?.exitstatus)
Logger.warn("#{pid} failed") unless $?.success?
end
end
# we can kill the session daemon
lm_lttng_kill_sessiond
end
Expand All @@ -715,24 +761,30 @@ def trace_and_on_node_processing(usr_argv)
# so they can have access to the daemon
ENV['LTTNG_HOME'] = lttng_home_dir
# Only local master spawn LTTNG daemon and start session
lm_setup_lttng(backends) if mpi_local_master?
pids = if mpi_local_master?
lm_setup_lttng(backends)
lm_babeltrace(backends) if OPTIONS[:archive]
end

syncd.local_barrier('waiting_for_lttng_setup')

# Launch User Command
begin
XprofExitCode.update(launch_usr_bin(h, usr_argv), usr_argv.join(' '))
rescue Errno::ENOENT
teardown_lttng(syncd)
teardown_lttng(syncd, pids)
raise
end

teardown_lttng(syncd)
teardown_lttng(syncd, pids)
return unless mpi_local_master?

# Preprocess trace
lm_babeltrace(backends)
lm_babeltrace(backends) unless OPTIONS[:archive]
lm_move_to_shared
end
# Global master rename the unique trace folder to a more
# human friendly name
gm_rename_folder if mpi_master?
end

Expand Down Expand Up @@ -826,6 +878,7 @@ if __FILE__ == $PROGRAM_NAME
parser.on('-b', '--backends BACKENDS', Array, "Select which and how backends' need to handled.",
'Format: backend_name[:backend_level],...',
default: ['mpi:3', 'omp:2', 'cl:1', 'ze:1', 'cuda:1', 'hip:1'])
parser.on('--[no-]archive', 'Trigger for ardhive support', default: false)

# Analysis
parser.on('-r', '--replay [PATH]', 'Replay traces for post-mortem analysis.',
Expand Down

0 comments on commit 152b480

Please sign in to comment.