diff --git a/.github/workflows/bt_makefile.patch b/.github/workflows/bt_makefile.patch new file mode 100644 index 00000000..e31f07e3 --- /dev/null +++ b/.github/workflows/bt_makefile.patch @@ -0,0 +1,29 @@ +From 41115106fc5f1f677e42a51929771995f53eed07 Mon Sep 17 00:00:00 2001 +From: Simon Marchi +Date: Wed, 28 Aug 2024 11:22:21 -0400 +Subject: [PATCH] Use LTTNGCTL_CFLAGS + +Change-Id: I5c50340ada0e7942ac24df1f2deba1f27bf04132 +--- + src/Makefile.am | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/src/Makefile.am b/src/Makefile.am +index c6146cec5b24..32b553eb38f3 100644 +--- a/src/Makefile.am ++++ b/src/Makefile.am +@@ -767,6 +767,10 @@ plugins_ctf_babeltrace_plugin_ctf_la_SOURCES = \ + plugins/ctf/lttng-live/viewer-connection.hpp \ + plugins/ctf/plugin.cpp + ++plugins_ctf_babeltrace_plugin_ctf_la_CXXFLAGS = \ ++ $(AM_CXXFLAGS) \ ++ $(LTTNGCTL_CFLAGS) ++ + plugins_ctf_babeltrace_plugin_ctf_la_LDFLAGS = \ + $(AM_LDFLAGS) \ + $(LT_NO_UNDEFINED) \ + +base-commit: 5d357b9284e77562cbe7d5397def89f686704422 +-- +2.46.0 diff --git a/.github/workflows/presubmit.yml b/.github/workflows/presubmit.yml index 0901bb69..033a263d 100644 --- a/.github/workflows/presubmit.yml +++ b/.github/workflows/presubmit.yml @@ -18,7 +18,7 @@ jobs: babeltrace2: needs: pre_job if: ${{ needs.pre_job.outputs.should_skip != 'true' }} - name: Build and cache Babeltrace2 + name: Build urcu and lttng-tools runs-on: ubuntu-24.04 steps: - uses: actions/cache@v4 @@ -28,28 +28,54 @@ jobs: with: path: ~/babeltrace2/2.0.5 key: ${{ runner.os }}-build-${{ env.cache-name }} - - run: sudo apt update; sudo apt install -y gcc g++ lttng-tools liblttng-ust-dev ruby ruby-dev elfutils libelf-dev libdw-dev libprotobuf-dev protobuf-compiler libglib2.0-dev - if: steps.babeltrace2.outputs.cache-hit != 'true' - - run: wget https://www.efficios.com/files/babeltrace/babeltrace2-2.0.5.tar.bz2 + - name: Load Env + run: | + echo "PKG_CONFIG_PATH=$HOME/babeltrace2/2.0.5/lib/pkgconfig:$PKG_CONFIG_PATH" >> $GITHUB_ENV + - run: sudo apt update; sudo apt install -y gcc g++ libpopt-dev libnuma-dev liburcu-dev libc-dev libglib2.0-dev elfutils libelf-dev libdw-dev if: steps.babeltrace2.outputs.cache-hit != 'true' - - run: tar -xjvf babeltrace2-2.0.5.tar.bz2 + # lttng-ust + - run: git clone https://github.com/lttng/lttng-ust if: steps.babeltrace2.outputs.cache-hit != 'true' - run: | - wget https://github.com/argonne-lcf/THAPI-spack/raw/main/packages/babeltrace2/d2d2e6cc.patch - patch -p1 < d2d2e6cc.patch + # Avoid https://github.com/lttng/lttng-ust/commit/b187bcd5d99cde54dececee0e5028524d55aa314 who change the signature of + # lttng_ust_ctl_recv_register_event used by lttng-tool anl-ms3 + git checkout 4f8afc535e77070f1ef00434674f0417c6f9ef69 + ./bootstrap + ./configure --disable-man-pages --prefix=$HOME/babeltrace2/2.0.5 + make -j$(nproc) + make install + working-directory: lttng-ust if: steps.babeltrace2.outputs.cache-hit != 'true' - working-directory: babeltrace2-2.0.5 - - run: mkdir -p babeltrace2-2.0.5/build + # lttng-tools need lttng-ust 2.14+ + - run: git clone -b anl-ms3 git://git.efficios.com/deliverable/lttng-tools.git if: steps.babeltrace2.outputs.cache-hit != 'true' - - run: ../configure --prefix=$HOME/babeltrace2/2.0.5 + - run: | + ./bootstrap + ./configure --disable-man-pages --disable-bin-lttng-crash --prefix=$HOME/babeltrace2/2.0.5 + make -j$(nproc) + make install + # Put in path + echo "#!/usr/bin/env python"| cat - dirwatch.py > $HOME/babeltrace2/2.0.5/bin/dirwatch.py + chmod 755 $HOME/babeltrace2/2.0.5/bin/dirwatch.py + working-directory: lttng-tools if: steps.babeltrace2.outputs.cache-hit != 'true' - working-directory: babeltrace2-2.0.5/build - - run: make -j + # babeltrace + - run: git clone -b anl-ms3 git://git.efficios.com/deliverable/babeltrace.git if: steps.babeltrace2.outputs.cache-hit != 'true' - working-directory: babeltrace2-2.0.5/build - - run: make -j install + - run: | + wget https://raw.githubusercontent.com/argonne-lcf/THAPI/53262fcaaaf45d7d475884d7e63b69abe47e41d6/.github/workflows/str_nullptr.patch + patch -p1 < str_nullptr.patch + wget https://raw.githubusercontent.com/argonne-lcf/THAPI/4418916620496fd66cde0b3d5e241bed0a4c18a3/.github/workflows/bt_makefile.patch + patch -p1 < bt_makefile.patch + working-directory: babeltrace + if: steps.babeltrace2.outputs.cache-hit != 'true' + - run: | + ./bootstrap + ./configure --disable-man-pages --prefix=$HOME/babeltrace2/2.0.5 + make -j$(nproc) + make install + working-directory: babeltrace if: steps.babeltrace2.outputs.cache-hit != 'true' - working-directory: babeltrace2-2.0.5/build build-and-check: needs: [babeltrace2, pre_job] @@ -65,7 +91,7 @@ jobs: with: path: ~/babeltrace2/2.0.5 key: ${{ runner.os }}-build-${{ env.cache-name }} - - run: sudo apt update; sudo apt install -y gcc g++ lttng-tools liblttng-ust-dev ruby ruby-dev elfutils libelf-dev libdw-dev libprotobuf-dev protobuf-compiler valgrind libglib2.0-dev + - run: sudo apt update; sudo apt install -y gcc g++ ruby ruby-dev elfutils libelf-dev libdw-dev libprotobuf-dev protobuf-compiler valgrind libglib2.0-dev libnuma-dev liburcu-dev - run: sudo gem install cast-to-yaml nokogiri babeltrace2 opencl_ruby_ffi metababel - name: Load Babeltrace2 run: | @@ -110,7 +136,7 @@ jobs: with: path: ~/babeltrace2/2.0.5 key: ${{ runner.os }}-build-${{ env.cache-name }} - - run: sudo apt update; sudo apt install -y gcc g++ lttng-tools liblttng-ust-dev ruby ruby-dev elfutils libelf-dev libdw-dev libprotobuf-dev protobuf-compiler libglib2.0-dev + - run: sudo apt update; sudo apt install -y gcc g++ ruby ruby-dev elfutils libelf-dev libdw-dev libprotobuf-dev protobuf-compiler libglib2.0-dev libnuma-dev liburcu-dev - run: sudo gem install cast-to-yaml nokogiri babeltrace2 opencl_ruby_ffi metababel - name: Load Babeltrace2 run: | @@ -157,7 +183,7 @@ jobs: name: thapi-bin - name: Untar THAPI run: tar -xvf thapi.tar - - run: sudo apt update; sudo apt install -y lttng-tools liblttng-ust-dev ruby ruby-dev libprotobuf-dev libpocl2 clinfo bats coreutils libglib2.0-dev + - run: sudo apt update; sudo apt install -y ruby ruby-dev libprotobuf-dev libpocl2 clinfo bats coreutils libglib2.0-dev libnuma-dev liburcu-dev - run: sudo gem install babeltrace2 opencl_ruby_ffi - name: Load Babeltrace2 run: | @@ -181,7 +207,7 @@ jobs: with: path: ~/babeltrace2/2.0.5 key: ${{ runner.os }}-build-${{ env.cache-name }} - - run: sudo apt update; sudo apt install -y gcc g++ lttng-tools liblttng-ust-dev ruby ruby-dev elfutils libelf-dev libdw-dev libprotobuf-dev protobuf-compiler valgrind libglib2.0-dev + - run: sudo apt update; sudo apt install -y gcc g++ ruby ruby-dev elfutils libelf-dev libdw-dev libprotobuf-dev protobuf-compiler valgrind libglib2.0-dev libnuma-dev liburcu-dev - run: sudo gem install cast-to-yaml nokogiri babeltrace2 opencl_ruby_ffi metababel - name: Load Babeltrace2 run: | @@ -219,7 +245,7 @@ jobs: with: path: ~/babeltrace2/2.0.5 key: ${{ runner.os }}-build-${{ env.cache-name }} - - run: sudo apt update; sudo apt install -y gcc g++ lttng-tools liblttng-ust-dev ruby ruby-dev elfutils libelf-dev libdw-dev libprotobuf-dev protobuf-compiler valgrind libglib2.0-dev + - run: sudo apt update; sudo apt install -y gcc g++ ruby ruby-dev elfutils libelf-dev libdw-dev libprotobuf-dev protobuf-compiler valgrind libglib2.0-dev libnuma-dev liburcu-dev - run: sudo gem install cast-to-yaml nokogiri babeltrace2 opencl_ruby_ffi metababel - name: Load Babeltrace2 run: | @@ -251,7 +277,7 @@ jobs: with: path: ~/babeltrace2/2.0.5 key: ${{ runner.os }}-build-${{ env.cache-name }} - - run: sudo apt update; sudo apt install -y gcc g++ lttng-tools liblttng-ust-dev ruby ruby-dev elfutils libelf-dev libdw-dev libprotobuf-dev protobuf-compiler valgrind libglib2.0-dev + - run: sudo apt update; sudo apt install -y gcc g++ ruby ruby-dev elfutils libelf-dev libdw-dev libprotobuf-dev protobuf-compiler valgrind libglib2.0-dev libnuma-dev liburcu-dev - run: sudo gem install cast-to-yaml nokogiri babeltrace2 opencl_ruby_ffi metababel - name: Load Babeltrace2 run: | diff --git a/.github/workflows/str_nullptr.patch b/.github/workflows/str_nullptr.patch new file mode 100644 index 00000000..a7e53631 --- /dev/null +++ b/.github/workflows/str_nullptr.patch @@ -0,0 +1,17 @@ +diff --git a/src/plugins/ctf/common/src/msg-iter.cpp b/src/plugins/ctf/common/src/msg-iter.cpp +index 36e2088b..eacce78a 100644 +--- a/src/plugins/ctf/common/src/msg-iter.cpp ++++ b/src/plugins/ctf/common/src/msg-iter.cpp +@@ -775,12 +775,6 @@ void MsgIter::_handleStrRawDataItem(const RawDataItem& item) + } + }); + +- if (afterNullCpIt) { +- /* Found U+0000 */ +- endIt = *afterNullCpIt; +- _mHaveNullChar = true; +- } +- + /* Append to current string buffer */ + _mStrBuf.insert(_mStrBuf.end(), item.data().begin(), endIt); + } diff --git a/integration_tests/general.bats b/integration_tests/general.bats index fbeac370..fe354f22 100644 --- a/integration_tests/general.bats +++ b/integration_tests/general.bats @@ -22,6 +22,10 @@ teardown_file() { rm out.pftrace } +@test "archive_summary" { + timeout 30s $IPROF --debug 0 --archive $THAPI_TEST_BIN +} + @test "replay_summary" { $IPROF $THAPI_TEST_BIN $IPROF -r @@ -29,10 +33,10 @@ teardown_file() { @test "no-analysis_all" { $IPROF --no-analysis -- $THAPI_TEST_BIN - $IPROF -r + $IPROF -r $IPROF -t -r | wc -l $IPROF -l -r - rm out.pftrace + rm out.pftrace } @test "trace-output_all" { diff --git a/integration_tests/light_iprof_only_sync.sh b/integration_tests/light_iprof_only_sync.sh index 2e5bd8d1..90442a05 100755 --- a/integration_tests/light_iprof_only_sync.sh +++ b/integration_tests/light_iprof_only_sync.sh @@ -15,7 +15,7 @@ RT_SIGNAL_FINISH=$((SIGRTMIN + 3)) # Signal handler for capturing signals handle_signal() { - echo "$PARENT_PID $(date) | Received signal $1 from sync_daemon" + echo "$PARENT_PID $(date) | Received signal $1 from mpi_daemon" if [ "$1" == "RT_SIGNAL_READY" ]; then SIGNAL_RECEIVED="true" fi diff --git a/utils/babeltrace_thapi.in b/utils/babeltrace_thapi.in index fe344861..08125a0a 100755 --- a/utils/babeltrace_thapi.in +++ b/utils/babeltrace_thapi.in @@ -130,6 +130,7 @@ def get_components(names) components_classes = { 'source.ctf.fs' => BT2::BTPlugin.find('ctf').get_source_component_class_by_name('fs'), 'source.ctf.lttng_live' => BT2::BTPlugin.find('ctf').get_source_component_class_by_name('lttng-live'), + 'source.ctf.lttng_archive' => BT2::BTPlugin.find('ctf').get_source_component_class_by_name('lttng-archive'), 'filter.utils.muxer' => BT2::BTPlugin.find('utils').get_filter_component_class_by_name('muxer'), 'sink.text.pretty' => BT2::BTPlugin.find('text').get_sink_component_class_by_name('pretty'), 'sink.ctf.fs' => BT2::BTPlugin.find('ctf').get_sink_component_class_by_name('fs'), @@ -200,6 +201,10 @@ def get_and_add_components(graph, names, l_inputs) graph.add(comp, 'source_live', params: { 'inputs' => $options[:inputs], 'session-not-found-action' => 'end' }) + when 'source.ctf.lttng_archive' + graph.add(comp, 'source_archive', + params: { 'session-name' => $options[:archive], + 'session-found-file-path' => $options[:'archive-session-found-file-path'] }) when 'source.ctf.fs' s = Find.find(*l_inputs) .reject { |path| FileTest.directory?(path) } @@ -281,6 +286,8 @@ def bt_graphs(inputs) @bt_graphs[inputs] ||= begin g_comps = [if $options[:live] 'source.ctf.lttng_live' + elsif $options[:archive] + 'source.ctf.lttng_archive' else 'source.ctf.fs' end] @@ -354,6 +361,8 @@ class BabeltraceParserThapi < OptionParserWithDefaultAndValidation 'Format: backend_name[:backend_level],...', default: ['mpi:3', 'omp:2', 'cl:1', 'ze:1', 'cuda:1', 'hip:1']) on('--debug', default: false) + on('--archive SESSION-NAME') + on('--archive-session-found-file-path PATH') on('--[no-]muxer') on('-v', '--version', 'Print the version string') do puts File.read(File.join(DATADIR, 'version')) diff --git a/xprof/xprof.rb.in b/xprof/xprof.rb.in index e2aed6d5..527d6b3c 100755 --- a/xprof/xprof.rb.in +++ b/xprof/xprof.rb.in @@ -13,6 +13,11 @@ PREFIX = '@prefix@' DATAROOTDIR = File.join(PREFIX, 'share') DATADIR = DATAROOTDIR +LTTNG_ARCHIVE_SIZE = '50M' +LTTNG_ARCHIVE_TIMER = '60s' +LTTNG_DIRWATCH_SIZE = '500' # In MiB +LTTNG_DIRWATCH_LOCK_RETRY_DELAY = 0.1 + $LOAD_PATH.unshift(DATADIR) if File.directory?(DATADIR) require 'open3' require 'fileutils' @@ -534,8 +539,8 @@ def lm_setup_lttng(backends) FileUtils.mkdir_p(lttng_home_dir) FileUtils.mkdir_p(lttng_trace_dir_tmp) exec('lttng-sessiond --daemonize') - exec("lttng create #{lttng_session_uuid} -o #{lttng_trace_dir_tmp}") + exec("lttng create #{lttng_session_uuid} -o #{lttng_trace_dir_tmp}") File.write(File.join(lttng_trace_dir_tmp, 'thapi_metadata.yaml'), { type: 'lttng' }.to_yaml) channel_name = 'blocking-channel' @@ -560,11 +565,16 @@ def lm_setup_lttng(backends) end end + + if OPTIONS[:archive] + exec("lttng enable-rotation --session=#{lttng_session_uuid} --size=#{LTTNG_ARCHIVE_SIZE} --timer=#{LTTNG_ARCHIVE_TIMER}") + end exec("lttng start #{lttng_session_uuid}") end def lm_lttng_teardown_session raise unless mpi_local_master? + exec("lttng rotate #{lttng_session_uuid}") if OPTIONS[:archive] exec("lttng destroy #{lttng_session_uuid}") end @@ -596,7 +606,23 @@ def lm_babeltrace(backends) opts << "--output #{thapi_trace_dir_tmp}" opts << "--backends #{backends.join(',')}" opts << '--no-discard-metadata' if type == 'aggreg' && OPTIONS.include?(:'kernel-verbose') - exec("#{BINDIR}/babeltrace_thapi #{opts.join(' ')} -- #{lttng_trace_dir_tmp}") + + if OPTIONS[:archive] + read_file = File.join(lttng_trace_dir_tmp, 'bt_archive_ready') + opts << "--archive #{lttng_session_uuid} --archive-session-found-file-path=#{read_file}" + cmd = "#{BINDIR}/babeltrace_thapi #{opts.join(' ')} -- #{lttng_trace_dir_tmp}" + LOGGER.debug(cmd) + pid_bt = spawn(cmd) + + cmd = "dirwatch.py --log-level=CRITICAL #{lttng_session_uuid} #{LTTNG_DIRWATCH_SIZE}" + LOGGER.debug(cmd) + pid_laurence = spawn(cmd) + + sleep(LTTNG_DIRWATCH_LOCK_RETRY_DELAY) until File.exist?(read_file) + [pid_bt, pid_laurence] + else + exec("#{BINDIR}/babeltrace_thapi #{opts.join(' ')} -- #{lttng_trace_dir_tmp}") + end end # _ @@ -610,10 +636,14 @@ end def lm_move_to_shared raise unless mpi_local_master? - if OPTIONS.include?(:trace) || !OPTIONS[:analysis] # The Apps finished, lttng finished, need to move to the shared tmp folder FileUtils.mkdir_p(File.dirname(thapi_trace_dir_tmp)) + # NOTE: I don't understand `mv` + # File.mv(a, b) will put a into b (aka a/b) + # FileUtils.rename(a,b) will move a as b, but may + # raise Invalid cross-device error. + # So we use `exec(mv -T a b)`, this have the added benefice of logging exec("mv #{lttng_trace_dir_tmp} #{thapi_trace_dir_tmp}") else # `lm_babeltrace` finished, can remove `tmp` folder @@ -623,7 +653,6 @@ end def gm_rename_folder raise unless mpi_master? - # All process have put their file into `thapi_trace_dir_tmp/hostname`. # `thapi_trace_dir_tmp` is using the MPI_JOB_ID # Replace it with a better name, and update the root metadata. @@ -653,24 +682,35 @@ def trace_and_on_node_processing(usr_argv) # All ranks need to set the LLTTNG_HOME env # so they can have access to the daemon ENV['LTTNG_HOME'] = lttng_home_dir - # Only local master spawn LTTNG daemon and start session - lm_setup_lttng(backends) if mpi_local_master? + LOGGER.debug("LTTNG_HOME = #{ENV.fetch('LTTNG_HOME', nil)}") + + # Only local master spawn LTTNG daemon, start session, and possible babeltrace archibe + pids = if mpi_local_master? + lm_setup_lttng(backends) + lm_babeltrace(backends) if OPTIONS[:archive] + end + # Other local node cannot start before lttng and the daemon syncd.local_barrier('waiting_for_lttng_setup') # Launch User Command launch_usr_bin(h, usr_argv) - # We need to be sure that all the local ranks are finished # before the local master stops the lttng session syncd.local_barrier('waiting_for_application_ending') + + # Everything from now on is local master processing return unless mpi_local_master? # Stop Lttng session lm_lttng_teardown_session - # Lttng session is finished, + if OPTIONS[:archive] + LOGGER.debug("Waiting for babeltrace_thapi and laurence #{pids} to finish") + pids.each { |pid| Process.wait(pid) } + end + # Lttng session, and babeltrace2 finished, # we can kill the session daemon lm_lttng_kill_sessiond # Preprocess trace - lm_babeltrace(backends) + lm_babeltrace(backends) unless OPTIONS[:archive] lm_move_to_shared end gm_rename_folder if mpi_master? @@ -767,6 +807,7 @@ if __FILE__ == $PROGRAM_NAME parser.on('-b', '--backends BACKENDS', Array, "Select which and how backends' need to handled.", 'Format: backend_name[:backend_level],...', default: ['mpi:3', 'omp:2', 'cl:1', 'ze:1', 'cuda:1', 'hip:1']) + parser.on('--[no-]archive', 'Trigger for ardhive support', default: false) # Analysis parser.on('-r', '--replay [PATH]', 'Replay traces for post-mortem analysis.', diff --git a/ze/Makefile.am b/ze/Makefile.am index 20d14200..0cbc9430 100644 --- a/ze/Makefile.am +++ b/ze/Makefile.am @@ -1,7 +1,7 @@ .DELETE_ON_ERROR: if STRICT - WERROR = -Werror + WERROR = -Werror -Wno-error=nonnull else WERROR = endif diff --git a/ze/tracer_ze_helpers.include.c b/ze/tracer_ze_helpers.include.c index 9b1e5dea..292e993f 100644 --- a/ze/tracer_ze_helpers.include.c +++ b/ze/tracer_ze_helpers.include.c @@ -1053,9 +1053,9 @@ static void readCopyE(uint32_t driverIdx, uint32_t deviceIdx, copyEngineData *co } static void thapi_sampling_energy() { - uint64_t ts_us; - uint64_t energy_uj; - uint32_t frequency; + uint64_t ts_us = 0; + uint64_t energy_uj = 0; + uint32_t frequency = 0; for (uint32_t driverIdx = 0; driverIdx < _sampling_driverCount; driverIdx++) { for (uint32_t deviceIdx = 0; deviceIdx < _sampling_deviceCount[driverIdx]; deviceIdx++) { if (tracepoint_enabled(lttng_ust_ze_sampling, gpu_frequency)){