diff --git a/Makefile.am b/Makefile.am index 577082fb..bf43ba89 100644 --- a/Makefile.am +++ b/Makefile.am @@ -1,4 +1,4 @@ ACLOCAL_AMFLAGS = -I m4 -SUBDIRS = utils opencl ze xprof cuda omp hip +SUBDIRS = utils xprof sampling opencl ze cuda omp hip EXTRA_DIST = autogen.sh README.md diff --git a/configure.ac b/configure.ac index 2a6c46ff..8977fa0c 100644 --- a/configure.ac +++ b/configure.ac @@ -98,6 +98,7 @@ AC_CHECK_FUNCS([atexit clock_gettime ftruncate memmove memset strdup strstr strt AC_CONFIG_FILES([Makefile utils/Makefile + sampling/Makefile xprof/xprof.sh.erb opencl/Makefile ze/Makefile @@ -113,6 +114,7 @@ AC_CONFIG_FILES([ze/test_wrapper.sh], [chmod +x ze/test_wrapper.sh]) AC_CONFIG_FILES([cuda/tracer_cuda.sh], [chmod +x cuda/tracer_cuda.sh]) AC_CONFIG_FILES([cuda/test_wrapper.sh], [chmod +x cuda/test_wrapper.sh]) AC_CONFIG_FILES([xprof/test_wrapper.sh], [chmod +x xprof/test_wrapper.sh]) +AC_CONFIG_FILES([utils/babeltrace_energy], [chmod +x utils/babeltrace_energy]) AC_CONFIG_FILES([utils/babeltrace_thapi], [chmod +x utils/babeltrace_thapi]) AC_CONFIG_FILES([omp/tracer_omp.sh], [chmod +x omp/tracer_omp.sh]) AC_CONFIG_FILES([hip/tracer_hip.sh], [chmod +x hip/tracer_hip.sh]) diff --git a/cuda/Makefile.am b/cuda/Makefile.am index dbf5ee0a..3bf8a07a 100644 --- a/cuda/Makefile.am +++ b/cuda/Makefile.am @@ -269,9 +269,9 @@ nodist_libTracerCUDA_la_SOURCES = \ $(CUDA_PROBES_INCL) \ tracer_cuda.c -libTracerCUDA_la_CPPFLAGS = -I$(top_srcdir)/utils -I$(top_srcdir)/utils/include -I$(srcdir)/include -I../utils -I./ +libTracerCUDA_la_CPPFLAGS = -I$(top_srcdir)/utils -I$(top_srcdir)/utils/include -I$(top_srcdir)/sampling -I$(srcdir)/include -I../utils -I./ libTracerCUDA_la_CFLAGS = -Wall -Wextra $(WERROR) $(LIBFFI_CFLAGS) $(LTTNG_UST_CFLAGS) -libTracerCUDA_la_LDFLAGS = $(LTTNG_UST_LIBS) -ldl -lpthread $(LIBFFI_LIBS) +libTracerCUDA_la_LDFLAGS = $(LTTNG_UST_LIBS) -ldl -lpthread $(LIBFFI_LIBS) ../sampling/libThapiSampling.la libTracerCUDA_la_LDFLAGS += -version-info 1:0:0 libTracerCUDA_la_LIBADD = libcudatracepoints.la @@ -279,9 +279,9 @@ nodist_libTracerCUDART_la_SOURCES = \ $(CUDART_PROBES_INCL) \ tracer_cudart.c -libTracerCUDART_la_CPPFLAGS = -I$(top_srcdir)/utils -I$(top_srcdir)/utils/include -I$(srcdir)/include -I../utils -I./ +libTracerCUDART_la_CPPFLAGS = -I$(top_srcdir)/utils -I$(top_srcdir)/utils/include -I$(top_srcdir)/sampling -I$(srcdir)/include -I../utils -I./ libTracerCUDART_la_CFLAGS = -Wall -Wextra $(WERROR) $(LIBFFI_CFLAGS) $(LTTNG_UST_CFLAGS) -libTracerCUDART_la_LDFLAGS = $(LTTNG_UST_LIBS) -ldl -lpthread $(LIBFFI_LIBS) +libTracerCUDART_la_LDFLAGS = $(LTTNG_UST_LIBS) -ldl -lpthread $(LIBFFI_LIBS) ../sampling/libThapiSampling.la libTracerCUDART_la_LDFLAGS += -version-number 12:1:55 -Wl,--version-script,tracer_cudart.map libTracerCUDART_la_LIBADD = libcudarttracepoints.la diff --git a/cuda/tracer_cuda.sh.in b/cuda/tracer_cuda.sh.in index 4d3cd708..c0390577 100644 --- a/cuda/tracer_cuda.sh.in +++ b/cuda/tracer_cuda.sh.in @@ -43,6 +43,7 @@ display_help() { echo " -e, --exports Trace export functions" echo " -v, --visualize Visualize trace on thefly" echo " --properties Dump devices infos" + echo " --sample Sample performance counters" exit 1 } @@ -61,6 +62,7 @@ while true; do -e | --exports ) shift; exports=1;; -v | --visualize ) shift; lttng_view=1;; --properties ) shift; properties=1;; + --sample ) shift; sample=1;; -- ) shift; break ;; * ) break ;; esac @@ -101,6 +103,12 @@ if [ ! -z "$properties" ] then lttng enable-event --channel=blocking-channel --userspace lttng_ust_cuda_properties:* fi +if [ ! -z "$sample" ] +then + export LTTNG_UST_SAMPLING=1 + lttng enable-channel --userspace nonblocking-channel + lttng enable-event --channel=nonblocking-channel --userspace lttng_ust_sampling:* +fi if [ -z "$LTTNG_UST_CUDA_LIBCUDA" ] then LTTNG_UST_CUDA_LIBCUDA=$(whichlib64_head libcuda.so) diff --git a/cuda/tracer_cuda_helpers.include.c b/cuda/tracer_cuda_helpers.include.c index 1e4bf9db..167a3fb4 100644 --- a/cuda/tracer_cuda_helpers.include.c +++ b/cuda/tracer_cuda_helpers.include.c @@ -1,3 +1,5 @@ +#include "thapi_sampling.h" + //pthread_mutex_t cuda_closures_mutex = PTHREAD_MUTEX_INITIALIZER; // //struct cuda_closure { @@ -485,6 +487,8 @@ static void _load_tracer(void) { void *handle = NULL; int verbose = 0; + thapi_sampling_init(); + s = getenv("LTTNG_UST_CUDA_LIBCUDA"); if (s) handle = dlopen(s, RTLD_LAZY | RTLD_LOCAL | RTLD_DEEPBIND); diff --git a/cuda/tracer_cudart_helpers.include.c b/cuda/tracer_cudart_helpers.include.c index 181c771c..847510da 100644 --- a/cuda/tracer_cudart_helpers.include.c +++ b/cuda/tracer_cudart_helpers.include.c @@ -1,3 +1,5 @@ +#include "thapi_sampling.h" + static pthread_once_t _init = PTHREAD_ONCE_INIT; static __thread volatile int in_init = 0; static volatile int _initialized = 0; @@ -7,6 +9,8 @@ static void _load_tracer(void) { void *handle = NULL; int verbose = 0; + thapi_sampling_init(); + s = getenv("LTTNG_UST_CUDART_LIBCUDART"); if (s) handle = dlopen(s, RTLD_LAZY | RTLD_LOCAL); diff --git a/hip/Makefile.am b/hip/Makefile.am index 8713ea10..5df50ddb 100644 --- a/hip/Makefile.am +++ b/hip/Makefile.am @@ -277,9 +277,9 @@ nodist_libTracerHIP_la_SOURCES = \ $(HIP_PROBES_INCL) \ tracer_hip.c -libTracerHIP_la_CPPFLAGS = -I$(top_srcdir)/utils -I$(top_srcdir)/utils/include -I$(srcdir)/include -I./utils -I./ +libTracerHIP_la_CPPFLAGS = -I$(top_srcdir)/utils -I$(top_srcdir)/utils/include -I$(top_srcdir)/sampling -I$(srcdir)/include -I./utils -I./ libTracerHIP_la_CFLAGS = -Wall -Wextra $(WERROR) $(LIBFFI_CFLAGS) $(LTTNG_UST_CFLAGS) -libTracerHIP_la_LDFLAGS = $(LTTNG_UST_LIBS) -ldl -lpthread $(LIBFFI_LIBS) +libTracerHIP_la_LDFLAGS = $(LTTNG_UST_LIBS) -ldl -lpthread $(LIBFFI_LIBS) ../sampling/libThapiSampling.la libTracerHIP_la_LDFLAGS += -Wl,--version-script,$(srcdir)/hip.map -version-number 5:4:50400 libTracerHIP_la_DEPENDS = $(srcdir)/hip.map libTracerHIP_la_LIBADD = libhiptracepoints.la diff --git a/omp/Makefile.am b/omp/Makefile.am index 609fd0d1..f527efcd 100644 --- a/omp/Makefile.am +++ b/omp/Makefile.am @@ -129,9 +129,9 @@ nodist_libTracerOMPT_la_SOURCES = \ $(OMP_PROBES_INCL) \ tracer_ompt.c -libTracerOMPT_la_CPPFLAGS = -I$(top_srcdir)/utils -I$(top_srcdir)/utils/include -I./modified_include -I../utils -I./ +libTracerOMPT_la_CPPFLAGS = -I$(top_srcdir)/utils -I$(top_srcdir)/utils/include -I$(top_srcdir)/sampling -I./modified_include -I../utils -I./ libTracerOMPT_la_CFLAGS = -Wall -Wextra $(WERROR) $(LTTNG_UST_CFLAGS) -libTracerOMPT_la_LDFLAGS = $(LTTNG_UST_LIBS) -avoid-version -module +libTracerOMPT_la_LDFLAGS = $(LTTNG_UST_LIBS) -avoid-version -module ../sampling/libThapiSampling.la libTracerOMPT_la_LIBADD = libompttracepoints.la install-exec-hook: diff --git a/omp/tracer_omp.sh.in b/omp/tracer_omp.sh.in index 58c0e01d..7260d45f 100644 --- a/omp/tracer_omp.sh.in +++ b/omp/tracer_omp.sh.in @@ -12,6 +12,7 @@ display_help() { echo " --help Show this screen" echo " --version Print the version string" echo " --disable-intel-extensions Disable Intel extensions" + echo " --sample Sample performance counters" exit 1 } @@ -26,6 +27,7 @@ while true; do --help ) display_help; exit;; --version ) display_version; exit;; --disable-intel-extensions) intel_extensions=false shift;; + --sample ) shift; sample=1;; -- ) shift; break ;; * ) break ;; esac @@ -53,6 +55,12 @@ export LTTNG_UST_ALLOW_BLOCKING=1 if [ "$intel_extensions" = true ] ; then export LTTNG_UST_OMP_INTEL=1 fi +if [ ! -z "$sample" ] +then + export LTTNG_UST_SAMPLING=1 + lttng enable-channel --userspace nonblocking-channel + lttng enable-event --channel=nonblocking-channel --userspace lttng_ust_sampling:* +fi lttng start diff --git a/omp/tracer_ompt_helpers.include.c.erb b/omp/tracer_ompt_helpers.include.c.erb index df6e02b2..b1a03d9d 100644 --- a/omp/tracer_ompt_helpers.include.c.erb +++ b/omp/tracer_ompt_helpers.include.c.erb @@ -1,3 +1,5 @@ +#include "thapi_sampling.h" + <% require "yaml" %> #define _OMPT_SET_CALLBACK(value, name) \ @@ -33,6 +35,8 @@ static int _ompt_initialize(ompt_function_lookup_t lookup, int do_callbacks_intel = 0; int verbose = 0; + thapi_sampling_init(); + if (getenv("LTTNG_UST_OMP_INTEL")) do_callbacks_intel = 1; if (getenv("LTTNG_UST_OMP_VERBOSE")) diff --git a/opencl/Makefile.am b/opencl/Makefile.am index c545c170..ae235e8a 100644 --- a/opencl/Makefile.am +++ b/opencl/Makefile.am @@ -157,9 +157,9 @@ nodist_libTracerOpenCL_la_SOURCES = \ tracer_opencl.h \ tracer_opencl.c -libTracerOpenCL_la_CPPFLAGS = -I$(top_srcdir)/utils -I$(top_srcdir)/utils/include -I$(srcdir)/include -I../utils -I./ +libTracerOpenCL_la_CPPFLAGS = -I$(top_srcdir)/utils -I$(top_srcdir)/utils/include -I$(top_srcdir)/sampling -I$(srcdir)/include -I../utils -I./ libTracerOpenCL_la_CFLAGS = -Wall -Wextra -Wno-unused-parameter $(WERROR) $(LIBFFI_CFLAGS) $(LTTNG_UST_CFLAGS) -libTracerOpenCL_la_LDFLAGS = $(LTTNG_UST_LIBS) -ldl -lpthread $(LIBFFI_LIBS) +libTracerOpenCL_la_LDFLAGS = $(LTTNG_UST_LIBS) -ldl -lpthread $(LIBFFI_LIBS) ../sampling/libThapiSampling.la libTracerOpenCL_la_LDFLAGS += -Wl,--version-script,$(srcdir)/tracer_opencl.map -version-info 1:0:0 libTracerOpenCL_la_DEPENDS = $(srcdir)/tracer_opencl.map libTracerOpenCL_la_LIBADD = libtracepoints.la diff --git a/opencl/tracer_opencl.sh.in b/opencl/tracer_opencl.sh.in index eceb34a1..67a866e5 100644 --- a/opencl/tracer_opencl.sh.in +++ b/opencl/tracer_opencl.sh.in @@ -49,6 +49,7 @@ display_help() { echo " -e, --iteration-end VALUE Dump inputs and outputs for kernels until enqueue counter VALUE" echo " -v, --visualize Visualize trace on thefly" echo " --devices Dump devices information" + echo " --sample Sample performance counters" exit 1 } @@ -73,6 +74,7 @@ while true; do -e | --iteration-end ) shift; export LTTNG_UST_OPENCL_DUMP=1; export LTTNG_UST_OPENCL_DUMP_END=$1; shift ;; -v | --visualize) shift; lttng_view=1;; --devices ) shift; devices=1;; + --sample ) shift; sample=1;; -- ) shift; break ;; * ) break ;; esac @@ -117,6 +119,12 @@ if [ ! -z "$build" ] then lttng enable-event --channel=blocking-channel --userspace lttng_ust_opencl_build:* fi +if [ ! -z "$sample" ] +then + export LTTNG_UST_SAMPLING=1 + lttng enable-channel --userspace nonblocking-channel + lttng enable-event --channel=nonblocking-channel --userspace lttng_ust_sampling:* +fi if [ ! -z "$LTTNG_UST_OPENCL_DUMP" ] then lttng enable-event --channel=blocking-channel --userspace lttng_ust_opencl_dump:* diff --git a/opencl/tracer_opencl_helpers.include.c b/opencl/tracer_opencl_helpers.include.c index ad567538..ee9c715d 100644 --- a/opencl/tracer_opencl_helpers.include.c +++ b/opencl/tracer_opencl_helpers.include.c @@ -1,3 +1,5 @@ +#include "thapi_sampling.h" + void CL_CALLBACK event_notify (cl_event event, cl_int event_command_exec_status, void *user_data) { (void)user_data; if (tracepoint_enabled(lttng_ust_opencl_profiling, event_profiling_results)) { @@ -1180,6 +1182,8 @@ static void _load_tracer(void) { void * handle = NULL; int verbose = 0; + thapi_sampling_init(); + s = getenv("LTTNG_UST_OPENCL_LIBOPENCL"); if (s) handle = dlopen(s, RTLD_LAZY | RTLD_LOCAL | RTLD_DEEPBIND); diff --git a/sampling/Makefile.am b/sampling/Makefile.am new file mode 100644 index 00000000..00efa4d6 --- /dev/null +++ b/sampling/Makefile.am @@ -0,0 +1,59 @@ +if STRICT + WERROR = -Werror +else + WERROR = +endif +LTTNG_FLAGS = -fPIC -Wall -Wextra -Wno-unused-parameter -Wno-type-limits -Wno-sign-compare $(WERROR) -I$(top_srcdir)/utils -I$(top_srcdir)/utils/include -I$(srcdir)/include -I../utils -I./ + + +TRACEPOINT_GEN = \ + $(srcdir)/sampling_events.yaml + +SAMPLING_STATIC_PROBES = \ + sampling + +SAMPLING_STATIC_PROBES_TP = $(SAMPLING_STATIC_PROBES:=.tp) + +SAMPLING_STATIC_PROBES_INCL = $(SAMPLING_STATIC_PROBES:=.h) + +SAMPLING_STATIC_PROBES_SRC = $(SAMPLING_STATIC_PROBES:=.c) + +$(SAMPLING_STATIC_PROBES_TP): %.tp: $(srcdir)/gen_sampling_custom_probes.rb $(TRACEPOINT_GEN) + SRC_DIR=$(srcdir) $(RUBY) $< lttng_ust_$* > $@ + +%.h %.c: %.tp + $(LTTNG_GEN_TP) $< -o $*.c -o $*.h + +noinst_LTLIBRARIES = libtracepoints.la + +nodist_libtracepoints_la_SOURCES = \ + $(SAMPLING_STATIC_PROBES_INCL) \ + $(SAMPLING_STATIC_PROBES_SRC) + +libtracepoints_la_CFLAGS = $(LTTNG_FLAGS) $(LTTNG_UST_CFLAGS) +libtracepoints_la_LDFLAGS = $(LTTNG_UST_LIBS) + +EXTRA_DIST = \ + sampling_events.yaml \ + gen_sampling_custom_probes.rb + +CLEANFILES = \ + $(SAMPLING_STATIC_PROBES_INCL) \ + $(SAMPLING_STATIC_PROBES_TP) \ + $(SAMPLING_STATIC_PROBES_SRC) + +BUILT_SOURCES = \ + $(SAMPLING_STATIC_PROBES_INCL) + +nodist_libThapiSampling_la_SOURCES = \ + $(SAMPLING_STATIC_PROBES_INCL) + +libThapiSampling_la_SOURCES = \ + thapi_sampling.h \ + thapi_sampling.c + +libThapiSampling_la_CFLAGS = -Wall -Wextra -Wno-unused-parameter $(WERROR) -I$(top_srcdir)/utils/include +libThapiSampling_la_LDFLAGS = -lpthread -version-info 1:0:0 +libThapiSampling_la_LIBADD = libtracepoints.la + +lib_LTLIBRARIES = libThapiSampling.la diff --git a/sampling/gen_sampling_custom_probes.rb b/sampling/gen_sampling_custom_probes.rb new file mode 100644 index 00000000..c5fc181e --- /dev/null +++ b/sampling/gen_sampling_custom_probes.rb @@ -0,0 +1,32 @@ +require 'yaml' +require_relative '../utils/LTTng' + +if ENV["SRC_DIR"] + SRC_DIR = ENV["SRC_DIR"] +else + SRC_DIR = "." +end + +namespace = ARGV[0] + +raise "No namespace provided!" unless namespace + +h = YAML::load_file(File.join(SRC_DIR,"sampling_events.yaml"))[namespace] + +raise "Invalid namespace!" unless h + +puts < +#include +#include +#include +#include +#include +#include "thapi_sampling.h" +#include "sampling.h" +#include "utarray.h" + +struct sampling_entry { + void (*pfn)(void); + struct timespec interval; + struct timespec next; +}; + + +static pthread_mutex_t thapi_sampling_mutex = PTHREAD_MUTEX_INITIALIZER; +static UT_array *thapi_sampling_events = NULL; + +static pthread_once_t thapi_init_once = PTHREAD_ONCE_INIT; +static volatile int thapi_sampling_finished = 0; +static pthread_t thapi_sampling_thread; + +static void thapi_sampling_cleanup() { + thapi_sampling_finished = 1; + pthread_join(thapi_sampling_thread, NULL); + pthread_mutex_lock(&thapi_sampling_mutex); + struct sampling_entry **entry = NULL; + while ((entry = (struct sampling_entry **)utarray_next(thapi_sampling_events, entry))) + free(*entry); + utarray_free(thapi_sampling_events); + pthread_mutex_unlock(&thapi_sampling_mutex); +} + +static inline int time_cmp(const struct timespec * t1, const struct timespec * t2) { + if (t1->tv_sec < t2->tv_sec) + return -1; + if (t1->tv_sec > t2->tv_sec) + return 1; + if (t1->tv_nsec < t2->tv_nsec) + return -1; + if (t1->tv_nsec > t2->tv_nsec) + return 1; + return 0; +} + +static inline int sampling_entry_cmp(const struct sampling_entry **e1, const struct sampling_entry **e2) { + return time_cmp(&(*e1)->next, &(*e2)->next); +} + +static inline int sampling_entry_cmpw(const void * t1, const void * t2) { + return sampling_entry_cmp((const struct sampling_entry **)t1, (const struct sampling_entry **)t2); +} + +static inline void time_add(struct timespec *dest, const struct timespec *t, const struct timespec *d) { + dest->tv_nsec = t->tv_nsec + d->tv_nsec; + dest->tv_sec = t->tv_sec + d->tv_sec; + while (dest->tv_nsec > 999999999) { + dest->tv_sec += 1; + dest->tv_nsec -= 1000000000; + } +} + +void * thapi_sampling_loop(void *args) { + (void)args; + while(!thapi_sampling_finished) { + struct timespec now; + struct sampling_entry **entry = NULL; + + pthread_mutex_lock(&thapi_sampling_mutex); + clock_gettime(CLOCK_REALTIME, &now); + while ((entry = (struct sampling_entry **)utarray_next(thapi_sampling_events, entry)) && + time_cmp(&(*entry)->next, &now) < 0) { + (*entry)->pfn(); + time_add(&(*entry)->next, &(*entry)->next, &(*entry)->interval); + if(time_cmp(&(*entry)->next, &now) < 0) + time_add(&(*entry)->next, &now, &(*entry)->interval); + } + utarray_sort(thapi_sampling_events, sampling_entry_cmpw); + entry = (struct sampling_entry **)utarray_front(thapi_sampling_events); + pthread_mutex_unlock(&thapi_sampling_mutex); + if (entry) + while (clock_nanosleep(CLOCK_REALTIME, TIMER_ABSTIME, &(*entry)->next, NULL) && !thapi_sampling_finished) + ; + } + return NULL; +} + +static void thapi_sampling_heartbeat() { + do_tracepoint(lttng_ust_sampling, heartbeat, 16); +} + +static void thapi_sampling_heartbeat2() { + do_tracepoint(lttng_ust_sampling, heartbeat2); +} + +void thapi_sampling_init_once() { + struct timespec interval; + utarray_new(thapi_sampling_events, &ut_ptr_icd); + if (!thapi_sampling_events) + return; + if (getenv("LTTNG_UST_SAMPLING_HEARTBEAT")) { + interval.tv_sec = 0; + interval.tv_nsec = 100000000; + thapi_register_sampling(&thapi_sampling_heartbeat, &interval); + } + if (getenv("LTTNG_UST_SAMPLING_HEARTBEAT2")) { + interval.tv_sec = 0; + interval.tv_nsec = 30000000; + thapi_register_sampling(&thapi_sampling_heartbeat2, &interval); + } + if (!pthread_create(&thapi_sampling_thread, NULL, &thapi_sampling_loop, NULL)) + atexit(&thapi_sampling_cleanup); +} + +int thapi_sampling_init() { + if (getenv("LTTNG_UST_SAMPLING")) + pthread_once(&thapi_init_once, &thapi_sampling_init_once); + return 1; +} + +void thapi_register_sampling(void (*pfn)(void), struct timespec *interval) { + struct sampling_entry *entry = NULL; + struct timespec now, next; + if(clock_gettime(CLOCK_REALTIME, &now)) + return; + time_add(&next, &now, interval); + + pthread_mutex_lock(&thapi_sampling_mutex); + if (!thapi_sampling_events) + goto end; + entry = (struct sampling_entry *)malloc(sizeof(struct sampling_entry)); + if (!entry) + goto end; + entry->pfn = pfn; + entry->interval = *interval; + entry->next = next; + utarray_push_back(thapi_sampling_events, &entry); + utarray_sort(thapi_sampling_events, sampling_entry_cmpw); +end: + pthread_mutex_unlock(&thapi_sampling_mutex); +} diff --git a/sampling/thapi_sampling.h b/sampling/thapi_sampling.h new file mode 100644 index 00000000..2dce7a46 --- /dev/null +++ b/sampling/thapi_sampling.h @@ -0,0 +1,5 @@ +#include + +extern int thapi_sampling_init(); + +extern void thapi_register_sampling(void (*pfn)(void), struct timespec *interval); diff --git a/utils/Makefile.am b/utils/Makefile.am index 00dbba4d..53ba6a9d 100644 --- a/utils/Makefile.am +++ b/utils/Makefile.am @@ -42,7 +42,9 @@ thapi_metadata_LDFLAGS = $(LTTNG_UST_LIBS) bin_PROGRAMS = thapi_metadata -bin_SCRIPTS = babeltrace_thapi +bin_SCRIPTS = \ + babeltrace_thapi \ + babeltrace_energy .PHONY: version diff --git a/utils/babeltrace_energy.in b/utils/babeltrace_energy.in new file mode 100755 index 00000000..8a7cabc4 --- /dev/null +++ b/utils/babeltrace_energy.in @@ -0,0 +1,122 @@ +#!/usr/bin/env ruby +# TODO: merge into babeltrace_thapi at some point +DATADIR = File.join("@prefix@", "share") +$:.unshift(DATADIR) if File::directory?(DATADIR) +require 'optparse' +require 'babeltrace2' +require 'find' +require 'yaml' +require 'pp' + +$event_lambdas = {} + +$options = { + context: false, + restrict: false, + live: false +} + +OptionParser.new do |opts| + opts.banner = "Usage: babeltrace_thapi [OPTIONS] target_trace_directory" + + opts.on("-c", "--[no-]context", "Add context information") do |context| + $options[:context] = context + end + + opts.on("-r", "--[no-]restrict", "Restrict output to recognized events") do |restrict| + $options[:restrict] = restrict + end + + opts.on("-h", "--help", "Prints this help") do + puts opts + exit + end + + opts.on("--live", "Enable live display of the trace") do + $options[:live] = true + end + + opts.on("-v", "--version", "Print the version string") do + puts File.read(File.join(DATADIR, "version")) + exit + end + +end.parse! + +$restrict = $options[:restrict] +$context = $options[:context] +$live = $options[:live] + +ctf_fs = BT2::BTPlugin.find("ctf").get_source_component_class_by_name("fs") +ctf_lttng_live = BT2::BTPlugin.find("ctf").get_source_component_class_by_name("lttng-live") +utils_muxer = BT2::BTPlugin.find("utils").get_filter_component_class_by_name("muxer") +text_pretty = BT2::BTPlugin.find("text").get_sink_component_class_by_name("pretty") + +if !$live + trace_locations = Find.find(*ARGV).reject { |path| + FileTest.directory?(path) + }.select { |path| + File.basename(path) == "metadata" + }.collect { |path| + File.dirname(path) + }.select { |path| + qe = BT2::BTQueryExecutor.new( component_class: ctf_fs, object_name: "babeltrace.support-info", params: { "input" => path, "type" => "directory" } ) + qe.query.value["weight"] > 0.5 + } +else + trace_locations = ARGV +end +raise "Could not find lttng trace" if trace_locations.size == 0 +$energies={} +$event_lambdas["lttng_ust_ze_sampling:gpu_energy"] = lambda { |event| + defi=event.payload_field.value + device = defi['hDevice'] + domain = defi['domain'] + energy = defi['energy'] + timestamp = defi['timestamp'] + key = device, domain + previous = $energies[key] + if previous + p_energy, p_timestamp = previous + puts "#{key[0]}:#{key[1]}: #{(energy - p_energy).to_f/(timestamp - p_timestamp)}" + end + $energies[key] = [energy, timestamp] +} + +consume = lambda { |iterator, _| + mess = iterator.next_messages + mess.each { |m| + if m.type == :BT_MESSAGE_TYPE_EVENT + + e = m.event + #puts e.name + l = $event_lambdas[e.name] + if l + l.call e + end + end + } +} + +graph = BT2::BTGraph.new +if !$live + comps = trace_locations.each_with_index.collect { |trace_location, i| graph.add_component(ctf_fs, "trace_#{i}", params: {"inputs" => [ trace_location ] }) } +else + comps = trace_locations.each_with_index.collect { |trace_location, i| graph.add_component(ctf_lttng_live, "trace_#{i}", params: {"inputs" => [ trace_location ], "session-not-found-action" => "end" }) } +end +comp2 = graph.add_component(utils_muxer, "mux") +comp3 = graph.add_simple_sink("babeltrace_thapi", consume) +i = 0 +comps.each { |comp| + ops = comp.output_ports + ops.each { |op| + ip = comp2.input_port(i) + i += 1 + graph.connect_ports(op, ip) + } +} + +op = comp2.output_port(0) +ip = comp3.input_port(0) +graph.connect_ports(op, ip) +graph.run diff --git a/utils/babeltrace_thapi.in b/utils/babeltrace_thapi.in index 52ba9d22..794978e4 100755 --- a/utils/babeltrace_thapi.in +++ b/utils/babeltrace_thapi.in @@ -170,9 +170,8 @@ def get_components(names) str = Time.at(0, m.get_default_clock_snapshot.ns_from_origin, :nsec).strftime('%H:%M:%S.%9L').to_s if $options[:context] str << " - #{e.stream.trace.get_environment_entry_value_by_name('hostname')}" - str << ' - ' << e.get_common_context_field.value.collect do |k, v| - "#{k}: #{v}" - end.join(', ') + common_context_field = e.get_common_context_field + str << " - " << common_context_field.value.collect { |k, v| "#{k}: #{v}" }.join(", ") if common_context_field end str << " - #{e.name}: " str << (l ? l.call(e.payload_field.value) : e.payload_field.to_s) diff --git a/utils/xprof_utils.cpp b/utils/xprof_utils.cpp index 07f141fc..5527e3eb 100644 --- a/utils/xprof_utils.cpp +++ b/utils/xprof_utils.cpp @@ -20,6 +20,104 @@ thread_id_t borrow_thread_id(const bt_event *event){ return bt_field_integer_unsigned_get_value(field); } + +bt_message* create_power_message(const char* hostname, const process_id_t process_id, const thread_id_t thread_id, + const uintptr_t hDevice, const uint32_t domain, const uint64_t power, const uint64_t ts, + bt_event_class *event_class, bt_self_message_iterator *message_iterator, bt_stream *stream, backend_t backend) { + + /* Message creation */ + bt_message *message = bt_message_event_create( + message_iterator, event_class, stream); + + /* event */ + bt_event *downstream_event = bt_message_event_borrow_event(message); + + /* Common context */ + bt_field *context_field = bt_event_borrow_common_context_field(downstream_event); + + // Hostname + bt_field *hostname_msg_field = bt_field_structure_borrow_member_field_by_index(context_field,0); + bt_field_string_set_value(hostname_msg_field, hostname); + // pid + bt_field *vpid_field = bt_field_structure_borrow_member_field_by_index(context_field,1); + bt_field_integer_signed_set_value(vpid_field, process_id); + // vid + bt_field *vtid_field = bt_field_structure_borrow_member_field_by_index(context_field,2); + bt_field_integer_signed_set_value(vtid_field, thread_id); + // ts + bt_field *ts_field = bt_field_structure_borrow_member_field_by_index(context_field,3); + bt_field_integer_signed_set_value(ts_field, ts); + // backend + bt_field *backend_field = bt_field_structure_borrow_member_field_by_index(context_field,4); + bt_field_integer_signed_set_value(backend_field, backend); + + /* Payload */ + bt_field *payload_field = bt_event_borrow_payload_field(downstream_event); + + // did + bt_field *device_id_field = bt_field_structure_borrow_member_field_by_index(payload_field,0); + bt_field_integer_unsigned_set_value(device_id_field, hDevice); + + // domain + bt_field *domain_field = bt_field_structure_borrow_member_field_by_index(payload_field,1); + bt_field_integer_unsigned_set_value(domain_field, domain); + + // power + bt_field *power_field = bt_field_structure_borrow_member_field_by_index(payload_field,2); + bt_field_integer_unsigned_set_value(power_field, power); + + return message; +} + + +bt_message* create_frequency_message(const char* hostname, const process_id_t process_id, const thread_id_t thread_id, + const uintptr_t hDevice, const uint32_t domain, const uint64_t ts, const uint64_t frequency, + bt_event_class *event_class, bt_self_message_iterator *message_iterator, bt_stream *stream, backend_t backend) { + + /* Message creation */ + bt_message *message = bt_message_event_create( + message_iterator, event_class, stream); + + /* event */ + bt_event *downstream_event = bt_message_event_borrow_event(message); + + /* Common context */ + bt_field *context_field = bt_event_borrow_common_context_field(downstream_event); + + // Hostname + bt_field *hostname_msg_field = bt_field_structure_borrow_member_field_by_index(context_field,0); + bt_field_string_set_value(hostname_msg_field, hostname); + // pid + bt_field *vpid_field = bt_field_structure_borrow_member_field_by_index(context_field,1); + bt_field_integer_signed_set_value(vpid_field, process_id); + // vid + bt_field *vtid_field = bt_field_structure_borrow_member_field_by_index(context_field,2); + bt_field_integer_signed_set_value(vtid_field, thread_id); + // ts + bt_field *ts_field = bt_field_structure_borrow_member_field_by_index(context_field,3); + bt_field_integer_signed_set_value(ts_field, ts); + // backend + bt_field *backend_field = bt_field_structure_borrow_member_field_by_index(context_field,4); + bt_field_integer_signed_set_value(backend_field, backend); + + /* Payload */ + bt_field *payload_field = bt_event_borrow_payload_field(downstream_event); + + // did + bt_field *device_id_field = bt_field_structure_borrow_member_field_by_index(payload_field,0); + bt_field_integer_unsigned_set_value(device_id_field, hDevice); + + // domain + bt_field *domain_field = bt_field_structure_borrow_member_field_by_index(payload_field,1); + bt_field_integer_unsigned_set_value(domain_field, domain); + + // frequency + bt_field *frequency_field = bt_field_structure_borrow_member_field_by_index(payload_field,2); + bt_field_integer_unsigned_set_value(frequency_field, frequency); + + return message; +} + bt_message* create_host_message(const char* hostname, const process_id_t process_id, const thread_id_t thread_id, const char* name, const uint64_t ts, const uint64_t duration, const bool err, bt_event_class *event_class, bt_self_message_iterator *message_iterator, bt_stream *stream, backend_t backend) { diff --git a/utils/xprof_utils.hpp b/utils/xprof_utils.hpp index cb65f43f..7136a95d 100644 --- a/utils/xprof_utils.hpp +++ b/utils/xprof_utils.hpp @@ -110,6 +110,15 @@ const char *borrow_hostname(const bt_event *); process_id_t borrow_process_id(const bt_event *); thread_id_t borrow_thread_id(const bt_event *); +bt_message* create_power_message(const char* hostname, const process_id_t proprocess_id, const thread_id_t thread_id, + const uintptr_t hDevice, const uint32_t domain, const uint64_t power, const uint64_t ts, + bt_event_class *event_class, bt_self_message_iterator *message_iterator, bt_stream *stream, backend_t backend = BACKEND_UNKNOWN); + + +bt_message* create_frequency_message(const char* hostname, const process_id_t proprocess_id, const thread_id_t thread_id, + const uintptr_t hDevice, const uint32_t domain, const uint64_t ts, const uint64_t frequency, + bt_event_class *event_class, bt_self_message_iterator *message_iterator, bt_stream *stream, backend_t backend = BACKEND_UNKNOWN); + bt_message *create_host_message(const char *hostname, const process_id_t, const thread_id_t, const char *name, const uint64_t ts, const uint64_t duration, const bool err, bt_event_class *, bt_self_message_iterator *, diff --git a/xprof/btx_interval_model.yaml b/xprof/btx_interval_model.yaml index 31b526b1..f62aac1a 100644 --- a/xprof/btx_interval_model.yaml +++ b/xprof/btx_interval_model.yaml @@ -89,3 +89,41 @@ :type: integer_unsigned :field_value_range: 64 :cast_type: uint64_t + - :name: lttng:frequency + :payload_field_class: + :type: structure + :members: + - :name: did + :field_class: + :type: integer_unsigned + :field_value_range: 64 + :cast_type: uint64_t + - :name: domain + :field_class: + :type: integer_unsigned + :field_value_range: 32 + :cast_type: uint32_t + - :name: frequency + :field_class: + :type: integer_unsigned + :field_value_range: 64 + :cast_type: uint64_t + - :name: lttng:power + :payload_field_class: + :type: structure + :members: + - :name: did + :field_class: + :type: integer_unsigned + :field_value_range: 64 + :cast_type: uint64_t + - :name: domain + :field_class: + :type: integer_unsigned + :field_value_range: 32 + :cast_type: uint32_t + - :name: power + :field_class: + :type: integer_unsigned + :field_value_range: 64 + :cast_type: uint64_t diff --git a/xprof/btx_timeline.cpp b/xprof/btx_timeline.cpp index 00a40aef..98d060ba 100644 --- a/xprof/btx_timeline.cpp +++ b/xprof/btx_timeline.cpp @@ -26,6 +26,12 @@ struct timeline_dispatch_s { std::unordered_map, std::map> track2lasts; + + std::unordered_map hp2frqtracks; + std::unordered_map hp2pwrtracks; + std::unordered_map hp_devs2frqtracks; + std::unordered_map hp_devs2pwrtracks; + perfetto_pruned::Trace trace; }; using timeline_dispatch_t = struct timeline_dispatch_s; @@ -36,6 +42,103 @@ static perfetto_uuid_t gen_perfetto_uuid() { return uuid++; } +static perfetto_uuid_t get_parent_counter_track_uuid(timeline_dispatch_t *dispatch, std::unordered_map &parent_tracks, + const std::string track_name, std::string hostname, uint64_t process_id) { + perfetto_uuid_t hp_uuid = 0; + auto [it, inserted] = parent_tracks.insert({{hostname, process_id}, hp_uuid}); + auto &potential_uuid = it->second; + // Exists + if (!inserted) + return potential_uuid; + + hp_uuid = gen_perfetto_uuid(); + potential_uuid = hp_uuid; + + // Create packet with track descriptor + auto *packet = dispatch->trace.add_packet(); + packet->set_trusted_packet_sequence_id(10000); + packet->set_timestamp(0); + // TODO: check if this is required + packet->set_previous_packet_dropped(true); + auto *track_descriptor = packet->mutable_track_descriptor(); + track_descriptor->set_uuid(hp_uuid); + auto *process = track_descriptor->mutable_process(); + process->set_pid(hp_uuid); + std::ostringstream oss; + oss << "Hostname " << hostname << " | Process " << process_id; + oss << " | " << track_name << " | uuid "; + process->set_process_name(oss.str()); + return hp_uuid; +} + +static perfetto_uuid_t get_counter_track_uuuid(timeline_dispatch_t *dispatch, std::unordered_map &parent_tracks, + std::unordered_map &counter_tracks, const std::string track_name, + std::string hostname, uint64_t process_id, thapi_device_id did) { + perfetto_uuid_t hp_dev_uuid = 0; + auto [it, inserted] = counter_tracks.insert({{hostname, process_id, did}, hp_dev_uuid}); + auto &potential_uuid = it->second; + // Exists + if (!inserted) + return potential_uuid; + + perfetto_uuid_t hp_uuid = get_parent_counter_track_uuid(dispatch, parent_tracks, track_name, hostname, process_id); + hp_dev_uuid = gen_perfetto_uuid(); + potential_uuid = hp_dev_uuid; + + // Create new track + auto *packet = dispatch->trace.add_packet(); + packet->set_timestamp(0); + packet->set_trusted_packet_sequence_id(10000); + auto *track_descriptor = packet->mutable_track_descriptor(); + track_descriptor->set_uuid(hp_dev_uuid); + track_descriptor->set_parent_uuid(hp_uuid); + std::ostringstream oss; + oss << "Device " << did; + track_descriptor->set_name(oss.str()); + track_descriptor->mutable_counter(); + return hp_dev_uuid; +} + +static perfetto_uuid_t get_frequency_track_uuuid(timeline_dispatch_t *dispatch, std::string hostname, + uint64_t process_id, thapi_device_id did) { + return get_counter_track_uuuid(dispatch, dispatch->hp2frqtracks, dispatch->hp_devs2frqtracks, "GPU Frequency", hostname, process_id, did); +} + +static perfetto_uuid_t get_power_track_uuuid(timeline_dispatch_t *dispatch, std::string hostname, + uint64_t process_id, thapi_device_id did) { + return get_counter_track_uuuid(dispatch, dispatch->hp2pwrtracks, dispatch->hp_devs2pwrtracks, "GPU Power", hostname, process_id, did); +} + +static void add_event_frequency(timeline_dispatch_t *dispatch, std::string hostname, + uint64_t process_id, uint64_t thread_id, uintptr_t did, + uint32_t domain, uint64_t timestamp, uint64_t frequency) { + (void)domain; + perfetto_uuid_t track_uuid = get_frequency_track_uuuid(dispatch, hostname, process_id, did); + auto *packet = dispatch->trace.add_packet(); + packet->set_trusted_packet_sequence_id(10000); + packet->set_timestamp(timestamp); + auto *track_event = packet->mutable_track_event(); + track_event->set_type(perfetto_pruned::TrackEvent::TYPE_COUNTER); + track_event->set_track_uuid(track_uuid); + track_event->set_name("Frequency"); + track_event->set_counter_value(frequency); +} + +static void add_event_power(timeline_dispatch_t *dispatch, std::string hostname, + uint64_t process_id, uint64_t thread_id, uintptr_t did, + uint32_t domain, uint64_t timestamp, uint64_t power) { + (void)domain; + perfetto_uuid_t track_uuid = get_power_track_uuuid(dispatch, hostname, process_id, did); + auto *packet = dispatch->trace.add_packet(); + packet->set_trusted_packet_sequence_id(10000); + packet->set_timestamp(timestamp); + auto *track_event = packet->mutable_track_event(); + track_event->set_type(perfetto_pruned::TrackEvent::TYPE_COUNTER); + track_event->set_track_uuid(track_uuid); + track_event->set_name("Power"); + track_event->set_counter_value(power); +} + static void add_event_begin(timeline_dispatch_t *dispatch, perfetto_uuid_t uuid, timestamp_t begin, std::string name) { auto *packet = dispatch->trace.add_packet(); @@ -239,9 +342,25 @@ static void device_usr_callback(void *btx_handle, void *usr_data, const char *ho add_event_async(dispatch, hostname, vpid, vtid, did, sdid, name, ts, dur); } +static void frequency_usr_callback(void *btx_handle, void *usr_data, const char *hostname, + int64_t vpid, uint64_t vtid, int64_t ts, int64_t backend, + uint64_t did, uint32_t domain, uint64_t frequency) { + auto *dispatch = static_cast(usr_data); + add_event_frequency(dispatch, hostname, vpid, vtid, did, domain, ts, frequency); +} + +static void power_usr_callback(void *btx_handle, void *usr_data, const char *hostname, + int64_t vpid, uint64_t vtid, int64_t ts, int64_t backend, + uint64_t did, uint32_t domain, uint64_t power) { + auto *dispatch = static_cast(usr_data); + add_event_power(dispatch, hostname, vpid, vtid, did, domain, ts, power); +} + void btx_register_usr_callbacks(void *btx_handle) { btx_register_callbacks_lttng_host(btx_handle, &host_usr_callback); btx_register_callbacks_lttng_device(btx_handle, &device_usr_callback); + btx_register_callbacks_lttng_frequency(btx_handle, &frequency_usr_callback); + btx_register_callbacks_lttng_power(btx_handle, &power_usr_callback); btx_register_callbacks_initialize_usr_data(btx_handle, &btx_initialize_usr_data); btx_register_callbacks_finalize_usr_data(btx_handle, &btx_finalize_usr_data); } diff --git a/xprof/interval.c.erb b/xprof/interval.c.erb index 3412390f..8e09ef4d 100644 --- a/xprof/interval.c.erb +++ b/xprof/interval.c.erb @@ -97,6 +97,8 @@ bt_component_class_initialize_method_status <%= namespace %>_dispatch_initialize dispatch->device_event_class = create_lttng_device_event_class_message(trace_class, stream_class); dispatch->traffic_event_class = create_lttng_traffic_event_class_message(trace_class, stream_class); dispatch->device_name_event_class = create_lttng_device_name_event_class_message(trace_class, stream_class); + dispatch->frequency_event_class = create_lttng_frequency_event_class_message(trace_class, stream_class); + dispatch->power_event_class = create_lttng_power_event_class_message(trace_class, stream_class); /* Create a default trace from (instance of `trace_class`) */ bt_trace *trace = bt_trace_create(trace_class); diff --git a/xprof/interval.h.erb b/xprof/interval.h.erb index ce94241f..2ebdb184 100644 --- a/xprof/interval.h.erb +++ b/xprof/interval.h.erb @@ -56,7 +56,8 @@ struct <%= namespace %>_dispatch { bt_event_class *device_event_class; bt_event_class *traffic_event_class; bt_event_class *device_name_event_class; - + bt_event_class *frequency_event_class; + bt_event_class *power_event_class; /* Component's input port (weak) */ bt_self_component_port_input *in_port; }; diff --git a/xprof/interval_model.yaml b/xprof/interval_model.yaml index 098bb85e..0a9df04a 100644 --- a/xprof/interval_model.yaml +++ b/xprof/interval_model.yaml @@ -53,4 +53,28 @@ - :name: name :class: string - :name: size - :class: unsigned + :class: unsigned +- :name: lttng:frequency + :payload: + - :name: did + :class: unsigned + :class_properties: + :preferred_display_base: 16 + - :name: domain + :class: unsigned + :class_properties: + :field_value_range: 32 + - :name: frequency + :class: unsigned +- :name: lttng:power + :payload: + - :name: did + :class: unsigned + :class_properties: + :preferred_display_base: 16 + - :name: domain + :class: unsigned + :class_properties: + :field_value_range: 32 + - :name: power + :class: unsigned diff --git a/xprof/perfetto_prunned.proto b/xprof/perfetto_prunned.proto index 34719c2f..0f9de5e6 100644 --- a/xprof/perfetto_prunned.proto +++ b/xprof/perfetto_prunned.proto @@ -1,7 +1,32 @@ +// AUTOGENERATED - DO NOT EDIT +// --------------------------- +// This file has been generated by +// AOSP://external/perfetto/tools/gen_merged_protos +// merging the perfetto config protos. +// This fused proto is intended to be copied in: +// - Android tree, for statsd. +// - Google internal repos. + syntax = "proto2"; -// "There Is a Light That Never Goes Out" -// https://android.googlesource.com/platform/external/perfetto/+/refs/heads/master/protos/perfetto/trace/ + package perfetto_pruned; +// Begin of protos/perfetto/common/track_event_descriptor.proto + +message TrackEventCategory { + optional string name = 1; + optional string description = 2; + repeated string tags = 3; +} + +message TrackEventDescriptor { + repeated TrackEventCategory available_categories = 1; +} + +message DataSourceDescriptor { + optional TrackEventDescriptor track_event_descriptor = 6 [lazy = true]; + +} +// Begin of protos/perfetto/common/builtin_clock.proto enum BuiltinClock { BUILTIN_CLOCK_UNKNOWN = 0; @@ -12,65 +37,655 @@ enum BuiltinClock { BUILTIN_CLOCK_MONOTONIC_RAW = 5; BUILTIN_CLOCK_BOOTTIME = 6; BUILTIN_CLOCK_MAX_ID = 63; - reserved 7, 8, 9; + + reserved 7, 8; + + // An internal CL (ag/16521245) has taken this for BUILTIN_CLOCK_TSC. + // That might get upstreamed later on. Avoid diverging on this ID in future. + reserved 9; } -message ProcessDescriptor { - optional int32 pid = 1; - optional string process_name = 6; +message TrackEventConfig { + // The following fields define the set of enabled trace categories. Each list + // item is a glob. + // + // Default: [] + repeated string disabled_categories = 1; + + // Default: [] + repeated string enabled_categories = 2; + + // Default: ["slow", "debug"] + repeated string disabled_tags = 3; + + // Default: [] + repeated string enabled_tags = 4; + + // Default: false (i.e. enabled by default) + optional bool disable_incremental_timestamps = 5; + + // Allows to specify a custom unit different than the default (ns). + // Also affects thread timestamps if enable_thread_time_sampling = true. + // A multiplier of 1000 means that a timestamp = 3 should be interpreted as + // 3000 ns = 3 us. + // Default: 1 (if unset, it should be read as 1). + optional uint64 timestamp_unit_multiplier = 6; + +} + +// End of protos/perfetto/config/track_event/track_event_config.proto + +// Begin of protos/perfetto/config/data_source_config.proto + +// The configuration that is passed to each data source when starting tracing. +// Next id: 124 +message DataSourceConfig { + // Data source unique name, e.g., "linux.ftrace". This must match + // the name passed by the data source when it registers (see + // RegisterDataSource()). + optional string name = 1; + + // Data source name: track_event + optional TrackEventConfig track_event_config = 113 [lazy = true]; + // optional string legacy_config = 1000; + + // This field is only used for testing. + //optional TestConfig for_testing = 1001; + + // Was |for_testing|. Caused more problems then found. + reserved 268435455; +} + +// Next id: 38. +message TraceConfig { + message BufferConfig { + optional uint32 size_kb = 1; + + // |page_size|, now deprecated. + reserved 2; + + // |optimize_for|, now deprecated. + reserved 3; + + enum FillPolicy { + UNSPECIFIED = 0; + + // Default behavior. The buffer operates as a conventional ring buffer. + // If the writer is faster than the reader (or if the reader reads only + // after tracing is stopped) newly written packets will overwrite old + // packets. + RING_BUFFER = 1; + + // Behaves like RING_BUFFER as long as there is space in the buffer or + // the reader catches up with the writer. As soon as the writer hits + // an unread chunk, it stops accepting new data in the buffer. + DISCARD = 2; + } + optional FillPolicy fill_policy = 4; + } + repeated BufferConfig buffers = 1; + + optional int64 trace_uuid_msb = 27 [deprecated = true]; + optional int64 trace_uuid_lsb = 28 [deprecated = true]; + +} +// A snapshot of clock readings to allow for trace alignment. +message ClockSnapshot { + message Clock { + // DEPRECATED. This enum has moved to ../common/builtin_clock.proto. + enum BuiltinClocks { + UNKNOWN = 0; + REALTIME = 1; + REALTIME_COARSE = 2; + MONOTONIC = 3; + MONOTONIC_COARSE = 4; + MONOTONIC_RAW = 5; + BOOTTIME = 6; + BUILTIN_CLOCK_MAX_ID = 63; + + reserved 7, 8; + } + + // Clock IDs have the following semantic: + // [1, 63]: Builtin types, see BuiltinClock from + // ../common/builtin_clock.proto. + // [64, 127]: User-defined clocks. These clocks are sequence-scoped. They + // are only valid within the same |trusted_packet_sequence_id| + // (i.e. only for TracePacket(s) emitted by the same TraceWriter + // that emitted the clock snapshot). + // [128, MAX]: Reserved for future use. The idea is to allow global clock + // IDs and setting this ID to hash(full_clock_name) & ~127. + optional uint32 clock_id = 1; + + // Absolute timestamp. Unit is ns unless specified otherwise by the + // unit_multiplier_ns field below. + optional uint64 timestamp = 2; + + // When true each TracePacket's timestamp should be interpreted as a delta + // from the last TracePacket's timestamp (referencing this clock) emitted by + // the same packet_sequence_id. Should only be used for user-defined + // sequence-local clocks. The first packet timestamp after each + // ClockSnapshot that contains this clock is relative to the |timestamp| in + // the ClockSnapshot. + optional bool is_incremental = 3; + + // Allows to specify a custom unit different than the default (ns) for this + // clock domain. A multiplier of 1000 means that a timestamp = 3 should be + // interpreted as 3000 ns = 3 us. All snapshots for the same clock within a + // trace need to use the same unit. + optional uint64 unit_multiplier_ns = 4; + } + repeated Clock clocks = 1; + + // The authoritative clock domain for the trace. Defaults to BOOTTIME, but can + // be overridden in TraceConfig's builtin_data_sources. Trace processor will + // attempt to translate packet/event timestamps from various data sources (and + // their chosen clock domains) to this domain during import. + optional BuiltinClock primary_trace_clock = 2; +} +/* +message FieldOptions { + // The packed option can be enabled for repeated primitive fields to enable + // a more efficient representation on the wire. Rather than repeatedly + // writing the tag and type for each element, the entire array is encoded as + // a single length-delimited blob. In proto3, only explicit setting it to + // false will avoid using packed encoding. + optional bool packed = 2; +} +*/ +// Describes a oneof. +message OneofDescriptorProto { + optional string name = 1; + optional OneofOptions options = 2; +} + +// Describes an enum type. +message EnumDescriptorProto { + optional string name = 1; + + repeated EnumValueDescriptorProto value = 2; + + reserved 3; + reserved 4; + + // Reserved enum value names, which may not be reused. A given name may only + // be reserved once. + repeated string reserved_name = 5; +} + +// Describes a value within an enum. +message EnumValueDescriptorProto { + optional string name = 1; + optional int32 number = 2; + + reserved 3; +} + +message OneofOptions { + reserved 999; + + // Clients can define custom options in extensions of this message. See above. + extensions 1000 to max; +} +// Next reserved id: 13 (up to 15). Next id: 50. +message TrackEvent { + // Names of categories of the event. In the client library, categories are a + // way to turn groups of individual events on or off. + // interned EventCategoryName. + repeated uint64 category_iids = 3; + // non-interned variant. + repeated string categories = 22; + + // Optional name of the event for its display in trace viewer. May be left + // unspecified for events with typed arguments. + // + // Note that metrics should not rely on event names, as they are prone to + // changing. Instead, they should use typed arguments to identify the events + // they are interested in. + oneof name_field { + // interned EventName. + uint64 name_iid = 10; + // non-interned variant. + string name = 23; + } + + // TODO(eseckler): Support using binary symbols for category/event names. + + // Type of the TrackEvent (required if |phase| in LegacyEvent is not set). + enum Type { + TYPE_UNSPECIFIED = 0; + + + TYPE_SLICE_BEGIN = 1; + TYPE_SLICE_END = 2; + + // Instant events are nestable events without duration. They can be children + // of slice events on the same track. + TYPE_INSTANT = 3; + + // Event that provides a value for a counter track. |track_uuid| should + // refer to a counter track and |counter_value| set to the new value. Note + // that most other TrackEvent fields (e.g. categories, name, ..) are not + // supported for TYPE_COUNTER events. See also CounterDescriptor. + TYPE_COUNTER = 4; + } + optional Type type = 9; + + // Identifies the track of the event. The default value may be overridden + // using TrackEventDefaults, e.g., to specify the track of the TraceWriter's + // sequence (in most cases sequence = one thread). If no value is specified + // here or in TrackEventDefaults, the TrackEvent will be associated with an + // implicit trace-global track (uuid 0). See TrackDescriptor::uuid. + optional uint64 track_uuid = 11; + + // A new value for a counter track. |track_uuid| should refer to a track with + // a CounterDescriptor, and |type| should be TYPE_COUNTER. For a more + // efficient encoding of counter values that are sampled at the beginning/end + // of a slice, see |extra_counter_values| and |extra_counter_track_uuids|. + // Counter values can optionally be encoded in as delta values (positive or + // negative) on each packet sequence (see CounterIncrementalBase). + oneof counter_value_field { + int64 counter_value = 30; + double double_counter_value = 44; + } + + // --------------------------------------------------------------------------- + // TrackEvent arguments: + // --------------------------------------------------------------------------- + + // This field is used only if the source location represents the function that + // executes during this event. + + // Extension range for future use. + extensions 1000 to 9899; + // Reserved for Perfetto unit and integration tests. + extensions 9900 to 10000; + + // --------------------------------------------------------------------------- + // Deprecated / legacy event fields, which will be removed in the future: + // --------------------------------------------------------------------------- + + // Deprecated. Use the |timestamp| and |timestamp_clock_id| fields in + // TracePacket instead. + // +} + +// Default values for fields of all TrackEvents on the same packet sequence. +// Should be emitted as part of TracePacketDefaults whenever incremental state +// is cleared. It's defined here because field IDs should match those of the +// corresponding fields in TrackEvent. +message TrackEventDefaults { + optional uint64 track_uuid = 11; + repeated uint64 extra_counter_track_uuids = 31; + repeated uint64 extra_double_counter_track_uuids = 45; + + // TODO(eseckler): Support default values for more TrackEvent fields. +} + +// -------------------- +// Interned data types: +// -------------------- + +message EventCategory { + optional uint64 iid = 1; + optional string name = 2; +} + +message EventName { + optional uint64 iid = 1; + optional string name = 2; +} + +// Begin of protos/perfetto/trace/ps/process_tree.proto + +// Metadata about the processes and threads in the trace. +// Note: this proto was designed to be filled in by traced_probes and should +// only be populated with accurate information coming from the system. Other +// trace writers should prefer to fill ThreadDescriptor and ProcessDescriptor +// in TrackDescriptor. +message ProcessTree { + // Representation of a thread. + message Thread { + // The thread ID (as per gettid()) in the root PID namespace. + optional int32 tid = 1; + + // Thread group id (i.e. the PID of the process, == TID of the main thread) + optional int32 tgid = 3; + + // The name of the thread. + optional string name = 2; + + // The non-root-level thread IDs if the thread runs in a PID namespace. Read + // from the NSpid entry of /proc//status, with the first element (root- + // level thread ID) omitted. + repeated int32 nstid = 4; + } + + // Representation of a process. + message Process { + // The UNIX process ID, aka thread group ID (as per getpid()) in the root + // PID namespace. + optional int32 pid = 1; + + // The parent process ID, as per getppid(). + optional int32 ppid = 2; + + // The command line for the process, as per /proc/pid/cmdline. + // If it is a kernel thread there will only be one cmdline field + // and it will contain /proc/pid/comm. + // repeated string cmdline = 3; + + // No longer used as of Apr 2018, when the dedicated |threads| field was + // introduced in ProcessTree. + //repeated Thread threads_deprecated = 4 [deprecated = true]; + + // The uid for the process, as per /proc/pid/status. + optional int32 uid = 5; + + // The non-root-level process IDs if the process runs in a PID namespace. + // Read from the NSpid entry of /proc//status, with the first element ( + // root-level process ID) omitted. + repeated int32 nspid = 6; + } + + // List of processes and threads in the client. These lists are incremental + // and not exhaustive. A process and its threads might show up separately in + // different ProcessTree messages. A thread might event not show up at all, if + // no sched_switch activity was detected, for instance: + // #0 { processes: [{pid: 10, ...}], threads: [{pid: 11, tgid: 10}] } + // #1 { threads: [{pid: 12, tgid: 10}] } + // #2 { processes: [{pid: 20, ...}], threads: [{pid: 13, tgid: 10}] } + repeated Process processes = 1; + repeated Thread threads = 2; + + // The time at which we finish collecting this process tree; + // the top-level packet timestamp is the time at which + // we begin collection. + optional uint64 collection_end_timestamp = 3; } message TracePacketDefaults { optional uint32 timestamp_clock_id = 58; + + // Default values for TrackEvents (e.g. default track). + optional TrackEventDefaults track_event_defaults = 11; + + // Defaults for perf profiler packets (PerfSample). + // optional PerfSampleDefaults perf_sample_defaults = 12; } +// End of protos/perfetto/trace/trace_packet_defaults.proto +// Begin of protos/perfetto/trace/trace_uuid.proto + +// A random unique ID that identifies the trace. +// This message has been introduced in v32. Prior to that, the UUID was +// only (optionally) present in the TraceConfig.trace_uuid_msb/lsb fields. +// This has been moved to a standalone packet to deal with new use-cases for +// go/gapless-aot, where the same tracing session can be serialized several +// times, in which case the UUID is changed on each snapshot and does not match +// the one in the TraceConfig. +message TraceUuid { + optional int64 msb = 1; + optional int64 lsb = 2; +} + +// End of protos/perfetto/trace/trace_uuid.proto + +// Begin of protos/perfetto/trace/track_event/process_descriptor.proto + +// Describes a process's attributes. Emitted as part of a TrackDescriptor, +// usually by the process's main thread. +// +// Next id: 9. +message ProcessDescriptor { + optional int32 pid = 1; + repeated string cmdline = 2; + optional string process_name = 6; + + optional int32 process_priority = 5; + // Process start time in nanoseconds. + // The timestamp refers to the trace clock by default. Other clock IDs + // provided in TracePacket are not supported. + optional int64 start_timestamp_ns = 7; + + // Labels can be used to further describe properties of the work performed by + // the process. For example, these can be used by Chrome renderer process to + // provide titles of frames being rendered. + repeated string process_labels = 8; +} + +// End of protos/perfetto/trace/track_event/process_descriptor.proto + +// Begin of protos/perfetto/trace/track_event/thread_descriptor.proto + +// Describes a thread's attributes. Emitted as part of a TrackDescriptor, +// usually by the thread's trace writer. +// +// Next id: 9. message ThreadDescriptor { optional int32 pid = 1; optional int32 tid = 2; + optional string thread_name = 5; + + // --------------------------------------------------------------------------- + // Deprecated / legacy fields, which will be removed in the future: + // --------------------------------------------------------------------------- + + optional int64 reference_timestamp_us = 6; + + // Absolute reference values. Clock values in subsequent TrackEvents can be + // encoded accumulatively and relative to these. This reduces their var-int + // encoding size. + // TODO(eseckler): Deprecated. Replace these with ClockSnapshot encoding. + optional int64 reference_thread_time_us = 7; + optional int64 reference_thread_instruction_count = 8; + + // To support old UI. New UI should determine default sorting by thread_type. + optional int32 legacy_sort_index = 3; } +message CounterDescriptor { + // Built-in counters, usually with special meaning in the client library, + // trace processor, legacy JSON format, or UI. Trace processor will infer a + // track name from the enum value if none is provided in TrackDescriptor. + enum BuiltinCounterType { + COUNTER_UNSPECIFIED = 0; + + // Thread-scoped counters. The thread's track should be specified via + // |parent_uuid| in the TrackDescriptor for such a counter. + + // implies UNIT_TIME_NS. + COUNTER_THREAD_TIME_NS = 1; + + // implies UNIT_COUNT. + COUNTER_THREAD_INSTRUCTION_COUNT = 2; + } + + // Type of the values for the counters - to supply lower granularity units, + // see also |unit_multiplier|. + enum Unit { + UNIT_UNSPECIFIED = 0; + UNIT_TIME_NS = 1; + UNIT_COUNT = 2; + UNIT_SIZE_BYTES = 3; + // TODO(eseckler): Support more units as necessary. + } + + // For built-in counters (e.g. thread time). Custom user-specified counters + // (e.g. those emitted by TRACE_COUNTER macros of the client library) + // shouldn't set this, and instead provide a counter name via TrackDescriptor. + optional BuiltinCounterType type = 1; + + // Names of categories of the counter (usually for user-specified counters). + // In the client library, categories are a way to turn groups of individual + // counters (or events) on or off. + repeated string categories = 2; + + // Type of the counter's values. Built-in counters imply a value for this + // field. + optional Unit unit = 3; + + // In order to use a unit not defined as a part of |Unit|, a free-form unit + // name can be used instead. + optional string unit_name = 6; + // TODO(eseckler): Support arguments describing the counter (?). + // repeated DebugAnnotation debug_annotations; +} +// As a fallback, TrackEvents emitted without an explicit track association will +// be associated with an implicit trace-global track (uuid = 0), see also +// |TrackEvent::track_uuid|. It is possible but not necessary to emit a +// TrackDescriptor for this implicit track. +// +// Next id: 10. message TrackDescriptor { + // Unique ID that identifies this track. This ID is global to the whole trace. + // Producers should ensure that it is unlikely to clash with IDs emitted by + // other producers. A value of 0 denotes the implicit trace-global track. + // + // For example, legacy TRACE_EVENT macros may use a hash involving the async + // event id + id_scope, pid, and/or tid to compute this ID. optional uint64 uuid = 1; + + // A parent track reference can be used to describe relationships between + // tracks. For example, to define an asynchronous track which is scoped to a + // specific process, specify the uuid for that process's process track here. + // Similarly, to associate a COUNTER_THREAD_TIME_NS counter track with a + // thread, specify the uuid for that thread's thread track here. optional uint64 parent_uuid = 5; - optional string name = 2; - optional ProcessDescriptor process = 3; - optional ThreadDescriptor thread = 4; -} -message TrackEvent { + // Name of the track. Optional - if unspecified, it may be derived from the + // process/thread name (process/thread tracks), the first event's name (async + // tracks), or counter name (counter tracks). + optional string name = 2; - enum Type { - TYPE_SLICE_BEGIN = 1; - TYPE_SLICE_END = 2; - } + // Associate the track with a process, making it the process-global track. + // There should only be one such track per process (usually for instant + // events; trace processor uses this fact to detect pid reuse). If you need + // more (e.g. for asynchronous events), create child tracks using parent_uuid. + // + // Trace processor will merge events on a process track with slice-type events + // from other sources (e.g. ftrace) for the same process into a single + // timeline view. + optional ProcessDescriptor process = 3; + // optional ChromeProcessDescriptor chrome_process = 6; - optional Type type = 9; - optional uint64 track_uuid = 11; + // Associate the track with a thread, indicating that the track's events + // describe synchronous code execution on the thread. There should only be one + // such track per thread (trace processor uses this fact to detect tid reuse). + // + // Trace processor will merge events on a thread track with slice-type events + // from other sources (e.g. ftrace) for the same thread into a single timeline + // view. + optional ThreadDescriptor thread = 4; + // optional ChromeThreadDescriptor chrome_thread = 7; - oneof name_field { - string name = 23; - } + // Descriptor for a counter track. If set, the track will only support + // TYPE_COUNTER TrackEvents (and values provided via TrackEvent's + // |extra_counter_values|). + optional CounterDescriptor counter = 8; + // If true, forces Trace Processor to use separate tracks for track events + // and system events for the same thread. + // Track events timestamps in Chrome have microsecond resolution, while + // system events use nanoseconds. It results in broken event nesting when + // track events and system events share a track. + optional bool disallow_merging_with_system_tracks = 9; } +// End of protos/perfetto/trace/track_event/track_descriptor.proto + +// Next id: 95. message TracePacket { + // The timestamp of the TracePacket. + // The clock domain definition in ClockSnapshot can also override: + // - The unit (default: 1ns). + // - The absolute vs delta encoding (default: absolute timestamp). optional uint64 timestamp = 8; + // Specifies the ID of the clock used for the TracePacket |timestamp|. Can be + // one of the built-in types from ClockSnapshot::BuiltinClocks, or a + // producer-defined clock id. + // If unspecified and if no default per-sequence value has been provided via + // TracePacketDefaults, it defaults to BuiltinClocks::BOOTTIME. + optional uint32 timestamp_clock_id = 58; + oneof data { +// ProcessTree process_tree = 2; + //ProcessStats process_stats = 9; + //InodeFileMap inode_file_map = 4; + //ChromeEventBundle chrome_events = 5; + // ClockSnapshot clock_snapshot = 6; + //SysStats sys_stats = 7; TrackEvent track_event = 11; + + // IDs up to 15 are reserved. They take only one byte to encode their + // preamble so should be used for frequent events. + + TraceUuid trace_uuid = 89; + TraceConfig trace_config = 33; + + // UiState ui_state = 78; + + + // Only used by TrackEvent. TrackDescriptor track_descriptor = 60; + + // Deprecated, use TrackDescriptor instead. + ProcessDescriptor process_descriptor = 43; + + // Deprecated, use TrackDescriptor instead. + ThreadDescriptor thread_descriptor = 44; + + // This field is emitted at periodic intervals (~10s) and + // contains always the binary representation of the UUID + // {82477a76-b28d-42ba-81dc-33326d57a079}. This is used to be able to + // efficiently partition long traces without having to fully parse them. + // bytes synchronization_marker = 36; + + //TestEvent for_testing = 900; + //gpu_freq.Packet gpu_freq_packet = 1001; } + // TODO(eseckler): Emit this field in a PacketSequenceDescriptor message + // instead. + oneof optional_trusted_uid { int32 trusted_uid = 3; }; + + // Service-assigned identifier of the packet sequence this packet belongs to. + // Uniquely identifies a producer + writer pair within the tracing session. A + // value of zero denotes an invalid ID. Keep in sync with + // TrustedPacket.trusted_packet_sequence_id. oneof optional_trusted_packet_sequence_id { uint32 trusted_packet_sequence_id = 10; } - required TracePacketDefaults trace_packet_defaults = 59; + // Trusted process id of the producer which generated this packet, written by + // the service. + optional int32 trusted_pid = 79; + + optional uint32 sequence_flags = 13; + + // TracePacketDefaults always override the global defaults for any future + // packet on this sequence (regardless of SEQ_NEEDS_INCREMENTAL_STATE). + optional TracePacketDefaults trace_packet_defaults = 59; + + // When packet loss occurs, incrementally emitted data (including interned + // data) on the sequence should be considered invalid up until the next packet + // with SEQ_INCREMENTAL_STATE_CLEARED set. optional bool previous_packet_dropped = 42; + // Given that older SDK versions do not support this flag, this flag not + // being present for a particular sequence does not necessarily imply data + // loss. + optional bool first_packet_on_sequence = 87; } +// End of protos/perfetto/trace/trace_packet.proto + +// Begin of protos/perfetto/trace/trace.proto + message Trace { repeated TracePacket packet = 1; -} + // Do NOT add any other field here. +} +// End of protos/perfetto/trace/trace.proto diff --git a/xprof/xprof.sh.erb.in b/xprof/xprof.sh.erb.in index 9cb63f46..61bfd64a 100644 --- a/xprof/xprof.sh.erb.in +++ b/xprof/xprof.sh.erb.in @@ -162,6 +162,7 @@ display_help() { echo " -v, --version Print the version string" echo " -r, --replay [path] will be treated as paths to traces folders ($HOME/lttng-traces/...)" echo " If no arguments are provided, will use the latest trace available" + echo " -s, --sample Activate sampling" echo echo " Example:" echo " $(basename $0) ./a.out" @@ -342,6 +343,18 @@ setup_lttng() { blocking-channel fi + #Activate sampling on non-blocking stream + if [ $sample == true ]; then + export LTTNG_UST_SAMPLING=1 + export LTTNG_UST_SAMPLING_ENERGY=1 + lttngq enable-channel --userspace nonblocking-channel + lttngq enable-event --channel=nonblocking-channel --userspace lttng_ust_sampling:* +<% if languages.include?("ze") %> + lttngq enable-event --channel=nonblocking-channel --userspace lttng_ust_ze_sampling:* +<% end %> + lttngq add-context --userspace --channel=nonblocking-channel -t vpid -t vtid + fi + <% if languages.include?("omp") %> enable_events_omp <% end %> @@ -574,6 +587,7 @@ asm=false replay=false cleanup=false profile=true +sample=false while (( "$#" )); do case "$1" in @@ -586,6 +600,7 @@ while (( "$#" )); do -l | --timeline) shift; mode="timeline" ;; -j | --json) shift; bt_tally_argv+=" --display_mode=json" ;; -m | --tracing-mode) shift; tracing_mode=$1; shift ;; + -s | --sample) shift; sample=true ;; --no-profile) shift; profile=false ;; --backend-level) shift; bt_tally_argv+=" --backend_level=$1"; shift ;; --no-save) shift; processing_mode="on-the-fly" ;; diff --git a/ze/Makefile.am b/ze/Makefile.am index 3d752e58..a114df5a 100644 --- a/ze/Makefile.am +++ b/ze/Makefile.am @@ -89,6 +89,7 @@ ZE_PROBES_INCL = $(ZE_PROBES:=.h) ZE_PROBES_SRC = $(ZE_PROBES:=.c) ZE_STATIC_PROBES = \ + ze_sampling \ ze_profiling \ ze_properties \ ze_build @@ -161,9 +162,9 @@ nodist_libTracerZE_la_SOURCES = \ $(ZE_STATIC_PROBES_INCL) \ tracer_ze.c -libTracerZE_la_CPPFLAGS = -I$(top_srcdir)/utils -I$(top_srcdir)/utils/include -I$(srcdir)/include -I../utils -I./ +libTracerZE_la_CPPFLAGS = -I$(top_srcdir)/utils -I$(top_srcdir)/utils/include -I$(top_srcdir)/sampling -I$(srcdir)/include -I../utils -I./ libTracerZE_la_CFLAGS = -Wall -Wextra $(WERROR) $(LIBFFI_CFLAGS) $(LTTNG_UST_CFLAGS) -libTracerZE_la_LDFLAGS = $(LTTNG_UST_LIBS) -ldl -lpthread $(LIBFFI_LIBS) +libTracerZE_la_LDFLAGS = $(LTTNG_UST_LIBS) -ldl -lpthread $(LIBFFI_LIBS) ../sampling/libThapiSampling.la libTracerZE_la_LIBADD = libzetracepoints.la install-exec-hook: diff --git a/ze/gen_ze.rb b/ze/gen_ze.rb index bbcdf56d..e6b273b8 100644 --- a/ze/gen_ze.rb +++ b/ze/gen_ze.rb @@ -24,6 +24,7 @@ #include "zet_structs_tracepoints.h" #include "zes_structs_tracepoints.h" #include "zel_structs_tracepoints.h" +#include "ze_sampling.h" #include "ze_profiling.h" #include "ze_properties.h" #include "ze_build.h" diff --git a/ze/gen_ze_custom_probes.rb b/ze/gen_ze_custom_probes.rb index b57418dc..28441688 100644 --- a/ze/gen_ze_custom_probes.rb +++ b/ze/gen_ze_custom_probes.rb @@ -6,7 +6,7 @@ h = YAML::load_file(File.join(SRC_DIR,"ze_events.yaml"))[namespace] -raise "Invalid namespace!" unless h +raise "Invalid namespace: #{namespace}!" unless h puts <= 1 ? 1:0); j++) { + readFrequency(i, j, &frequency); + do_tracepoint(lttng_ust_ze_sampling, gpu_frequency, (ze_device_handle_t)_sampling_hDevices[i], j, ts_us, frequency); + } + for (uint32_t j = 0; j < (_sampling_powerDomainCounts[i] >= 1 ? 1:0); j++) { + readEnergy(i, j, &ts_us, &energy_uj); + do_tracepoint(lttng_ust_ze_sampling, gpu_energy, (ze_device_handle_t)_sampling_hDevices[i], j, (uint64_t)energy_uj, ts_us); + } + } +} + static void _load_tracer(void) { char *s = NULL; void *handle = NULL; int verbose = 0; + struct timespec interval; + thapi_sampling_init(); s = getenv("LTTNG_UST_ZE_LIBZE_LOADER"); if (s) @@ -798,6 +940,15 @@ static void _load_tracer(void) { else if (verbose) fprintf(stderr, "Warning: LTTNG_UST_ZE_PARANOID_DRIFT not activated without LTTNG_UST_ZE_PROFILE\n"); } + + if (getenv("LTTNG_UST_SAMPLING_ENERGY")) { + initializeHandles(); + /* TODO: make it configurable */ + interval.tv_sec = 0; + interval.tv_nsec = 50000000; + thapi_register_sampling(&thapi_sampling_energy, &interval); + } + if (_do_profile) atexit(&_lib_cleanup); } diff --git a/ze/ze_events.yaml b/ze/ze_events.yaml index b8019a4d..4d914186 100644 --- a/ze/ze_events.yaml +++ b/ze/ze_events.yaml @@ -1,4 +1,28 @@ --- +lttng_ust_ze_sampling: + events: + - name: gpu_energy + args: + - [ ze_device_handle_t, hDevice ] + - [ uint32_t, domain] + - [ uint64_t, energy ] + - [ uint64_t, timestamp ] + fields: + - [ ctf_integer_hex, uintptr_t, hDevice, "(uintptr_t)hDevice" ] + - [ ctf_integer, uint32_t, domain, "domain" ] + - [ ctf_integer, uint64_t, energy, "energy" ] + - [ ctf_integer, uint64_t, timestamp, "timestamp" ] + - name: gpu_frequency + args: + - [ ze_device_handle_t, hDevice ] + - [ uint32_t, domain] + - [ uint64_t, timestamp ] + - [ uint64_t, frequency ] + fields: + - [ ctf_integer_hex, uintptr_t, hDevice, "(uintptr_t)hDevice" ] + - [ ctf_integer, uint32_t, domain, "domain" ] + - [ ctf_integer, uint64_t, timestamp, "timestamp" ] + - [ ctf_integer, uint64_t, frequency, "frequency" ] lttng_ust_ze_profiling: events: - name: event_profiling diff --git a/ze/zeinterval_callbacks.cpp.erb b/ze/zeinterval_callbacks.cpp.erb index d32e4ccf..8ca790bd 100644 --- a/ze/zeinterval_callbacks.cpp.erb +++ b/ze/zeinterval_callbacks.cpp.erb @@ -78,6 +78,39 @@ void *init_zeinterval_callbacks_state() { return (void*) s; } +static void create_and_enqueue_power_message(const char* hostname, const process_id_t process_id, const thread_id_t thread_id, + const uintptr_t hDevice, const uint32_t domain, const uint64_t energy, const uint64_t ts) { + zeinterval_callbacks_state* state = (zeinterval_callbacks_state*) zeinterval_iter_g->callbacks_state; + auto [it, inserted] = state->device_energy_ref.insert({{hostname, process_id, hDevice}, {energy, ts}}); + // First entry + if (inserted) + return; + + auto &[prev_energy, prev_ts] = it->second; + + bt_message *message = create_power_message(hostname, process_id, + thread_id, hDevice, domain, + static_cast(((energy-prev_energy) / static_cast(ts-prev_ts))*1000.0), + prev_ts, + zeinterval_iter_g->dispatch->power_event_class, + zeinterval_self_message_iterator_g, + zeinterval_iter_g->dispatch->stream, BACKEND_ZE); + state->downstream_message_queue.push(message); + prev_energy = energy; + prev_ts = ts; +} + +static void create_and_enqueue_frequency_message(const char* hostname, const process_id_t process_id, const thread_id_t thread_id, + const uintptr_t hDevice, const uint32_t domain, const uint64_t ts, const uint64_t frequency) { + bt_message *message = create_frequency_message(hostname, process_id, thread_id, hDevice, domain, ts, frequency, + zeinterval_iter_g->dispatch->frequency_event_class, + zeinterval_self_message_iterator_g, + zeinterval_iter_g->dispatch->stream, BACKEND_ZE); + + zeinterval_callbacks_state* state = (zeinterval_callbacks_state*) zeinterval_iter_g->callbacks_state; + state->downstream_message_queue.push(message); +} + static void create_and_enqueue_host_message(const char* hostname, const process_id_t process_id, const thread_id_t thread_id, const char* name, const uint64_t ts, const uint64_t duration, const bool err) { @@ -246,10 +279,24 @@ static void zeinterval_<%= dbt_event.name %>_callback( |_ (_) (_ (_| | | |_ \/ _|_ | | | (_) / %> - <% if dbt_event.name_unsanitized.start_with?('lttng_ust_ze:') or - dbt_event.name_unsanitized.start_with?('lttng_ust_zet:') or - dbt_event.name_unsanitized.start_with?('lttng_ust_zes:') or - dbt_event.name_unsanitized.start_with?('lttng_ust_zel:') %> + <% if dbt_event.name_unsanitized == "lttng_ust_ze_sampling:gpu_energy" %> + const hostname_t hostname = borrow_hostname(bt_evt); + const process_id_t process_id = 0; + const thread_id_t thread_id = 0; + int64_t ns_from_origin; + bt_clock_snapshot_get_ns_from_origin(bt_clock, &ns_from_origin); + create_and_enqueue_power_message(hostname.c_str(), process_id, thread_id, (uintptr_t)hDevice, domain, energy, ns_from_origin); + <% elsif dbt_event.name_unsanitized == "lttng_ust_ze_sampling:gpu_frequency" %> + const hostname_t hostname = borrow_hostname(bt_evt); + const process_id_t process_id = 0; + const thread_id_t thread_id = 0; + int64_t ns_from_origin; + bt_clock_snapshot_get_ns_from_origin(bt_clock, &ns_from_origin); + create_and_enqueue_frequency_message(hostname.c_str(), process_id, thread_id, (uintptr_t)hDevice, domain, ns_from_origin, frequency); + <% elsif dbt_event.name_unsanitized.start_with?('lttng_ust_ze:') or + dbt_event.name_unsanitized.start_with?('lttng_ust_zet:') or + dbt_event.name_unsanitized.start_with?('lttng_ust_zes:') or + dbt_event.name_unsanitized.start_with?('lttng_ust_zel:') %> const hostname_t hostname = borrow_hostname(bt_evt); const process_id_t process_id = borrow_process_id(bt_evt); diff --git a/ze/zeinterval_callbacks.hpp b/ze/zeinterval_callbacks.hpp index 47775510..cbc0027e 100644 --- a/ze/zeinterval_callbacks.hpp +++ b/ze/zeinterval_callbacks.hpp @@ -20,6 +20,7 @@ typedef hp_device_t hpd_t; typedef hp_event_t hpe_t; typedef hp_kernel_t hpk_t; typedef std::tuple clock_lttng_device_t; +typedef std::tuple energy_timestamp_t; typedef std::tuple t_tfnm_m_d_ts_cld_t; typedef std::tuple l_tfnm_m_d_ts_t; @@ -54,6 +55,8 @@ struct zeinterval_callbacks_state { /* Stack to get begin end */ std::unordered_map> last_command; + /*Energy */ + std::unordered_map device_energy_ref; }; template