From 01bf472599393443e2ad3d667e173ab66999fce5 Mon Sep 17 00:00:00 2001 From: Ian Lumsden Date: Sat, 20 Jul 2024 17:16:42 -0700 Subject: [PATCH 01/11] Reimplements the IntelTopdown service to support both Haswell/Broadwell calculations and Sapphire Rapids/Emerald Rapids calculations --- src/services/topdown/IntelTopdown.cpp | 649 ++++++++++++++++++++------ 1 file changed, 503 insertions(+), 146 deletions(-) diff --git a/src/services/topdown/IntelTopdown.cpp b/src/services/topdown/IntelTopdown.cpp index 80064bb2..1724f561 100644 --- a/src/services/topdown/IntelTopdown.cpp +++ b/src/services/topdown/IntelTopdown.cpp @@ -26,28 +26,24 @@ using namespace cali; namespace { -class IntelTopdown +enum IntelTopdownLevel { All = 1, Top = 2 }; + +class TopdownCalculator { - static const char* s_top_counters; - static const char* s_all_counters; +protected: - std::map counter_attrs; - std::map result_attrs; + IntelTopdownLevel m_level; - std::map counters_not_found; + const char* m_top_counters; + const char* m_all_counters; - unsigned num_top_computed; - unsigned num_top_skipped; - unsigned num_be_computed; - unsigned num_be_skipped; - unsigned num_fe_computed; - unsigned num_fe_skipped; - unsigned num_bsp_computed; - unsigned num_bsp_skipped; + const char* s_res_top[]; + const char* s_res_all[]; - enum Level { All = 1, Top = 2 }; + std::map m_counter_attrs; + std::map m_result_attrs; - Level level; + std::map m_counters_not_found; Variant get_val_from_rec(const std::vector& rec, const char* name) { @@ -64,14 +60,38 @@ class IntelTopdown if (it != rec.end()) ret = it->value(); else - ++counters_not_found[std::string(name)]; + ++m_counters_not_found[std::string(name)]; return ret; } +public: + + TopdownCalculator(IntelTopdownLevel level) : m_level(level) {} + + virtual std::vector compute_toplevel(const std::vector& rec) = 0; + + virtual std::size_t get_num_expected_toplevel() const = 0; + + virtual std::vector compute_retiring(const std::vector& rec) = 0; + + virtual std::size_t get_num_expected_retiring() const = 0; + + virtual std::vector compute_backend_bound(const std::vector& rec) = 0; + + virtual std::size_t get_num_expected_backend_bound() const = 0; + + virtual std::vector compute_frontend_bound(const std::vector& rec) = 0; + + virtual std::size_t get_num_expected_frontend_bound() const = 0; + + virtual std::vector compute_bad_speculation(const std::vector& rec) = 0; + + virtual std::size_t get_num_expected_bad_speculation() const = 0; + bool find_counter_attrs(CaliperMetadataAccessInterface& db) { - const char* list = (level == All ? s_all_counters : s_top_counters); + const char* list = (m_level == All ? m_all_counters : m_top_counters); auto counters = StringConverter(list).to_stringlist(); for (const auto& s : counters) { @@ -84,7 +104,7 @@ class IntelTopdown return false; } - counter_attrs[s] = attr; + m_counter_attrs[s] = attr; } return true; @@ -92,24 +112,80 @@ class IntelTopdown void make_result_attrs(CaliperMetadataAccessInterface& db) { - const char* res_top[] = { "retiring", "backend_bound", "frontend_bound", "bad_speculation", nullptr }; - const char* res_all[] = { "retiring", "backend_bound", "frontend_bound", - "bad_speculation", "branch_mispredict", "machine_clears", - "frontend_latency", "frontend_bandwidth", "memory_bound", - "core_bound", "ext_mem_bound", "l1_bound", - "l2_bound", "l3_bound", nullptr }; - - const char** res = (level == Top ? res_top : res_all); + const char** res = (m_level == Top ? m_res_top : m_res_all); for (const char** s = res; s && *s; ++s) - result_attrs[std::string(*s)] = db.create_attribute( + m_result_attrs[std::string(*s)] = db.create_attribute( std::string("topdown.") + (*s), CALI_TYPE_DOUBLE, CALI_ATTR_ASVALUE | CALI_ATTR_SKIP_EVENTS ); } - std::vector compute_toplevel(const std::vector& rec) + const std::map& get_counters_not_found() const { return m_counters_not_found; } + + const char* get_counters() const + { + if (m_level == All) { + return m_all_counters; + } else { + return m_top_counters; + } + } + + IntelTopdownLevel get_level() const { return m_level; } +}; + +class HaswellTopdown +{ +public: + + HaswellTopdown(IntelTopdownLevel level) + : TopdownCalculator(level), + m_top_counters( + "CPU_CLK_THREAD_UNHALTED:THREAD_P" + ",IDQ_UOPS_NOT_DELIVERED:CORE" + ",INT_MISC:RECOVERY_CYCLES" + ",UOPS_ISSUED:ANY" + ",UOPS_RETIRED:RETIRE_SLOTS" + ), + m_all_counters( + "BR_MISP_RETIRED:ALL_BRANCHES" + ",CPU_CLK_THREAD_UNHALTED:THREAD_P" + ",CYCLE_ACTIVITY:CYCLES_NO_EXECUTE" + ",CYCLE_ACTIVITY:STALLS_L1D_PENDING" + ",CYCLE_ACTIVITY:STALLS_L2_PENDING" + ",CYCLE_ACTIVITY:STALLS_LDM_PENDING" + ",IDQ_UOPS_NOT_DELIVERED:CORE" + ",IDQ_UOPS_NOT_DELIVERED:CYCLES_0_UOPS_DELIV_CORE" + ",INT_MISC:RECOVERY_CYCLES" + ",MACHINE_CLEARS:COUNT" + ",MEM_LOAD_UOPS_RETIRED:L3_HIT" + ",MEM_LOAD_UOPS_RETIRED:L3_MISS" + ",UOPS_EXECUTED:CORE_CYCLES_GE_1" + ",UOPS_EXECUTED:CORE_CYCLES_GE_2" + ",UOPS_ISSUED:ANY" + ",UOPS_RETIRED:RETIRE_SLOTS" + ), + m_res_top({ "retiring", "backend_bound", "frontend_bound", "bad_speculation", nullptr }), + m_res_all({ "retiring", + "backend_bound", + "frontend_bound", + "bad_speculation", + "branch_mispredict", + "machine_clears", + "frontend_latency", + "frontend_bandwidth", + "memory_bound", + "core_bound", + "ext_mem_bound", + "l1_bound", + "l2_bound", + "l3_bound", + nullptr }) + {} + + virtual std::vector compute_toplevel(const std::vector& rec) override { std::vector ret; @@ -139,15 +215,21 @@ class IntelTopdown double backend_bound = 1.0 - (retiring + bad_speculation + frontend_bound); ret.reserve(4); - ret.push_back(Entry(result_attrs["retiring"], Variant(std::max(retiring, 0.0)))); - ret.push_back(Entry(result_attrs["backend_bound"], Variant(std::max(backend_bound, 0.0)))); - ret.push_back(Entry(result_attrs["frontend_bound"], Variant(std::max(frontend_bound, 0.0)))); - ret.push_back(Entry(result_attrs["bad_speculation"], Variant(std::max(bad_speculation, 0.0)))); + ret.push_back(Entry(m_result_attrs["retiring"], Variant(std::max(retiring, 0.0)))); + ret.push_back(Entry(m_result_attrs["backend_bound"], Variant(std::max(backend_bound, 0.0)))); + ret.push_back(Entry(m_result_attrs["frontend_bound"], Variant(std::max(frontend_bound, 0.0)))); + ret.push_back(Entry(m_result_attrs["bad_speculation"], Variant(std::max(bad_speculation, 0.0)))); return ret; } - std::vector compute_backend_bound(const std::vector& rec) + virtual std::size_t get_num_expected_toplevel() const override { return 4; } + + virtual std::vector compute_retiring(const std::vector& rec) override { return {}; } + + virtual std::size_t get_num_expected_retiring() const override { return 0; } + + virtual std::vector compute_backend_bound(const std::vector& rec) override { std::vector ret; @@ -203,7 +285,9 @@ class IntelTopdown return ret; } - std::vector compute_frontend_bound(const std::vector& rec) + virtual std::size_t get_num_expected_backend_bound() const override { return 6; } + + virtual std::vector compute_frontend_bound(const std::vector& rec) override { std::vector ret; @@ -227,7 +311,9 @@ class IntelTopdown return ret; } - std::vector compute_bad_speculation(const std::vector& rec) + virtual std::size_t get_num_expected_frontend_bound() const override { return 2; } + + virtual std::vector compute_bad_speculation(const std::vector& rec) override { std::vector ret; @@ -251,11 +337,285 @@ class IntelTopdown return ret; } + virtual std::size_t get_num_expected_bad_speculation() const override { return 2; } +}; + +class SapphireRapidsTopdown +{ +public: + + SapphireRapidsTopdown(IntelTopdownLevel level) + : TopdownCalculator(level), + m_top_counters( + "perf::slots" + ",perf::topdown-retiring" + ",perf::topdown-bad-spec" + ",perf::topdown-fe-bound" + ",perf::topdown-be-bound" + ",INT_MISC:UOP_DROPPING" + ), + m_all_counters( + "perf::slots" + ",perf::topdown-retiring" + ",perf::topdown-bad-spec" + ",perf::topdown-fe-bound" + ",perf::topdown-be-bound" + ",INT_MISC:UOP_DROPPING" + ",perf_raw::r8400" // topdown-heavy-ops + ",perf_raw::r8500" // topdown-br-mispredict + ",perf_raw::r8600" // topdown-fetch-lat + ",perf_raw::r8700" + ), // topdown-mem-bound + m_res_top({ "retiring", "backend_bound", "frontend_bound", "bad_speculation", nullptr }), + m_res_all({ "retiring", + "backend_bound", + "frontend_bound", + "bad_speculation", + "branch_mispredict", + "machine_clears", + "frontend_latency", + "frontend_bandwidth", + "memory_bound", + "core_bound", + "light_ops", + "heavy_ops", + nullptr }) + {} + + virtual std::vector compute_toplevel(const std::vector& rec) override + { + std::vector ret; + + // Get PAPI metrics for toplevel calculations + Variant v_retiring = get_val_from_rec(rec, "perf::topdown-retiring"); + Variant v_bad_spec = get_val_from_rec(rec, "perf::topdown-bad-spec"); + Variant v_fe_bound = get_val_from_rec(rec, "perf::topdown-fe-bound"); + Variant v_be_bound = get_val_from_rec(rec, "perf::topdown-be-bound"); + Variant v_int_misc_uop_dropping = get_val_from_rec(rec, "INT_MSC:UOP_DROPPING"); + Variant v_slots_or_info_thread_slots = get_val_from_rec(rec, "perf::slots"); + + // Check if any Variant is empty (use .empty()) + bool is_incomplete = v_fe_bound.empty() || v_be_bound.emtpy() || v_bad_spec.empty() || v_retiring.empty() + || v_int_misc_uop_dropping.empty() || v_slots_or_info_thread_slots.empty(); + // Check if all Variants are greater than 0 when casted to doubles (use + // .to_double()) + bool is_nonzero = v_fe_bound.to_double() > 0.0 && v_be_bound.to_double() > 0.0 && v_bad_spec.to_double() > 0.0 + && v_retiring.to_double() > 0.0 && v_int_misc_uop_dropping.to_double() > 0.0 + && v_slots_or_info_thread_slots.to_double() > 0.0; + + // Check if bad values were obtained + if (is_incomplete || !is_nonzero) + return ret; + + // Perform toplevel calcs + double toplevel_sum = + (v_retiring.to_double() + v_bad_spec.to_double() + v_fe_bound.to_double() + v_be_bound.to_double()); + + double retiring = (v_retiring.to_double() / toplevel_sum) + (0 * v_slots_or_info_thread_slots); + double frontend_bound = + (v_fe_bound.to_double() / toplevel_sum) - (v_int_misc_uop_dropping / v_slots_or_info_thread_slots); + double backend_bound = (v_be_bound.to_double() / toplevel_sum) + (0 * v_slots_or_info_thread_slots); + double bad_speculation = std::max(1.0 - (frontend_bound + backend_bound + retiring), 0.0); + + // Add toplevel metrics to vector of Entry + ret.reserve(4); + ret.push_back(Entry(m_result_attrs["retiring"], Variant(std::max(retiring, 0.0)))); + ret.push_back(Entry(m_result_attrs["backend_bound"], Variant(std::max(backend_bound, 0.0)))); + ret.push_back(Entry(m_result_attrs["frontend_bound"], Variant(std::max(frontend_bound, 0.0)))); + ret.push_back(Entry(m_result_attrs["bad_speculation"], Variant(std::max(bad_speculation, 0.0)))); + + return ret; + } + + virtual std::size_t get_num_expected_toplevel() const override { return 4; } + + virtual std::vector compute_retiring(const std::vector& rec) override + { + std::vector ret; + + // Get PAPI metrics for toplevel calculations + Variant v_retiring = get_val_from_rec(rec, "perf::topdown-retiring"); + Variant v_bad_spec = get_val_from_rec(rec, "perf::topdown-bad-spec"); + Variant v_fe_bound = get_val_from_rec(rec, "perf::topdown-fe-bound"); + Variant v_be_bound = get_val_from_rec(rec, "perf::topdown-be-bound"); + Variant v_slots_or_info_thread_slots = get_val_from_rec(rec, "perf::slots"); + Variant v_heavy_ops = get_val_from_rec(rec, "perf_raw::r8400"); + + // Check if any Variant is empty (use .empty()) + bool is_incomplete = v_fe_bound.empty() || v_be_bound.emtpy() || v_bad_spec.empty() || v_retiring.empty() + || v_slots_or_info_thread_slots.empty() || v_heavy_ops.empty(); + + // Check if bad values were obtained + if (is_incomplete) + return ret; + + double toplevel_sum = + (v_retiring.to_double() + v_bad_spec.to_double() + v_fe_bound.to_double() + v_be_bound.to_double()); + // Copied from compute_toplevel + double retiring = (v_retiring.to_double() / toplevel_sum) + (0 * v_slots_or_info_thread_slots); + + double heavy_ops = (v_heavy_ops.to_double() / toplevel_sum) + (0 * v_slots_or_info_thread_slots); + double light_ops = std::max(0.0, retiring - heavy_ops); + + // Add toplevel metrics to vector of Entry + ret.reserve(2); + ret.push_back(Entry(m_result_attrs["heavy_ops"], Variant(std::max(heavy_ops, 0.0)))); + ret.push_back(Entry(m_result_attrs["light_ops"], Variant(std::max(light_ops, 0.0)))); + + return ret; + } + + virtual std::size_t get_num_expected_retiring() const override { return 2; } + + virtual std::vector compute_backend_bound(const std::vector& rec) override + { + std::vector ret; + + // Get PAPI metrics for toplevel calculations + Variant v_retiring = get_val_from_rec(rec, "perf::topdown-retiring"); + Variant v_bad_spec = get_val_from_rec(rec, "perf::topdown-bad-spec"); + Variant v_fe_bound = get_val_from_rec(rec, "perf::topdown-fe-bound"); + Variant v_be_bound = get_val_from_rec(rec, "perf::topdown-be-bound"); + Variant v_slots_or_info_thread_slots = get_val_from_rec(rec, "perf::slots"); + Variant v_memory_bound = get_val_from_rec(rec, "perf_raw::r8700"); + + // Check if any Variant is empty (use .empty()) + bool is_incomplete = v_fe_bound.empty() || v_be_bound.emtpy() || v_bad_spec.empty() || v_retiring.empty() + || v_slots_or_info_thread_slots.empty() || v_memory_bound.empty(); + + // Check if bad values were obtained + if (is_incomplete) + return ret; + + double toplevel_sum = + (v_retiring.to_double() + v_bad_spec.to_double() + v_fe_bound.to_double() + v_be_bound.to_double()); + // Copied from compute_toplevel + double backend_bound = (v_be_bound.to_double() / toplevel_sum) + (0 * v_slots_or_info_thread_slots); + + double memory_bound = (v_memory_bound.to_double() / toplevel_sum) + (0 * v_slots_or_info_thread_slots); + double core_bound = std::max(0.0, backend_bound - memory_bound); + + // Add toplevel metrics to vector of Entry + ret.reserve(2); + ret.push_back(Entry(m_result_attrs["backend_bound"], Variant(std::max(backend_bound, 0.0)))); + ret.push_back(Entry(m_result_attrs["memory_bound"], Variant(std::max(memory_bound, 0.0)))); + + return ret; + } + + virtual std::size_t get_num_expected_backend_bound() const override { return 2; } + + virtual std::vector compute_frontend_bound(const std::vector& rec) override + { + std::vector ret; + + // Get PAPI metrics for toplevel calculations + Variant v_retiring = get_val_from_rec(rec, "perf::topdown-retiring"); + Variant v_bad_spec = get_val_from_rec(rec, "perf::topdown-bad-spec"); + Variant v_fe_bound = get_val_from_rec(rec, "perf::topdown-fe-bound"); + Variant v_be_bound = get_val_from_rec(rec, "perf::topdown-be-bound"); + Variant v_int_misc_uop_dropping = get_val_from_rec(rec, "INT_MSC:UOP_DROPPING"); + Variant v_slots_or_info_thread_slots = get_val_from_rec(rec, "perf::slots"); + Variant v_fetch_latency = get_val_from_rec(rec, "perf_raw::r8600"); + + // Check if any Variant is empty (use .empty()) + bool is_incomplete = v_fe_bound.empty() || v_be_bound.emtpy() || v_bad_spec.empty() || v_retiring.empty() + || v_int_misc_uop_dropping.empty() || v_slots_or_info_thread_slots.empty() + || v_memory_bound.empty(); + + // Check if bad values were obtained + if (is_incomplete) + return ret; + + double toplevel_sum = + (v_retiring.to_double() + v_bad_spec.to_double() + v_fe_bound.to_double() + v_be_bound.to_double()); + // Copied from compute_toplevel + double frontend_bound = + (v_fe_bound.to_double() / toplevel_sum) - (v_int_misc_uop_dropping / v_slots_or_info_thread_slots); + + double fetch_latency = + (v_fetch_latency.to_double() / toplevel_sum) - (v_int_misc_uop_dropping * v_slots_or_info_thread_slots); + double fetch_bandwidth = std::max(0.0, frontend_bound - fetch_latency); + + // Add toplevel metrics to vector of Entry + ret.reserve(2); + ret.push_back(Entry(m_result_attrs["frontend_latency"], Variant(std::max(fetch_latency, 0.0)))); + ret.push_back(Entry(m_result_attrs["frontend_bandwidth"], Variant(std::max(fetch_bandwidth, 0.0)))); + + return ret; + } + + virtual std::size_t get_num_expected_frontend_bound() const override { return 2; } + + virtual std::vector compute_bad_speculation(const std::vector& rec) override + { + std::vector ret; + + // Get PAPI metrics for toplevel calculations + Variant v_retiring = get_val_from_rec(rec, "perf::topdown-retiring"); + Variant v_bad_spec = get_val_from_rec(rec, "perf::topdown-bad-spec"); + Variant v_fe_bound = get_val_from_rec(rec, "perf::topdown-fe-bound"); + Variant v_be_bound = get_val_from_rec(rec, "perf::topdown-be-bound"); + Variant v_int_misc_uop_dropping = get_val_from_rec(rec, "INT_MSC:UOP_DROPPING"); + Variant v_slots_or_info_thread_slots = get_val_from_rec(rec, "perf::slots"); + Variant v_branch_mispredict = get_val_from_rec(rec, "perf_raw::r8500"); + + // Check if any Variant is empty (use .empty()) + bool is_incomplete = v_fe_bound.empty() || v_be_bound.emtpy() || v_bad_spec.empty() || v_retiring.empty() + || v_int_misc_uop_dropping.empty() || v_slots_or_info_thread_slots.empty() + || v_branch_mispredict.empty(); + + // Check if bad values were obtained + if (is_incomplete) + return ret; + + // Perform toplevel calcs + double toplevel_sum = + (v_retiring.to_double() + v_bad_spec.to_double() + v_fe_bound.to_double() + v_be_bound.to_double()); + + double retiring = (v_retiring.to_double() / toplevel_sum) + (0 * v_slots_or_info_thread_slots); + double frontend_bound = + (v_fe_bound.to_double() / toplevel_sum) - (v_int_misc_uop_dropping / v_slots_or_info_thread_slots); + double backend_bound = (v_be_bound.to_double() / toplevel_sum) + (0 * v_slots_or_info_thread_slots); + double bad_speculation = std::max(1.0 - (frontend_bound + backend_bound + retiring), 0.0); + + double branch_mispredict = + (v_branch_mispredict.to_double() / toplevel_sum) + (0 * v_slots_or_info_thread_slots); + double machine_clears = std::max(0.0, bad_speculation - branch_mispredict); + + // Add toplevel metrics to vector of Entry + ret.reserve(2); + ret.push_back(Entry(m_result_attrs["branch_mispredict"], Variant(std::max(branch_mispredict, 0.0)))); + ret.push_back(Entry(m_result_attrs["machine_clears"], Variant(std::max(machine_clears, 0.0)))); + + return ret; + } + + virtual std::size_t get_num_expected_bad_speculation() const override { return 2; } +}; + +class IntelTopdown +{ + unsigned num_top_computed; + unsigned num_top_skipped; + unsigned num_be_computed; + unsigned num_be_skipped; + unsigned num_fe_computed; + unsigned num_fe_skipped; + unsigned num_bsp_computed; + unsigned num_bsp_skipped; + unsigned num_ret_computed; + unsigned num_ret_skipped; + + IntelTopdownLevel level; + + TopdownCalculator* m_calculator; + void postprocess_snapshot_cb(std::vector& rec) { - std::vector result = compute_toplevel(rec); + std::vector result = m_calculator->compute_toplevel(rec); - if (result.empty()) + if (result.size() != m_calculator->get_num_expected_toplevel()) ++num_top_skipped; else { rec.insert(rec.end(), result.begin(), result.end()); @@ -263,144 +623,141 @@ class IntelTopdown } if (level == All) { - result = compute_backend_bound(rec); + result = m_calculator->compute_backend_bound(rec); - if (result.empty()) + if (result.size() != m_calculator->get_num_expected_backend_bound()) ++num_be_skipped; else { rec.insert(rec.end(), result.begin(), result.end()); ++num_be_computed; } - result = compute_frontend_bound(rec); + result = m_calculator->compute_frontend_bound(rec); - if (result.empty()) + if (result.size() != m_calculator->get_num_expected_frontend_bound()) ++num_fe_skipped; else { rec.insert(rec.end(), result.begin(), result.end()); ++num_fe_computed; } - result = compute_bad_speculation(rec); + result = m_calculator->compute_bad_speculation(rec); - if (result.empty()) + if (result.size() != m_calculator->get_num_expected_bad_speculation()) ++num_bsp_skipped; else { rec.insert(rec.end(), result.begin(), result.end()); ++num_bsp_computed; } + + result = m_calculator->compute_retiring(rec); + + if (result.size() != m_calculator->get_num_expected_retiring()) + ++num_ret_skipped; + else { + rec.insert(rec.end(), result.begin(), result.end()); + ++num_ret_computed; + } } - } - void finish_cb(Caliper* c, Channel* channel) - { - Log(1).stream() << channel->name() << ": topdown: Computed topdown metrics for " << num_top_computed - << " records, skipped " << num_top_skipped << std::endl; - - if (Log::verbosity() >= 2) { - Log(2).stream() << channel->name() << ": topdown: Records processed per topdown level: " - << "\n top: " << num_top_computed << " computed, " << num_top_skipped << " skipped," - << "\n bad spec: " << num_bsp_computed << " computed, " << num_bsp_skipped << " skipped," - << "\n frontend: " << num_bsp_computed << " computed, " << num_bsp_skipped << " skipped," - << "\n backend: " << num_bsp_computed << " computed, " << num_bsp_skipped << " skipped." - << std::endl; - - if (!counters_not_found.empty()) { - std::ostringstream os; - for (auto& p : counters_not_found) - os << "\n " << p.first << ": " << p.second; - Log(2).stream() << channel->name() << ": topdown: Counters not found:" << os.str() << std::endl; + void finish_cb(Caliper * c, Channel * channel) + { + Log(1).stream() << channel->name() << ": topdown: Computed topdown metrics for " << num_top_computed + << " records, skipped " << num_top_skipped << std::endl; + + if (Log::verbosity() >= 2) { + Log(2).stream() << channel->name() << ": topdown: Records processed per topdown level: " + << "\n top: " << num_top_computed << " computed, " << num_top_skipped + << " skipped," + << "\n bad spec: " << num_bsp_computed << " computed, " << num_bsp_skipped + << " skipped," + << "\n frontend: " << num_bsp_computed << " computed, " << num_bsp_skipped + << " skipped," + << "\n backend: " << num_bsp_computed << " computed, " << num_bsp_skipped + << " skipped." << std::endl; + + const std::map& counters_not_found = m_calculator->get_counters_not_found(); + + if (!counters_not_found.empty()) { + std::ostringstream os; + for (auto& p : counters_not_found) + os << "\n " << p.first << ": " << p.second; + Log(2).stream() << channel->name() << ": topdown: Counters not found:" << os.str() << std::endl; + } } } - } - explicit IntelTopdown(Level lvl) - : num_top_computed(0), - num_top_skipped(0), - num_be_computed(0), - num_be_skipped(0), - num_fe_computed(0), - num_fe_skipped(0), - num_bsp_computed(0), - num_bsp_skipped(0), - level(lvl) - {} + explicit IntelTopdown(TopdownCalculator * calculator) + : num_top_computed(0), + num_top_skipped(0), + num_be_computed(0), + num_be_skipped(0), + num_fe_computed(0), + num_fe_skipped(0), + num_bsp_computed(0), + num_bsp_skipped(0), + level(calculator->get_level()), + m_calculator(calculator) + {} + + ~IntelTopdown() + { + if (m_calculator != nullptr) { + delete m_calculator; + } + } -public: + public: - static const char* s_spec; + static const char* s_spec; - static void intel_topdown_register(Caliper* c, Channel* channel) - { - Level level = Top; - const char* counters = s_top_counters; - - auto config = services::init_config_from_spec(channel->config(), s_spec); - std::string lvlcfg = config.get("level").to_string(); - - if (lvlcfg == "all") { - level = All; - counters = s_all_counters; - } else if (lvlcfg != "top") { - Log(0).stream() << channel->name() << ": topdown: Unknown level \"" << lvlcfg << "\", skipping topdown" - << std::endl; - return; - } + static void intel_topdown_register(Caliper * c, Channel * channel) + { + Level level = Top; - channel->config().set("CALI_PAPI_COUNTERS", counters); + auto config = services::init_config_from_spec(channel->config(), s_spec); + std::string lvlcfg = config.get("level").to_string(); - if (!cali::services::register_service(c, channel, "papi")) { - Log(0).stream() << channel->name() << ": topdown: Unable to register papi service, skipping topdown" - << std::endl; - return; - } + if (lvlcfg == "all") { + level = All; + } else if (lvlcfg != "top") { + Log(0).stream() << channel->name() << ": topdown: Unknown level \"" << lvlcfg << "\", skipping topdown" + << std::endl; + return; + } - IntelTopdown* instance = new IntelTopdown(level); - - channel->events().pre_flush_evt.connect([instance](Caliper* c, Channel* channel, SnapshotView) { - if (instance->find_counter_attrs(*c)) - instance->make_result_attrs(*c); - else - Log(0).stream() << channel->name() << ": topdown: Could not find counter attributes!" << std::endl; - }); - channel->events().postprocess_snapshot.connect([instance](Caliper*, Channel*, std::vector& rec) { - instance->postprocess_snapshot_cb(rec); - }); - channel->events().finish_evt.connect([instance](Caliper* c, Channel* channel) { - instance->finish_cb(c, channel); - delete instance; - }); - - Log(1).stream() << channel->name() << ": Registered topdown service. Level: " << lvlcfg << "." << std::endl; - } -}; + // TODO Add logic to select correct TopdownCalculator + TopdownCalculator* calculator = new HaswellTopdown(level); + + channel->config().set("CALI_PAPI_COUNTERS", calculator->get_counters()); + + if (!cali::services::register_service(c, channel, "papi")) { + Log(0).stream() << channel->name() << ": topdown: Unable to register papi service, skipping topdown" + << std::endl; + return; + } -const char* IntelTopdown::s_top_counters = - "CPU_CLK_THREAD_UNHALTED:THREAD_P" - ",IDQ_UOPS_NOT_DELIVERED:CORE" - ",INT_MISC:RECOVERY_CYCLES" - ",UOPS_ISSUED:ANY" - ",UOPS_RETIRED:RETIRE_SLOTS"; - -const char* IntelTopdown::s_all_counters = - "BR_MISP_RETIRED:ALL_BRANCHES" - ",CPU_CLK_THREAD_UNHALTED:THREAD_P" - ",CYCLE_ACTIVITY:CYCLES_NO_EXECUTE" - ",CYCLE_ACTIVITY:STALLS_L1D_PENDING" - ",CYCLE_ACTIVITY:STALLS_L2_PENDING" - ",CYCLE_ACTIVITY:STALLS_LDM_PENDING" - ",IDQ_UOPS_NOT_DELIVERED:CORE" - ",IDQ_UOPS_NOT_DELIVERED:CYCLES_0_UOPS_DELIV_CORE" - ",INT_MISC:RECOVERY_CYCLES" - ",MACHINE_CLEARS:COUNT" - ",MEM_LOAD_UOPS_RETIRED:L3_HIT" - ",MEM_LOAD_UOPS_RETIRED:L3_MISS" - ",UOPS_EXECUTED:CORE_CYCLES_GE_1" - ",UOPS_EXECUTED:CORE_CYCLES_GE_2" - ",UOPS_ISSUED:ANY" - ",UOPS_RETIRED:RETIRE_SLOTS"; - -const char* IntelTopdown::s_spec = R"json( + IntelTopdown* instance = new IntelTopdown(calculator); + + channel->events().pre_flush_evt.connect([instance](Caliper* c, Channel* channel, SnapshotView) { + if (instance->find_counter_attrs(*c)) + instance->make_result_attrs(*c); + else + Log(0).stream() << channel->name() << ": topdown: Could not find counter attributes!" << std::endl; + }); + channel->events().postprocess_snapshot.connect([instance](Caliper*, Channel*, std::vector& rec) { + instance->postprocess_snapshot_cb(rec); + }); + channel->events().finish_evt.connect([instance](Caliper* c, Channel* channel) { + instance->finish_cb(c, channel); + delete instance; + }); + + Log(1).stream() << channel->name() << ": Registered topdown service. Level: " << lvlcfg << "." << std::endl; + } + }; + + const char* IntelTopdown::s_spec = R"json( { "name": "topdown", "description": "Record PAPI counters and compute top-down analysis for Intel CPUs", "config": [ @@ -416,8 +773,8 @@ const char* IntelTopdown::s_spec = R"json( } // namespace namespace cali -{ -CaliperService topdown_service { ::IntelTopdown::s_spec, ::IntelTopdown::intel_topdown_register }; +{ + CaliperService topdown_service { ::IntelTopdown::s_spec, ::IntelTopdown::intel_topdown_register }; } From cabc803f2128f9853f68f2dc1157afe500a1e0ca Mon Sep 17 00:00:00 2001 From: Ian Lumsden Date: Sat, 20 Jul 2024 17:49:54 -0700 Subject: [PATCH 02/11] Adds infrastructure to update builtin_option_specs based on features like architecture support --- src/caliper/ConfigManager.cpp | 34 ++++++++++++------------- src/caliper/controllers/controllers.cpp | 1 + 2 files changed, 17 insertions(+), 18 deletions(-) diff --git a/src/caliper/ConfigManager.cpp b/src/caliper/ConfigManager.cpp index 97340782..98944ab2 100644 --- a/src/caliper/ConfigManager.cpp +++ b/src/caliper/ConfigManager.cpp @@ -7,8 +7,8 @@ #include "caliper/common/Log.h" -#include "../src/common/util/parse_util.h" #include "../src/common/util/format_util.h" +#include "../src/common/util/parse_util.h" #include "../services/Services.h" @@ -36,46 +36,43 @@ extern const char* builtin_umpire_option_specs; extern const char* builtin_papi_option_specs; extern const char* builtin_kokkos_option_specs; -extern void add_submodule_controllers_and_services(); +extern const char* get_builtin_option_specs(); +extern void add_submodule_controllers_and_services(); } // namespace cali namespace { -const char* builtin_option_specs_list[] = -{ - builtin_base_option_specs, +const char* builtin_option_specs_list[] = { builtin_base_option_specs, #ifdef CALIPER_HAVE_GOTCHA - builtin_gotcha_option_specs, + builtin_gotcha_option_specs, #endif #ifdef CALIPER_HAVE_MPI - builtin_mpi_option_specs, + builtin_mpi_option_specs, #endif #ifdef CALIPER_HAVE_OMPT - builtin_openmp_option_specs, + builtin_openmp_option_specs, #endif #ifdef CALIPER_HAVE_CUPTI - builtin_cuda_option_specs, + builtin_cuda_option_specs, #endif #if defined(CALIPER_HAVE_ROCTRACER) || defined(CALIPER_HAVE_ROCPROFILER) - builtin_rocm_option_specs, + builtin_rocm_option_specs, #endif #ifdef CALIPER_HAVE_LIBDW - builtin_libdw_option_specs, + builtin_libdw_option_specs, #endif #ifdef CALIPER_HAVE_PAPI - builtin_papi_option_specs, + builtin_papi_option_specs, #endif #ifdef CALIPER_HAVE_PCP - builtin_pcp_option_specs, + builtin_pcp_option_specs, #endif #ifdef CALIPER_HAVE_UMPIRE - builtin_umpire_option_specs, + builtin_umpire_option_specs, #endif - builtin_kokkos_option_specs, - nullptr -}; + builtin_kokkos_option_specs, nullptr }; ChannelController* make_basic_channel_controller( const char* name, @@ -139,7 +136,8 @@ ConfigManager::arglist_t merge_new_elements(ConfigManager::arglist_t& to, const return p.first == v.first; }); - if (it == to.end() || p.first == "metadata") // hacky but we want to allow multiple entries for metadata + if (it == to.end() || p.first == "metadata") // hacky but we want to allow multiple entries + // for metadata to.push_back(p); } diff --git a/src/caliper/controllers/controllers.cpp b/src/caliper/controllers/controllers.cpp index 696b519d..929c0402 100644 --- a/src/caliper/controllers/controllers.cpp +++ b/src/caliper/controllers/controllers.cpp @@ -4,6 +4,7 @@ #include "caliper/caliper-config.h" #include "caliper/ConfigManager.h" +#include namespace { From c367be3f1f54aaea58ddeeeadf745c4e60f0473e Mon Sep 17 00:00:00 2001 From: Ian Lumsden Date: Sat, 20 Jul 2024 18:26:06 -0700 Subject: [PATCH 03/11] Adds conditional behavior to topdown service and builtin option specs based on architecture specified at configure time --- CMakeLists.txt | 7 + caliper-config.h.in | 1 + src/caliper/ConfigManager.cpp | 82 +++-- src/caliper/controllers/controllers.cpp | 172 ++++++++- src/services/papi/Papi.cpp | 32 +- src/services/topdown/IntelTopdown.cpp | 461 +++++++++++++----------- 6 files changed, 493 insertions(+), 262 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 528eb305..496d4c2d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -83,6 +83,12 @@ add_caliper_option(WITH_UMPIRE "Enable Umpire statistics support" FALSE) add_caliper_option(WITH_CRAYPAT "Enable CrayPAT region forwarding support" FALSE) add_caliper_option(WITH_LDMS "Enable LDMS forwarder" FALSE) +set(WITH_ARCH "" CACHE STRING "Enable features specific to the provided archspec CPU architecture name") +if (NOT WITH_ARCH STREQUAL "") + string(TOLOWER ${WITH_ARCH} LOWER_WITH_ARCH) + set(CALIPER_HAVE_ARCH "${LOWER_WITH_ARCH}") +endif () + add_caliper_option(USE_EXTERNAL_GOTCHA "Use pre-installed gotcha instead of building our own" FALSE) add_caliper_option(ENABLE_HISTOGRAMS "Enable histogram aggregation (experimental)" FALSE) @@ -633,6 +639,7 @@ message(STATUS "Build type : ${CMAKE_BUILD_TYPE}") message(STATUS "Compiler : ${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION} (${CMAKE_CXX_COMPILER})") message(STATUS "Python interpreter : ${Python_EXECUTABLE}") message(STATUS "System : ${CMAKE_SYSTEM} (${CMAKE_SYSTEM_PROCESSOR})") +message(STATUS "Architecture : ${CALIPER_HAVE_ARCH}") message(STATUS "Install dir : ${CMAKE_INSTALL_PREFIX}") message(STATUS "Build shared libs : ${BUILD_SHARED_LIBS}") message(STATUS "Build Caliper tools : ${WITH_TOOLS}") diff --git a/caliper-config.h.in b/caliper-config.h.in index 0ead0425..d5602e97 100644 --- a/caliper-config.h.in +++ b/caliper-config.h.in @@ -26,6 +26,7 @@ #cmakedefine CALIPER_HAVE_UMPIRE #cmakedefine CALIPER_HAVE_CRAYPAT #cmakedefine CALIPER_HAVE_LDMS +#cmakedefine CALIPER_HAVE_ARCH "@CALIPER_HAVE_ARCH@" #cmakedefine CALIPER_REDUCED_CONSTEXPR_USAGE diff --git a/src/caliper/ConfigManager.cpp b/src/caliper/ConfigManager.cpp index 98944ab2..3d288ac1 100644 --- a/src/caliper/ConfigManager.cpp +++ b/src/caliper/ConfigManager.cpp @@ -33,47 +33,18 @@ extern const char* builtin_cuda_option_specs; extern const char* builtin_rocm_option_specs; extern const char* builtin_pcp_option_specs; extern const char* builtin_umpire_option_specs; -extern const char* builtin_papi_option_specs; extern const char* builtin_kokkos_option_specs; -extern const char* get_builtin_option_specs(); -extern void add_submodule_controllers_and_services(); +extern const char* builtin_papi_hsw_option_specs; +extern const char* builtin_papi_spr_option_specs; + +extern void add_submodule_controllers_and_services(); } // namespace cali namespace { -const char* builtin_option_specs_list[] = { builtin_base_option_specs, -#ifdef CALIPER_HAVE_GOTCHA - builtin_gotcha_option_specs, -#endif -#ifdef CALIPER_HAVE_MPI - builtin_mpi_option_specs, -#endif -#ifdef CALIPER_HAVE_OMPT - builtin_openmp_option_specs, -#endif -#ifdef CALIPER_HAVE_CUPTI - builtin_cuda_option_specs, -#endif -#if defined(CALIPER_HAVE_ROCTRACER) || defined(CALIPER_HAVE_ROCPROFILER) - builtin_rocm_option_specs, -#endif -#ifdef CALIPER_HAVE_LIBDW - builtin_libdw_option_specs, -#endif -#ifdef CALIPER_HAVE_PAPI - builtin_papi_option_specs, -#endif -#ifdef CALIPER_HAVE_PCP - builtin_pcp_option_specs, -#endif -#ifdef CALIPER_HAVE_UMPIRE - builtin_umpire_option_specs, -#endif - builtin_kokkos_option_specs, nullptr }; - ChannelController* make_basic_channel_controller( const char* name, const config_map_t& initial_cfg, @@ -901,6 +872,8 @@ struct ConfigManager::ConfigManagerImpl { bool m_error = false; std::string m_error_msg = ""; + std::vector builtin_option_specs_list; + std::map m_default_parameters_for_spec; arglist_t m_default_parameters; @@ -1404,9 +1377,48 @@ struct ConfigManager::ConfigManagerImpl { } ConfigManagerImpl() + : builtin_option_specs_list({ + builtin_base_option_specs, +#ifdef CALIPER_HAVE_GOTCHA + builtin_gotcha_option_specs, +#endif +#ifdef CALIPER_HAVE_MPI + builtin_mpi_option_specs, +#endif +#ifdef CALIPER_HAVE_OMPT + builtin_openmp_option_specs, +#endif +#ifdef CALIPER_HAVE_CUPTI + builtin_cuda_option_specs, +#endif +#if defined(CALIPER_HAVE_ROCTRACER) || defined(CALIPER_HAVE_ROCPROFILER) + builtin_rocm_option_specs, +#endif +#ifdef CALIPER_HAVE_LIBDW + builtin_libdw_option_specs, +#endif +#ifdef CALIPER_HAVE_PCP + builtin_pcp_option_specs, +#endif +#ifdef CALIPER_HAVE_UMPIRE + builtin_umpire_option_specs, +#endif + builtin_kokkos_option_specs + }) { - for (const char** spec_p = builtin_option_specs_list; *spec_p; ++spec_p) - add_global_option_specs(*spec_p); +#ifdef CALIPER_HAVE_PAPI +#ifdef CALIPER_HAVE_ARCH + if (CALIPER_HAVE_ARCH == "sapphirerapids") { + builtin_option_specs_list.push_back(builtin_papi_spr_option_specs); + } else { + builtin_option_specs_list.push_back(builtin_papi_hsw_option_specs); + } +#else + builtin_option_specs_list.push_back(builtin_papi_hsw_option_specs); +#endif +#endif + for (const char* spec_p : builtin_option_specs_list) + add_global_option_specs(spec_p); } }; diff --git a/src/caliper/controllers/controllers.cpp b/src/caliper/controllers/controllers.cpp index 929c0402..5e70da15 100644 --- a/src/caliper/controllers/controllers.cpp +++ b/src/caliper/controllers/controllers.cpp @@ -207,6 +207,25 @@ const ConfigManager::ConfigInfo* builtin_controllers_table[] = { &cuda_activity_ &spot_controller_info, nullptr }; +// Compile-time string comparison +// Based on code from: +// https://gist.github.com/ac1dloop/4f7109e8856e5d28e769134bca7d6d7d +constexpr bool const_strcmp(const char* a, const char* b) +{ + // Iterate until one of the strings hits its NULL terminator + for (; *a || *b;) { + // Check if the current characters in the strings are equal + // If not equal, return false + // If equal, progress to the next character in the strings + if (*a++ != *b++) { + return false; + } + } + // If we reach here, every character from the strings were equal, + // so we return true + return true; +} + const char* builtin_base_option_specs = R"json( [ { @@ -1062,7 +1081,7 @@ const char* builtin_pcp_option_specs = R"json( ] )json"; -const char* builtin_papi_option_specs = R"json( +const char* builtin_papi_hsw_option_specs = R"json( [ { "name" : "topdown.toplevel", @@ -1141,6 +1160,157 @@ const char* builtin_papi_option_specs = R"json( ] )json"; +const char* builtin_papi_spr_option_specs = R"json( + { + "name" : "topdown.toplevel", + "description" : "Top-down analysis for Intel CPUs (top level)", + "type" : "bool", + "category" : "metric", + "services" : [ "topdown" ], + "config" : { "CALI_TOPDOWN_LEVEL": "top" }, + "query" : + [ + { "level": "local", "select": + [ + "any(topdown.retiring) as \"Retiring\"", + "any(topdown.backend_bound) as \"Backend bound\"", + "any(topdown.frontend_bound) as \"Frontend bound\"", + "any(topdown.bad_speculation) as \"Bad speculation\"" + ] + }, + { "level": "cross", "select": + [ + "any(any#topdown.retiring) as \"Retiring\"", + "any(any#topdown.backend_bound) as \"Backend bound\"", + "any(any#topdown.frontend_bound) as \"Frontend bound\"", + "any(any#topdown.bad_speculation) as \"Bad speculation\"" + ] + } + ] + }, + { + "name" : "topdown.all", + "description" : "Top-down analysis for Intel CPUs (all levels)", + "type" : "bool", + "category" : "metric", + "services" : [ "topdown" ], + "config" : { "CALI_TOPDOWN_LEVEL": "all" }, + "query" : + [ + { "level": "local", "select": + [ + "any(topdown.retiring) as \"Retiring\"", + "any(topdown.backend_bound) as \"Backend bound\"", + "any(topdown.frontend_bound) as \"Frontend bound\"", + "any(topdown.bad_speculation) as \"Bad speculation\"", + "any(topdown.branch_mispredict) as \"Branch mispredict\"", + "any(topdown.machine_clears) as \"Machine clears\"", + "any(topdown.frontend_latency) as \"Frontend latency\"", + "any(topdown.frontend_bandwidth) as \"Frontend bandwidth\"", + "any(topdown.memory_bound) as \"Memory bound\"", + "any(topdown.core_bound) as \"Core bound\"", + "any(topdown.light_ops) as \"Light operations\"", + "any(topdown.heavy_ops) as \"Heavy operations\"" + ] + }, + { "level": "cross", "select": + [ + "any(any#topdown.retiring) as \"Retiring\"", + "any(any#topdown.backend_bound) as \"Backend bound\"", + "any(any#topdown.frontend_bound) as \"Frontend bound\"", + "any(any#topdown.bad_speculation) as \"Bad speculation\"", + "any(any#topdown.branch_mispredict) as \"Branch mispredict\"", + "any(any#topdown.machine_clears) as \"Machine clears\"", + "any(any#topdown.frontend_latency) as \"Frontend latency\"", + "any(any#topdown.frontend_bandwidth) as \"Frontend bandwidth\"", + "any(any#topdown.memory_bound) as \"Memory bound\"", + "any(any#topdown.core_bound) as \"Core bound\"", + "any(any#topdown.light_ops) as \"Light operations\"", + "any(any#topdown.heavy_ops) as \"Heavy operations\"" + ] + } + ] + }, + { + "name" : "topdown-counters.toplevel", + "description" : "Raw counter values for Intel top-down analysis (top level)", + "type" : "bool", + "category" : "metric", + "services" : [ "papi" ], + "config" : + { + "CALI_PAPI_COUNTERS": + "perf::slots,perf::topdown-retiring,perf::topdown-bad-spec,perf::topdown-fe-bound,perf::topdown-be-bound,INT_MISC:UOP_DROPPING" + }, + "query" : + [ + { "level": "local", "select": + [ + "inclusive_sum(sum#papi.slots) as slots", + "inclusive_sum(sum#papi.perf::topdown-retiring) as topdown_retiring", + "inclusive_sum(sum#papi.perf::topdown-bad-spec) as topdown_bad_spec", + "inclusive_sum(sum#papi.perf::topdown-fe-bound) as topdown_fe_bound", + "inclusive_sum(sum#papi.perf::topdown-be-bound) as topdown_be_bound", + "inclusive_sum(sum#papi.INT_MISC:UOP_DROPPING) as int_mist:uop_dropping" + ] + }, + { "level": "cross", "select": + [ + "sum(inclusive#sum#papi.slots) as slots", + "sum(inclusive#sum#papi.perf::topdown-retiring) as topdown_retiring", + "sum(inclusive#sum#papi.perf::topdown-bad-spec) as topdown_bad_spec", + "sum(inclusive#sum#papi.perf::topdown-fe-bound) as topdown_fe_bound", + "sum(inclusive#sum#papi.perf::topdown-be-bound) as topdown_be_bound", + "sum(inclusive#sum#papi.INT_MISC:UOP_DROPPING) as int_mist:uop_dropping" + ] + } + ] + }, + { + "name" : "topdown-counters.all", + "description" : "Raw counter values for Intel top-down analysis (all levels)", + "type" : "bool", + "category" : "metric", + "services" : [ "papi" ], + "config" : + { + "CALI_PAPI_COUNTERS": + "perf::slots,perf::topdown-retiring,perf::topdown-bad-spec,perf::topdown-fe-bound,perf::topdown-be-bound,INT_MISC:UOP_DROPPING,perf_raw::r8400,perf_raw::r8500,perf_raw::r8600,perf_raw::r8700" + }, + "query" : + [ + { "level": "local", "select": + [ + "inclusive_sum(sum#papi.slots) as slots", + "inclusive_sum(sum#papi.perf::topdown-retiring) as topdown_retiring", + "inclusive_sum(sum#papi.perf::topdown-bad-spec) as topdown_bad_spec", + "inclusive_sum(sum#papi.perf::topdown-fe-bound) as topdown_fe_bound", + "inclusive_sum(sum#papi.perf::topdown-be-bound) as topdown_be_bound", + "inclusive_sum(sum#papi.INT_MISC:UOP_DROPPING) as int_mist:uop_dropping", + "inclusive_sum(sum#papi.perf_raw::r8400) as topdown_heavy_ops", + "inclusive_sum(sum#papi.perf_raw::r8500) as topdown_br_mispredict", + "inclusive_sum(sum#papi.perf_raw::r8600) as topdown_fetch_lat", + "inclusive_sum(sum#papi.perf_raw::r8700) as topdown_mem_bound" + ] + }, + { "level": "cross", "select": + [ + "sum(inclusive#sum#papi.slots) as slots", + "sum(inclusive#sum#papi.perf::topdown-retiring) as topdown_retiring", + "sum(inclusive#sum#papi.perf::topdown-bad-spec) as topdown_bad_spec", + "sum(inclusive#sum#papi.perf::topdown-fe-bound) as topdown_fe_bound", + "sum(inclusive#sum#papi.perf::topdown-be-bound) as topdown_be_bound", + "sum(inclusive#sum#papi.INT_MISC:UOP_DROPPING) as int_mist:uop_dropping", + "sum(inclusive#sum#papi.perf_raw::r8400) as topdown_heavy_ops", + "sum(inclusive#sum#papi.perf_raw::r8500) as topdown_br_mispredict", + "sum(inclusive#sum#papi.perf_raw::r8600) as topdown_fetch_lat", + "sum(inclusive#sum#papi.perf_raw::r8700) as topdown_mem_bound" + ] + } + ] + } + )json"; + const char* builtin_kokkos_option_specs = R"json( [ { diff --git a/src/services/papi/Papi.cpp b/src/services/papi/Papi.cpp index d2ff3f01..ac3f0d6d 100644 --- a/src/services/papi/Papi.cpp +++ b/src/services/papi/Papi.cpp @@ -11,8 +11,8 @@ #include "caliper/Caliper.h" #include "caliper/SnapshotRecord.h" -#include "caliper/common/RuntimeConfig.h" #include "caliper/common/Log.h" +#include "caliper/common/RuntimeConfig.h" #include "../../common/util/spinlock.hpp" @@ -73,7 +73,8 @@ class PapiService unsigned m_num_failed_start; unsigned m_num_threads; - // PAPI component id -> event group info map for constructing the per-thread PAPI EventSets + // PAPI component id -> event group info map for constructing the per-thread + // PAPI EventSets eventset_map_t m_event_groups; ThreadInfo* m_thread_list; @@ -168,18 +169,21 @@ class PapiService int num = static_cast(p.second->codes.size()); - if (cpi && (num > 4 /* magic number for Intel counter support :-( */ || m_enable_multiplex)) { - if (Log::verbosity() >= 2) - Log(2).stream() << "papi: Initializing multiplex support for component " << p.first << " (" - << cpi->name << ")" << std::endl; - - ret = PAPI_assign_eventset_component(eventset, p.first); - if (ret != PAPI_OK) - print_papi_error("PAPI_assign_eventset_component", ret); - ret = PAPI_set_multiplex(eventset); - if (ret != PAPI_OK) - print_papi_error("PAPI_set_multiplex", ret); - } + // if (cpi && (num > 4 /* magic number for Intel counter support :-( */ || + // m_enable_multiplex)) { + // if (Log::verbosity() >= 2) + // Log(2).stream() << "papi: Initializing multiplex support for + // component " + // << p.first << " (" << cpi->name << ")" + // << std::endl; + + // ret = PAPI_assign_eventset_component(eventset, p.first); + // if (ret != PAPI_OK) + // print_papi_error("PAPI_assign_eventset_component", ret); + // ret = PAPI_set_multiplex(eventset); + // if (ret != PAPI_OK) + // print_papi_error("PAPI_set_multiplex", ret); + // } ret = PAPI_add_events(eventset, p.second->codes.data(), num); if (ret < 0) { diff --git a/src/services/topdown/IntelTopdown.cpp b/src/services/topdown/IntelTopdown.cpp index 1724f561..1087c10e 100644 --- a/src/services/topdown/IntelTopdown.cpp +++ b/src/services/topdown/IntelTopdown.cpp @@ -20,6 +20,7 @@ #include #include #include +#include using namespace cali; @@ -37,8 +38,8 @@ class TopdownCalculator const char* m_top_counters; const char* m_all_counters; - const char* s_res_top[]; - const char* s_res_all[]; + std::vector m_res_top; + std::vector m_res_all; std::map m_counter_attrs; std::map m_result_attrs; @@ -49,8 +50,8 @@ class TopdownCalculator { Variant ret; - auto c_it = counter_attrs.find(name); - if (c_it == counter_attrs.end()) + auto c_it = m_counter_attrs.find(name); + if (c_it == m_counter_attrs.end()) return ret; cali_id_t attr_id = c_it->second.id(); @@ -65,10 +66,26 @@ class TopdownCalculator return ret; } + TopdownCalculator( + IntelTopdownLevel level, + const char* top_counters, + const char* all_counters, + std::vector&& res_top, + std::vector&& res_all + ) + : m_level(level), + m_top_counters(top_counters), + m_all_counters(all_counters), + m_res_top(res_top), + m_res_all(res_all) + {} + public: TopdownCalculator(IntelTopdownLevel level) : m_level(level) {} + virtual ~TopdownCalculator() = default; + virtual std::vector compute_toplevel(const std::vector& rec) = 0; virtual std::size_t get_num_expected_toplevel() const = 0; @@ -112,14 +129,15 @@ class TopdownCalculator void make_result_attrs(CaliperMetadataAccessInterface& db) { - const char** res = (m_level == Top ? m_res_top : m_res_all); + std::vector& res = (m_level == Top ? m_res_top : m_res_all); - for (const char** s = res; s && *s; ++s) - m_result_attrs[std::string(*s)] = db.create_attribute( - std::string("topdown.") + (*s), + for (const char* s : res) { + m_result_attrs[std::string(s)] = db.create_attribute( + std::string("topdown.") + s, CALI_TYPE_DOUBLE, CALI_ATTR_ASVALUE | CALI_ATTR_SKIP_EVENTS ); + } } const std::map& get_counters_not_found() const { return m_counters_not_found; } @@ -136,55 +154,58 @@ class TopdownCalculator IntelTopdownLevel get_level() const { return m_level; } }; -class HaswellTopdown +class HaswellTopdown : public TopdownCalculator { public: HaswellTopdown(IntelTopdownLevel level) - : TopdownCalculator(level), - m_top_counters( - "CPU_CLK_THREAD_UNHALTED:THREAD_P" - ",IDQ_UOPS_NOT_DELIVERED:CORE" - ",INT_MISC:RECOVERY_CYCLES" - ",UOPS_ISSUED:ANY" - ",UOPS_RETIRED:RETIRE_SLOTS" - ), - m_all_counters( - "BR_MISP_RETIRED:ALL_BRANCHES" - ",CPU_CLK_THREAD_UNHALTED:THREAD_P" - ",CYCLE_ACTIVITY:CYCLES_NO_EXECUTE" - ",CYCLE_ACTIVITY:STALLS_L1D_PENDING" - ",CYCLE_ACTIVITY:STALLS_L2_PENDING" - ",CYCLE_ACTIVITY:STALLS_LDM_PENDING" - ",IDQ_UOPS_NOT_DELIVERED:CORE" - ",IDQ_UOPS_NOT_DELIVERED:CYCLES_0_UOPS_DELIV_CORE" - ",INT_MISC:RECOVERY_CYCLES" - ",MACHINE_CLEARS:COUNT" - ",MEM_LOAD_UOPS_RETIRED:L3_HIT" - ",MEM_LOAD_UOPS_RETIRED:L3_MISS" - ",UOPS_EXECUTED:CORE_CYCLES_GE_1" - ",UOPS_EXECUTED:CORE_CYCLES_GE_2" - ",UOPS_ISSUED:ANY" - ",UOPS_RETIRED:RETIRE_SLOTS" - ), - m_res_top({ "retiring", "backend_bound", "frontend_bound", "bad_speculation", nullptr }), - m_res_all({ "retiring", - "backend_bound", - "frontend_bound", - "bad_speculation", - "branch_mispredict", - "machine_clears", - "frontend_latency", - "frontend_bandwidth", - "memory_bound", - "core_bound", - "ext_mem_bound", - "l1_bound", - "l2_bound", - "l3_bound", - nullptr }) + : TopdownCalculator( + level, + // top_counters + "CPU_CLK_THREAD_UNHALTED:THREAD_P" + ",IDQ_UOPS_NOT_DELIVERED:CORE" + ",INT_MISC:RECOVERY_CYCLES" + ",UOPS_ISSUED:ANY" + ",UOPS_RETIRED:RETIRE_SLOTS", + // all_counters + "BR_MISP_RETIRED:ALL_BRANCHES" + ",CPU_CLK_THREAD_UNHALTED:THREAD_P" + ",CYCLE_ACTIVITY:CYCLES_NO_EXECUTE" + ",CYCLE_ACTIVITY:STALLS_L1D_PENDING" + ",CYCLE_ACTIVITY:STALLS_L2_PENDING" + ",CYCLE_ACTIVITY:STALLS_LDM_PENDING" + ",IDQ_UOPS_NOT_DELIVERED:CORE" + ",IDQ_UOPS_NOT_DELIVERED:CYCLES_0_UOPS_DELIV_CORE" + ",INT_MISC:RECOVERY_CYCLES" + ",MACHINE_CLEARS:COUNT" + ",MEM_LOAD_UOPS_RETIRED:L3_HIT" + ",MEM_LOAD_UOPS_RETIRED:L3_MISS" + ",UOPS_EXECUTED:CORE_CYCLES_GE_1" + ",UOPS_EXECUTED:CORE_CYCLES_GE_2" + ",UOPS_ISSUED:ANY" + ",UOPS_RETIRED:RETIRE_SLOTS", + // res_top + { "retiring", "backend_bound", "frontend_bound", "bad_speculation" }, + // res_all + { "retiring", + "backend_bound", + "frontend_bound", + "bad_speculation", + "branch_mispredict", + "machine_clears", + "frontend_latency", + "frontend_bandwidth", + "memory_bound", + "core_bound", + "ext_mem_bound", + "l1_bound", + "l2_bound", + "l3_bound" } + ) {} + virtual ~HaswellTopdown() = default; + virtual std::vector compute_toplevel(const std::vector& rec) override { std::vector ret; @@ -340,59 +361,62 @@ class HaswellTopdown virtual std::size_t get_num_expected_bad_speculation() const override { return 2; } }; -class SapphireRapidsTopdown +class SapphireRapidsTopdown : public TopdownCalculator { public: SapphireRapidsTopdown(IntelTopdownLevel level) - : TopdownCalculator(level), - m_top_counters( - "perf::slots" - ",perf::topdown-retiring" - ",perf::topdown-bad-spec" - ",perf::topdown-fe-bound" - ",perf::topdown-be-bound" - ",INT_MISC:UOP_DROPPING" - ), - m_all_counters( - "perf::slots" - ",perf::topdown-retiring" - ",perf::topdown-bad-spec" - ",perf::topdown-fe-bound" - ",perf::topdown-be-bound" - ",INT_MISC:UOP_DROPPING" - ",perf_raw::r8400" // topdown-heavy-ops - ",perf_raw::r8500" // topdown-br-mispredict - ",perf_raw::r8600" // topdown-fetch-lat - ",perf_raw::r8700" - ), // topdown-mem-bound - m_res_top({ "retiring", "backend_bound", "frontend_bound", "bad_speculation", nullptr }), - m_res_all({ "retiring", - "backend_bound", - "frontend_bound", - "bad_speculation", - "branch_mispredict", - "machine_clears", - "frontend_latency", - "frontend_bandwidth", - "memory_bound", - "core_bound", - "light_ops", - "heavy_ops", - nullptr }) + : TopdownCalculator( + level, + // top_counters + "perf::slots" + ",perf::topdown-retiring" + ",perf::topdown-bad-spec" + ",perf::topdown-fe-bound" + ",perf::topdown-be-bound" + ",INT_MISC:UOP_DROPPING", + // all_counters + "perf::slots" + ",perf::topdown-retiring" + ",perf::topdown-bad-spec" + ",perf::topdown-fe-bound" + ",perf::topdown-be-bound" + ",INT_MISC:UOP_DROPPING" + ",perf_raw::r8400" // topdown-heavy-ops + ",perf_raw::r8500" // topdown-br-mispredict + ",perf_raw::r8600" // topdown-fetch-lat + ",perf_raw::r8700", // topdown-mem-bound + // res_top + { "retiring", "backend_bound", "frontend_bound", "bad_speculation" }, + // res_all + { "retiring", + "backend_bound", + "frontend_bound", + "bad_speculation", + "branch_mispredict", + "machine_clears", + "frontend_latency", + "frontend_bandwidth", + "memory_bound", + "core_bound", + "light_ops", + "heavy_ops" } + ) {} + virtual ~SapphireRapidsTopdown() = default; + virtual std::vector compute_toplevel(const std::vector& rec) override { std::vector ret; // Get PAPI metrics for toplevel calculations + Variant v_slots_or_info_thread_slots = get_val_from_rec(rec, "perf::slots"); Variant v_retiring = get_val_from_rec(rec, "perf::topdown-retiring"); Variant v_bad_spec = get_val_from_rec(rec, "perf::topdown-bad-spec"); Variant v_fe_bound = get_val_from_rec(rec, "perf::topdown-fe-bound"); Variant v_be_bound = get_val_from_rec(rec, "perf::topdown-be-bound"); - Variant v_int_misc_uop_dropping = get_val_from_rec(rec, "INT_MSC:UOP_DROPPING"); - Variant v_slots_or_info_thread_slots = get_val_from_rec(rec, "perf::slots"); + Variant v_int_misc_uop_dropping = get_val_from_rec(rec, "INT_MISC:UOP_DROPPING"); // Check if any Variant is empty (use .empty()) bool is_incomplete = v_fe_bound.empty() || v_be_bound.emtpy() || v_bad_spec.empty() || v_retiring.empty() @@ -411,10 +435,10 @@ class SapphireRapidsTopdown double toplevel_sum = (v_retiring.to_double() + v_bad_spec.to_double() + v_fe_bound.to_double() + v_be_bound.to_double()); - double retiring = (v_retiring.to_double() / toplevel_sum) + (0 * v_slots_or_info_thread_slots); - double frontend_bound = - (v_fe_bound.to_double() / toplevel_sum) - (v_int_misc_uop_dropping / v_slots_or_info_thread_slots); - double backend_bound = (v_be_bound.to_double() / toplevel_sum) + (0 * v_slots_or_info_thread_slots); + double retiring = (v_retiring.to_double() / toplevel_sum) + (0 * v_slots_or_info_thread_slots.to_double()); + double frontend_bound = (v_fe_bound.to_double() / toplevel_sum) + - (v_int_misc_uop_dropping.to_double() / v_slots_or_info_thread_slots.to_double()); + double backend_bound = (v_be_bound.to_double() / toplevel_sum) + (0 * v_slots_or_info_thread_slots.to_double()); double bad_speculation = std::max(1.0 - (frontend_bound + backend_bound + retiring), 0.0); // Add toplevel metrics to vector of Entry @@ -434,11 +458,11 @@ class SapphireRapidsTopdown std::vector ret; // Get PAPI metrics for toplevel calculations + Variant v_slots_or_info_thread_slots = get_val_from_rec(rec, "perf::slots"); Variant v_retiring = get_val_from_rec(rec, "perf::topdown-retiring"); Variant v_bad_spec = get_val_from_rec(rec, "perf::topdown-bad-spec"); Variant v_fe_bound = get_val_from_rec(rec, "perf::topdown-fe-bound"); Variant v_be_bound = get_val_from_rec(rec, "perf::topdown-be-bound"); - Variant v_slots_or_info_thread_slots = get_val_from_rec(rec, "perf::slots"); Variant v_heavy_ops = get_val_from_rec(rec, "perf_raw::r8400"); // Check if any Variant is empty (use .empty()) @@ -452,9 +476,9 @@ class SapphireRapidsTopdown double toplevel_sum = (v_retiring.to_double() + v_bad_spec.to_double() + v_fe_bound.to_double() + v_be_bound.to_double()); // Copied from compute_toplevel - double retiring = (v_retiring.to_double() / toplevel_sum) + (0 * v_slots_or_info_thread_slots); + double retiring = (v_retiring.to_double() / toplevel_sum) + (0 * v_slots_or_info_thread_slots.to_double()); - double heavy_ops = (v_heavy_ops.to_double() / toplevel_sum) + (0 * v_slots_or_info_thread_slots); + double heavy_ops = (v_heavy_ops.to_double() / toplevel_sum) + (0 * v_slots_or_info_thread_slots.to_double()); double light_ops = std::max(0.0, retiring - heavy_ops); // Add toplevel metrics to vector of Entry @@ -472,11 +496,11 @@ class SapphireRapidsTopdown std::vector ret; // Get PAPI metrics for toplevel calculations + Variant v_slots_or_info_thread_slots = get_val_from_rec(rec, "perf::slots"); Variant v_retiring = get_val_from_rec(rec, "perf::topdown-retiring"); Variant v_bad_spec = get_val_from_rec(rec, "perf::topdown-bad-spec"); Variant v_fe_bound = get_val_from_rec(rec, "perf::topdown-fe-bound"); Variant v_be_bound = get_val_from_rec(rec, "perf::topdown-be-bound"); - Variant v_slots_or_info_thread_slots = get_val_from_rec(rec, "perf::slots"); Variant v_memory_bound = get_val_from_rec(rec, "perf_raw::r8700"); // Check if any Variant is empty (use .empty()) @@ -490,15 +514,16 @@ class SapphireRapidsTopdown double toplevel_sum = (v_retiring.to_double() + v_bad_spec.to_double() + v_fe_bound.to_double() + v_be_bound.to_double()); // Copied from compute_toplevel - double backend_bound = (v_be_bound.to_double() / toplevel_sum) + (0 * v_slots_or_info_thread_slots); + double backend_bound = (v_be_bound.to_double() / toplevel_sum) + (0 * v_slots_or_info_thread_slots.to_double()); - double memory_bound = (v_memory_bound.to_double() / toplevel_sum) + (0 * v_slots_or_info_thread_slots); - double core_bound = std::max(0.0, backend_bound - memory_bound); + double memory_bound = + (v_memory_bound.to_double() / toplevel_sum) + (0 * v_slots_or_info_thread_slots.to_double()); + double core_bound = std::max(0.0, backend_bound - memory_bound); // Add toplevel metrics to vector of Entry ret.reserve(2); - ret.push_back(Entry(m_result_attrs["backend_bound"], Variant(std::max(backend_bound, 0.0)))); ret.push_back(Entry(m_result_attrs["memory_bound"], Variant(std::max(memory_bound, 0.0)))); + ret.push_back(Entry(m_result_attrs["core_bound"], Variant(std::max(core_bound, 0.0)))); return ret; } @@ -510,18 +535,18 @@ class SapphireRapidsTopdown std::vector ret; // Get PAPI metrics for toplevel calculations + Variant v_slots_or_info_thread_slots = get_val_from_rec(rec, "perf::slots"); Variant v_retiring = get_val_from_rec(rec, "perf::topdown-retiring"); Variant v_bad_spec = get_val_from_rec(rec, "perf::topdown-bad-spec"); Variant v_fe_bound = get_val_from_rec(rec, "perf::topdown-fe-bound"); Variant v_be_bound = get_val_from_rec(rec, "perf::topdown-be-bound"); - Variant v_int_misc_uop_dropping = get_val_from_rec(rec, "INT_MSC:UOP_DROPPING"); - Variant v_slots_or_info_thread_slots = get_val_from_rec(rec, "perf::slots"); + Variant v_int_misc_uop_dropping = get_val_from_rec(rec, "INT_MISC:UOP_DROPPING"); Variant v_fetch_latency = get_val_from_rec(rec, "perf_raw::r8600"); // Check if any Variant is empty (use .empty()) - bool is_incomplete = v_fe_bound.empty() || v_be_bound.emtpy() || v_bad_spec.empty() || v_retiring.empty() + bool is_incomplete = v_fe_bound.empty() || v_be_bound.empty() || v_bad_spec.empty() || v_retiring.empty() || v_int_misc_uop_dropping.empty() || v_slots_or_info_thread_slots.empty() - || v_memory_bound.empty(); + || v_fetch_latency.empty(); // Check if bad values were obtained if (is_incomplete) @@ -530,11 +555,12 @@ class SapphireRapidsTopdown double toplevel_sum = (v_retiring.to_double() + v_bad_spec.to_double() + v_fe_bound.to_double() + v_be_bound.to_double()); // Copied from compute_toplevel - double frontend_bound = - (v_fe_bound.to_double() / toplevel_sum) - (v_int_misc_uop_dropping / v_slots_or_info_thread_slots); + double frontend_bound = (v_fe_bound.to_double() / toplevel_sum) + - (v_int_misc_uop_dropping.to_double() / v_slots_or_info_thread_slots.to_double()); + + double fetch_latency = (v_fetch_latency.to_double() / toplevel_sum) + - (v_int_misc_uop_dropping.to_double() / v_slots_or_info_thread_slots.to_double()); - double fetch_latency = - (v_fetch_latency.to_double() / toplevel_sum) - (v_int_misc_uop_dropping * v_slots_or_info_thread_slots); double fetch_bandwidth = std::max(0.0, frontend_bound - fetch_latency); // Add toplevel metrics to vector of Entry @@ -552,12 +578,12 @@ class SapphireRapidsTopdown std::vector ret; // Get PAPI metrics for toplevel calculations + Variant v_slots_or_info_thread_slots = get_val_from_rec(rec, "perf::slots"); Variant v_retiring = get_val_from_rec(rec, "perf::topdown-retiring"); Variant v_bad_spec = get_val_from_rec(rec, "perf::topdown-bad-spec"); Variant v_fe_bound = get_val_from_rec(rec, "perf::topdown-fe-bound"); Variant v_be_bound = get_val_from_rec(rec, "perf::topdown-be-bound"); - Variant v_int_misc_uop_dropping = get_val_from_rec(rec, "INT_MSC:UOP_DROPPING"); - Variant v_slots_or_info_thread_slots = get_val_from_rec(rec, "perf::slots"); + Variant v_int_misc_uop_dropping = get_val_from_rec(rec, "INT_MISC:UOP_DROPPING"); Variant v_branch_mispredict = get_val_from_rec(rec, "perf_raw::r8500"); // Check if any Variant is empty (use .empty()) @@ -573,14 +599,14 @@ class SapphireRapidsTopdown double toplevel_sum = (v_retiring.to_double() + v_bad_spec.to_double() + v_fe_bound.to_double() + v_be_bound.to_double()); - double retiring = (v_retiring.to_double() / toplevel_sum) + (0 * v_slots_or_info_thread_slots); - double frontend_bound = - (v_fe_bound.to_double() / toplevel_sum) - (v_int_misc_uop_dropping / v_slots_or_info_thread_slots); - double backend_bound = (v_be_bound.to_double() / toplevel_sum) + (0 * v_slots_or_info_thread_slots); + double retiring = (v_retiring.to_double() / toplevel_sum) + (0 * v_slots_or_info_thread_slots.to_double()); + double frontend_bound = (v_fe_bound.to_double() / toplevel_sum) + - (v_int_misc_uop_dropping.to_double() / v_slots_or_info_thread_slots.to_double()); + double backend_bound = (v_be_bound.to_double() / toplevel_sum) + (0 * v_slots_or_info_thread_slots.to_double()); double bad_speculation = std::max(1.0 - (frontend_bound + backend_bound + retiring), 0.0); double branch_mispredict = - (v_branch_mispredict.to_double() / toplevel_sum) + (0 * v_slots_or_info_thread_slots); + (v_branch_mispredict.to_double() / toplevel_sum) + (0 * v_slots_or_info_thread_slots.to_double()); double machine_clears = std::max(0.0, bad_speculation - branch_mispredict); // Add toplevel metrics to vector of Entry @@ -607,157 +633,168 @@ class IntelTopdown unsigned num_ret_computed; unsigned num_ret_skipped; - IntelTopdownLevel level; + IntelTopdownLevel m_level; TopdownCalculator* m_calculator; + bool find_counter_attrs(CaliperMetadataAccessInterface& db) { return m_calculator->find_counter_attrs(db); } + + void make_result_attrs(CaliperMetadataAccessInterface& db) { m_calculator->make_result_attrs(db); } + void postprocess_snapshot_cb(std::vector& rec) { std::vector result = m_calculator->compute_toplevel(rec); - if (result.size() != m_calculator->get_num_expected_toplevel()) + if (result.size() != m_calculator->get_num_expected_toplevel()) { ++num_top_skipped; - else { + } else { rec.insert(rec.end(), result.begin(), result.end()); ++num_top_computed; } - if (level == All) { + if (m_level == All) { result = m_calculator->compute_backend_bound(rec); - if (result.size() != m_calculator->get_num_expected_backend_bound()) + if (result.size() != m_calculator->get_num_expected_backend_bound()) { ++num_be_skipped; - else { + } else { rec.insert(rec.end(), result.begin(), result.end()); ++num_be_computed; } result = m_calculator->compute_frontend_bound(rec); - if (result.size() != m_calculator->get_num_expected_frontend_bound()) + if (result.size() != m_calculator->get_num_expected_frontend_bound()) { ++num_fe_skipped; - else { + } else { rec.insert(rec.end(), result.begin(), result.end()); ++num_fe_computed; } result = m_calculator->compute_bad_speculation(rec); - if (result.size() != m_calculator->get_num_expected_bad_speculation()) + if (result.size() != m_calculator->get_num_expected_bad_speculation()) { ++num_bsp_skipped; - else { + } else { rec.insert(rec.end(), result.begin(), result.end()); ++num_bsp_computed; } result = m_calculator->compute_retiring(rec); - if (result.size() != m_calculator->get_num_expected_retiring()) + if (result.size() != m_calculator->get_num_expected_retiring()) { ++num_ret_skipped; - else { + } else { rec.insert(rec.end(), result.begin(), result.end()); ++num_ret_computed; } } + } - void finish_cb(Caliper * c, Channel * channel) - { - Log(1).stream() << channel->name() << ": topdown: Computed topdown metrics for " << num_top_computed - << " records, skipped " << num_top_skipped << std::endl; - - if (Log::verbosity() >= 2) { - Log(2).stream() << channel->name() << ": topdown: Records processed per topdown level: " - << "\n top: " << num_top_computed << " computed, " << num_top_skipped - << " skipped," - << "\n bad spec: " << num_bsp_computed << " computed, " << num_bsp_skipped - << " skipped," - << "\n frontend: " << num_bsp_computed << " computed, " << num_bsp_skipped - << " skipped," - << "\n backend: " << num_bsp_computed << " computed, " << num_bsp_skipped - << " skipped." << std::endl; - - const std::map& counters_not_found = m_calculator->get_counters_not_found(); - - if (!counters_not_found.empty()) { - std::ostringstream os; - for (auto& p : counters_not_found) - os << "\n " << p.first << ": " << p.second; - Log(2).stream() << channel->name() << ": topdown: Counters not found:" << os.str() << std::endl; - } + void finish_cb(Caliper* c, Channel* channel) + { + Log(1).stream() << channel->name() << ": topdown: Computed topdown metrics for " << num_top_computed + << " records, skipped " << num_top_skipped << std::endl; + + if (Log::verbosity() >= 2) { + Log(2).stream() << channel->name() << ": topdown: Records processed per topdown level: " + << "\n top: " << num_top_computed << " computed, " << num_top_skipped << " skipped," + << "\n bad spec: " << num_bsp_computed << " computed, " << num_bsp_skipped << " skipped," + << "\n frontend: " << num_bsp_computed << " computed, " << num_bsp_skipped << " skipped," + << "\n backend: " << num_bsp_computed << " computed, " << num_bsp_skipped << " skipped." + << std::endl; + + const std::map& counters_not_found = m_calculator->get_counters_not_found(); + + if (!counters_not_found.empty()) { + std::ostringstream os; + for (auto& p : counters_not_found) + os << "\n " << p.first << ": " << p.second; + Log(2).stream() << channel->name() << ": topdown: Counters not found:" << os.str() << std::endl; } } + } - explicit IntelTopdown(TopdownCalculator * calculator) - : num_top_computed(0), - num_top_skipped(0), - num_be_computed(0), - num_be_skipped(0), - num_fe_computed(0), - num_fe_skipped(0), - num_bsp_computed(0), - num_bsp_skipped(0), - level(calculator->get_level()), - m_calculator(calculator) - {} - - ~IntelTopdown() - { - if (m_calculator != nullptr) { - delete m_calculator; - } + explicit IntelTopdown(TopdownCalculator* calculator) + : num_top_computed(0), + num_top_skipped(0), + num_be_computed(0), + num_be_skipped(0), + num_fe_computed(0), + num_fe_skipped(0), + num_bsp_computed(0), + num_bsp_skipped(0), + m_level(calculator->get_level()), + m_calculator(calculator) + {} + + ~IntelTopdown() + { + if (m_calculator != nullptr) { + delete m_calculator; } + } - public: +public: - static const char* s_spec; + static const char* s_spec; - static void intel_topdown_register(Caliper * c, Channel * channel) - { - Level level = Top; + static void intel_topdown_register(Caliper* c, Channel* channel) + { + IntelTopdownLevel level = Top; - auto config = services::init_config_from_spec(channel->config(), s_spec); - std::string lvlcfg = config.get("level").to_string(); + auto config = services::init_config_from_spec(channel->config(), s_spec); + std::string lvlcfg = config.get("level").to_string(); - if (lvlcfg == "all") { - level = All; - } else if (lvlcfg != "top") { - Log(0).stream() << channel->name() << ": topdown: Unknown level \"" << lvlcfg << "\", skipping topdown" - << std::endl; - return; - } + if (lvlcfg == "all") { + level = All; + } else if (lvlcfg != "top") { + Log(0).stream() << channel->name() << ": topdown: Unknown level \"" << lvlcfg << "\", skipping topdown" + << std::endl; + return; + } - // TODO Add logic to select correct TopdownCalculator - TopdownCalculator* calculator = new HaswellTopdown(level); + TopdownCalculator* calculator; - channel->config().set("CALI_PAPI_COUNTERS", calculator->get_counters()); +#if defined(CALIPER_HAVE_ARCH) + if (std::string(CALIPER_HAVE_ARCH) == "sapphirerapids") { + calculator = new SapphireRapidsTopdown(level); + } else { +#endif + calculator = new HaswellTopdown(level); // Default type of calculation +#if defined(CALIPER_HAVE_ARCH) + } +#endif - if (!cali::services::register_service(c, channel, "papi")) { - Log(0).stream() << channel->name() << ": topdown: Unable to register papi service, skipping topdown" - << std::endl; - return; - } + channel->config().set("CALI_PAPI_COUNTERS", calculator->get_counters()); - IntelTopdown* instance = new IntelTopdown(calculator); - - channel->events().pre_flush_evt.connect([instance](Caliper* c, Channel* channel, SnapshotView) { - if (instance->find_counter_attrs(*c)) - instance->make_result_attrs(*c); - else - Log(0).stream() << channel->name() << ": topdown: Could not find counter attributes!" << std::endl; - }); - channel->events().postprocess_snapshot.connect([instance](Caliper*, Channel*, std::vector& rec) { - instance->postprocess_snapshot_cb(rec); - }); - channel->events().finish_evt.connect([instance](Caliper* c, Channel* channel) { - instance->finish_cb(c, channel); - delete instance; - }); - - Log(1).stream() << channel->name() << ": Registered topdown service. Level: " << lvlcfg << "." << std::endl; + if (!cali::services::register_service(c, channel, "papi")) { + Log(0).stream() << channel->name() << ": topdown: Unable to register papi service, skipping topdown" + << std::endl; + return; } - }; - const char* IntelTopdown::s_spec = R"json( + IntelTopdown* instance = new IntelTopdown(calculator); + + channel->events().pre_flush_evt.connect([instance](Caliper* c, Channel* channel, SnapshotView) { + if (instance->find_counter_attrs(*c)) + instance->make_result_attrs(*c); + else + Log(0).stream() << channel->name() << ": topdown: Could not find counter attributes!" << std::endl; + }); + channel->events().postprocess_snapshot.connect([instance](Caliper*, Channel*, std::vector& rec) { + instance->postprocess_snapshot_cb(rec); + }); + channel->events().finish_evt.connect([instance](Caliper* c, Channel* channel) { + instance->finish_cb(c, channel); + delete instance; + }); + + Log(1).stream() << channel->name() << ": Registered topdown service. Level: " << lvlcfg << "." << std::endl; + } +}; + +const char* IntelTopdown::s_spec = R"json( { "name": "topdown", "description": "Record PAPI counters and compute top-down analysis for Intel CPUs", "config": [ @@ -776,5 +813,5 @@ namespace cali { - CaliperService topdown_service { ::IntelTopdown::s_spec, ::IntelTopdown::intel_topdown_register }; +CaliperService topdown_service { ::IntelTopdown::s_spec, ::IntelTopdown::intel_topdown_register }; } From c09c473474f3b3ac70a98ed5edb700a5a440358c Mon Sep 17 00:00:00 2001 From: Ian Lumsden Date: Fri, 4 Oct 2024 16:37:04 -0400 Subject: [PATCH 04/11] Splits TopdownCalculator and subclasses into own files to simplify implementation --- src/services/topdown/CMakeLists.txt | 9 +- src/services/topdown/HaswellTopdown.cpp | 249 +++++++ src/services/topdown/HaswellTopdown.h | 44 ++ src/services/topdown/IntelTopdown.cpp | 615 +----------------- .../topdown/SapphireRapidsTopdown.cpp | 299 +++++++++ src/services/topdown/SapphireRapidsTopdown.h | 44 ++ src/services/topdown/TopdownCalculator.cpp | 90 +++ src/services/topdown/TopdownCalculator.h | 80 +++ 8 files changed, 823 insertions(+), 607 deletions(-) create mode 100644 src/services/topdown/HaswellTopdown.cpp create mode 100644 src/services/topdown/HaswellTopdown.h create mode 100644 src/services/topdown/SapphireRapidsTopdown.cpp create mode 100644 src/services/topdown/SapphireRapidsTopdown.h create mode 100644 src/services/topdown/TopdownCalculator.cpp create mode 100644 src/services/topdown/TopdownCalculator.h diff --git a/src/services/topdown/CMakeLists.txt b/src/services/topdown/CMakeLists.txt index 7adbe68b..78a8bef5 100644 --- a/src/services/topdown/CMakeLists.txt +++ b/src/services/topdown/CMakeLists.txt @@ -1,5 +1,10 @@ set(CALIPER_TOPDOWN_SOURCES - IntelTopdown.cpp) + IntelTopdown.cpp + TopdownCalulator.cpp + HaswellTopdown.cpp + SapphireRapidsTopdown.cpp) -add_service_sources(${CALIPER_TOPDOWN_SOURCES}) +add_library(caliper-topdown OBJECT ${CALIPER_TOPDOWN_SOURCES}) + +add_service_objlib("caliper-topdown") add_caliper_service("topdown CALIPER_HAVE_PAPI") diff --git a/src/services/topdown/HaswellTopdown.cpp b/src/services/topdown/HaswellTopdown.cpp new file mode 100644 index 00000000..a04551c8 --- /dev/null +++ b/src/services/topdown/HaswellTopdown.cpp @@ -0,0 +1,249 @@ +#include "HaswellTopdown.h" + +#include + +namespace cali { +namespace topdown { + +HaswellTopdown::HaswellTopdown(IntelTopdownLevel level) + : cali::topdown::TopdownCalculator( + level, + // top_counters + "CPU_CLK_THREAD_UNHALTED:THREAD_P" + ",IDQ_UOPS_NOT_DELIVERED:CORE" + ",INT_MISC:RECOVERY_CYCLES" + ",UOPS_ISSUED:ANY" + ",UOPS_RETIRED:RETIRE_SLOTS", + // all_counters + "BR_MISP_RETIRED:ALL_BRANCHES" + ",CPU_CLK_THREAD_UNHALTED:THREAD_P" + ",CYCLE_ACTIVITY:CYCLES_NO_EXECUTE" + ",CYCLE_ACTIVITY:STALLS_L1D_PENDING" + ",CYCLE_ACTIVITY:STALLS_L2_PENDING" + ",CYCLE_ACTIVITY:STALLS_LDM_PENDING" + ",IDQ_UOPS_NOT_DELIVERED:CORE" + ",IDQ_UOPS_NOT_DELIVERED:CYCLES_0_UOPS_DELIV_CORE" + ",INT_MISC:RECOVERY_CYCLES" + ",MACHINE_CLEARS:COUNT" + ",MEM_LOAD_UOPS_RETIRED:L3_HIT" + ",MEM_LOAD_UOPS_RETIRED:L3_MISS" + ",UOPS_EXECUTED:CORE_CYCLES_GE_1" + ",UOPS_EXECUTED:CORE_CYCLES_GE_2" + ",UOPS_ISSUED:ANY" + ",UOPS_RETIRED:RETIRE_SLOTS", + // res_top + {"retiring", "backend_bound", "frontend_bound", "bad_speculation"}, + // res_all + {"retiring", "backend_bound", "frontend_bound", "bad_speculation", + "branch_mispredict", "machine_clears", "frontend_latency", + "frontend_bandwidth", "memory_bound", "core_bound", "ext_mem_bound", + "l1_bound", "l2_bound", "l3_bound"}) {} + +std::vector +HaswellTopdown::compute_toplevel(const std::vector &rec) { + std::vector ret; + + Variant v_cpu_clk_unhalted_thread_p = + get_val_from_rec(rec, "CPU_CLK_THREAD_UNHALTED:THREAD_P"); + Variant v_uops_retired_retire_slots = + get_val_from_rec(rec, "UOPS_RETIRED:RETIRE_SLOTS"); + Variant v_uops_issued_any = get_val_from_rec(rec, "UOPS_ISSUED:ANY"); + Variant v_int_misc_recovery_cycles = + get_val_from_rec(rec, "INT_MISC:RECOVERY_CYCLES"); + Variant v_idq_uops_not_delivered_core = + get_val_from_rec(rec, "IDQ_UOPS_NOT_DELIVERED:CORE"); + + bool is_incomplete = v_cpu_clk_unhalted_thread_p.empty() || + v_uops_retired_retire_slots.empty() || + v_uops_issued_any.empty() || + v_int_misc_recovery_cycles.empty() || + v_idq_uops_not_delivered_core.empty(); + bool is_nonzero = v_cpu_clk_unhalted_thread_p.to_double() > 0.0 && + v_uops_retired_retire_slots.to_double() > 0.0 && + v_uops_issued_any.to_double() > 0.0 && + v_int_misc_recovery_cycles.to_double() > 0.0 && + v_idq_uops_not_delivered_core.to_double() > 0.0; + + double slots = 4.0 * v_cpu_clk_unhalted_thread_p.to_double(); + + if (is_incomplete || !is_nonzero || slots < 1.0) + return ret; + + double retiring = v_uops_retired_retire_slots.to_double() / slots; + double bad_speculation = + (v_uops_issued_any.to_double() - v_uops_retired_retire_slots.to_double() + + 4.0 * v_int_misc_recovery_cycles.to_double()) / + slots; + double frontend_bound = v_idq_uops_not_delivered_core.to_double() / slots; + double backend_bound = 1.0 - (retiring + bad_speculation + frontend_bound); + + ret.reserve(4); + ret.push_back( + Entry(m_result_attrs["retiring"], Variant(std::max(retiring, 0.0)))); + ret.push_back(Entry(m_result_attrs["backend_bound"], + Variant(std::max(backend_bound, 0.0)))); + ret.push_back(Entry(m_result_attrs["frontend_bound"], + Variant(std::max(frontend_bound, 0.0)))); + ret.push_back(Entry(m_result_attrs["bad_speculation"], + Variant(std::max(bad_speculation, 0.0)))); + + return ret; +} + +std::size_t HaswellTopdown::get_num_expected_toplevel() const { return 4; } + +std::vector +HaswellTopdown::compute_retiring(const std::vector &rec) { + return {}; +} + +std::size_t HaswellTopdown::get_num_expected_retiring() const { return 0; } + +std::vector +HaswellTopdown::compute_backend_bound(const std::vector &rec) { + std::vector ret; + + Variant v_cpu_clk_unhalted_thread_p = + get_val_from_rec(rec, "CPU_CLK_THREAD_UNHALTED:THREAD_P"); + Variant v_cycle_activity_stalls_ldm_pending = + get_val_from_rec(rec, "CYCLE_ACTIVITY:STALLS_LDM_PENDING"); + Variant v_cycle_activity_cycles_no_execute = + get_val_from_rec(rec, "CYCLE_ACTIVITY:CYCLES_NO_EXECUTE"); + Variant v_uops_executed_core_cycles_ge_1 = + get_val_from_rec(rec, "UOPS_EXECUTED:CORE_CYCLES_GE_1"); + Variant v_uops_executed_core_cycles_ge_2 = + get_val_from_rec(rec, "UOPS_EXECUTED:CORE_CYCLES_GE_2"); + Variant v_mem_load_uops_retired_l3_miss = + get_val_from_rec(rec, "MEM_LOAD_UOPS_RETIRED:L3_MISS"); + Variant v_mem_load_uops_retired_l3_hit = + get_val_from_rec(rec, "MEM_LOAD_UOPS_RETIRED:L3_HIT"); + Variant v_cycle_activity_stalls_l2_pending = + get_val_from_rec(rec, "CYCLE_ACTIVITY:STALLS_L2_PENDING"); + Variant v_cycle_activity_stalls_l1d_pending = + get_val_from_rec(rec, "CYCLE_ACTIVITY:STALLS_L1D_PENDING"); + + bool is_incomplete = v_cpu_clk_unhalted_thread_p.empty() || + v_cycle_activity_stalls_ldm_pending.empty() || + v_cycle_activity_cycles_no_execute.empty() || + v_uops_executed_core_cycles_ge_1.empty() || + v_uops_executed_core_cycles_ge_2.empty() || + v_mem_load_uops_retired_l3_miss.empty() || + v_mem_load_uops_retired_l3_hit.empty() || + v_cycle_activity_stalls_l2_pending.empty() || + v_cycle_activity_stalls_l1d_pending.empty(); + + double clocks = v_cpu_clk_unhalted_thread_p.to_double(); + + if (is_incomplete || !(clocks > 1.0)) + return ret; + + double memory_bound = + v_cycle_activity_stalls_ldm_pending.to_double() / clocks; + double be_bound_at_exe = (v_cycle_activity_cycles_no_execute.to_double() + + v_uops_executed_core_cycles_ge_1.to_double() - + v_uops_executed_core_cycles_ge_2.to_double()) / + clocks; + double l3_tot = v_mem_load_uops_retired_l3_hit.to_double() + + 7.0 * v_mem_load_uops_retired_l3_miss.to_double(); + double l3_hit_fraction = 0.0; + double l3_miss_fraction = 0.0; + if (l3_tot > 0.0) { + l3_hit_fraction = v_mem_load_uops_retired_l3_hit.to_double() / l3_tot; + l3_miss_fraction = v_mem_load_uops_retired_l3_miss.to_double() / l3_tot; + } + double ext_mem_bound = v_cycle_activity_stalls_l2_pending.to_double() * + l3_miss_fraction / clocks; + double l1_bound = (v_cycle_activity_stalls_ldm_pending.to_double() - + v_cycle_activity_stalls_l1d_pending.to_double()) / + clocks; + double l2_bound = (v_cycle_activity_stalls_l1d_pending.to_double() - + v_cycle_activity_stalls_l2_pending.to_double()) / + clocks; + double l3_bound = + v_cycle_activity_stalls_l2_pending.to_double() * l3_hit_fraction / clocks; + + ret.reserve(6); + ret.push_back(Entry(m_result_attrs["memory_bound"], Variant(memory_bound))); + ret.push_back(Entry(m_result_attrs["core_bound"], + Variant(be_bound_at_exe - memory_bound))); + ret.push_back(Entry(m_result_attrs["ext_mem_bound"], Variant(ext_mem_bound))); + ret.push_back(Entry(m_result_attrs["l1_bound"], Variant(l1_bound))); + ret.push_back(Entry(m_result_attrs["l2_bound"], Variant(l2_bound))); + ret.push_back(Entry(m_result_attrs["l3_bound"], Variant(l3_bound))); + + return ret; +} + +std::size_t HaswellTopdown::get_num_expected_backend_bound() const { return 6; } + +std::vector +HaswellTopdown::compute_frontend_bound(const std::vector &rec) { + std::vector ret; + + Variant v_cpu_clk_unhalted_thread_p = + get_val_from_rec(rec, "CPU_CLK_THREAD_UNHALTED:THREAD_P"); + Variant v_idq_uops_not_delivered = + get_val_from_rec(rec, "IDQ_UOPS_NOT_DELIVERED:CYCLES_0_UOPS_DELIV_CORE"); + + bool is_incomplete = + v_cpu_clk_unhalted_thread_p.empty() || v_idq_uops_not_delivered.empty(); + + double clocks = v_cpu_clk_unhalted_thread_p.to_double(); + double uops = v_idq_uops_not_delivered.to_double(); + + if (is_incomplete || clocks < 1.0 || uops > clocks) + return ret; + + double fe_latency = uops / clocks; + + ret.reserve(2); + ret.push_back(Entry(m_result_attrs["frontend_latency"], Variant(fe_latency))); + ret.push_back( + Entry(m_result_attrs["frontend_bandwidth"], Variant(1.0 - fe_latency))); + + return ret; +} + +std::size_t HaswellTopdown::get_num_expected_frontend_bound() const { + return 2; +} + +std::vector +HaswellTopdown::compute_bad_speculation(const std::vector &rec) { + std::vector ret; + + Variant v_br_misp_retired_all_branches = + get_val_from_rec(rec, "BR_MISP_RETIRED:ALL_BRANCHES"); + Variant v_machine_clears_count = + get_val_from_rec(rec, "MACHINE_CLEARS:COUNT"); + + bool is_incomplete = + v_br_misp_retired_all_branches.empty() || v_machine_clears_count.empty(); + + double br_misp_retired_all_branches = + v_br_misp_retired_all_branches.to_double(); + double machine_clears_count = v_machine_clears_count.to_double(); + + if (is_incomplete || + !(br_misp_retired_all_branches + machine_clears_count > 1.0)) + return ret; + + double branch_mispredict = + br_misp_retired_all_branches / + (br_misp_retired_all_branches + machine_clears_count); + + ret.reserve(2); + ret.push_back( + Entry(m_result_attrs["branch_mispredict"], Variant(branch_mispredict))); + ret.push_back(Entry(m_result_attrs["machine_clears"], + Variant(1.0 - branch_mispredict))); + + return ret; +} + +std::size_t HaswellTopdown::get_num_expected_bad_speculation() const { + return 2; +} + +} // namespace topdown +} // namespace cali \ No newline at end of file diff --git a/src/services/topdown/HaswellTopdown.h b/src/services/topdown/HaswellTopdown.h new file mode 100644 index 00000000..39622d89 --- /dev/null +++ b/src/services/topdown/HaswellTopdown.h @@ -0,0 +1,44 @@ +#ifndef CALI_TOPDOWN_HASWELL_TOPDOWN_H +#define CALI_TOPDOWN_HASWELL_TOPDOWN_H + +#include "TopdownCalculator.h" + +namespace cali { +namespace topdown { + +class HaswellTopdown : public TopdownCalculator { +public: + HaswellTopdown(IntelTopdownLevel level); + + virtual ~HaswellTopdown() = default; + + virtual std::vector + compute_toplevel(const std::vector &rec) override; + + virtual std::size_t get_num_expected_toplevel() const override; + + virtual std::vector + compute_retiring(const std::vector &rec) override; + + virtual std::size_t get_num_expected_retiring() const override; + + virtual std::vector + compute_backend_bound(const std::vector &rec) override; + + virtual std::size_t get_num_expected_backend_bound() const override; + + virtual std::vector + compute_frontend_bound(const std::vector &rec) override; + + virtual std::size_t get_num_expected_frontend_bound() const override; + + virtual std::vector + compute_bad_speculation(const std::vector &rec) override; + + virtual std::size_t get_num_expected_bad_speculation() const override; +}; + +} // namespace topdown +} // namespace cali + +#endif /* CALI_TOPDOWN_HASWELL_TOPDOWN_H */ \ No newline at end of file diff --git a/src/services/topdown/IntelTopdown.cpp b/src/services/topdown/IntelTopdown.cpp index 1087c10e..507a7648 100644 --- a/src/services/topdown/IntelTopdown.cpp +++ b/src/services/topdown/IntelTopdown.cpp @@ -8,7 +8,9 @@ #include "../Services.h" -#include "caliper/Caliper.h" +#include "HaswellTopdown.h" +#include "SapphireRapidsTopdown.h" + #include "caliper/SnapshotRecord.h" #include "caliper/common/Log.h" @@ -18,608 +20,13 @@ #include "../Services.h" #include -#include #include -#include using namespace cali; namespace { -enum IntelTopdownLevel { All = 1, Top = 2 }; - -class TopdownCalculator -{ -protected: - - IntelTopdownLevel m_level; - - const char* m_top_counters; - const char* m_all_counters; - - std::vector m_res_top; - std::vector m_res_all; - - std::map m_counter_attrs; - std::map m_result_attrs; - - std::map m_counters_not_found; - - Variant get_val_from_rec(const std::vector& rec, const char* name) - { - Variant ret; - - auto c_it = m_counter_attrs.find(name); - if (c_it == m_counter_attrs.end()) - return ret; - - cali_id_t attr_id = c_it->second.id(); - - auto it = std::find_if(rec.begin(), rec.end(), [attr_id](const Entry& e) { return e.attribute() == attr_id; }); - - if (it != rec.end()) - ret = it->value(); - else - ++m_counters_not_found[std::string(name)]; - - return ret; - } - - TopdownCalculator( - IntelTopdownLevel level, - const char* top_counters, - const char* all_counters, - std::vector&& res_top, - std::vector&& res_all - ) - : m_level(level), - m_top_counters(top_counters), - m_all_counters(all_counters), - m_res_top(res_top), - m_res_all(res_all) - {} - -public: - - TopdownCalculator(IntelTopdownLevel level) : m_level(level) {} - - virtual ~TopdownCalculator() = default; - - virtual std::vector compute_toplevel(const std::vector& rec) = 0; - - virtual std::size_t get_num_expected_toplevel() const = 0; - - virtual std::vector compute_retiring(const std::vector& rec) = 0; - - virtual std::size_t get_num_expected_retiring() const = 0; - - virtual std::vector compute_backend_bound(const std::vector& rec) = 0; - - virtual std::size_t get_num_expected_backend_bound() const = 0; - - virtual std::vector compute_frontend_bound(const std::vector& rec) = 0; - - virtual std::size_t get_num_expected_frontend_bound() const = 0; - - virtual std::vector compute_bad_speculation(const std::vector& rec) = 0; - - virtual std::size_t get_num_expected_bad_speculation() const = 0; - - bool find_counter_attrs(CaliperMetadataAccessInterface& db) - { - const char* list = (m_level == All ? m_all_counters : m_top_counters); - auto counters = StringConverter(list).to_stringlist(); - - for (const auto& s : counters) { - Attribute attr = db.get_attribute(std::string("sum#papi.") + s); - - if (!attr) - attr = db.get_attribute(std::string("papi.") + s); - if (!attr) { - Log(0).stream() << "topdown: " << s << " counter attribute not found!" << std::endl; - return false; - } - - m_counter_attrs[s] = attr; - } - - return true; - } - - void make_result_attrs(CaliperMetadataAccessInterface& db) - { - std::vector& res = (m_level == Top ? m_res_top : m_res_all); - - for (const char* s : res) { - m_result_attrs[std::string(s)] = db.create_attribute( - std::string("topdown.") + s, - CALI_TYPE_DOUBLE, - CALI_ATTR_ASVALUE | CALI_ATTR_SKIP_EVENTS - ); - } - } - - const std::map& get_counters_not_found() const { return m_counters_not_found; } - - const char* get_counters() const - { - if (m_level == All) { - return m_all_counters; - } else { - return m_top_counters; - } - } - - IntelTopdownLevel get_level() const { return m_level; } -}; - -class HaswellTopdown : public TopdownCalculator -{ -public: - - HaswellTopdown(IntelTopdownLevel level) - : TopdownCalculator( - level, - // top_counters - "CPU_CLK_THREAD_UNHALTED:THREAD_P" - ",IDQ_UOPS_NOT_DELIVERED:CORE" - ",INT_MISC:RECOVERY_CYCLES" - ",UOPS_ISSUED:ANY" - ",UOPS_RETIRED:RETIRE_SLOTS", - // all_counters - "BR_MISP_RETIRED:ALL_BRANCHES" - ",CPU_CLK_THREAD_UNHALTED:THREAD_P" - ",CYCLE_ACTIVITY:CYCLES_NO_EXECUTE" - ",CYCLE_ACTIVITY:STALLS_L1D_PENDING" - ",CYCLE_ACTIVITY:STALLS_L2_PENDING" - ",CYCLE_ACTIVITY:STALLS_LDM_PENDING" - ",IDQ_UOPS_NOT_DELIVERED:CORE" - ",IDQ_UOPS_NOT_DELIVERED:CYCLES_0_UOPS_DELIV_CORE" - ",INT_MISC:RECOVERY_CYCLES" - ",MACHINE_CLEARS:COUNT" - ",MEM_LOAD_UOPS_RETIRED:L3_HIT" - ",MEM_LOAD_UOPS_RETIRED:L3_MISS" - ",UOPS_EXECUTED:CORE_CYCLES_GE_1" - ",UOPS_EXECUTED:CORE_CYCLES_GE_2" - ",UOPS_ISSUED:ANY" - ",UOPS_RETIRED:RETIRE_SLOTS", - // res_top - { "retiring", "backend_bound", "frontend_bound", "bad_speculation" }, - // res_all - { "retiring", - "backend_bound", - "frontend_bound", - "bad_speculation", - "branch_mispredict", - "machine_clears", - "frontend_latency", - "frontend_bandwidth", - "memory_bound", - "core_bound", - "ext_mem_bound", - "l1_bound", - "l2_bound", - "l3_bound" } - ) - {} - - virtual ~HaswellTopdown() = default; - - virtual std::vector compute_toplevel(const std::vector& rec) override - { - std::vector ret; - - Variant v_cpu_clk_unhalted_thread_p = get_val_from_rec(rec, "CPU_CLK_THREAD_UNHALTED:THREAD_P"); - Variant v_uops_retired_retire_slots = get_val_from_rec(rec, "UOPS_RETIRED:RETIRE_SLOTS"); - Variant v_uops_issued_any = get_val_from_rec(rec, "UOPS_ISSUED:ANY"); - Variant v_int_misc_recovery_cycles = get_val_from_rec(rec, "INT_MISC:RECOVERY_CYCLES"); - Variant v_idq_uops_not_delivered_core = get_val_from_rec(rec, "IDQ_UOPS_NOT_DELIVERED:CORE"); - - bool is_incomplete = v_cpu_clk_unhalted_thread_p.empty() || v_uops_retired_retire_slots.empty() - || v_uops_issued_any.empty() || v_int_misc_recovery_cycles.empty() - || v_idq_uops_not_delivered_core.empty(); - bool is_nonzero = v_cpu_clk_unhalted_thread_p.to_double() > 0.0 && v_uops_retired_retire_slots.to_double() > 0.0 - && v_uops_issued_any.to_double() > 0.0 && v_int_misc_recovery_cycles.to_double() > 0.0 - && v_idq_uops_not_delivered_core.to_double() > 0.0; - - double slots = 4.0 * v_cpu_clk_unhalted_thread_p.to_double(); - - if (is_incomplete || !is_nonzero || slots < 1.0) - return ret; - - double retiring = v_uops_retired_retire_slots.to_double() / slots; - double bad_speculation = (v_uops_issued_any.to_double() - v_uops_retired_retire_slots.to_double() - + 4.0 * v_int_misc_recovery_cycles.to_double()) - / slots; - double frontend_bound = v_idq_uops_not_delivered_core.to_double() / slots; - double backend_bound = 1.0 - (retiring + bad_speculation + frontend_bound); - - ret.reserve(4); - ret.push_back(Entry(m_result_attrs["retiring"], Variant(std::max(retiring, 0.0)))); - ret.push_back(Entry(m_result_attrs["backend_bound"], Variant(std::max(backend_bound, 0.0)))); - ret.push_back(Entry(m_result_attrs["frontend_bound"], Variant(std::max(frontend_bound, 0.0)))); - ret.push_back(Entry(m_result_attrs["bad_speculation"], Variant(std::max(bad_speculation, 0.0)))); - - return ret; - } - - virtual std::size_t get_num_expected_toplevel() const override { return 4; } - - virtual std::vector compute_retiring(const std::vector& rec) override { return {}; } - - virtual std::size_t get_num_expected_retiring() const override { return 0; } - - virtual std::vector compute_backend_bound(const std::vector& rec) override - { - std::vector ret; - - Variant v_cpu_clk_unhalted_thread_p = get_val_from_rec(rec, "CPU_CLK_THREAD_UNHALTED:THREAD_P"); - Variant v_cycle_activity_stalls_ldm_pending = get_val_from_rec(rec, "CYCLE_ACTIVITY:STALLS_LDM_PENDING"); - Variant v_cycle_activity_cycles_no_execute = get_val_from_rec(rec, "CYCLE_ACTIVITY:CYCLES_NO_EXECUTE"); - Variant v_uops_executed_core_cycles_ge_1 = get_val_from_rec(rec, "UOPS_EXECUTED:CORE_CYCLES_GE_1"); - Variant v_uops_executed_core_cycles_ge_2 = get_val_from_rec(rec, "UOPS_EXECUTED:CORE_CYCLES_GE_2"); - Variant v_mem_load_uops_retired_l3_miss = get_val_from_rec(rec, "MEM_LOAD_UOPS_RETIRED:L3_MISS"); - Variant v_mem_load_uops_retired_l3_hit = get_val_from_rec(rec, "MEM_LOAD_UOPS_RETIRED:L3_HIT"); - Variant v_cycle_activity_stalls_l2_pending = get_val_from_rec(rec, "CYCLE_ACTIVITY:STALLS_L2_PENDING"); - Variant v_cycle_activity_stalls_l1d_pending = get_val_from_rec(rec, "CYCLE_ACTIVITY:STALLS_L1D_PENDING"); - - bool is_incomplete = v_cpu_clk_unhalted_thread_p.empty() || v_cycle_activity_stalls_ldm_pending.empty() - || v_cycle_activity_cycles_no_execute.empty() || v_uops_executed_core_cycles_ge_1.empty() - || v_uops_executed_core_cycles_ge_2.empty() || v_mem_load_uops_retired_l3_miss.empty() - || v_mem_load_uops_retired_l3_hit.empty() || v_cycle_activity_stalls_l2_pending.empty() - || v_cycle_activity_stalls_l1d_pending.empty(); - - double clocks = v_cpu_clk_unhalted_thread_p.to_double(); - - if (is_incomplete || !(clocks > 1.0)) - return ret; - - double memory_bound = v_cycle_activity_stalls_ldm_pending.to_double() / clocks; - double be_bound_at_exe = - (v_cycle_activity_cycles_no_execute.to_double() + v_uops_executed_core_cycles_ge_1.to_double() - - v_uops_executed_core_cycles_ge_2.to_double()) - / clocks; - double l3_tot = v_mem_load_uops_retired_l3_hit.to_double() + 7.0 * v_mem_load_uops_retired_l3_miss.to_double(); - double l3_hit_fraction = 0.0; - double l3_miss_fraction = 0.0; - if (l3_tot > 0.0) { - l3_hit_fraction = v_mem_load_uops_retired_l3_hit.to_double() / l3_tot; - l3_miss_fraction = v_mem_load_uops_retired_l3_miss.to_double() / l3_tot; - } - double ext_mem_bound = v_cycle_activity_stalls_l2_pending.to_double() * l3_miss_fraction / clocks; - double l1_bound = - (v_cycle_activity_stalls_ldm_pending.to_double() - v_cycle_activity_stalls_l1d_pending.to_double()) - / clocks; - double l2_bound = - (v_cycle_activity_stalls_l1d_pending.to_double() - v_cycle_activity_stalls_l2_pending.to_double()) / clocks; - double l3_bound = v_cycle_activity_stalls_l2_pending.to_double() * l3_hit_fraction / clocks; - - ret.reserve(6); - ret.push_back(Entry(result_attrs["memory_bound"], Variant(memory_bound))); - ret.push_back(Entry(result_attrs["core_bound"], Variant(be_bound_at_exe - memory_bound))); - ret.push_back(Entry(result_attrs["ext_mem_bound"], Variant(ext_mem_bound))); - ret.push_back(Entry(result_attrs["l1_bound"], Variant(l1_bound))); - ret.push_back(Entry(result_attrs["l2_bound"], Variant(l2_bound))); - ret.push_back(Entry(result_attrs["l3_bound"], Variant(l3_bound))); - - return ret; - } - - virtual std::size_t get_num_expected_backend_bound() const override { return 6; } - - virtual std::vector compute_frontend_bound(const std::vector& rec) override - { - std::vector ret; - - Variant v_cpu_clk_unhalted_thread_p = get_val_from_rec(rec, "CPU_CLK_THREAD_UNHALTED:THREAD_P"); - Variant v_idq_uops_not_delivered = get_val_from_rec(rec, "IDQ_UOPS_NOT_DELIVERED:CYCLES_0_UOPS_DELIV_CORE"); - - bool is_incomplete = v_cpu_clk_unhalted_thread_p.empty() || v_idq_uops_not_delivered.empty(); - - double clocks = v_cpu_clk_unhalted_thread_p.to_double(); - double uops = v_idq_uops_not_delivered.to_double(); - - if (is_incomplete || clocks < 1.0 || uops > clocks) - return ret; - - double fe_latency = uops / clocks; - - ret.reserve(2); - ret.push_back(Entry(result_attrs["frontend_latency"], Variant(fe_latency))); - ret.push_back(Entry(result_attrs["frontend_bandwidth"], Variant(1.0 - fe_latency))); - - return ret; - } - - virtual std::size_t get_num_expected_frontend_bound() const override { return 2; } - - virtual std::vector compute_bad_speculation(const std::vector& rec) override - { - std::vector ret; - - Variant v_br_misp_retired_all_branches = get_val_from_rec(rec, "BR_MISP_RETIRED:ALL_BRANCHES"); - Variant v_machine_clears_count = get_val_from_rec(rec, "MACHINE_CLEARS:COUNT"); - - bool is_incomplete = v_br_misp_retired_all_branches.empty() || v_machine_clears_count.empty(); - - double br_misp_retired_all_branches = v_br_misp_retired_all_branches.to_double(); - double machine_clears_count = v_machine_clears_count.to_double(); - - if (is_incomplete || !(br_misp_retired_all_branches + machine_clears_count > 1.0)) - return ret; - - double branch_mispredict = br_misp_retired_all_branches / (br_misp_retired_all_branches + machine_clears_count); - - ret.reserve(2); - ret.push_back(Entry(result_attrs["branch_mispredict"], Variant(branch_mispredict))); - ret.push_back(Entry(result_attrs["machine_clears"], Variant(1.0 - branch_mispredict))); - - return ret; - } - - virtual std::size_t get_num_expected_bad_speculation() const override { return 2; } -}; - -class SapphireRapidsTopdown : public TopdownCalculator -{ -public: - - SapphireRapidsTopdown(IntelTopdownLevel level) - : TopdownCalculator( - level, - // top_counters - "perf::slots" - ",perf::topdown-retiring" - ",perf::topdown-bad-spec" - ",perf::topdown-fe-bound" - ",perf::topdown-be-bound" - ",INT_MISC:UOP_DROPPING", - // all_counters - "perf::slots" - ",perf::topdown-retiring" - ",perf::topdown-bad-spec" - ",perf::topdown-fe-bound" - ",perf::topdown-be-bound" - ",INT_MISC:UOP_DROPPING" - ",perf_raw::r8400" // topdown-heavy-ops - ",perf_raw::r8500" // topdown-br-mispredict - ",perf_raw::r8600" // topdown-fetch-lat - ",perf_raw::r8700", // topdown-mem-bound - // res_top - { "retiring", "backend_bound", "frontend_bound", "bad_speculation" }, - // res_all - { "retiring", - "backend_bound", - "frontend_bound", - "bad_speculation", - "branch_mispredict", - "machine_clears", - "frontend_latency", - "frontend_bandwidth", - "memory_bound", - "core_bound", - "light_ops", - "heavy_ops" } - ) - {} - - virtual ~SapphireRapidsTopdown() = default; - - virtual std::vector compute_toplevel(const std::vector& rec) override - { - std::vector ret; - - // Get PAPI metrics for toplevel calculations - Variant v_slots_or_info_thread_slots = get_val_from_rec(rec, "perf::slots"); - Variant v_retiring = get_val_from_rec(rec, "perf::topdown-retiring"); - Variant v_bad_spec = get_val_from_rec(rec, "perf::topdown-bad-spec"); - Variant v_fe_bound = get_val_from_rec(rec, "perf::topdown-fe-bound"); - Variant v_be_bound = get_val_from_rec(rec, "perf::topdown-be-bound"); - Variant v_int_misc_uop_dropping = get_val_from_rec(rec, "INT_MISC:UOP_DROPPING"); - - // Check if any Variant is empty (use .empty()) - bool is_incomplete = v_fe_bound.empty() || v_be_bound.emtpy() || v_bad_spec.empty() || v_retiring.empty() - || v_int_misc_uop_dropping.empty() || v_slots_or_info_thread_slots.empty(); - // Check if all Variants are greater than 0 when casted to doubles (use - // .to_double()) - bool is_nonzero = v_fe_bound.to_double() > 0.0 && v_be_bound.to_double() > 0.0 && v_bad_spec.to_double() > 0.0 - && v_retiring.to_double() > 0.0 && v_int_misc_uop_dropping.to_double() > 0.0 - && v_slots_or_info_thread_slots.to_double() > 0.0; - - // Check if bad values were obtained - if (is_incomplete || !is_nonzero) - return ret; - - // Perform toplevel calcs - double toplevel_sum = - (v_retiring.to_double() + v_bad_spec.to_double() + v_fe_bound.to_double() + v_be_bound.to_double()); - - double retiring = (v_retiring.to_double() / toplevel_sum) + (0 * v_slots_or_info_thread_slots.to_double()); - double frontend_bound = (v_fe_bound.to_double() / toplevel_sum) - - (v_int_misc_uop_dropping.to_double() / v_slots_or_info_thread_slots.to_double()); - double backend_bound = (v_be_bound.to_double() / toplevel_sum) + (0 * v_slots_or_info_thread_slots.to_double()); - double bad_speculation = std::max(1.0 - (frontend_bound + backend_bound + retiring), 0.0); - - // Add toplevel metrics to vector of Entry - ret.reserve(4); - ret.push_back(Entry(m_result_attrs["retiring"], Variant(std::max(retiring, 0.0)))); - ret.push_back(Entry(m_result_attrs["backend_bound"], Variant(std::max(backend_bound, 0.0)))); - ret.push_back(Entry(m_result_attrs["frontend_bound"], Variant(std::max(frontend_bound, 0.0)))); - ret.push_back(Entry(m_result_attrs["bad_speculation"], Variant(std::max(bad_speculation, 0.0)))); - - return ret; - } - - virtual std::size_t get_num_expected_toplevel() const override { return 4; } - - virtual std::vector compute_retiring(const std::vector& rec) override - { - std::vector ret; - - // Get PAPI metrics for toplevel calculations - Variant v_slots_or_info_thread_slots = get_val_from_rec(rec, "perf::slots"); - Variant v_retiring = get_val_from_rec(rec, "perf::topdown-retiring"); - Variant v_bad_spec = get_val_from_rec(rec, "perf::topdown-bad-spec"); - Variant v_fe_bound = get_val_from_rec(rec, "perf::topdown-fe-bound"); - Variant v_be_bound = get_val_from_rec(rec, "perf::topdown-be-bound"); - Variant v_heavy_ops = get_val_from_rec(rec, "perf_raw::r8400"); - - // Check if any Variant is empty (use .empty()) - bool is_incomplete = v_fe_bound.empty() || v_be_bound.emtpy() || v_bad_spec.empty() || v_retiring.empty() - || v_slots_or_info_thread_slots.empty() || v_heavy_ops.empty(); - - // Check if bad values were obtained - if (is_incomplete) - return ret; - - double toplevel_sum = - (v_retiring.to_double() + v_bad_spec.to_double() + v_fe_bound.to_double() + v_be_bound.to_double()); - // Copied from compute_toplevel - double retiring = (v_retiring.to_double() / toplevel_sum) + (0 * v_slots_or_info_thread_slots.to_double()); - - double heavy_ops = (v_heavy_ops.to_double() / toplevel_sum) + (0 * v_slots_or_info_thread_slots.to_double()); - double light_ops = std::max(0.0, retiring - heavy_ops); - - // Add toplevel metrics to vector of Entry - ret.reserve(2); - ret.push_back(Entry(m_result_attrs["heavy_ops"], Variant(std::max(heavy_ops, 0.0)))); - ret.push_back(Entry(m_result_attrs["light_ops"], Variant(std::max(light_ops, 0.0)))); - - return ret; - } - - virtual std::size_t get_num_expected_retiring() const override { return 2; } - - virtual std::vector compute_backend_bound(const std::vector& rec) override - { - std::vector ret; - - // Get PAPI metrics for toplevel calculations - Variant v_slots_or_info_thread_slots = get_val_from_rec(rec, "perf::slots"); - Variant v_retiring = get_val_from_rec(rec, "perf::topdown-retiring"); - Variant v_bad_spec = get_val_from_rec(rec, "perf::topdown-bad-spec"); - Variant v_fe_bound = get_val_from_rec(rec, "perf::topdown-fe-bound"); - Variant v_be_bound = get_val_from_rec(rec, "perf::topdown-be-bound"); - Variant v_memory_bound = get_val_from_rec(rec, "perf_raw::r8700"); - - // Check if any Variant is empty (use .empty()) - bool is_incomplete = v_fe_bound.empty() || v_be_bound.emtpy() || v_bad_spec.empty() || v_retiring.empty() - || v_slots_or_info_thread_slots.empty() || v_memory_bound.empty(); - - // Check if bad values were obtained - if (is_incomplete) - return ret; - - double toplevel_sum = - (v_retiring.to_double() + v_bad_spec.to_double() + v_fe_bound.to_double() + v_be_bound.to_double()); - // Copied from compute_toplevel - double backend_bound = (v_be_bound.to_double() / toplevel_sum) + (0 * v_slots_or_info_thread_slots.to_double()); - - double memory_bound = - (v_memory_bound.to_double() / toplevel_sum) + (0 * v_slots_or_info_thread_slots.to_double()); - double core_bound = std::max(0.0, backend_bound - memory_bound); - - // Add toplevel metrics to vector of Entry - ret.reserve(2); - ret.push_back(Entry(m_result_attrs["memory_bound"], Variant(std::max(memory_bound, 0.0)))); - ret.push_back(Entry(m_result_attrs["core_bound"], Variant(std::max(core_bound, 0.0)))); - - return ret; - } - - virtual std::size_t get_num_expected_backend_bound() const override { return 2; } - - virtual std::vector compute_frontend_bound(const std::vector& rec) override - { - std::vector ret; - - // Get PAPI metrics for toplevel calculations - Variant v_slots_or_info_thread_slots = get_val_from_rec(rec, "perf::slots"); - Variant v_retiring = get_val_from_rec(rec, "perf::topdown-retiring"); - Variant v_bad_spec = get_val_from_rec(rec, "perf::topdown-bad-spec"); - Variant v_fe_bound = get_val_from_rec(rec, "perf::topdown-fe-bound"); - Variant v_be_bound = get_val_from_rec(rec, "perf::topdown-be-bound"); - Variant v_int_misc_uop_dropping = get_val_from_rec(rec, "INT_MISC:UOP_DROPPING"); - Variant v_fetch_latency = get_val_from_rec(rec, "perf_raw::r8600"); - - // Check if any Variant is empty (use .empty()) - bool is_incomplete = v_fe_bound.empty() || v_be_bound.empty() || v_bad_spec.empty() || v_retiring.empty() - || v_int_misc_uop_dropping.empty() || v_slots_or_info_thread_slots.empty() - || v_fetch_latency.empty(); - - // Check if bad values were obtained - if (is_incomplete) - return ret; - - double toplevel_sum = - (v_retiring.to_double() + v_bad_spec.to_double() + v_fe_bound.to_double() + v_be_bound.to_double()); - // Copied from compute_toplevel - double frontend_bound = (v_fe_bound.to_double() / toplevel_sum) - - (v_int_misc_uop_dropping.to_double() / v_slots_or_info_thread_slots.to_double()); - - double fetch_latency = (v_fetch_latency.to_double() / toplevel_sum) - - (v_int_misc_uop_dropping.to_double() / v_slots_or_info_thread_slots.to_double()); - - double fetch_bandwidth = std::max(0.0, frontend_bound - fetch_latency); - - // Add toplevel metrics to vector of Entry - ret.reserve(2); - ret.push_back(Entry(m_result_attrs["frontend_latency"], Variant(std::max(fetch_latency, 0.0)))); - ret.push_back(Entry(m_result_attrs["frontend_bandwidth"], Variant(std::max(fetch_bandwidth, 0.0)))); - - return ret; - } - - virtual std::size_t get_num_expected_frontend_bound() const override { return 2; } - - virtual std::vector compute_bad_speculation(const std::vector& rec) override - { - std::vector ret; - - // Get PAPI metrics for toplevel calculations - Variant v_slots_or_info_thread_slots = get_val_from_rec(rec, "perf::slots"); - Variant v_retiring = get_val_from_rec(rec, "perf::topdown-retiring"); - Variant v_bad_spec = get_val_from_rec(rec, "perf::topdown-bad-spec"); - Variant v_fe_bound = get_val_from_rec(rec, "perf::topdown-fe-bound"); - Variant v_be_bound = get_val_from_rec(rec, "perf::topdown-be-bound"); - Variant v_int_misc_uop_dropping = get_val_from_rec(rec, "INT_MISC:UOP_DROPPING"); - Variant v_branch_mispredict = get_val_from_rec(rec, "perf_raw::r8500"); - - // Check if any Variant is empty (use .empty()) - bool is_incomplete = v_fe_bound.empty() || v_be_bound.emtpy() || v_bad_spec.empty() || v_retiring.empty() - || v_int_misc_uop_dropping.empty() || v_slots_or_info_thread_slots.empty() - || v_branch_mispredict.empty(); - - // Check if bad values were obtained - if (is_incomplete) - return ret; - - // Perform toplevel calcs - double toplevel_sum = - (v_retiring.to_double() + v_bad_spec.to_double() + v_fe_bound.to_double() + v_be_bound.to_double()); - - double retiring = (v_retiring.to_double() / toplevel_sum) + (0 * v_slots_or_info_thread_slots.to_double()); - double frontend_bound = (v_fe_bound.to_double() / toplevel_sum) - - (v_int_misc_uop_dropping.to_double() / v_slots_or_info_thread_slots.to_double()); - double backend_bound = (v_be_bound.to_double() / toplevel_sum) + (0 * v_slots_or_info_thread_slots.to_double()); - double bad_speculation = std::max(1.0 - (frontend_bound + backend_bound + retiring), 0.0); - - double branch_mispredict = - (v_branch_mispredict.to_double() / toplevel_sum) + (0 * v_slots_or_info_thread_slots.to_double()); - double machine_clears = std::max(0.0, bad_speculation - branch_mispredict); - - // Add toplevel metrics to vector of Entry - ret.reserve(2); - ret.push_back(Entry(m_result_attrs["branch_mispredict"], Variant(std::max(branch_mispredict, 0.0)))); - ret.push_back(Entry(m_result_attrs["machine_clears"], Variant(std::max(machine_clears, 0.0)))); - - return ret; - } - - virtual std::size_t get_num_expected_bad_speculation() const override { return 2; } -}; - class IntelTopdown { unsigned num_top_computed; @@ -633,9 +40,9 @@ class IntelTopdown unsigned num_ret_computed; unsigned num_ret_skipped; - IntelTopdownLevel m_level; + cali::topdown::IntelTopdownLevel m_level; - TopdownCalculator* m_calculator; + cali::topdown::TopdownCalculator* m_calculator; bool find_counter_attrs(CaliperMetadataAccessInterface& db) { return m_calculator->find_counter_attrs(db); } @@ -715,7 +122,7 @@ class IntelTopdown } } - explicit IntelTopdown(TopdownCalculator* calculator) + explicit IntelTopdown(cali::topdown::TopdownCalculator* calculator) : num_top_computed(0), num_top_skipped(0), num_be_computed(0), @@ -737,11 +144,9 @@ class IntelTopdown public: - static const char* s_spec; - static void intel_topdown_register(Caliper* c, Channel* channel) { - IntelTopdownLevel level = Top; + cali::topdown::IntelTopdownLevel level = Top; auto config = services::init_config_from_spec(channel->config(), s_spec); std::string lvlcfg = config.get("level").to_string(); @@ -754,14 +159,14 @@ class IntelTopdown return; } - TopdownCalculator* calculator; + cali::topdown::TopdownCalculator* calculator; #if defined(CALIPER_HAVE_ARCH) if (std::string(CALIPER_HAVE_ARCH) == "sapphirerapids") { - calculator = new SapphireRapidsTopdown(level); + calculator = new cali::topdown::SapphireRapidsTopdown(level); } else { #endif - calculator = new HaswellTopdown(level); // Default type of calculation + calculator = new cali::topdown::HaswellTopdown(level); // Default type of calculation #if defined(CALIPER_HAVE_ARCH) } #endif diff --git a/src/services/topdown/SapphireRapidsTopdown.cpp b/src/services/topdown/SapphireRapidsTopdown.cpp new file mode 100644 index 00000000..457489b3 --- /dev/null +++ b/src/services/topdown/SapphireRapidsTopdown.cpp @@ -0,0 +1,299 @@ +#include "SapphireRapidsTopdown.h" + +#include + +namespace cali { +namespace topdown { + +SapphireRapidsTopdown::SapphireRapidsTopdown(IntelTopdownLevel level) + : cali::topdown::TopdownCalculator( + level, + // top_counters + "perf::slots" + ",perf::topdown-retiring" + ",perf::topdown-bad-spec" + ",perf::topdown-fe-bound" + ",perf::topdown-be-bound" + ",INT_MISC:UOP_DROPPING", + // all_counters + "perf::slots" + ",perf::topdown-retiring" + ",perf::topdown-bad-spec" + ",perf::topdown-fe-bound" + ",perf::topdown-be-bound" + ",INT_MISC:UOP_DROPPING" + ",perf_raw::r8400" // topdown-heavy-ops + ",perf_raw::r8500" // topdown-br-mispredict + ",perf_raw::r8600" // topdown-fetch-lat + ",perf_raw::r8700", // topdown-mem-bound + // res_top + {"retiring", "backend_bound", "frontend_bound", "bad_speculation"}, + // res_all + {"retiring", "backend_bound", "frontend_bound", "bad_speculation", + "branch_mispredict", "machine_clears", "frontend_latency", + "frontend_bandwidth", "memory_bound", "core_bound", "light_ops", + "heavy_ops"}) {} + +std::vector +SapphireRapidsTopdown::compute_toplevel(const std::vector &rec) { + std::vector ret; + + // Get PAPI metrics for toplevel calculations + Variant v_slots_or_info_thread_slots = get_val_from_rec(rec, "perf::slots"); + Variant v_retiring = get_val_from_rec(rec, "perf::topdown-retiring"); + Variant v_bad_spec = get_val_from_rec(rec, "perf::topdown-bad-spec"); + Variant v_fe_bound = get_val_from_rec(rec, "perf::topdown-fe-bound"); + Variant v_be_bound = get_val_from_rec(rec, "perf::topdown-be-bound"); + Variant v_int_misc_uop_dropping = + get_val_from_rec(rec, "INT_MISC:UOP_DROPPING"); + + // Check if any Variant is empty (use .empty()) + bool is_incomplete = v_fe_bound.empty() || v_be_bound.empty() || + v_bad_spec.empty() || v_retiring.empty() || + v_int_misc_uop_dropping.empty() || + v_slots_or_info_thread_slots.empty(); + // Check if all Variants are greater than 0 when casted to doubles (use + // .to_double()) + bool is_nonzero = + v_fe_bound.to_double() > 0.0 && v_be_bound.to_double() > 0.0 && + v_bad_spec.to_double() > 0.0 && v_retiring.to_double() > 0.0 && + v_int_misc_uop_dropping.to_double() > 0.0 && + v_slots_or_info_thread_slots.to_double() > 0.0; + + // Check if bad values were obtained + if (is_incomplete || !is_nonzero) + return ret; + + // Perform toplevel calcs + double toplevel_sum = (v_retiring.to_double() + v_bad_spec.to_double() + + v_fe_bound.to_double() + v_be_bound.to_double()); + + double retiring = (v_retiring.to_double() / toplevel_sum) + + (0 * v_slots_or_info_thread_slots.to_double()); + double frontend_bound = (v_fe_bound.to_double() / toplevel_sum) - + (v_int_misc_uop_dropping.to_double() / + v_slots_or_info_thread_slots.to_double()); + double backend_bound = (v_be_bound.to_double() / toplevel_sum) + + (0 * v_slots_or_info_thread_slots.to_double()); + double bad_speculation = + std::max(1.0 - (frontend_bound + backend_bound + retiring), 0.0); + + // Add toplevel metrics to vector of Entry + ret.reserve(4); + ret.push_back( + Entry(m_result_attrs["retiring"], Variant(std::max(retiring, 0.0)))); + ret.push_back(Entry(m_result_attrs["backend_bound"], + Variant(std::max(backend_bound, 0.0)))); + ret.push_back(Entry(m_result_attrs["frontend_bound"], + Variant(std::max(frontend_bound, 0.0)))); + ret.push_back(Entry(m_result_attrs["bad_speculation"], + Variant(std::max(bad_speculation, 0.0)))); + + return ret; +} + +std::size_t SapphireRapidsTopdown::get_num_expected_toplevel() const { + return 4; +} + +std::vector +SapphireRapidsTopdown::compute_retiring(const std::vector &rec) { + std::vector ret; + + // Get PAPI metrics for toplevel calculations + Variant v_slots_or_info_thread_slots = get_val_from_rec(rec, "perf::slots"); + Variant v_retiring = get_val_from_rec(rec, "perf::topdown-retiring"); + Variant v_bad_spec = get_val_from_rec(rec, "perf::topdown-bad-spec"); + Variant v_fe_bound = get_val_from_rec(rec, "perf::topdown-fe-bound"); + Variant v_be_bound = get_val_from_rec(rec, "perf::topdown-be-bound"); + Variant v_heavy_ops = get_val_from_rec(rec, "perf_raw::r8400"); + + // Check if any Variant is empty (use .empty()) + bool is_incomplete = v_fe_bound.empty() || v_be_bound.empty() || + v_bad_spec.empty() || v_retiring.empty() || + v_slots_or_info_thread_slots.empty() || + v_heavy_ops.empty(); + + // Check if bad values were obtained + if (is_incomplete) + return ret; + + double toplevel_sum = (v_retiring.to_double() + v_bad_spec.to_double() + + v_fe_bound.to_double() + v_be_bound.to_double()); + // Copied from compute_toplevel + double retiring = (v_retiring.to_double() / toplevel_sum) + + (0 * v_slots_or_info_thread_slots.to_double()); + + double heavy_ops = (v_heavy_ops.to_double() / toplevel_sum) + + (0 * v_slots_or_info_thread_slots.to_double()); + double light_ops = std::max(0.0, retiring - heavy_ops); + + // Add toplevel metrics to vector of Entry + ret.reserve(2); + ret.push_back( + Entry(m_result_attrs["heavy_ops"], Variant(std::max(heavy_ops, 0.0)))); + ret.push_back( + Entry(m_result_attrs["light_ops"], Variant(std::max(light_ops, 0.0)))); + + return ret; +} + +std::size_t SapphireRapidsTopdown::get_num_expected_retiring() const { + return 2; +} + +std::vector +SapphireRapidsTopdown::compute_backend_bound(const std::vector &rec) { + std::vector ret; + + // Get PAPI metrics for toplevel calculations + Variant v_slots_or_info_thread_slots = get_val_from_rec(rec, "perf::slots"); + Variant v_retiring = get_val_from_rec(rec, "perf::topdown-retiring"); + Variant v_bad_spec = get_val_from_rec(rec, "perf::topdown-bad-spec"); + Variant v_fe_bound = get_val_from_rec(rec, "perf::topdown-fe-bound"); + Variant v_be_bound = get_val_from_rec(rec, "perf::topdown-be-bound"); + Variant v_memory_bound = get_val_from_rec(rec, "perf_raw::r8700"); + + // Check if any Variant is empty (use .empty()) + bool is_incomplete = v_fe_bound.empty() || v_be_bound.empty() || + v_bad_spec.empty() || v_retiring.empty() || + v_slots_or_info_thread_slots.empty() || + v_memory_bound.empty(); + + // Check if bad values were obtained + if (is_incomplete) + return ret; + + double toplevel_sum = (v_retiring.to_double() + v_bad_spec.to_double() + + v_fe_bound.to_double() + v_be_bound.to_double()); + // Copied from compute_toplevel + double backend_bound = (v_be_bound.to_double() / toplevel_sum) + + (0 * v_slots_or_info_thread_slots.to_double()); + + double memory_bound = (v_memory_bound.to_double() / toplevel_sum) + + (0 * v_slots_or_info_thread_slots.to_double()); + double core_bound = std::max(0.0, backend_bound - memory_bound); + + // Add toplevel metrics to vector of Entry + ret.reserve(2); + ret.push_back(Entry(m_result_attrs["memory_bound"], + Variant(std::max(memory_bound, 0.0)))); + ret.push_back( + Entry(m_result_attrs["core_bound"], Variant(std::max(core_bound, 0.0)))); + + return ret; +} + +std::size_t SapphireRapidsTopdown::get_num_expected_backend_bound() const { + return 2; +} + +std::vector +SapphireRapidsTopdown::compute_frontend_bound(const std::vector &rec) { + std::vector ret; + + // Get PAPI metrics for toplevel calculations + Variant v_slots_or_info_thread_slots = get_val_from_rec(rec, "perf::slots"); + Variant v_retiring = get_val_from_rec(rec, "perf::topdown-retiring"); + Variant v_bad_spec = get_val_from_rec(rec, "perf::topdown-bad-spec"); + Variant v_fe_bound = get_val_from_rec(rec, "perf::topdown-fe-bound"); + Variant v_be_bound = get_val_from_rec(rec, "perf::topdown-be-bound"); + Variant v_int_misc_uop_dropping = + get_val_from_rec(rec, "INT_MISC:UOP_DROPPING"); + Variant v_fetch_latency = get_val_from_rec(rec, "perf_raw::r8600"); + + // Check if any Variant is empty (use .empty()) + bool is_incomplete = + v_fe_bound.empty() || v_be_bound.empty() || v_bad_spec.empty() || + v_retiring.empty() || v_int_misc_uop_dropping.empty() || + v_slots_or_info_thread_slots.empty() || v_fetch_latency.empty(); + + // Check if bad values were obtained + if (is_incomplete) + return ret; + + double toplevel_sum = (v_retiring.to_double() + v_bad_spec.to_double() + + v_fe_bound.to_double() + v_be_bound.to_double()); + // Copied from compute_toplevel + double frontend_bound = (v_fe_bound.to_double() / toplevel_sum) - + (v_int_misc_uop_dropping.to_double() / + v_slots_or_info_thread_slots.to_double()); + + double fetch_latency = (v_fetch_latency.to_double() / toplevel_sum) - + (v_int_misc_uop_dropping.to_double() / + v_slots_or_info_thread_slots.to_double()); + + double fetch_bandwidth = std::max(0.0, frontend_bound - fetch_latency); + + // Add toplevel metrics to vector of Entry + ret.reserve(2); + ret.push_back(Entry(m_result_attrs["frontend_latency"], + Variant(std::max(fetch_latency, 0.0)))); + ret.push_back(Entry(m_result_attrs["frontend_bandwidth"], + Variant(std::max(fetch_bandwidth, 0.0)))); + + return ret; +} + +std::size_t SapphireRapidsTopdown::get_num_expected_frontend_bound() const { + return 2; +} + +std::vector +SapphireRapidsTopdown::compute_bad_speculation(const std::vector &rec) { + std::vector ret; + + // Get PAPI metrics for toplevel calculations + Variant v_slots_or_info_thread_slots = get_val_from_rec(rec, "perf::slots"); + Variant v_retiring = get_val_from_rec(rec, "perf::topdown-retiring"); + Variant v_bad_spec = get_val_from_rec(rec, "perf::topdown-bad-spec"); + Variant v_fe_bound = get_val_from_rec(rec, "perf::topdown-fe-bound"); + Variant v_be_bound = get_val_from_rec(rec, "perf::topdown-be-bound"); + Variant v_int_misc_uop_dropping = + get_val_from_rec(rec, "INT_MISC:UOP_DROPPING"); + Variant v_branch_mispredict = get_val_from_rec(rec, "perf_raw::r8500"); + + // Check if any Variant is empty (use .empty()) + bool is_incomplete = + v_fe_bound.empty() || v_be_bound.empty() || v_bad_spec.empty() || + v_retiring.empty() || v_int_misc_uop_dropping.empty() || + v_slots_or_info_thread_slots.empty() || v_branch_mispredict.empty(); + + // Check if bad values were obtained + if (is_incomplete) + return ret; + + // Perform toplevel calcs + double toplevel_sum = (v_retiring.to_double() + v_bad_spec.to_double() + + v_fe_bound.to_double() + v_be_bound.to_double()); + + double retiring = (v_retiring.to_double() / toplevel_sum) + + (0 * v_slots_or_info_thread_slots.to_double()); + double frontend_bound = (v_fe_bound.to_double() / toplevel_sum) - + (v_int_misc_uop_dropping.to_double() / + v_slots_or_info_thread_slots.to_double()); + double backend_bound = (v_be_bound.to_double() / toplevel_sum) + + (0 * v_slots_or_info_thread_slots.to_double()); + double bad_speculation = + std::max(1.0 - (frontend_bound + backend_bound + retiring), 0.0); + + double branch_mispredict = (v_branch_mispredict.to_double() / toplevel_sum) + + (0 * v_slots_or_info_thread_slots.to_double()); + double machine_clears = std::max(0.0, bad_speculation - branch_mispredict); + + // Add toplevel metrics to vector of Entry + ret.reserve(2); + ret.push_back(Entry(m_result_attrs["branch_mispredict"], + Variant(std::max(branch_mispredict, 0.0)))); + ret.push_back(Entry(m_result_attrs["machine_clears"], + Variant(std::max(machine_clears, 0.0)))); + + return ret; +} + +std::size_t SapphireRapidsTopdown::get_num_expected_bad_speculation() const { + return 2; +} + +} // namespace topdown +} // namespace cali \ No newline at end of file diff --git a/src/services/topdown/SapphireRapidsTopdown.h b/src/services/topdown/SapphireRapidsTopdown.h new file mode 100644 index 00000000..5038305b --- /dev/null +++ b/src/services/topdown/SapphireRapidsTopdown.h @@ -0,0 +1,44 @@ +#ifndef CALI_TOPDOWN_SAPPHIRE_RAPIDS_TOPDOWN_H +#define CALI_TOPDOWN_SAPPHIRE_RAPIDS_TOPDOWN_H + +#include "TopdownCalculator.h" + +namespace cali { +namespace topdown { + +class SapphireRapidsTopdown : public TopdownCalculator { +public: + SapphireRapidsTopdown(IntelTopdownLevel level); + + virtual ~SapphireRapidsTopdown() = default; + + virtual std::vector + compute_toplevel(const std::vector &rec) override; + + virtual std::size_t get_num_expected_toplevel() const override; + + virtual std::vector + compute_retiring(const std::vector &rec) override; + + virtual std::size_t get_num_expected_retiring() const override; + + virtual std::vector + compute_backend_bound(const std::vector &rec) override; + + virtual std::size_t get_num_expected_backend_bound() const override; + + virtual std::vector + compute_frontend_bound(const std::vector &rec) override; + + virtual std::size_t get_num_expected_frontend_bound() const override; + + virtual std::vector + compute_bad_speculation(const std::vector &rec) override; + + virtual std::size_t get_num_expected_bad_speculation() const override; +}; + +} // namespace topdown +} // namespace cali + +#endif /* CALI_TOPDOWN_SAPPHIRE_RAPIDS_TOPDOWN_H */ \ No newline at end of file diff --git a/src/services/topdown/TopdownCalculator.cpp b/src/services/topdown/TopdownCalculator.cpp new file mode 100644 index 00000000..d1c8909b --- /dev/null +++ b/src/services/topdown/TopdownCalculator.cpp @@ -0,0 +1,90 @@ +#include "TopdownCalculator.h" + +#include "caliper/common/Log.h" + +#include + +namespace cali { +namespace topdown { + +Variant TopdownCalculator::get_val_from_rec(const std::vector &rec, + const char *name) { + Variant ret; + + auto c_it = m_counter_attrs.find(name); + if (c_it == m_counter_attrs.end()) + return ret; + + cali_id_t attr_id = c_it->second.id(); + + auto it = std::find_if(rec.begin(), rec.end(), [attr_id](const Entry &e) { + return e.attribute() == attr_id; + }); + + if (it != rec.end()) + ret = it->value(); + else + ++m_counters_not_found[std::string(name)]; + + return ret; +} + +TopdownCalculator::TopdownCalculator(IntelTopdownLevel level, + const char *top_counters, + const char *all_counters, + std::vector &&res_top, + std::vector &&res_all) + : m_level(level), m_top_counters(top_counters), + m_all_counters(all_counters), m_res_top(res_top), m_res_all(res_all) {} + +TopdownCalculator::TopdownCalculator(IntelTopdownLevel level) + : m_level(level) {} + +bool TopdownCalculator::find_counter_attrs(CaliperMetadataAccessInterface &db) { + const char *list = (m_level == All ? m_all_counters : m_top_counters); + auto counters = StringConverter(list).to_stringlist(); + + for (const auto &s : counters) { + Attribute attr = db.get_attribute(std::string("sum#papi.") + s); + + if (attr == Attribute::invalid) + attr = db.get_attribute(std::string("papi.") + s); + if (attr == Attribute::invalid) { + Log(0).stream() << "topdown: " << s << " counter attribute not found!" + << std::endl; + return false; + } + + m_counter_attrs[s] = attr; + } + + return true; +} + +void TopdownCalculator::make_result_attrs(CaliperMetadataAccessInterface &db) { + std::vector &res = (m_level == Top ? m_res_top : m_res_all); + + for (const char *s : res) { + m_result_attrs[std::string(s)] = + db.create_attribute(std::string("topdown.") + s, CALI_TYPE_DOUBLE, + CALI_ATTR_ASVALUE | CALI_ATTR_SKIP_EVENTS); + } +} + +const std::map & +TopdownCalculator::get_counters_not_found() const { + return m_counters_not_found; +} + +const char *TopdownCalculator::get_counters() const { + if (m_level == All) { + return m_all_counters; + } else { + return m_top_counters; + } +} + +IntelTopdownLevel TopdownCalculator::get_level() const { return m_level; } + +} // namespace topdown +} // namespace cali \ No newline at end of file diff --git a/src/services/topdown/TopdownCalculator.h b/src/services/topdown/TopdownCalculator.h new file mode 100644 index 00000000..eb40043a --- /dev/null +++ b/src/services/topdown/TopdownCalculator.h @@ -0,0 +1,80 @@ +#ifndef CALI_TOPDOWN_TOPDOWN_CALCULATOR_H +#define CALI_TOPDOWN_TOPDOWN_CALCULATOR_H + +#include "caliper/Caliper.h" + +#include +#include + +namespace cali { +namespace topdown { + +enum IntelTopdownLevel { All = 1, Top = 2 }; + +class TopdownCalculator { +protected: + IntelTopdownLevel m_level; + + const char *m_top_counters; + const char *m_all_counters; + + std::vector m_res_top; + std::vector m_res_all; + + std::map m_counter_attrs; + std::map m_result_attrs; + + std::map m_counters_not_found; + + Variant get_val_from_rec(const std::vector &rec, const char *name); + + TopdownCalculator(IntelTopdownLevel level, const char *top_counters, + const char *all_counters, + std::vector &&res_top, + std::vector &&res_all); + +public: + TopdownCalculator(IntelTopdownLevel level); + + virtual ~TopdownCalculator() = default; + + virtual std::vector + compute_toplevel(const std::vector &rec) = 0; + + virtual std::size_t get_num_expected_toplevel() const = 0; + + virtual std::vector + compute_retiring(const std::vector &rec) = 0; + + virtual std::size_t get_num_expected_retiring() const = 0; + + virtual std::vector + compute_backend_bound(const std::vector &rec) = 0; + + virtual std::size_t get_num_expected_backend_bound() const = 0; + + virtual std::vector + compute_frontend_bound(const std::vector &rec) = 0; + + virtual std::size_t get_num_expected_frontend_bound() const = 0; + + virtual std::vector + compute_bad_speculation(const std::vector &rec) = 0; + + virtual std::size_t get_num_expected_bad_speculation() const = 0; + + bool find_counter_attrs(CaliperMetadataAccessInterface &db); + + void make_result_attrs(CaliperMetadataAccessInterface &db); + + const std::map &get_counters_not_found() const; + + const char *get_counters() const; + + IntelTopdownLevel get_level() const; +}; + +} // namespace topdown +} // namespace cali + +#endif /* CALI_TOPDOWN_TOPDOWN_CALCULATOR_H */ \ No newline at end of file From ea6d73eea6abc561dacec43ed13c1c7c188e0105 Mon Sep 17 00:00:00 2001 From: Ian Lumsden Date: Fri, 4 Oct 2024 16:54:24 -0400 Subject: [PATCH 05/11] Adds a 'disable_multiplexing' configuration to the Papi service and use that configuration in the topdown service --- src/services/papi/Papi.cpp | 46 +++++++++++-------- src/services/topdown/HaswellTopdown.cpp | 2 + src/services/topdown/HaswellTopdown.h | 2 + src/services/topdown/IntelTopdown.cpp | 4 ++ .../topdown/SapphireRapidsTopdown.cpp | 4 ++ src/services/topdown/SapphireRapidsTopdown.h | 2 + src/services/topdown/TopdownCalculator.h | 2 + 7 files changed, 44 insertions(+), 18 deletions(-) diff --git a/src/services/papi/Papi.cpp b/src/services/papi/Papi.cpp index ac3f0d6d..eb111af3 100644 --- a/src/services/papi/Papi.cpp +++ b/src/services/papi/Papi.cpp @@ -64,6 +64,7 @@ class PapiService Attribute m_thread_attr; bool m_enable_multiplex; + bool m_disable_multiplex; unsigned m_num_eventsets; unsigned m_num_event_mismatch; @@ -169,21 +170,20 @@ class PapiService int num = static_cast(p.second->codes.size()); - // if (cpi && (num > 4 /* magic number for Intel counter support :-( */ || - // m_enable_multiplex)) { - // if (Log::verbosity() >= 2) - // Log(2).stream() << "papi: Initializing multiplex support for - // component " - // << p.first << " (" << cpi->name << ")" - // << std::endl; - - // ret = PAPI_assign_eventset_component(eventset, p.first); - // if (ret != PAPI_OK) - // print_papi_error("PAPI_assign_eventset_component", ret); - // ret = PAPI_set_multiplex(eventset); - // if (ret != PAPI_OK) - // print_papi_error("PAPI_set_multiplex", ret); - // } + if (!m_disable_multiplex && cpi + && (num > 4 /* magic number for Intel counter support :-( */ || m_enable_multiplex)) { + if (Log::verbosity() >= 2) + Log(2).stream() << "papi: Initializing multiplex support for + component " + << p.first << " (" << cpi->name << ")" << std::endl; + + ret = PAPI_assign_eventset_component(eventset, p.first); + if (ret != PAPI_OK) + print_papi_error("PAPI_assign_eventset_component", ret); + ret = PAPI_set_multiplex(eventset); + if (ret != PAPI_OK) + print_papi_error("PAPI_set_multiplex", ret); + } ret = PAPI_add_events(eventset, p.second->codes.data(), num); if (ret < 0) { @@ -370,6 +370,7 @@ class PapiService PapiService(Caliper* c, Channel* channel) : m_enable_multiplex(false), + m_disable_multiplex(false), m_num_eventsets(0), m_num_event_mismatch(0), m_num_failed_acquire(0), @@ -450,7 +451,8 @@ class PapiService ++s_num_instances; PapiService* instance = new PapiService(c, channel); - instance->m_enable_multiplex = cfg.get("enable_multiplexing").to_bool(); + instance->m_enable_multiplex = cfg.get("enable_multiplexing").to_bool(); + instance->m_disable_multiplex = cfg.get("disable_multiplexing").to_bool(); if (!(instance->setup_event_info(c, eventlist) && instance->setup_thread_eventsets(c))) { Log(0).stream() << channel->name() << ": papi: Failed to initialize event sets, dropping papi service" @@ -494,13 +496,21 @@ const char* PapiService::s_spec = R"json( "name": "counters", "description": "List of PAPI events to record", "type": "string" - },{ + }, + { "name": "enable_multiplexing", "description": "Always enable multiplexing", "type": "bool", "value": "False" + }, + { + "name": "disable_multiplexing", + "description": "Always disable multiplexing", + "type": "bool", + "value": "False" } -]} +] +} )json"; } // namespace diff --git a/src/services/topdown/HaswellTopdown.cpp b/src/services/topdown/HaswellTopdown.cpp index a04551c8..f149a6c5 100644 --- a/src/services/topdown/HaswellTopdown.cpp +++ b/src/services/topdown/HaswellTopdown.cpp @@ -39,6 +39,8 @@ HaswellTopdown::HaswellTopdown(IntelTopdownLevel level) "frontend_bandwidth", "memory_bound", "core_bound", "ext_mem_bound", "l1_bound", "l2_bound", "l3_bound"}) {} +bool HaswellTopdown::check_for_disabled_multiplex() const { return false; } + std::vector HaswellTopdown::compute_toplevel(const std::vector &rec) { std::vector ret; diff --git a/src/services/topdown/HaswellTopdown.h b/src/services/topdown/HaswellTopdown.h index 39622d89..5ca0a9be 100644 --- a/src/services/topdown/HaswellTopdown.h +++ b/src/services/topdown/HaswellTopdown.h @@ -12,6 +12,8 @@ class HaswellTopdown : public TopdownCalculator { virtual ~HaswellTopdown() = default; + virtual bool check_for_disabled_multiplex() const override; + virtual std::vector compute_toplevel(const std::vector &rec) override; diff --git a/src/services/topdown/IntelTopdown.cpp b/src/services/topdown/IntelTopdown.cpp index 507a7648..04eabf1b 100644 --- a/src/services/topdown/IntelTopdown.cpp +++ b/src/services/topdown/IntelTopdown.cpp @@ -172,6 +172,10 @@ class IntelTopdown #endif channel->config().set("CALI_PAPI_COUNTERS", calculator->get_counters()); + // Some PAPI counters for topdown (particularly on SPR) don't play nice + // with PAPI multiplexing. Ask the TopdownCalculator class whether we need + // to disable multiplexing for the corresponding architecture. + channel->config().set("CALI_PAPI_DISABLE_MULTIPLEXING", calculator->check_for_disabled_multiplex()); if (!cali::services::register_service(c, channel, "papi")) { Log(0).stream() << channel->name() << ": topdown: Unable to register papi service, skipping topdown" diff --git a/src/services/topdown/SapphireRapidsTopdown.cpp b/src/services/topdown/SapphireRapidsTopdown.cpp index 457489b3..1739e144 100644 --- a/src/services/topdown/SapphireRapidsTopdown.cpp +++ b/src/services/topdown/SapphireRapidsTopdown.cpp @@ -34,6 +34,10 @@ SapphireRapidsTopdown::SapphireRapidsTopdown(IntelTopdownLevel level) "frontend_bandwidth", "memory_bound", "core_bound", "light_ops", "heavy_ops"}) {} +bool SapphireRapidsTopdown::check_for_disabled_multiplex() const { + return true; +} + std::vector SapphireRapidsTopdown::compute_toplevel(const std::vector &rec) { std::vector ret; diff --git a/src/services/topdown/SapphireRapidsTopdown.h b/src/services/topdown/SapphireRapidsTopdown.h index 5038305b..8fc75282 100644 --- a/src/services/topdown/SapphireRapidsTopdown.h +++ b/src/services/topdown/SapphireRapidsTopdown.h @@ -12,6 +12,8 @@ class SapphireRapidsTopdown : public TopdownCalculator { virtual ~SapphireRapidsTopdown() = default; + virtual bool check_for_disabled_multiplex() const override; + virtual std::vector compute_toplevel(const std::vector &rec) override; diff --git a/src/services/topdown/TopdownCalculator.h b/src/services/topdown/TopdownCalculator.h index eb40043a..e478d723 100644 --- a/src/services/topdown/TopdownCalculator.h +++ b/src/services/topdown/TopdownCalculator.h @@ -38,6 +38,8 @@ class TopdownCalculator { virtual ~TopdownCalculator() = default; + virtual bool check_for_disabled_multiplex() const = 0; + virtual std::vector compute_toplevel(const std::vector &rec) = 0; From 26f46a39b52a47e7ba3265873fa83c7410d93207 Mon Sep 17 00:00:00 2001 From: Ian Lumsden Date: Mon, 7 Oct 2024 09:19:46 -0700 Subject: [PATCH 06/11] Checks whether PAPI uses rdpmc on SPR in the topdown service --- src/services/topdown/CMakeLists.txt | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/services/topdown/CMakeLists.txt b/src/services/topdown/CMakeLists.txt index 78a8bef5..bf5ea84e 100644 --- a/src/services/topdown/CMakeLists.txt +++ b/src/services/topdown/CMakeLists.txt @@ -4,6 +4,21 @@ set(CALIPER_TOPDOWN_SOURCES HaswellTopdown.cpp SapphireRapidsTopdown.cpp) +if (CALIPER_HAVE_ARCH STREQUAL "sapphirerapids") + if (NOT EXISTS ${PAPI_PREFIX}/bin/papi_coponent_avail) + message(WARNING "Cannot check if PAPI uses rdpmc. Note that the topdown service will not work correctly on Sapphire Rapids if rdpmc is enabled. This will be fixed by a future version of PAPI.") + else () + execute_process( + COMMAND ${PAPI_PREFIX}/bin/papi_coponent_avail + OUTPUT_VARIABLE CALIPER_TOPDOWN_PAPI_COMPONENTS + ) + string(FIND ${CALIPER_TOPDOWN_PAPI_COMPONENTS} "Fast counter read (rdpmc): yes" CALIPER_TOPDOWN_PAPI_USES_RDPMC) + if (CALIPER_TOPDOWN_PAPI_USES_RDPMC EQUAL "-1") + message(WARNING "Detected that PAPI uses rdpmc to read counters. The topdown service will not work correctly on Sapphire Rapids if rdpmc is enabled. This will be fixed by a future version of PAPI.") + endif () + endif() +endif () + add_library(caliper-topdown OBJECT ${CALIPER_TOPDOWN_SOURCES}) add_service_objlib("caliper-topdown") From 0d98d08ad3fabe73c67fac49cd17add263c91e90 Mon Sep 17 00:00:00 2001 From: Ian Lumsden Date: Tue, 8 Oct 2024 06:53:42 -0700 Subject: [PATCH 07/11] Reworks SPR topdown implementation to use rdpmc-style values instead of raw counter values --- .../topdown/SapphireRapidsTopdown.cpp | 178 ++++++------------ 1 file changed, 60 insertions(+), 118 deletions(-) diff --git a/src/services/topdown/SapphireRapidsTopdown.cpp b/src/services/topdown/SapphireRapidsTopdown.cpp index 1739e144..a7e55bcf 100644 --- a/src/services/topdown/SapphireRapidsTopdown.cpp +++ b/src/services/topdown/SapphireRapidsTopdown.cpp @@ -2,6 +2,21 @@ #include +#define RETIRING_OFFSET 0 +#define BAD_SPEC_OFFSET 1 +#define FE_BOUND_OFFSET 2 +#define BE_BOUND_OFFSET 3 + +#define HEAVY_OPS_OFFSET 4 +#define BR_MISPRED_OFFSET 5 +#define FETCH_LAT_OFFSET 6 +#define MEM_BOUND_OFFSET 7 + +static double get_tma_percent_from_rdpmc_value(uint64_t rdpmc_value, + uint64_t offset) { + return (double)((rdpmc_value >> (offset * 8)) & 0xff) / 0xff; +} + namespace cali { namespace topdown { @@ -10,22 +25,10 @@ SapphireRapidsTopdown::SapphireRapidsTopdown(IntelTopdownLevel level) level, // top_counters "perf::slots" - ",perf::topdown-retiring" - ",perf::topdown-bad-spec" - ",perf::topdown-fe-bound" - ",perf::topdown-be-bound" - ",INT_MISC:UOP_DROPPING", + ",perf::topdown-retiring", // all_counters "perf::slots" - ",perf::topdown-retiring" - ",perf::topdown-bad-spec" - ",perf::topdown-fe-bound" - ",perf::topdown-be-bound" - ",INT_MISC:UOP_DROPPING" - ",perf_raw::r8400" // topdown-heavy-ops - ",perf_raw::r8500" // topdown-br-mispredict - ",perf_raw::r8600" // topdown-fetch-lat - ",perf_raw::r8700", // topdown-mem-bound + ",perf::topdown-retiring", // res_top {"retiring", "backend_bound", "frontend_bound", "bad_speculation"}, // res_all @@ -44,43 +47,29 @@ SapphireRapidsTopdown::compute_toplevel(const std::vector &rec) { // Get PAPI metrics for toplevel calculations Variant v_slots_or_info_thread_slots = get_val_from_rec(rec, "perf::slots"); - Variant v_retiring = get_val_from_rec(rec, "perf::topdown-retiring"); - Variant v_bad_spec = get_val_from_rec(rec, "perf::topdown-bad-spec"); - Variant v_fe_bound = get_val_from_rec(rec, "perf::topdown-fe-bound"); - Variant v_be_bound = get_val_from_rec(rec, "perf::topdown-be-bound"); - Variant v_int_misc_uop_dropping = - get_val_from_rec(rec, "INT_MISC:UOP_DROPPING"); + Variant v_tma_metrics = get_val_from_rec(rec, "perf::topdown-retiring"); // Check if any Variant is empty (use .empty()) - bool is_incomplete = v_fe_bound.empty() || v_be_bound.empty() || - v_bad_spec.empty() || v_retiring.empty() || - v_int_misc_uop_dropping.empty() || - v_slots_or_info_thread_slots.empty(); + bool is_incomplete = + v_tma_metrics.empty() || v_slots_or_info_thread_slots.empty(); // Check if all Variants are greater than 0 when casted to doubles (use // .to_double()) - bool is_nonzero = - v_fe_bound.to_double() > 0.0 && v_be_bound.to_double() > 0.0 && - v_bad_spec.to_double() > 0.0 && v_retiring.to_double() > 0.0 && - v_int_misc_uop_dropping.to_double() > 0.0 && - v_slots_or_info_thread_slots.to_double() > 0.0; + bool is_nonzero = v_tma_metrics.to_uint() > 0; // Check if bad values were obtained if (is_incomplete || !is_nonzero) return ret; - // Perform toplevel calcs - double toplevel_sum = (v_retiring.to_double() + v_bad_spec.to_double() + - v_fe_bound.to_double() + v_be_bound.to_double()); - - double retiring = (v_retiring.to_double() / toplevel_sum) + - (0 * v_slots_or_info_thread_slots.to_double()); - double frontend_bound = (v_fe_bound.to_double() / toplevel_sum) - - (v_int_misc_uop_dropping.to_double() / - v_slots_or_info_thread_slots.to_double()); - double backend_bound = (v_be_bound.to_double() / toplevel_sum) + - (0 * v_slots_or_info_thread_slots.to_double()); + uint64_t tma_metric_papi_rdpmc = v_tma_metrics.to_uint(); + + double retiring = + get_tma_percent_from_rdpmc_value(tma_metric_papi_rdpmc, RETIRING_OFFSET); + double frontend_bound = + get_tma_percent_from_rdpmc_value(tma_metric_papi_rdpmc, FE_BOUND_OFFSET); + double backend_bound = + get_tma_percent_from_rdpmc_value(tma_metric_papi_rdpmc, BE_BOUND_OFFSET); double bad_speculation = - std::max(1.0 - (frontend_bound + backend_bound + retiring), 0.0); + get_tma_percent_from_rdpmc_value(tma_metric_papi_rdpmc, BAD_SPEC_OFFSET); // Add toplevel metrics to vector of Entry ret.reserve(4); @@ -106,30 +95,22 @@ SapphireRapidsTopdown::compute_retiring(const std::vector &rec) { // Get PAPI metrics for toplevel calculations Variant v_slots_or_info_thread_slots = get_val_from_rec(rec, "perf::slots"); - Variant v_retiring = get_val_from_rec(rec, "perf::topdown-retiring"); - Variant v_bad_spec = get_val_from_rec(rec, "perf::topdown-bad-spec"); - Variant v_fe_bound = get_val_from_rec(rec, "perf::topdown-fe-bound"); - Variant v_be_bound = get_val_from_rec(rec, "perf::topdown-be-bound"); - Variant v_heavy_ops = get_val_from_rec(rec, "perf_raw::r8400"); + Variant v_tma_metrics = get_val_from_rec(rec, "perf::topdown-retiring"); // Check if any Variant is empty (use .empty()) - bool is_incomplete = v_fe_bound.empty() || v_be_bound.empty() || - v_bad_spec.empty() || v_retiring.empty() || - v_slots_or_info_thread_slots.empty() || - v_heavy_ops.empty(); + bool is_incomplete = + v_tma_metrics.empty() || v_slots_or_info_thread_slots.empty(); // Check if bad values were obtained if (is_incomplete) return ret; - double toplevel_sum = (v_retiring.to_double() + v_bad_spec.to_double() + - v_fe_bound.to_double() + v_be_bound.to_double()); - // Copied from compute_toplevel - double retiring = (v_retiring.to_double() / toplevel_sum) + - (0 * v_slots_or_info_thread_slots.to_double()); + uint64_t tma_metric_papi_rdpmc = v_tma_metrics.to_uint(); - double heavy_ops = (v_heavy_ops.to_double() / toplevel_sum) + - (0 * v_slots_or_info_thread_slots.to_double()); + double retiring = + get_tma_percent_from_rdpmc_value(tma_metric_papi_rdpmc, RETIRING_OFFSET); + double heavy_ops = + get_tma_percent_from_rdpmc_value(tma_metric_papi_rdpmc, HEAVY_OPS_OFFSET); double light_ops = std::max(0.0, retiring - heavy_ops); // Add toplevel metrics to vector of Entry @@ -152,30 +133,22 @@ SapphireRapidsTopdown::compute_backend_bound(const std::vector &rec) { // Get PAPI metrics for toplevel calculations Variant v_slots_or_info_thread_slots = get_val_from_rec(rec, "perf::slots"); - Variant v_retiring = get_val_from_rec(rec, "perf::topdown-retiring"); - Variant v_bad_spec = get_val_from_rec(rec, "perf::topdown-bad-spec"); - Variant v_fe_bound = get_val_from_rec(rec, "perf::topdown-fe-bound"); - Variant v_be_bound = get_val_from_rec(rec, "perf::topdown-be-bound"); - Variant v_memory_bound = get_val_from_rec(rec, "perf_raw::r8700"); + Variant v_tma_metrics = get_val_from_rec(rec, "perf::topdown-retiring"); // Check if any Variant is empty (use .empty()) - bool is_incomplete = v_fe_bound.empty() || v_be_bound.empty() || - v_bad_spec.empty() || v_retiring.empty() || - v_slots_or_info_thread_slots.empty() || - v_memory_bound.empty(); + bool is_incomplete = + v_tma_metrics.empty() || v_slots_or_info_thread_slots.empty(); // Check if bad values were obtained if (is_incomplete) return ret; - double toplevel_sum = (v_retiring.to_double() + v_bad_spec.to_double() + - v_fe_bound.to_double() + v_be_bound.to_double()); - // Copied from compute_toplevel - double backend_bound = (v_be_bound.to_double() / toplevel_sum) + - (0 * v_slots_or_info_thread_slots.to_double()); + uint64_t tma_metric_papi_rdpmc = v_tma_metrics.to_uint(); - double memory_bound = (v_memory_bound.to_double() / toplevel_sum) + - (0 * v_slots_or_info_thread_slots.to_double()); + double backend_bound = + get_tma_percent_from_rdpmc_value(tma_metric_papi_rdpmc, BE_BOUND_OFFSET); + double memory_bound = + get_tma_percent_from_rdpmc_value(tma_metric_papi_rdpmc, MEM_BOUND_OFFSET); double core_bound = std::max(0.0, backend_bound - memory_bound); // Add toplevel metrics to vector of Entry @@ -198,35 +171,22 @@ SapphireRapidsTopdown::compute_frontend_bound(const std::vector &rec) { // Get PAPI metrics for toplevel calculations Variant v_slots_or_info_thread_slots = get_val_from_rec(rec, "perf::slots"); - Variant v_retiring = get_val_from_rec(rec, "perf::topdown-retiring"); - Variant v_bad_spec = get_val_from_rec(rec, "perf::topdown-bad-spec"); - Variant v_fe_bound = get_val_from_rec(rec, "perf::topdown-fe-bound"); - Variant v_be_bound = get_val_from_rec(rec, "perf::topdown-be-bound"); - Variant v_int_misc_uop_dropping = - get_val_from_rec(rec, "INT_MISC:UOP_DROPPING"); - Variant v_fetch_latency = get_val_from_rec(rec, "perf_raw::r8600"); + Variant v_tma_metrics = get_val_from_rec(rec, "perf::topdown-retiring"); // Check if any Variant is empty (use .empty()) bool is_incomplete = - v_fe_bound.empty() || v_be_bound.empty() || v_bad_spec.empty() || - v_retiring.empty() || v_int_misc_uop_dropping.empty() || - v_slots_or_info_thread_slots.empty() || v_fetch_latency.empty(); + v_tma_metrics.empty() || v_slots_or_info_thread_slots.empty(); // Check if bad values were obtained if (is_incomplete) return ret; - double toplevel_sum = (v_retiring.to_double() + v_bad_spec.to_double() + - v_fe_bound.to_double() + v_be_bound.to_double()); - // Copied from compute_toplevel - double frontend_bound = (v_fe_bound.to_double() / toplevel_sum) - - (v_int_misc_uop_dropping.to_double() / - v_slots_or_info_thread_slots.to_double()); - - double fetch_latency = (v_fetch_latency.to_double() / toplevel_sum) - - (v_int_misc_uop_dropping.to_double() / - v_slots_or_info_thread_slots.to_double()); + uint64_t tma_metric_papi_rdpmc = v_tma_metrics.to_uint(); + double frontend_bound = + get_tma_percent_from_rdpmc_value(tma_metric_papi_rdpmc, FE_BOUND_OFFSET); + double fetch_latency = + get_tma_percent_from_rdpmc_value(tma_metric_papi_rdpmc, FETCH_LAT_OFFSET); double fetch_bandwidth = std::max(0.0, frontend_bound - fetch_latency); // Add toplevel metrics to vector of Entry @@ -249,40 +209,22 @@ SapphireRapidsTopdown::compute_bad_speculation(const std::vector &rec) { // Get PAPI metrics for toplevel calculations Variant v_slots_or_info_thread_slots = get_val_from_rec(rec, "perf::slots"); - Variant v_retiring = get_val_from_rec(rec, "perf::topdown-retiring"); - Variant v_bad_spec = get_val_from_rec(rec, "perf::topdown-bad-spec"); - Variant v_fe_bound = get_val_from_rec(rec, "perf::topdown-fe-bound"); - Variant v_be_bound = get_val_from_rec(rec, "perf::topdown-be-bound"); - Variant v_int_misc_uop_dropping = - get_val_from_rec(rec, "INT_MISC:UOP_DROPPING"); - Variant v_branch_mispredict = get_val_from_rec(rec, "perf_raw::r8500"); + Variant v_tma_metrics = get_val_from_rec(rec, "perf::topdown-retiring"); // Check if any Variant is empty (use .empty()) bool is_incomplete = - v_fe_bound.empty() || v_be_bound.empty() || v_bad_spec.empty() || - v_retiring.empty() || v_int_misc_uop_dropping.empty() || - v_slots_or_info_thread_slots.empty() || v_branch_mispredict.empty(); + v_tma_metrics.empty() || v_slots_or_info_thread_slots.empty(); // Check if bad values were obtained if (is_incomplete) return ret; - // Perform toplevel calcs - double toplevel_sum = (v_retiring.to_double() + v_bad_spec.to_double() + - v_fe_bound.to_double() + v_be_bound.to_double()); - - double retiring = (v_retiring.to_double() / toplevel_sum) + - (0 * v_slots_or_info_thread_slots.to_double()); - double frontend_bound = (v_fe_bound.to_double() / toplevel_sum) - - (v_int_misc_uop_dropping.to_double() / - v_slots_or_info_thread_slots.to_double()); - double backend_bound = (v_be_bound.to_double() / toplevel_sum) + - (0 * v_slots_or_info_thread_slots.to_double()); - double bad_speculation = - std::max(1.0 - (frontend_bound + backend_bound + retiring), 0.0); + uint64_t tma_metric_papi_rdpmc = v_tma_metrics.to_uint(); - double branch_mispredict = (v_branch_mispredict.to_double() / toplevel_sum) + - (0 * v_slots_or_info_thread_slots.to_double()); + double bad_speculation = + get_tma_percent_from_rdpmc_value(tma_metric_papi_rdpmc, BAD_SPEC_OFFSET); + double branch_mispredict = get_tma_percent_from_rdpmc_value( + tma_metric_papi_rdpmc, BR_MISPRED_OFFSET); double machine_clears = std::max(0.0, bad_speculation - branch_mispredict); // Add toplevel metrics to vector of Entry From 28ff918b8bff8e90b439676da1896180e75e7d48 Mon Sep 17 00:00:00 2001 From: Ian Lumsden Date: Tue, 8 Oct 2024 08:26:48 -0700 Subject: [PATCH 08/11] Updates option spec for SPR topdown and adds instruction comments for making new topdown calculators --- src/caliper/controllers/controllers.cpp | 36 ++++------------------ src/services/papi/Papi.cpp | 5 ++- src/services/topdown/CMakeLists.txt | 6 ++-- src/services/topdown/IntelTopdown.cpp | 11 ++++--- src/services/topdown/TopdownCalculator.cpp | 5 +-- src/services/topdown/TopdownCalculator.h | 13 ++++++++ 6 files changed, 34 insertions(+), 42 deletions(-) diff --git a/src/caliper/controllers/controllers.cpp b/src/caliper/controllers/controllers.cpp index 5e70da15..a0c33e73 100644 --- a/src/caliper/controllers/controllers.cpp +++ b/src/caliper/controllers/controllers.cpp @@ -1240,28 +1240,20 @@ const char* builtin_papi_spr_option_specs = R"json( "config" : { "CALI_PAPI_COUNTERS": - "perf::slots,perf::topdown-retiring,perf::topdown-bad-spec,perf::topdown-fe-bound,perf::topdown-be-bound,INT_MISC:UOP_DROPPING" + "perf::slots,perf::topdown-retiring" }, "query" : [ { "level": "local", "select": [ "inclusive_sum(sum#papi.slots) as slots", - "inclusive_sum(sum#papi.perf::topdown-retiring) as topdown_retiring", - "inclusive_sum(sum#papi.perf::topdown-bad-spec) as topdown_bad_spec", - "inclusive_sum(sum#papi.perf::topdown-fe-bound) as topdown_fe_bound", - "inclusive_sum(sum#papi.perf::topdown-be-bound) as topdown_be_bound", - "inclusive_sum(sum#papi.INT_MISC:UOP_DROPPING) as int_mist:uop_dropping" + "inclusive_sum(sum#papi.perf::topdown-retiring) as topdown_retiring" ] }, { "level": "cross", "select": [ "sum(inclusive#sum#papi.slots) as slots", - "sum(inclusive#sum#papi.perf::topdown-retiring) as topdown_retiring", - "sum(inclusive#sum#papi.perf::topdown-bad-spec) as topdown_bad_spec", - "sum(inclusive#sum#papi.perf::topdown-fe-bound) as topdown_fe_bound", - "sum(inclusive#sum#papi.perf::topdown-be-bound) as topdown_be_bound", - "sum(inclusive#sum#papi.INT_MISC:UOP_DROPPING) as int_mist:uop_dropping" + "sum(inclusive#sum#papi.perf::topdown-retiring) as topdown_retiring" ] } ] @@ -1275,36 +1267,20 @@ const char* builtin_papi_spr_option_specs = R"json( "config" : { "CALI_PAPI_COUNTERS": - "perf::slots,perf::topdown-retiring,perf::topdown-bad-spec,perf::topdown-fe-bound,perf::topdown-be-bound,INT_MISC:UOP_DROPPING,perf_raw::r8400,perf_raw::r8500,perf_raw::r8600,perf_raw::r8700" + "perf::slots,perf::topdown-retiring" }, "query" : [ { "level": "local", "select": [ "inclusive_sum(sum#papi.slots) as slots", - "inclusive_sum(sum#papi.perf::topdown-retiring) as topdown_retiring", - "inclusive_sum(sum#papi.perf::topdown-bad-spec) as topdown_bad_spec", - "inclusive_sum(sum#papi.perf::topdown-fe-bound) as topdown_fe_bound", - "inclusive_sum(sum#papi.perf::topdown-be-bound) as topdown_be_bound", - "inclusive_sum(sum#papi.INT_MISC:UOP_DROPPING) as int_mist:uop_dropping", - "inclusive_sum(sum#papi.perf_raw::r8400) as topdown_heavy_ops", - "inclusive_sum(sum#papi.perf_raw::r8500) as topdown_br_mispredict", - "inclusive_sum(sum#papi.perf_raw::r8600) as topdown_fetch_lat", - "inclusive_sum(sum#papi.perf_raw::r8700) as topdown_mem_bound" + "inclusive_sum(sum#papi.perf::topdown-retiring) as topdown_retiring" ] }, { "level": "cross", "select": [ "sum(inclusive#sum#papi.slots) as slots", - "sum(inclusive#sum#papi.perf::topdown-retiring) as topdown_retiring", - "sum(inclusive#sum#papi.perf::topdown-bad-spec) as topdown_bad_spec", - "sum(inclusive#sum#papi.perf::topdown-fe-bound) as topdown_fe_bound", - "sum(inclusive#sum#papi.perf::topdown-be-bound) as topdown_be_bound", - "sum(inclusive#sum#papi.INT_MISC:UOP_DROPPING) as int_mist:uop_dropping", - "sum(inclusive#sum#papi.perf_raw::r8400) as topdown_heavy_ops", - "sum(inclusive#sum#papi.perf_raw::r8500) as topdown_br_mispredict", - "sum(inclusive#sum#papi.perf_raw::r8600) as topdown_fetch_lat", - "sum(inclusive#sum#papi.perf_raw::r8700) as topdown_mem_bound" + "sum(inclusive#sum#papi.perf::topdown-retiring) as topdown_retiring" ] } ] diff --git a/src/services/papi/Papi.cpp b/src/services/papi/Papi.cpp index eb111af3..ed9ea6ff 100644 --- a/src/services/papi/Papi.cpp +++ b/src/services/papi/Papi.cpp @@ -173,9 +173,8 @@ class PapiService if (!m_disable_multiplex && cpi && (num > 4 /* magic number for Intel counter support :-( */ || m_enable_multiplex)) { if (Log::verbosity() >= 2) - Log(2).stream() << "papi: Initializing multiplex support for - component " - << p.first << " (" << cpi->name << ")" << std::endl; + Log(2).stream() << "papi: Initializing multiplex support for component " << p.first << " (" + << cpi->name << ")" << std::endl; ret = PAPI_assign_eventset_component(eventset, p.first); if (ret != PAPI_OK) diff --git a/src/services/topdown/CMakeLists.txt b/src/services/topdown/CMakeLists.txt index bf5ea84e..d5dd230c 100644 --- a/src/services/topdown/CMakeLists.txt +++ b/src/services/topdown/CMakeLists.txt @@ -1,12 +1,12 @@ set(CALIPER_TOPDOWN_SOURCES IntelTopdown.cpp - TopdownCalulator.cpp + TopdownCalculator.cpp HaswellTopdown.cpp SapphireRapidsTopdown.cpp) if (CALIPER_HAVE_ARCH STREQUAL "sapphirerapids") if (NOT EXISTS ${PAPI_PREFIX}/bin/papi_coponent_avail) - message(WARNING "Cannot check if PAPI uses rdpmc. Note that the topdown service will not work correctly on Sapphire Rapids if rdpmc is enabled. This will be fixed by a future version of PAPI.") + message(WARNING "Cannot check if PAPI uses rdpmc. Note that the topdown service will not work correctly on Sapphire Rapids if rdpmc is NOT enabled. This will be fixed by a future version of PAPI.") else () execute_process( COMMAND ${PAPI_PREFIX}/bin/papi_coponent_avail @@ -14,7 +14,7 @@ if (CALIPER_HAVE_ARCH STREQUAL "sapphirerapids") ) string(FIND ${CALIPER_TOPDOWN_PAPI_COMPONENTS} "Fast counter read (rdpmc): yes" CALIPER_TOPDOWN_PAPI_USES_RDPMC) if (CALIPER_TOPDOWN_PAPI_USES_RDPMC EQUAL "-1") - message(WARNING "Detected that PAPI uses rdpmc to read counters. The topdown service will not work correctly on Sapphire Rapids if rdpmc is enabled. This will be fixed by a future version of PAPI.") + message(WARNING "Detected that PAPI does not use rdpmc to read counters. The topdown service will not work correctly on Sapphire Rapids if rdpmc is NOT enabled. This will be fixed by a future version of PAPI.") endif () endif() endif () diff --git a/src/services/topdown/IntelTopdown.cpp b/src/services/topdown/IntelTopdown.cpp index 04eabf1b..840dcef1 100644 --- a/src/services/topdown/IntelTopdown.cpp +++ b/src/services/topdown/IntelTopdown.cpp @@ -59,7 +59,7 @@ class IntelTopdown ++num_top_computed; } - if (m_level == All) { + if (m_level == cali::topdown::All) { result = m_calculator->compute_backend_bound(rec); if (result.size() != m_calculator->get_num_expected_backend_bound()) { @@ -146,13 +146,13 @@ class IntelTopdown static void intel_topdown_register(Caliper* c, Channel* channel) { - cali::topdown::IntelTopdownLevel level = Top; + cali::topdown::IntelTopdownLevel level = cali::topdown::Top; auto config = services::init_config_from_spec(channel->config(), s_spec); std::string lvlcfg = config.get("level").to_string(); if (lvlcfg == "all") { - level = All; + level = cali::topdown::All; } else if (lvlcfg != "top") { Log(0).stream() << channel->name() << ": topdown: Unknown level \"" << lvlcfg << "\", skipping topdown" << std::endl; @@ -175,7 +175,10 @@ class IntelTopdown // Some PAPI counters for topdown (particularly on SPR) don't play nice // with PAPI multiplexing. Ask the TopdownCalculator class whether we need // to disable multiplexing for the corresponding architecture. - channel->config().set("CALI_PAPI_DISABLE_MULTIPLEXING", calculator->check_for_disabled_multiplex()); + channel->config().set( + "CALI_PAPI_DISABLE_MULTIPLEXING", + calculator->check_for_disabled_multiplex() ? "true" : "false" + ); if (!cali::services::register_service(c, channel, "papi")) { Log(0).stream() << channel->name() << ": topdown: Unable to register papi service, skipping topdown" diff --git a/src/services/topdown/TopdownCalculator.cpp b/src/services/topdown/TopdownCalculator.cpp index d1c8909b..bbfa386f 100644 --- a/src/services/topdown/TopdownCalculator.cpp +++ b/src/services/topdown/TopdownCalculator.cpp @@ -1,6 +1,7 @@ #include "TopdownCalculator.h" #include "caliper/common/Log.h" +#include "caliper/common/StringConverter.h" #include @@ -47,9 +48,9 @@ bool TopdownCalculator::find_counter_attrs(CaliperMetadataAccessInterface &db) { for (const auto &s : counters) { Attribute attr = db.get_attribute(std::string("sum#papi.") + s); - if (attr == Attribute::invalid) + if (!attr) attr = db.get_attribute(std::string("papi.") + s); - if (attr == Attribute::invalid) { + if (!attr) { Log(0).stream() << "topdown: " << s << " counter attribute not found!" << std::endl; return false; diff --git a/src/services/topdown/TopdownCalculator.h b/src/services/topdown/TopdownCalculator.h index e478d723..9841580e 100644 --- a/src/services/topdown/TopdownCalculator.h +++ b/src/services/topdown/TopdownCalculator.h @@ -6,6 +6,19 @@ #include #include +// clang-format off +/* How to create a new topdown calculation plugin: + * + * Step 1: Create a subclass of this class implementing the calculations for the new + * architecture (see Haswell and SPR as examples) + * Step 2: Edit IntelTopdown::intel_topdown_register in IntelTopdown.cpp with logic for + * creating an instance of your subclass (edits should be made around line 165) + * Step 3: Edit CMakeLists.txt to include the source file for your new subclass + * Step 4: Edit the 'get_builtin_option_specs' function in src/caliper/controllers/controllers.cpp + * to add the appropriate option spec for your architecture in the topdown service + */ +// clang-format on + namespace cali { namespace topdown { From eec5ea906e4d98039f5cbfef44282ff24c5013e6 Mon Sep 17 00:00:00 2001 From: Ian Lumsden Date: Tue, 8 Oct 2024 16:27:47 -0400 Subject: [PATCH 09/11] Adds a CMake flag to let users tell us if PAPI is built to use rdpmc or not --- CMakeLists.txt | 2 + caliper-config.h.in | 3 + src/caliper/controllers/controllers.cpp | 383 ++++++++++++------ src/services/topdown/CMakeLists.txt | 24 +- ...wn.cpp => SapphireRapidsTopdown_rdpmc.cpp} | 0 .../topdown/SapphireRapidsTopdown_read.cpp | 303 ++++++++++++++ 6 files changed, 588 insertions(+), 127 deletions(-) rename src/services/topdown/{SapphireRapidsTopdown.cpp => SapphireRapidsTopdown_rdpmc.cpp} (100%) create mode 100644 src/services/topdown/SapphireRapidsTopdown_read.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 496d4c2d..70f21a9e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -82,6 +82,8 @@ add_caliper_option(WITH_VARIORUM "Enable Variorum support" FALSE) add_caliper_option(WITH_UMPIRE "Enable Umpire statistics support" FALSE) add_caliper_option(WITH_CRAYPAT "Enable CrayPAT region forwarding support" FALSE) add_caliper_option(WITH_LDMS "Enable LDMS forwarder" FALSE) +add_caliper_option(WITH_PAPI_RDPMC "Declare that PAPI is built to use rdpmc for reading counters. Does nothing if PAPI support is not enabled." TRUE) + set(WITH_ARCH "" CACHE STRING "Enable features specific to the provided archspec CPU architecture name") if (NOT WITH_ARCH STREQUAL "") diff --git a/caliper-config.h.in b/caliper-config.h.in index d5602e97..366c407e 100644 --- a/caliper-config.h.in +++ b/caliper-config.h.in @@ -27,6 +27,9 @@ #cmakedefine CALIPER_HAVE_CRAYPAT #cmakedefine CALIPER_HAVE_LDMS #cmakedefine CALIPER_HAVE_ARCH "@CALIPER_HAVE_ARCH@" +#ifdef CALIPER_HAVE_PAPI +#cmakedefine CALIPER_WITH_PAPI_RDPMC +#endif #cmakedefine CALIPER_REDUCED_CONSTEXPR_USAGE diff --git a/src/caliper/controllers/controllers.cpp b/src/caliper/controllers/controllers.cpp index a0c33e73..cbb08bcc 100644 --- a/src/caliper/controllers/controllers.cpp +++ b/src/caliper/controllers/controllers.cpp @@ -1160,132 +1160,289 @@ const char* builtin_papi_hsw_option_specs = R"json( ] )json"; +#ifdef CALIPER_WITH_PAPI_RDPMC const char* builtin_papi_spr_option_specs = R"json( - { - "name" : "topdown.toplevel", - "description" : "Top-down analysis for Intel CPUs (top level)", - "type" : "bool", - "category" : "metric", - "services" : [ "topdown" ], - "config" : { "CALI_TOPDOWN_LEVEL": "top" }, - "query" : +[ + { + "name" : "topdown.toplevel", + "description" : "Top-down analysis for Intel CPUs (top level)", + "type" : "bool", + "category" : "metric", + "services" : [ "topdown" ], + "config" : { "CALI_TOPDOWN_LEVEL": "top" }, + "query" : + [ + { "level": "local", "select": [ - { "level": "local", "select": - [ - "any(topdown.retiring) as \"Retiring\"", - "any(topdown.backend_bound) as \"Backend bound\"", - "any(topdown.frontend_bound) as \"Frontend bound\"", - "any(topdown.bad_speculation) as \"Bad speculation\"" - ] - }, - { "level": "cross", "select": - [ - "any(any#topdown.retiring) as \"Retiring\"", - "any(any#topdown.backend_bound) as \"Backend bound\"", - "any(any#topdown.frontend_bound) as \"Frontend bound\"", - "any(any#topdown.bad_speculation) as \"Bad speculation\"" - ] - } + "any(topdown.retiring) as \"Retiring\"", + "any(topdown.backend_bound) as \"Backend bound\"", + "any(topdown.frontend_bound) as \"Frontend bound\"", + "any(topdown.bad_speculation) as \"Bad speculation\"" ] }, - { - "name" : "topdown.all", - "description" : "Top-down analysis for Intel CPUs (all levels)", - "type" : "bool", - "category" : "metric", - "services" : [ "topdown" ], - "config" : { "CALI_TOPDOWN_LEVEL": "all" }, - "query" : + { "level": "cross", "select": + [ + "any(any#topdown.retiring) as \"Retiring\"", + "any(any#topdown.backend_bound) as \"Backend bound\"", + "any(any#topdown.frontend_bound) as \"Frontend bound\"", + "any(any#topdown.bad_speculation) as \"Bad speculation\"" + ] + } + ] + }, + { + "name" : "topdown.all", + "description" : "Top-down analysis for Intel CPUs (all levels)", + "type" : "bool", + "category" : "metric", + "services" : [ "topdown" ], + "config" : { "CALI_TOPDOWN_LEVEL": "all" }, + "query" : + [ + { "level": "local", "select": + [ + "any(topdown.retiring) as \"Retiring\"", + "any(topdown.backend_bound) as \"Backend bound\"", + "any(topdown.frontend_bound) as \"Frontend bound\"", + "any(topdown.bad_speculation) as \"Bad speculation\"", + "any(topdown.branch_mispredict) as \"Branch mispredict\"", + "any(topdown.machine_clears) as \"Machine clears\"", + "any(topdown.frontend_latency) as \"Frontend latency\"", + "any(topdown.frontend_bandwidth) as \"Frontend bandwidth\"", + "any(topdown.memory_bound) as \"Memory bound\"", + "any(topdown.core_bound) as \"Core bound\"", + "any(topdown.light_ops) as \"Light operations\"", + "any(topdown.heavy_ops) as \"Heavy operations\"" + ] + }, + { "level": "cross", "select": + [ + "any(any#topdown.retiring) as \"Retiring\"", + "any(any#topdown.backend_bound) as \"Backend bound\"", + "any(any#topdown.frontend_bound) as \"Frontend bound\"", + "any(any#topdown.bad_speculation) as \"Bad speculation\"", + "any(any#topdown.branch_mispredict) as \"Branch mispredict\"", + "any(any#topdown.machine_clears) as \"Machine clears\"", + "any(any#topdown.frontend_latency) as \"Frontend latency\"", + "any(any#topdown.frontend_bandwidth) as \"Frontend bandwidth\"", + "any(any#topdown.memory_bound) as \"Memory bound\"", + "any(any#topdown.core_bound) as \"Core bound\"", + "any(any#topdown.light_ops) as \"Light operations\"", + "any(any#topdown.heavy_ops) as \"Heavy operations\"" + ] + } + ] + }, + { + "name" : "topdown-counters.toplevel", + "description" : "Raw counter values for Intel top-down analysis (top level)", + "type" : "bool", + "category" : "metric", + "services" : [ "papi" ], + "config" : + { + "CALI_PAPI_COUNTERS": + "perf::slots,perf::topdown-retiring" + }, + "query" : + [ + { "level": "local", "select": [ - { "level": "local", "select": - [ - "any(topdown.retiring) as \"Retiring\"", - "any(topdown.backend_bound) as \"Backend bound\"", - "any(topdown.frontend_bound) as \"Frontend bound\"", - "any(topdown.bad_speculation) as \"Bad speculation\"", - "any(topdown.branch_mispredict) as \"Branch mispredict\"", - "any(topdown.machine_clears) as \"Machine clears\"", - "any(topdown.frontend_latency) as \"Frontend latency\"", - "any(topdown.frontend_bandwidth) as \"Frontend bandwidth\"", - "any(topdown.memory_bound) as \"Memory bound\"", - "any(topdown.core_bound) as \"Core bound\"", - "any(topdown.light_ops) as \"Light operations\"", - "any(topdown.heavy_ops) as \"Heavy operations\"" - ] - }, - { "level": "cross", "select": - [ - "any(any#topdown.retiring) as \"Retiring\"", - "any(any#topdown.backend_bound) as \"Backend bound\"", - "any(any#topdown.frontend_bound) as \"Frontend bound\"", - "any(any#topdown.bad_speculation) as \"Bad speculation\"", - "any(any#topdown.branch_mispredict) as \"Branch mispredict\"", - "any(any#topdown.machine_clears) as \"Machine clears\"", - "any(any#topdown.frontend_latency) as \"Frontend latency\"", - "any(any#topdown.frontend_bandwidth) as \"Frontend bandwidth\"", - "any(any#topdown.memory_bound) as \"Memory bound\"", - "any(any#topdown.core_bound) as \"Core bound\"", - "any(any#topdown.light_ops) as \"Light operations\"", - "any(any#topdown.heavy_ops) as \"Heavy operations\"" - ] - } + "inclusive_sum(sum#papi.slots) as slots", + "inclusive_sum(sum#papi.perf::topdown-retiring) as topdown_retiring" ] }, - { - "name" : "topdown-counters.toplevel", - "description" : "Raw counter values for Intel top-down analysis (top level)", - "type" : "bool", - "category" : "metric", - "services" : [ "papi" ], - "config" : - { - "CALI_PAPI_COUNTERS": - "perf::slots,perf::topdown-retiring" - }, - "query" : + { "level": "cross", "select": + [ + "sum(inclusive#sum#papi.slots) as slots", + "sum(inclusive#sum#papi.perf::topdown-retiring) as topdown_retiring" + ] + } + ] + }, + { + "name" : "topdown-counters.all", + "description" : "Raw counter values for Intel top-down analysis (all levels)", + "type" : "bool", + "category" : "metric", + "services" : [ "papi" ], + "config" : + { + "CALI_PAPI_COUNTERS": + "perf::slots,perf::topdown-retiring" + }, + "query" : + [ + { "level": "local", "select": [ - { "level": "local", "select": - [ - "inclusive_sum(sum#papi.slots) as slots", - "inclusive_sum(sum#papi.perf::topdown-retiring) as topdown_retiring" - ] - }, - { "level": "cross", "select": - [ - "sum(inclusive#sum#papi.slots) as slots", - "sum(inclusive#sum#papi.perf::topdown-retiring) as topdown_retiring" - ] - } + "inclusive_sum(sum#papi.slots) as slots", + "inclusive_sum(sum#papi.perf::topdown-retiring) as topdown_retiring" ] }, - { - "name" : "topdown-counters.all", - "description" : "Raw counter values for Intel top-down analysis (all levels)", - "type" : "bool", - "category" : "metric", - "services" : [ "papi" ], - "config" : - { - "CALI_PAPI_COUNTERS": - "perf::slots,perf::topdown-retiring" - }, - "query" : + { "level": "cross", "select": [ - { "level": "local", "select": - [ - "inclusive_sum(sum#papi.slots) as slots", - "inclusive_sum(sum#papi.perf::topdown-retiring) as topdown_retiring" - ] - }, - { "level": "cross", "select": - [ - "sum(inclusive#sum#papi.slots) as slots", - "sum(inclusive#sum#papi.perf::topdown-retiring) as topdown_retiring" - ] - } + "sum(inclusive#sum#papi.slots) as slots", + "sum(inclusive#sum#papi.perf::topdown-retiring) as topdown_retiring" ] } - )json"; + ] + } +] +)json"; +#else +const char* builtin_papi_spr_option_specs = R"json( +[ + { + "name" : "topdown.toplevel", + "description" : "Top-down analysis for Intel CPUs (top level)", + "type" : "bool", + "category" : "metric", + "services" : [ "topdown" ], + "config" : { "CALI_TOPDOWN_LEVEL": "top" }, + "query" : + [ + { "level": "local", "select": + [ + "any(topdown.retiring) as \"Retiring\"", + "any(topdown.backend_bound) as \"Backend bound\"", + "any(topdown.frontend_bound) as \"Frontend bound\"", + "any(topdown.bad_speculation) as \"Bad speculation\"" + ] + }, + { "level": "cross", "select": + [ + "any(any#topdown.retiring) as \"Retiring\"", + "any(any#topdown.backend_bound) as \"Backend bound\"", + "any(any#topdown.frontend_bound) as \"Frontend bound\"", + "any(any#topdown.bad_speculation) as \"Bad speculation\"" + ] + } + ] + }, + { + "name" : "topdown.all", + "description" : "Top-down analysis for Intel CPUs (all levels)", + "type" : "bool", + "category" : "metric", + "services" : [ "topdown" ], + "config" : { "CALI_TOPDOWN_LEVEL": "all" }, + "query" : + [ + { "level": "local", "select": + [ + "any(topdown.retiring) as \"Retiring\"", + "any(topdown.backend_bound) as \"Backend bound\"", + "any(topdown.frontend_bound) as \"Frontend bound\"", + "any(topdown.bad_speculation) as \"Bad speculation\"", + "any(topdown.branch_mispredict) as \"Branch mispredict\"", + "any(topdown.machine_clears) as \"Machine clears\"", + "any(topdown.frontend_latency) as \"Frontend latency\"", + "any(topdown.frontend_bandwidth) as \"Frontend bandwidth\"", + "any(topdown.memory_bound) as \"Memory bound\"", + "any(topdown.core_bound) as \"Core bound\"", + "any(topdown.light_ops) as \"Light operations\"", + "any(topdown.heavy_ops) as \"Heavy operations\"" + ] + }, + { "level": "cross", "select": + [ + "any(any#topdown.retiring) as \"Retiring\"", + "any(any#topdown.backend_bound) as \"Backend bound\"", + "any(any#topdown.frontend_bound) as \"Frontend bound\"", + "any(any#topdown.bad_speculation) as \"Bad speculation\"", + "any(any#topdown.branch_mispredict) as \"Branch mispredict\"", + "any(any#topdown.machine_clears) as \"Machine clears\"", + "any(any#topdown.frontend_latency) as \"Frontend latency\"", + "any(any#topdown.frontend_bandwidth) as \"Frontend bandwidth\"", + "any(any#topdown.memory_bound) as \"Memory bound\"", + "any(any#topdown.core_bound) as \"Core bound\"", + "any(any#topdown.light_ops) as \"Light operations\"", + "any(any#topdown.heavy_ops) as \"Heavy operations\"" + ] + } + ] + }, + { + "name" : "topdown-counters.toplevel", + "description" : "Raw counter values for Intel top-down analysis (top level)", + "type" : "bool", + "category" : "metric", + "services" : [ "papi" ], + "config" : + { + "CALI_PAPI_COUNTERS": + "perf::slots,perf::topdown-retiring,perf::topdown-bad-spec,perf::topdown-fe-bound,perf::topdown-be-bound,INT_MISC:UOP_DROPPING" + }, + "query" : + [ + { "level": "local", "select": + [ + "inclusive_sum(sum#papi.perf::slots) as slots", + "inclusive_sum(sum#papi.perf::topdown-retiring) as topdown_retiring", + "inclusive_sum(sum#papi.perf::topdown-bad-spec) as topdown_bad_spec", + "inclusive_sum(sum#papi.perf::topdown-fe-bound) as topdown_fe_bound", + "inclusive_sum(sum#papi.perf::topdown-be-bound) as topdown_be_bound", + "inclusive_sum(sum#papi.INT_MISC:UOP_DROPPING) as int_mist:uop_dropping" + ] + }, + { "level": "cross", "select": + [ + "sum(inclusive#sum#papi.perf::slots) as slots", + "sum(inclusive#sum#papi.perf::topdown-retiring) as topdown_retiring", + "sum(inclusive#sum#papi.perf::topdown-bad-spec) as topdown_bad_spec", + "sum(inclusive#sum#papi.perf::topdown-fe-bound) as topdown_fe_bound", + "sum(inclusive#sum#papi.perf::topdown-be-bound) as topdown_be_bound", + "sum(inclusive#sum#papi.INT_MISC:UOP_DROPPING) as int_mist:uop_dropping" + ] + } + ] + }, + { + "name" : "topdown-counters.all", + "description" : "Raw counter values for Intel top-down analysis (all levels)", + "type" : "bool", + "category" : "metric", + "services" : [ "papi" ], + "config" : + { + "CALI_PAPI_COUNTERS": + "perf::slots,perf::topdown-retiring,perf::topdown-bad-spec,perf::topdown-fe-bound,perf::topdown-be-bound,INT_MISC:UOP_DROPPING,perf_raw::r8400,perf_raw::r8500,perf_raw::r8600,perf_raw::r8700" + }, + "query" : + [ + { "level": "local", "select": + [ + "inclusive_sum(sum#papi.perf::slots) as slots", + "inclusive_sum(sum#papi.perf::topdown-retiring) as topdown_retiring", + "inclusive_sum(sum#papi.perf::topdown-bad-spec) as topdown_bad_spec", + "inclusive_sum(sum#papi.perf::topdown-fe-bound) as topdown_fe_bound", + "inclusive_sum(sum#papi.perf::topdown-be-bound) as topdown_be_bound", + "inclusive_sum(sum#papi.INT_MISC:UOP_DROPPING) as int_mist:uop_dropping", + "inclusive_sum(sum#papi.perf_raw::r8400) as topdown_heavy_ops", + "inclusive_sum(sum#papi.perf_raw::r8500) as topdown_br_mispredict", + "inclusive_sum(sum#papi.perf_raw::r8600) as topdown_fetch_lat", + "inclusive_sum(sum#papi.perf_raw::r8700) as topdown_mem_bound" + ] + }, + { "level": "cross", "select": + [ + "sum(inclusive#sum#papi.perf::slots) as slots", + "sum(inclusive#sum#papi.perf::topdown-retiring) as topdown_retiring", + "sum(inclusive#sum#papi.perf::topdown-bad-spec) as topdown_bad_spec", + "sum(inclusive#sum#papi.perf::topdown-fe-bound) as topdown_fe_bound", + "sum(inclusive#sum#papi.perf::topdown-be-bound) as topdown_be_bound", + "sum(inclusive#sum#papi.INT_MISC:UOP_DROPPING) as int_mist:uop_dropping", + "sum(inclusive#sum#papi.perf_raw::r8400) as topdown_heavy_ops", + "sum(inclusive#sum#papi.perf_raw::r8500) as topdown_br_mispredict", + "sum(inclusive#sum#papi.perf_raw::r8600) as topdown_fetch_lat", + "sum(inclusive#sum#papi.perf_raw::r8700) as topdown_mem_bound" + ] + } + ] + } +] +)json"; +#endif const char* builtin_kokkos_option_specs = R"json( [ diff --git a/src/services/topdown/CMakeLists.txt b/src/services/topdown/CMakeLists.txt index d5dd230c..5fc9c537 100644 --- a/src/services/topdown/CMakeLists.txt +++ b/src/services/topdown/CMakeLists.txt @@ -1,22 +1,18 @@ set(CALIPER_TOPDOWN_SOURCES IntelTopdown.cpp TopdownCalculator.cpp - HaswellTopdown.cpp - SapphireRapidsTopdown.cpp) + HaswellTopdown.cpp) -if (CALIPER_HAVE_ARCH STREQUAL "sapphirerapids") - if (NOT EXISTS ${PAPI_PREFIX}/bin/papi_coponent_avail) - message(WARNING "Cannot check if PAPI uses rdpmc. Note that the topdown service will not work correctly on Sapphire Rapids if rdpmc is NOT enabled. This will be fixed by a future version of PAPI.") - else () - execute_process( - COMMAND ${PAPI_PREFIX}/bin/papi_coponent_avail - OUTPUT_VARIABLE CALIPER_TOPDOWN_PAPI_COMPONENTS - ) - string(FIND ${CALIPER_TOPDOWN_PAPI_COMPONENTS} "Fast counter read (rdpmc): yes" CALIPER_TOPDOWN_PAPI_USES_RDPMC) - if (CALIPER_TOPDOWN_PAPI_USES_RDPMC EQUAL "-1") - message(WARNING "Detected that PAPI does not use rdpmc to read counters. The topdown service will not work correctly on Sapphire Rapids if rdpmc is NOT enabled. This will be fixed by a future version of PAPI.") - endif () + +if (CALIPER_WITH_PAPI_RDPMC) + message(STATUS "PAPI uses rdpmc") + if (CALIPER_HAVE_ARCH STREQUAL "sapphirerapids") + message(WARNING "Trying to use rdpmc for topdown on Sapphire Rapids will likely result in invalid values!") endif() + list(APPEND CALIPER_TOPDOWN_SOURCES SapphireRapidsTopdown_rdpmc.cpp) +else() + message(STATUS "PAPI does not use rdpmc") + list(APPEND CALIPER_TOPDOWN_SOURCES SapphireRapidsTopdown_read.cpp) endif () add_library(caliper-topdown OBJECT ${CALIPER_TOPDOWN_SOURCES}) diff --git a/src/services/topdown/SapphireRapidsTopdown.cpp b/src/services/topdown/SapphireRapidsTopdown_rdpmc.cpp similarity index 100% rename from src/services/topdown/SapphireRapidsTopdown.cpp rename to src/services/topdown/SapphireRapidsTopdown_rdpmc.cpp diff --git a/src/services/topdown/SapphireRapidsTopdown_read.cpp b/src/services/topdown/SapphireRapidsTopdown_read.cpp new file mode 100644 index 00000000..1739e144 --- /dev/null +++ b/src/services/topdown/SapphireRapidsTopdown_read.cpp @@ -0,0 +1,303 @@ +#include "SapphireRapidsTopdown.h" + +#include + +namespace cali { +namespace topdown { + +SapphireRapidsTopdown::SapphireRapidsTopdown(IntelTopdownLevel level) + : cali::topdown::TopdownCalculator( + level, + // top_counters + "perf::slots" + ",perf::topdown-retiring" + ",perf::topdown-bad-spec" + ",perf::topdown-fe-bound" + ",perf::topdown-be-bound" + ",INT_MISC:UOP_DROPPING", + // all_counters + "perf::slots" + ",perf::topdown-retiring" + ",perf::topdown-bad-spec" + ",perf::topdown-fe-bound" + ",perf::topdown-be-bound" + ",INT_MISC:UOP_DROPPING" + ",perf_raw::r8400" // topdown-heavy-ops + ",perf_raw::r8500" // topdown-br-mispredict + ",perf_raw::r8600" // topdown-fetch-lat + ",perf_raw::r8700", // topdown-mem-bound + // res_top + {"retiring", "backend_bound", "frontend_bound", "bad_speculation"}, + // res_all + {"retiring", "backend_bound", "frontend_bound", "bad_speculation", + "branch_mispredict", "machine_clears", "frontend_latency", + "frontend_bandwidth", "memory_bound", "core_bound", "light_ops", + "heavy_ops"}) {} + +bool SapphireRapidsTopdown::check_for_disabled_multiplex() const { + return true; +} + +std::vector +SapphireRapidsTopdown::compute_toplevel(const std::vector &rec) { + std::vector ret; + + // Get PAPI metrics for toplevel calculations + Variant v_slots_or_info_thread_slots = get_val_from_rec(rec, "perf::slots"); + Variant v_retiring = get_val_from_rec(rec, "perf::topdown-retiring"); + Variant v_bad_spec = get_val_from_rec(rec, "perf::topdown-bad-spec"); + Variant v_fe_bound = get_val_from_rec(rec, "perf::topdown-fe-bound"); + Variant v_be_bound = get_val_from_rec(rec, "perf::topdown-be-bound"); + Variant v_int_misc_uop_dropping = + get_val_from_rec(rec, "INT_MISC:UOP_DROPPING"); + + // Check if any Variant is empty (use .empty()) + bool is_incomplete = v_fe_bound.empty() || v_be_bound.empty() || + v_bad_spec.empty() || v_retiring.empty() || + v_int_misc_uop_dropping.empty() || + v_slots_or_info_thread_slots.empty(); + // Check if all Variants are greater than 0 when casted to doubles (use + // .to_double()) + bool is_nonzero = + v_fe_bound.to_double() > 0.0 && v_be_bound.to_double() > 0.0 && + v_bad_spec.to_double() > 0.0 && v_retiring.to_double() > 0.0 && + v_int_misc_uop_dropping.to_double() > 0.0 && + v_slots_or_info_thread_slots.to_double() > 0.0; + + // Check if bad values were obtained + if (is_incomplete || !is_nonzero) + return ret; + + // Perform toplevel calcs + double toplevel_sum = (v_retiring.to_double() + v_bad_spec.to_double() + + v_fe_bound.to_double() + v_be_bound.to_double()); + + double retiring = (v_retiring.to_double() / toplevel_sum) + + (0 * v_slots_or_info_thread_slots.to_double()); + double frontend_bound = (v_fe_bound.to_double() / toplevel_sum) - + (v_int_misc_uop_dropping.to_double() / + v_slots_or_info_thread_slots.to_double()); + double backend_bound = (v_be_bound.to_double() / toplevel_sum) + + (0 * v_slots_or_info_thread_slots.to_double()); + double bad_speculation = + std::max(1.0 - (frontend_bound + backend_bound + retiring), 0.0); + + // Add toplevel metrics to vector of Entry + ret.reserve(4); + ret.push_back( + Entry(m_result_attrs["retiring"], Variant(std::max(retiring, 0.0)))); + ret.push_back(Entry(m_result_attrs["backend_bound"], + Variant(std::max(backend_bound, 0.0)))); + ret.push_back(Entry(m_result_attrs["frontend_bound"], + Variant(std::max(frontend_bound, 0.0)))); + ret.push_back(Entry(m_result_attrs["bad_speculation"], + Variant(std::max(bad_speculation, 0.0)))); + + return ret; +} + +std::size_t SapphireRapidsTopdown::get_num_expected_toplevel() const { + return 4; +} + +std::vector +SapphireRapidsTopdown::compute_retiring(const std::vector &rec) { + std::vector ret; + + // Get PAPI metrics for toplevel calculations + Variant v_slots_or_info_thread_slots = get_val_from_rec(rec, "perf::slots"); + Variant v_retiring = get_val_from_rec(rec, "perf::topdown-retiring"); + Variant v_bad_spec = get_val_from_rec(rec, "perf::topdown-bad-spec"); + Variant v_fe_bound = get_val_from_rec(rec, "perf::topdown-fe-bound"); + Variant v_be_bound = get_val_from_rec(rec, "perf::topdown-be-bound"); + Variant v_heavy_ops = get_val_from_rec(rec, "perf_raw::r8400"); + + // Check if any Variant is empty (use .empty()) + bool is_incomplete = v_fe_bound.empty() || v_be_bound.empty() || + v_bad_spec.empty() || v_retiring.empty() || + v_slots_or_info_thread_slots.empty() || + v_heavy_ops.empty(); + + // Check if bad values were obtained + if (is_incomplete) + return ret; + + double toplevel_sum = (v_retiring.to_double() + v_bad_spec.to_double() + + v_fe_bound.to_double() + v_be_bound.to_double()); + // Copied from compute_toplevel + double retiring = (v_retiring.to_double() / toplevel_sum) + + (0 * v_slots_or_info_thread_slots.to_double()); + + double heavy_ops = (v_heavy_ops.to_double() / toplevel_sum) + + (0 * v_slots_or_info_thread_slots.to_double()); + double light_ops = std::max(0.0, retiring - heavy_ops); + + // Add toplevel metrics to vector of Entry + ret.reserve(2); + ret.push_back( + Entry(m_result_attrs["heavy_ops"], Variant(std::max(heavy_ops, 0.0)))); + ret.push_back( + Entry(m_result_attrs["light_ops"], Variant(std::max(light_ops, 0.0)))); + + return ret; +} + +std::size_t SapphireRapidsTopdown::get_num_expected_retiring() const { + return 2; +} + +std::vector +SapphireRapidsTopdown::compute_backend_bound(const std::vector &rec) { + std::vector ret; + + // Get PAPI metrics for toplevel calculations + Variant v_slots_or_info_thread_slots = get_val_from_rec(rec, "perf::slots"); + Variant v_retiring = get_val_from_rec(rec, "perf::topdown-retiring"); + Variant v_bad_spec = get_val_from_rec(rec, "perf::topdown-bad-spec"); + Variant v_fe_bound = get_val_from_rec(rec, "perf::topdown-fe-bound"); + Variant v_be_bound = get_val_from_rec(rec, "perf::topdown-be-bound"); + Variant v_memory_bound = get_val_from_rec(rec, "perf_raw::r8700"); + + // Check if any Variant is empty (use .empty()) + bool is_incomplete = v_fe_bound.empty() || v_be_bound.empty() || + v_bad_spec.empty() || v_retiring.empty() || + v_slots_or_info_thread_slots.empty() || + v_memory_bound.empty(); + + // Check if bad values were obtained + if (is_incomplete) + return ret; + + double toplevel_sum = (v_retiring.to_double() + v_bad_spec.to_double() + + v_fe_bound.to_double() + v_be_bound.to_double()); + // Copied from compute_toplevel + double backend_bound = (v_be_bound.to_double() / toplevel_sum) + + (0 * v_slots_or_info_thread_slots.to_double()); + + double memory_bound = (v_memory_bound.to_double() / toplevel_sum) + + (0 * v_slots_or_info_thread_slots.to_double()); + double core_bound = std::max(0.0, backend_bound - memory_bound); + + // Add toplevel metrics to vector of Entry + ret.reserve(2); + ret.push_back(Entry(m_result_attrs["memory_bound"], + Variant(std::max(memory_bound, 0.0)))); + ret.push_back( + Entry(m_result_attrs["core_bound"], Variant(std::max(core_bound, 0.0)))); + + return ret; +} + +std::size_t SapphireRapidsTopdown::get_num_expected_backend_bound() const { + return 2; +} + +std::vector +SapphireRapidsTopdown::compute_frontend_bound(const std::vector &rec) { + std::vector ret; + + // Get PAPI metrics for toplevel calculations + Variant v_slots_or_info_thread_slots = get_val_from_rec(rec, "perf::slots"); + Variant v_retiring = get_val_from_rec(rec, "perf::topdown-retiring"); + Variant v_bad_spec = get_val_from_rec(rec, "perf::topdown-bad-spec"); + Variant v_fe_bound = get_val_from_rec(rec, "perf::topdown-fe-bound"); + Variant v_be_bound = get_val_from_rec(rec, "perf::topdown-be-bound"); + Variant v_int_misc_uop_dropping = + get_val_from_rec(rec, "INT_MISC:UOP_DROPPING"); + Variant v_fetch_latency = get_val_from_rec(rec, "perf_raw::r8600"); + + // Check if any Variant is empty (use .empty()) + bool is_incomplete = + v_fe_bound.empty() || v_be_bound.empty() || v_bad_spec.empty() || + v_retiring.empty() || v_int_misc_uop_dropping.empty() || + v_slots_or_info_thread_slots.empty() || v_fetch_latency.empty(); + + // Check if bad values were obtained + if (is_incomplete) + return ret; + + double toplevel_sum = (v_retiring.to_double() + v_bad_spec.to_double() + + v_fe_bound.to_double() + v_be_bound.to_double()); + // Copied from compute_toplevel + double frontend_bound = (v_fe_bound.to_double() / toplevel_sum) - + (v_int_misc_uop_dropping.to_double() / + v_slots_or_info_thread_slots.to_double()); + + double fetch_latency = (v_fetch_latency.to_double() / toplevel_sum) - + (v_int_misc_uop_dropping.to_double() / + v_slots_or_info_thread_slots.to_double()); + + double fetch_bandwidth = std::max(0.0, frontend_bound - fetch_latency); + + // Add toplevel metrics to vector of Entry + ret.reserve(2); + ret.push_back(Entry(m_result_attrs["frontend_latency"], + Variant(std::max(fetch_latency, 0.0)))); + ret.push_back(Entry(m_result_attrs["frontend_bandwidth"], + Variant(std::max(fetch_bandwidth, 0.0)))); + + return ret; +} + +std::size_t SapphireRapidsTopdown::get_num_expected_frontend_bound() const { + return 2; +} + +std::vector +SapphireRapidsTopdown::compute_bad_speculation(const std::vector &rec) { + std::vector ret; + + // Get PAPI metrics for toplevel calculations + Variant v_slots_or_info_thread_slots = get_val_from_rec(rec, "perf::slots"); + Variant v_retiring = get_val_from_rec(rec, "perf::topdown-retiring"); + Variant v_bad_spec = get_val_from_rec(rec, "perf::topdown-bad-spec"); + Variant v_fe_bound = get_val_from_rec(rec, "perf::topdown-fe-bound"); + Variant v_be_bound = get_val_from_rec(rec, "perf::topdown-be-bound"); + Variant v_int_misc_uop_dropping = + get_val_from_rec(rec, "INT_MISC:UOP_DROPPING"); + Variant v_branch_mispredict = get_val_from_rec(rec, "perf_raw::r8500"); + + // Check if any Variant is empty (use .empty()) + bool is_incomplete = + v_fe_bound.empty() || v_be_bound.empty() || v_bad_spec.empty() || + v_retiring.empty() || v_int_misc_uop_dropping.empty() || + v_slots_or_info_thread_slots.empty() || v_branch_mispredict.empty(); + + // Check if bad values were obtained + if (is_incomplete) + return ret; + + // Perform toplevel calcs + double toplevel_sum = (v_retiring.to_double() + v_bad_spec.to_double() + + v_fe_bound.to_double() + v_be_bound.to_double()); + + double retiring = (v_retiring.to_double() / toplevel_sum) + + (0 * v_slots_or_info_thread_slots.to_double()); + double frontend_bound = (v_fe_bound.to_double() / toplevel_sum) - + (v_int_misc_uop_dropping.to_double() / + v_slots_or_info_thread_slots.to_double()); + double backend_bound = (v_be_bound.to_double() / toplevel_sum) + + (0 * v_slots_or_info_thread_slots.to_double()); + double bad_speculation = + std::max(1.0 - (frontend_bound + backend_bound + retiring), 0.0); + + double branch_mispredict = (v_branch_mispredict.to_double() / toplevel_sum) + + (0 * v_slots_or_info_thread_slots.to_double()); + double machine_clears = std::max(0.0, bad_speculation - branch_mispredict); + + // Add toplevel metrics to vector of Entry + ret.reserve(2); + ret.push_back(Entry(m_result_attrs["branch_mispredict"], + Variant(std::max(branch_mispredict, 0.0)))); + ret.push_back(Entry(m_result_attrs["machine_clears"], + Variant(std::max(machine_clears, 0.0)))); + + return ret; +} + +std::size_t SapphireRapidsTopdown::get_num_expected_bad_speculation() const { + return 2; +} + +} // namespace topdown +} // namespace cali \ No newline at end of file From 77755903ce89bf221b1891611a7d4bdc37935924 Mon Sep 17 00:00:00 2001 From: Ian Lumsden Date: Wed, 9 Oct 2024 14:06:20 -0400 Subject: [PATCH 10/11] Disables multiplexing in topdown-counters --- CMakeLists.txt | 4 ++++ src/caliper/controllers/controllers.cpp | 19 ------------------- src/services/topdown/CMakeLists.txt | 2 +- 3 files changed, 5 insertions(+), 20 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 70f21a9e..f1dd10d9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -84,6 +84,10 @@ add_caliper_option(WITH_CRAYPAT "Enable CrayPAT region forwarding support" FAL add_caliper_option(WITH_LDMS "Enable LDMS forwarder" FALSE) add_caliper_option(WITH_PAPI_RDPMC "Declare that PAPI is built to use rdpmc for reading counters. Does nothing if PAPI support is not enabled." TRUE) +if (WITH_PAPI_RDPMC) + set(CALIPER_WITH_PAPI_RDPMC TRUE) +endif () + set(WITH_ARCH "" CACHE STRING "Enable features specific to the provided archspec CPU architecture name") if (NOT WITH_ARCH STREQUAL "") diff --git a/src/caliper/controllers/controllers.cpp b/src/caliper/controllers/controllers.cpp index cbb08bcc..80598ab2 100644 --- a/src/caliper/controllers/controllers.cpp +++ b/src/caliper/controllers/controllers.cpp @@ -207,25 +207,6 @@ const ConfigManager::ConfigInfo* builtin_controllers_table[] = { &cuda_activity_ &spot_controller_info, nullptr }; -// Compile-time string comparison -// Based on code from: -// https://gist.github.com/ac1dloop/4f7109e8856e5d28e769134bca7d6d7d -constexpr bool const_strcmp(const char* a, const char* b) -{ - // Iterate until one of the strings hits its NULL terminator - for (; *a || *b;) { - // Check if the current characters in the strings are equal - // If not equal, return false - // If equal, progress to the next character in the strings - if (*a++ != *b++) { - return false; - } - } - // If we reach here, every character from the strings were equal, - // so we return true - return true; -} - const char* builtin_base_option_specs = R"json( [ { diff --git a/src/services/topdown/CMakeLists.txt b/src/services/topdown/CMakeLists.txt index 5fc9c537..a5e59717 100644 --- a/src/services/topdown/CMakeLists.txt +++ b/src/services/topdown/CMakeLists.txt @@ -4,7 +4,7 @@ set(CALIPER_TOPDOWN_SOURCES HaswellTopdown.cpp) -if (CALIPER_WITH_PAPI_RDPMC) +if (WITH_PAPI_RDPMC) message(STATUS "PAPI uses rdpmc") if (CALIPER_HAVE_ARCH STREQUAL "sapphirerapids") message(WARNING "Trying to use rdpmc for topdown on Sapphire Rapids will likely result in invalid values!") From f72b2cf7131e6f6c2dad66ff477db680a3322d2e Mon Sep 17 00:00:00 2001 From: Ian Lumsden Date: Wed, 9 Oct 2024 17:07:08 -0400 Subject: [PATCH 11/11] Adds comments describing the expected behavior of the virtual methods in TopdownCalculator --- doc/sphinx/PythonSupport.rst | 45 ++ doc/sphinx/build.rst | 246 +++++---- doc/sphinx/index.rst | 1 + src/services/topdown/HaswellTopdown.cpp | 400 +++++++------- src/services/topdown/HaswellTopdown.h | 41 +- src/services/topdown/SapphireRapidsTopdown.h | 41 +- .../topdown/SapphireRapidsTopdown_rdpmc.cpp | 329 ++++++------ .../topdown/SapphireRapidsTopdown_read.cpp | 493 +++++++++--------- src/services/topdown/TopdownCalculator.cpp | 133 ++--- src/services/topdown/TopdownCalculator.h | 103 ++-- 10 files changed, 940 insertions(+), 892 deletions(-) create mode 100644 doc/sphinx/PythonSupport.rst diff --git a/doc/sphinx/PythonSupport.rst b/doc/sphinx/PythonSupport.rst new file mode 100644 index 00000000..cc36928e --- /dev/null +++ b/doc/sphinx/PythonSupport.rst @@ -0,0 +1,45 @@ +Python support +============== + +Caliper provides Python bindings based on `pybind11 `_ +for the annotation and :code:`ConfigManager` APIs. To build Caliper with Python support, enable +the :code:`WITH_PYTHON_BINDINGS` option in the CMake configuration: + +.. code-block:: sh + + $ cmake -DWITH_PYTHON_BINDINGS=On .. + +Using the Python module +----------------------- + +The Python module requires pybind11 and an installation of Python that both supports +pybind11 and provides development headers (e.g., :code:`Python.h`) and libraries +(e.g., :code:`libpython3.8.so`). + +The Caliper Python module is installed in either :code:`lib/pythonX.Y/site-packages/` and/or +:code:`lib64/pythonX.Y/site-packages` in the Caliper installation directory. In these paths, +:code:`X.Y` corresponds to the major and minor version numbers of the Python installation used. +Additionally, :code:`lib/` and :code:`lib64/` will be used in accordance with the configuration +of the Python installed. To better understand the rules for where Python modules are installed, +see `this thread `_ +from the Python Software Foundation Discuss. + +To use the Caliper Python module, simply add the directories above to :code:`PYTHONPATH` or +:code:`sys.path`. Note that the module will be automatically added to :code:`PYTHONPATH` when +loading the Caliper package with Spack if the :code:`python` variant is enabled. +The module can then be imported with :code:`import pycaliper`. + +Caliper Python API +------------------ + +The Caliper Python API supports a significant subset of the C and C++ annotation APIs. +The simplest options are the :code:`pycaliper.begin_region()` and :code:`pycaliper.end_region()` +functions. Caliper's Python API also provides the :code:`pycaliper.annotate_function` decorator +as a higher-level way of annotating functions. + +The Python API also supports the Caliper :code:`ConfigManager` API (:doc:`ConfigManagerAPI`). +The example is examples/apps/py-example.py demonstrates the annotation and +:code:`ConfigManager` APIs for Python: + +.. literalinclude:: ../../examples/apps/py-example.py + :language: Python \ No newline at end of file diff --git a/doc/sphinx/build.rst b/doc/sphinx/build.rst index 5c8d34e1..d3179afc 100644 --- a/doc/sphinx/build.rst +++ b/doc/sphinx/build.rst @@ -45,6 +45,9 @@ WITH_CUPTI WITH_FORTRAN Build the Fortran wrappers. +WITH_PYTHON_BINDINGS + Build the Python bindings. + WITH_GOTCHA Enable Gotcha support. Allows pthread, IO, and malloc/free tracking, and enables dynamic wrapping of MPI functions. @@ -75,6 +78,9 @@ WITH_OMPT WITH_PAPI Enable PAPI support. Set PAPI installation dir in PAPI_PREFIX. +WITH_PAPI_RDPMC + Specify that PAPI is built to use :code:`rdpmc` by default for reading counters. + WITH_ROCTX Build adapters to forward Caliper annotations to AMD's roctx annotation API. @@ -92,6 +98,10 @@ WITH_VTUNE Build adapters to forward Caliper annotations to Intel's VTune annotation API. Set Intel ITT API installation dir in ``ITT_PREFIX``. +WITH_ARCH + Specify the architecture for which you are building to enable + architecture-specific functionality (e.g., topdown calculations). + All options are off by default. On Linux, Gotcha is enabled by default. Linking Caliper programs @@ -144,116 +154,126 @@ Feature and build option overview The following table shows the features, recipes, and services that are enabled with the given Caliper and spack build options. -+----------------+---------------+---------------------------+--------------------+ -| CMake option | Spack option | Enabled features/recipes | Enabled services | -+================+===============+===========================+====================+ -| WITH_ADIAK | +adiak | Import adiak metadata in | adiak_import, | -| | | most config recipes | adiak_export | -+----------------+---------------+---------------------------+--------------------+ -| WITH_MPI | +mpi | - mpi-report recipe | mpi, mpireport | -| | | - profile.mpi, | | -| | | mpi.message.count, | | -| | | mpi.message.size | | -| | | recipe options | | -| | | - Cross-process | | -| | | aggregation | | -+----------------+---------------+---------------------------+--------------------+ -| WITH_PAPI | +papi | - topdown.all, | papi, topdown | -| | | topdown.toplevel, | | -| | | topdown-counters.* | | -| | | recipe options for some | | -| | | x86 systems | | -| | | - PAPI counter collection | | -+----------------+---------------+---------------------------+--------------------+ -| WITH_LIBDW | +libdw | - source.module, | symbollookup | -| | | source.function, | | -| | | source.location | | -| | | recipe options | | -| | | - Symbol name lookup | | -+----------------+---------------+---------------------------+--------------------+ -| WITH_LIBPFM | +libpfm | PerfEvent counter | libpfm | -| | | collection and precise | | -| | | event sampling | | -+----------------+---------------+---------------------------+--------------------+ -| WITH_LIBUNWIND | +libunwind | - callpath option for | callpath | -| | | sample-report and | | -| | | event-trace recipes | | -| | | (requires libdw) | | -| | | - Call stack unwinding | | -+----------------+---------------+---------------------------+--------------------+ -| WITH_SAMPLER | +sampler | - sample-report, | sampler | -| | | hatchet-sample-profile | | -| | | recipes | | -| | | - sampling option for | | -| | | event-trace recipe | | -| | | - Linux sampling support | | -+----------------+---------------+---------------------------+--------------------+ -| WITH_CUPTI | +cuda | - cuda-activity-report | cupti, cuptitrace | -| | | cuda-activity-profile | | -| | | recipes | | -| | | - profile.cuda, | | -| | | cuda.gputime, | | -| | | cuda.memcpy recipe | | -| | | options | | -| | | - CUDA API profiling | | -| | | - CUDA activity tracing | | -+----------------+ +---------------------------+--------------------+ -| WITH_NVTX | | - nvtx recipe | nvtx | -| | | - Caliper-to-NVTX region | | -| | | forwarding | | -+----------------+---------------+---------------------------+--------------------+ -| WITH_ROCTRACER | +rocm | - rocm-activity-report, | roctracer | -| | | rocm-activity-profile | | -| | | recipes | | -| | | - profile.hip | | -| | | rocm.gputime, | | -| | | rocm.memcpy recipe | | -| | | options | | -| | | - ROCm/HIP API profiling | | -| | | - ROCm activity tracing | | -+----------------+ +---------------------------+--------------------+ -| WITH_ROCTX | | - roctx recipe | roctx | -| | | - Caliper-to-ROCTX region | | -| | | forwarding | | -+----------------+---------------+---------------------------+--------------------+ -| WITH_OMPT | not available | - openmp-report recipe | ompt | -| | yet | - openmp.times, | | -| | | openmp.threads, | | -| | | openmp.efficiency | | -| | | recipe options | | -| | | - OpenMP tools interface | | -| | | support (CPU only, no | | -| | | target offload) | | -+----------------+---------------+---------------------------+--------------------+ -| WITH_GOTCHA | +gotcha | - io.bytes.*, | io, pthread, | -| | | io.*.bandwidth, | sysalloc | -| | | mem.highwatermark, | | -| | | main_thread_only | | -| | | recipe options | | -| | | - Use Gotcha for MPI | | -| | | MPI function wrapping | | -| | | instead of PMPI | | -+----------------+---------------+---------------------------+--------------------+ -| WITH_UMPIRE | not available | umpire.totals, | umpire | -| | yet | umpire.allocators options | | -+----------------+---------------+---------------------------+--------------------+ -| WITH_VARIORUM | +variorum | Read variorum counters | variorum | -+----------------+---------------+---------------------------+--------------------+ -| WITH_PCP | not available | - mem.*.bandwidth, | pcp, pcp.memory | -| | yet | mem.*.bytes recipe | | -| | | options on some LLNL | | -| | | LC systems | | -| | | - Read Performance | | -| | | CoPilot counters | | -+----------------+---------------+---------------------------+--------------------+ -| WITH_VTUNE | not available | Intel ITT API annotation | vtune | -| | yet | forwarding | | -+----------------+---------------+---------------------------+--------------------+ -| WITH_CRAYPAT | not available | HPE CrayPAT API | craypat | -| | yet | annotation forwarding | | -+----------------+---------------+---------------------------+--------------------+ -| WITH_KOKKOS | +kokkos | Enable Kokkos tool API | kokkostime, | -| | | bindings | kokkoslookup | -+----------------+---------------+---------------------------+--------------------+ -| WITH_FORTRAN | +fortran | Enable Fortran API | | -+----------------+---------------+---------------------------+--------------------+ ++----------------------+---------------+---------------+---------------------------+--------------------+ +| CMake option | Default value | Spack option | Enabled features/recipes | Enabled services | ++======================+===============+===============+===========================+====================+ +| WITH_ADIAK | False | +adiak | Import adiak metadata in | adiak_import, | +| | | | most config recipes | adiak_export | ++----------------------+---------------+---------------+---------------------------+--------------------+ +| WITH_MPI | False | +mpi | - mpi-report recipe | mpi, mpireport | +| | | | - profile.mpi, | | +| | | | mpi.message.count, | | +| | | | mpi.message.size | | +| | | | recipe options | | +| | | | - Cross-process | | +| | | | aggregation | | ++----------------------+---------------+---------------+---------------------------+--------------------+ +| WITH_PAPI | False | +papi | - topdown.all, | papi, topdown | +| | | | topdown.toplevel, | | +| | | | topdown-counters.* | | +| | | | recipe options for some | | +| | | | x86 systems | | +| | | | - PAPI counter collection | | ++----------------------+---------------+---------------+---------------------------+--------------------+ +| WITH_PAPI_RDPMC | True | not available | Topdown calculations | | +| | | yet | based on different | | +| | | | approaches to reading | | +| | | | counters in PAPI | | ++----------------------+---------------+---------------+---------------------------+--------------------+ +| WITH_LIBDW | False | +libdw | - source.module, | symbollookup | +| | | | source.function, | | +| | | | source.location | | +| | | | recipe options | | +| | | | - Symbol name lookup | | ++----------------------+---------------+---------------+---------------------------+--------------------+ +| WITH_LIBPFM | False | +libpfm | PerfEvent counter | libpfm | +| | | | collection and precise | | +| | | | event sampling | | ++----------------------+---------------+---------------+---------------------------+--------------------+ +| WITH_LIBUNWIND | False | +libunwind | - callpath option for | callpath | +| | | | sample-report and | | +| | | | event-trace recipes | | +| | | | (requires libdw) | | +| | | | - Call stack unwinding | | ++----------------------+---------------+---------------+---------------------------+--------------------+ +| WITH_SAMPLER | False | +sampler | - sample-report, | sampler | +| | | | hatchet-sample-profile | | +| | | | recipes | | +| | | | - sampling option for | | +| | | | event-trace recipe | | +| | | | - Linux sampling support | | ++----------------------+---------------+---------------+---------------------------+--------------------+ +| WITH_CUPTI | False | +cuda | - cuda-activity-report | cupti, cuptitrace | +| | | | cuda-activity-profile | | +| | | | recipes | | +| | | | - profile.cuda, | | +| | | | cuda.gputime, | | +| | | | cuda.memcpy recipe | | +| | | | options | | +| | | | - CUDA API profiling | | +| | | | - CUDA activity tracing | | ++----------------------+---------------+ +---------------------------+--------------------+ +| WITH_NVTX | False | | - nvtx recipe | nvtx | +| | | | - Caliper-to-NVTX region | | +| | | | forwarding | | ++----------------------+---------------+---------------+---------------------------+--------------------+ +| WITH_ROCTRACER | False | +rocm | - rocm-activity-report, | roctracer | +| | | | rocm-activity-profile | | +| | | | recipes | | +| | | | - profile.hip | | +| | | | rocm.gputime, | | +| | | | rocm.memcpy recipe | | +| | | | options | | +| | | | - ROCm/HIP API profiling | | +| | | | - ROCm activity tracing | | ++----------------------+---------------+ +---------------------------+--------------------+ +| WITH_ROCTX | False | | - roctx recipe | roctx | +| | | | - Caliper-to-ROCTX region | | +| | | | forwarding | | ++----------------------+---------------+---------------+---------------------------+--------------------+ +| WITH_OMPT | False | not available | - openmp-report recipe | ompt | +| | | yet | - openmp.times, | | +| | | | openmp.threads, | | +| | | | openmp.efficiency | | +| | | | recipe options | | +| | | | - OpenMP tools interface | | +| | | | support (CPU only, no | | +| | | | target offload) | | ++----------------------+---------------+---------------+---------------------------+--------------------+ +| WITH_GOTCHA | True on | +gotcha | - io.bytes.*, | io, pthread, | +| | Linux; | | io.*.bandwidth, | sysalloc | +| | False | | mem.highwatermark, | | +| | otherwise | | main_thread_only | | +| | | | recipe options | | +| | | | - Use Gotcha for MPI | | +| | | | MPI function wrapping | | +| | | | instead of PMPI | | ++----------------------+---------------+---------------+---------------------------+--------------------+ +| WITH_UMPIRE | False | not available | umpire.totals, | umpire | +| | | yet | umpire.allocators options | | ++----------------------+---------------+---------------+---------------------------+--------------------+ +| WITH_VARIORUM | False | +variorum | Read variorum counters | variorum | ++----------------------+---------------+---------------+---------------------------+--------------------+ +| WITH_PCP | False | not available | - mem.*.bandwidth, | pcp, pcp.memory | +| | | yet | mem.*.bytes recipe | | +| | | | options on some LLNL | | +| | | | LC systems | | +| | | | - Read Performance | | +| | | | CoPilot counters | | ++----------------------+---------------+---------------+---------------------------+--------------------+ +| WITH_VTUNE | False | not available | Intel ITT API annotation | vtune | +| | | yet | forwarding | | ++----------------------+---------------+---------------+---------------------------+--------------------+ +| WITH_CRAYPAT | False | not available | HPE CrayPAT API | craypat | +| | | yet | annotation forwarding | | ++----------------------+---------------+---------------+---------------------------+--------------------+ +| WITH_KOKKOS | True | +kokkos | Enable Kokkos tool API | kokkostime, | +| | | | bindings | kokkoslookup | ++----------------------+---------------+---------------+---------------------------+--------------------+ +| WITH_FORTRAN | False | +fortran | Enable Fortran API | | ++----------------------+---------------+---------------+---------------------------+--------------------+ +| WITH_PYTHON_BINDINGS | False | +python | Enable Python API | | ++----------------------+---------------+---------------+---------------------------+--------------------+ +| WITH_ARCH | No default | not available | Enable microarchitecture- | | +| | | yet | specific features | | ++----------------------+---------------+---------------+---------------------------+--------------------+ diff --git a/doc/sphinx/index.rst b/doc/sphinx/index.rst index acf1dd16..692564f8 100644 --- a/doc/sphinx/index.rst +++ b/doc/sphinx/index.rst @@ -66,6 +66,7 @@ This section lists how-to articles for various use cases. SampleProfiling ThirdPartyTools FortranSupport + PythonSupport Reference documentation ------------------------------- diff --git a/src/services/topdown/HaswellTopdown.cpp b/src/services/topdown/HaswellTopdown.cpp index f149a6c5..f57acd04 100644 --- a/src/services/topdown/HaswellTopdown.cpp +++ b/src/services/topdown/HaswellTopdown.cpp @@ -2,249 +2,231 @@ #include -namespace cali { -namespace topdown { +namespace cali +{ +namespace topdown +{ HaswellTopdown::HaswellTopdown(IntelTopdownLevel level) : cali::topdown::TopdownCalculator( - level, - // top_counters - "CPU_CLK_THREAD_UNHALTED:THREAD_P" - ",IDQ_UOPS_NOT_DELIVERED:CORE" - ",INT_MISC:RECOVERY_CYCLES" - ",UOPS_ISSUED:ANY" - ",UOPS_RETIRED:RETIRE_SLOTS", - // all_counters - "BR_MISP_RETIRED:ALL_BRANCHES" - ",CPU_CLK_THREAD_UNHALTED:THREAD_P" - ",CYCLE_ACTIVITY:CYCLES_NO_EXECUTE" - ",CYCLE_ACTIVITY:STALLS_L1D_PENDING" - ",CYCLE_ACTIVITY:STALLS_L2_PENDING" - ",CYCLE_ACTIVITY:STALLS_LDM_PENDING" - ",IDQ_UOPS_NOT_DELIVERED:CORE" - ",IDQ_UOPS_NOT_DELIVERED:CYCLES_0_UOPS_DELIV_CORE" - ",INT_MISC:RECOVERY_CYCLES" - ",MACHINE_CLEARS:COUNT" - ",MEM_LOAD_UOPS_RETIRED:L3_HIT" - ",MEM_LOAD_UOPS_RETIRED:L3_MISS" - ",UOPS_EXECUTED:CORE_CYCLES_GE_1" - ",UOPS_EXECUTED:CORE_CYCLES_GE_2" - ",UOPS_ISSUED:ANY" - ",UOPS_RETIRED:RETIRE_SLOTS", - // res_top - {"retiring", "backend_bound", "frontend_bound", "bad_speculation"}, - // res_all - {"retiring", "backend_bound", "frontend_bound", "bad_speculation", - "branch_mispredict", "machine_clears", "frontend_latency", - "frontend_bandwidth", "memory_bound", "core_bound", "ext_mem_bound", - "l1_bound", "l2_bound", "l3_bound"}) {} - -bool HaswellTopdown::check_for_disabled_multiplex() const { return false; } - -std::vector -HaswellTopdown::compute_toplevel(const std::vector &rec) { - std::vector ret; - - Variant v_cpu_clk_unhalted_thread_p = - get_val_from_rec(rec, "CPU_CLK_THREAD_UNHALTED:THREAD_P"); - Variant v_uops_retired_retire_slots = - get_val_from_rec(rec, "UOPS_RETIRED:RETIRE_SLOTS"); - Variant v_uops_issued_any = get_val_from_rec(rec, "UOPS_ISSUED:ANY"); - Variant v_int_misc_recovery_cycles = - get_val_from_rec(rec, "INT_MISC:RECOVERY_CYCLES"); - Variant v_idq_uops_not_delivered_core = - get_val_from_rec(rec, "IDQ_UOPS_NOT_DELIVERED:CORE"); - - bool is_incomplete = v_cpu_clk_unhalted_thread_p.empty() || - v_uops_retired_retire_slots.empty() || - v_uops_issued_any.empty() || - v_int_misc_recovery_cycles.empty() || - v_idq_uops_not_delivered_core.empty(); - bool is_nonzero = v_cpu_clk_unhalted_thread_p.to_double() > 0.0 && - v_uops_retired_retire_slots.to_double() > 0.0 && - v_uops_issued_any.to_double() > 0.0 && - v_int_misc_recovery_cycles.to_double() > 0.0 && - v_idq_uops_not_delivered_core.to_double() > 0.0; - - double slots = 4.0 * v_cpu_clk_unhalted_thread_p.to_double(); - - if (is_incomplete || !is_nonzero || slots < 1.0) + level, + // top_counters + "CPU_CLK_THREAD_UNHALTED:THREAD_P" + ",IDQ_UOPS_NOT_DELIVERED:CORE" + ",INT_MISC:RECOVERY_CYCLES" + ",UOPS_ISSUED:ANY" + ",UOPS_RETIRED:RETIRE_SLOTS", + // all_counters + "BR_MISP_RETIRED:ALL_BRANCHES" + ",CPU_CLK_THREAD_UNHALTED:THREAD_P" + ",CYCLE_ACTIVITY:CYCLES_NO_EXECUTE" + ",CYCLE_ACTIVITY:STALLS_L1D_PENDING" + ",CYCLE_ACTIVITY:STALLS_L2_PENDING" + ",CYCLE_ACTIVITY:STALLS_LDM_PENDING" + ",IDQ_UOPS_NOT_DELIVERED:CORE" + ",IDQ_UOPS_NOT_DELIVERED:CYCLES_0_UOPS_DELIV_CORE" + ",INT_MISC:RECOVERY_CYCLES" + ",MACHINE_CLEARS:COUNT" + ",MEM_LOAD_UOPS_RETIRED:L3_HIT" + ",MEM_LOAD_UOPS_RETIRED:L3_MISS" + ",UOPS_EXECUTED:CORE_CYCLES_GE_1" + ",UOPS_EXECUTED:CORE_CYCLES_GE_2" + ",UOPS_ISSUED:ANY" + ",UOPS_RETIRED:RETIRE_SLOTS", + // res_top + { "retiring", "backend_bound", "frontend_bound", "bad_speculation" }, + // res_all + { "retiring", + "backend_bound", + "frontend_bound", + "bad_speculation", + "branch_mispredict", + "machine_clears", + "frontend_latency", + "frontend_bandwidth", + "memory_bound", + "core_bound", + "ext_mem_bound", + "l1_bound", + "l2_bound", + "l3_bound" } + ) +{} + +bool HaswellTopdown::check_for_disabled_multiplex() const +{ + return false; +} + +std::vector HaswellTopdown::compute_toplevel(const std::vector& rec) +{ + std::vector ret; + + Variant v_cpu_clk_unhalted_thread_p = get_val_from_rec(rec, "CPU_CLK_THREAD_UNHALTED:THREAD_P"); + Variant v_uops_retired_retire_slots = get_val_from_rec(rec, "UOPS_RETIRED:RETIRE_SLOTS"); + Variant v_uops_issued_any = get_val_from_rec(rec, "UOPS_ISSUED:ANY"); + Variant v_int_misc_recovery_cycles = get_val_from_rec(rec, "INT_MISC:RECOVERY_CYCLES"); + Variant v_idq_uops_not_delivered_core = get_val_from_rec(rec, "IDQ_UOPS_NOT_DELIVERED:CORE"); + + bool is_incomplete = v_cpu_clk_unhalted_thread_p.empty() || v_uops_retired_retire_slots.empty() + || v_uops_issued_any.empty() || v_int_misc_recovery_cycles.empty() + || v_idq_uops_not_delivered_core.empty(); + bool is_nonzero = v_cpu_clk_unhalted_thread_p.to_double() > 0.0 && v_uops_retired_retire_slots.to_double() > 0.0 + && v_uops_issued_any.to_double() > 0.0 && v_int_misc_recovery_cycles.to_double() > 0.0 + && v_idq_uops_not_delivered_core.to_double() > 0.0; + + double slots = 4.0 * v_cpu_clk_unhalted_thread_p.to_double(); + + if (is_incomplete || !is_nonzero || slots < 1.0) + return ret; + + double retiring = v_uops_retired_retire_slots.to_double() / slots; + double bad_speculation = (v_uops_issued_any.to_double() - v_uops_retired_retire_slots.to_double() + + 4.0 * v_int_misc_recovery_cycles.to_double()) + / slots; + double frontend_bound = v_idq_uops_not_delivered_core.to_double() / slots; + double backend_bound = 1.0 - (retiring + bad_speculation + frontend_bound); + + ret.reserve(4); + ret.push_back(Entry(m_result_attrs["retiring"], Variant(std::max(retiring, 0.0)))); + ret.push_back(Entry(m_result_attrs["backend_bound"], Variant(std::max(backend_bound, 0.0)))); + ret.push_back(Entry(m_result_attrs["frontend_bound"], Variant(std::max(frontend_bound, 0.0)))); + ret.push_back(Entry(m_result_attrs["bad_speculation"], Variant(std::max(bad_speculation, 0.0)))); + return ret; +} - double retiring = v_uops_retired_retire_slots.to_double() / slots; - double bad_speculation = - (v_uops_issued_any.to_double() - v_uops_retired_retire_slots.to_double() + - 4.0 * v_int_misc_recovery_cycles.to_double()) / - slots; - double frontend_bound = v_idq_uops_not_delivered_core.to_double() / slots; - double backend_bound = 1.0 - (retiring + bad_speculation + frontend_bound); - - ret.reserve(4); - ret.push_back( - Entry(m_result_attrs["retiring"], Variant(std::max(retiring, 0.0)))); - ret.push_back(Entry(m_result_attrs["backend_bound"], - Variant(std::max(backend_bound, 0.0)))); - ret.push_back(Entry(m_result_attrs["frontend_bound"], - Variant(std::max(frontend_bound, 0.0)))); - ret.push_back(Entry(m_result_attrs["bad_speculation"], - Variant(std::max(bad_speculation, 0.0)))); - - return ret; +std::size_t HaswellTopdown::get_num_expected_toplevel() const +{ + return 4; } -std::size_t HaswellTopdown::get_num_expected_toplevel() const { return 4; } +std::vector HaswellTopdown::compute_retiring(const std::vector& rec) +{ + return {}; +} -std::vector -HaswellTopdown::compute_retiring(const std::vector &rec) { - return {}; +std::size_t HaswellTopdown::get_num_expected_retiring() const +{ + return 0; } -std::size_t HaswellTopdown::get_num_expected_retiring() const { return 0; } - -std::vector -HaswellTopdown::compute_backend_bound(const std::vector &rec) { - std::vector ret; - - Variant v_cpu_clk_unhalted_thread_p = - get_val_from_rec(rec, "CPU_CLK_THREAD_UNHALTED:THREAD_P"); - Variant v_cycle_activity_stalls_ldm_pending = - get_val_from_rec(rec, "CYCLE_ACTIVITY:STALLS_LDM_PENDING"); - Variant v_cycle_activity_cycles_no_execute = - get_val_from_rec(rec, "CYCLE_ACTIVITY:CYCLES_NO_EXECUTE"); - Variant v_uops_executed_core_cycles_ge_1 = - get_val_from_rec(rec, "UOPS_EXECUTED:CORE_CYCLES_GE_1"); - Variant v_uops_executed_core_cycles_ge_2 = - get_val_from_rec(rec, "UOPS_EXECUTED:CORE_CYCLES_GE_2"); - Variant v_mem_load_uops_retired_l3_miss = - get_val_from_rec(rec, "MEM_LOAD_UOPS_RETIRED:L3_MISS"); - Variant v_mem_load_uops_retired_l3_hit = - get_val_from_rec(rec, "MEM_LOAD_UOPS_RETIRED:L3_HIT"); - Variant v_cycle_activity_stalls_l2_pending = - get_val_from_rec(rec, "CYCLE_ACTIVITY:STALLS_L2_PENDING"); - Variant v_cycle_activity_stalls_l1d_pending = - get_val_from_rec(rec, "CYCLE_ACTIVITY:STALLS_L1D_PENDING"); - - bool is_incomplete = v_cpu_clk_unhalted_thread_p.empty() || - v_cycle_activity_stalls_ldm_pending.empty() || - v_cycle_activity_cycles_no_execute.empty() || - v_uops_executed_core_cycles_ge_1.empty() || - v_uops_executed_core_cycles_ge_2.empty() || - v_mem_load_uops_retired_l3_miss.empty() || - v_mem_load_uops_retired_l3_hit.empty() || - v_cycle_activity_stalls_l2_pending.empty() || - v_cycle_activity_stalls_l1d_pending.empty(); - - double clocks = v_cpu_clk_unhalted_thread_p.to_double(); - - if (is_incomplete || !(clocks > 1.0)) - return ret; +std::vector HaswellTopdown::compute_backend_bound(const std::vector& rec) +{ + std::vector ret; + + Variant v_cpu_clk_unhalted_thread_p = get_val_from_rec(rec, "CPU_CLK_THREAD_UNHALTED:THREAD_P"); + Variant v_cycle_activity_stalls_ldm_pending = get_val_from_rec(rec, "CYCLE_ACTIVITY:STALLS_LDM_PENDING"); + Variant v_cycle_activity_cycles_no_execute = get_val_from_rec(rec, "CYCLE_ACTIVITY:CYCLES_NO_EXECUTE"); + Variant v_uops_executed_core_cycles_ge_1 = get_val_from_rec(rec, "UOPS_EXECUTED:CORE_CYCLES_GE_1"); + Variant v_uops_executed_core_cycles_ge_2 = get_val_from_rec(rec, "UOPS_EXECUTED:CORE_CYCLES_GE_2"); + Variant v_mem_load_uops_retired_l3_miss = get_val_from_rec(rec, "MEM_LOAD_UOPS_RETIRED:L3_MISS"); + Variant v_mem_load_uops_retired_l3_hit = get_val_from_rec(rec, "MEM_LOAD_UOPS_RETIRED:L3_HIT"); + Variant v_cycle_activity_stalls_l2_pending = get_val_from_rec(rec, "CYCLE_ACTIVITY:STALLS_L2_PENDING"); + Variant v_cycle_activity_stalls_l1d_pending = get_val_from_rec(rec, "CYCLE_ACTIVITY:STALLS_L1D_PENDING"); + + bool is_incomplete = v_cpu_clk_unhalted_thread_p.empty() || v_cycle_activity_stalls_ldm_pending.empty() + || v_cycle_activity_cycles_no_execute.empty() || v_uops_executed_core_cycles_ge_1.empty() + || v_uops_executed_core_cycles_ge_2.empty() || v_mem_load_uops_retired_l3_miss.empty() + || v_mem_load_uops_retired_l3_hit.empty() || v_cycle_activity_stalls_l2_pending.empty() + || v_cycle_activity_stalls_l1d_pending.empty(); + + double clocks = v_cpu_clk_unhalted_thread_p.to_double(); + + if (is_incomplete || !(clocks > 1.0)) + return ret; + + double memory_bound = v_cycle_activity_stalls_ldm_pending.to_double() / clocks; + double be_bound_at_exe = + (v_cycle_activity_cycles_no_execute.to_double() + v_uops_executed_core_cycles_ge_1.to_double() + - v_uops_executed_core_cycles_ge_2.to_double()) + / clocks; + double l3_tot = v_mem_load_uops_retired_l3_hit.to_double() + 7.0 * v_mem_load_uops_retired_l3_miss.to_double(); + double l3_hit_fraction = 0.0; + double l3_miss_fraction = 0.0; + if (l3_tot > 0.0) { + l3_hit_fraction = v_mem_load_uops_retired_l3_hit.to_double() / l3_tot; + l3_miss_fraction = v_mem_load_uops_retired_l3_miss.to_double() / l3_tot; + } + double ext_mem_bound = v_cycle_activity_stalls_l2_pending.to_double() * l3_miss_fraction / clocks; + double l1_bound = + (v_cycle_activity_stalls_ldm_pending.to_double() - v_cycle_activity_stalls_l1d_pending.to_double()) / clocks; + double l2_bound = + (v_cycle_activity_stalls_l1d_pending.to_double() - v_cycle_activity_stalls_l2_pending.to_double()) / clocks; + double l3_bound = v_cycle_activity_stalls_l2_pending.to_double() * l3_hit_fraction / clocks; + + ret.reserve(6); + ret.push_back(Entry(m_result_attrs["memory_bound"], Variant(memory_bound))); + ret.push_back(Entry(m_result_attrs["core_bound"], Variant(be_bound_at_exe - memory_bound))); + ret.push_back(Entry(m_result_attrs["ext_mem_bound"], Variant(ext_mem_bound))); + ret.push_back(Entry(m_result_attrs["l1_bound"], Variant(l1_bound))); + ret.push_back(Entry(m_result_attrs["l2_bound"], Variant(l2_bound))); + ret.push_back(Entry(m_result_attrs["l3_bound"], Variant(l3_bound))); - double memory_bound = - v_cycle_activity_stalls_ldm_pending.to_double() / clocks; - double be_bound_at_exe = (v_cycle_activity_cycles_no_execute.to_double() + - v_uops_executed_core_cycles_ge_1.to_double() - - v_uops_executed_core_cycles_ge_2.to_double()) / - clocks; - double l3_tot = v_mem_load_uops_retired_l3_hit.to_double() + - 7.0 * v_mem_load_uops_retired_l3_miss.to_double(); - double l3_hit_fraction = 0.0; - double l3_miss_fraction = 0.0; - if (l3_tot > 0.0) { - l3_hit_fraction = v_mem_load_uops_retired_l3_hit.to_double() / l3_tot; - l3_miss_fraction = v_mem_load_uops_retired_l3_miss.to_double() / l3_tot; - } - double ext_mem_bound = v_cycle_activity_stalls_l2_pending.to_double() * - l3_miss_fraction / clocks; - double l1_bound = (v_cycle_activity_stalls_ldm_pending.to_double() - - v_cycle_activity_stalls_l1d_pending.to_double()) / - clocks; - double l2_bound = (v_cycle_activity_stalls_l1d_pending.to_double() - - v_cycle_activity_stalls_l2_pending.to_double()) / - clocks; - double l3_bound = - v_cycle_activity_stalls_l2_pending.to_double() * l3_hit_fraction / clocks; - - ret.reserve(6); - ret.push_back(Entry(m_result_attrs["memory_bound"], Variant(memory_bound))); - ret.push_back(Entry(m_result_attrs["core_bound"], - Variant(be_bound_at_exe - memory_bound))); - ret.push_back(Entry(m_result_attrs["ext_mem_bound"], Variant(ext_mem_bound))); - ret.push_back(Entry(m_result_attrs["l1_bound"], Variant(l1_bound))); - ret.push_back(Entry(m_result_attrs["l2_bound"], Variant(l2_bound))); - ret.push_back(Entry(m_result_attrs["l3_bound"], Variant(l3_bound))); - - return ret; + return ret; } -std::size_t HaswellTopdown::get_num_expected_backend_bound() const { return 6; } +std::size_t HaswellTopdown::get_num_expected_backend_bound() const +{ + return 6; +} -std::vector -HaswellTopdown::compute_frontend_bound(const std::vector &rec) { - std::vector ret; +std::vector HaswellTopdown::compute_frontend_bound(const std::vector& rec) +{ + std::vector ret; - Variant v_cpu_clk_unhalted_thread_p = - get_val_from_rec(rec, "CPU_CLK_THREAD_UNHALTED:THREAD_P"); - Variant v_idq_uops_not_delivered = - get_val_from_rec(rec, "IDQ_UOPS_NOT_DELIVERED:CYCLES_0_UOPS_DELIV_CORE"); + Variant v_cpu_clk_unhalted_thread_p = get_val_from_rec(rec, "CPU_CLK_THREAD_UNHALTED:THREAD_P"); + Variant v_idq_uops_not_delivered = get_val_from_rec(rec, "IDQ_UOPS_NOT_DELIVERED:CYCLES_0_UOPS_DELIV_CORE"); - bool is_incomplete = - v_cpu_clk_unhalted_thread_p.empty() || v_idq_uops_not_delivered.empty(); + bool is_incomplete = v_cpu_clk_unhalted_thread_p.empty() || v_idq_uops_not_delivered.empty(); - double clocks = v_cpu_clk_unhalted_thread_p.to_double(); - double uops = v_idq_uops_not_delivered.to_double(); + double clocks = v_cpu_clk_unhalted_thread_p.to_double(); + double uops = v_idq_uops_not_delivered.to_double(); - if (is_incomplete || clocks < 1.0 || uops > clocks) - return ret; + if (is_incomplete || clocks < 1.0 || uops > clocks) + return ret; - double fe_latency = uops / clocks; + double fe_latency = uops / clocks; - ret.reserve(2); - ret.push_back(Entry(m_result_attrs["frontend_latency"], Variant(fe_latency))); - ret.push_back( - Entry(m_result_attrs["frontend_bandwidth"], Variant(1.0 - fe_latency))); + ret.reserve(2); + ret.push_back(Entry(m_result_attrs["frontend_latency"], Variant(fe_latency))); + ret.push_back(Entry(m_result_attrs["frontend_bandwidth"], Variant(1.0 - fe_latency))); - return ret; + return ret; } -std::size_t HaswellTopdown::get_num_expected_frontend_bound() const { - return 2; +std::size_t HaswellTopdown::get_num_expected_frontend_bound() const +{ + return 2; } -std::vector -HaswellTopdown::compute_bad_speculation(const std::vector &rec) { - std::vector ret; +std::vector HaswellTopdown::compute_bad_speculation(const std::vector& rec) +{ + std::vector ret; - Variant v_br_misp_retired_all_branches = - get_val_from_rec(rec, "BR_MISP_RETIRED:ALL_BRANCHES"); - Variant v_machine_clears_count = - get_val_from_rec(rec, "MACHINE_CLEARS:COUNT"); + Variant v_br_misp_retired_all_branches = get_val_from_rec(rec, "BR_MISP_RETIRED:ALL_BRANCHES"); + Variant v_machine_clears_count = get_val_from_rec(rec, "MACHINE_CLEARS:COUNT"); - bool is_incomplete = - v_br_misp_retired_all_branches.empty() || v_machine_clears_count.empty(); + bool is_incomplete = v_br_misp_retired_all_branches.empty() || v_machine_clears_count.empty(); - double br_misp_retired_all_branches = - v_br_misp_retired_all_branches.to_double(); - double machine_clears_count = v_machine_clears_count.to_double(); + double br_misp_retired_all_branches = v_br_misp_retired_all_branches.to_double(); + double machine_clears_count = v_machine_clears_count.to_double(); - if (is_incomplete || - !(br_misp_retired_all_branches + machine_clears_count > 1.0)) - return ret; + if (is_incomplete || !(br_misp_retired_all_branches + machine_clears_count > 1.0)) + return ret; - double branch_mispredict = - br_misp_retired_all_branches / - (br_misp_retired_all_branches + machine_clears_count); + double branch_mispredict = br_misp_retired_all_branches / (br_misp_retired_all_branches + machine_clears_count); - ret.reserve(2); - ret.push_back( - Entry(m_result_attrs["branch_mispredict"], Variant(branch_mispredict))); - ret.push_back(Entry(m_result_attrs["machine_clears"], - Variant(1.0 - branch_mispredict))); + ret.reserve(2); + ret.push_back(Entry(m_result_attrs["branch_mispredict"], Variant(branch_mispredict))); + ret.push_back(Entry(m_result_attrs["machine_clears"], Variant(1.0 - branch_mispredict))); - return ret; + return ret; } -std::size_t HaswellTopdown::get_num_expected_bad_speculation() const { - return 2; +std::size_t HaswellTopdown::get_num_expected_bad_speculation() const +{ + return 2; } } // namespace topdown diff --git a/src/services/topdown/HaswellTopdown.h b/src/services/topdown/HaswellTopdown.h index 5ca0a9be..01c99ebc 100644 --- a/src/services/topdown/HaswellTopdown.h +++ b/src/services/topdown/HaswellTopdown.h @@ -3,41 +3,40 @@ #include "TopdownCalculator.h" -namespace cali { -namespace topdown { +namespace cali +{ +namespace topdown +{ -class HaswellTopdown : public TopdownCalculator { +class HaswellTopdown : public TopdownCalculator +{ public: - HaswellTopdown(IntelTopdownLevel level); - virtual ~HaswellTopdown() = default; + HaswellTopdown(IntelTopdownLevel level); - virtual bool check_for_disabled_multiplex() const override; + virtual ~HaswellTopdown() = default; - virtual std::vector - compute_toplevel(const std::vector &rec) override; + virtual bool check_for_disabled_multiplex() const override; - virtual std::size_t get_num_expected_toplevel() const override; + virtual std::vector compute_toplevel(const std::vector& rec) override; - virtual std::vector - compute_retiring(const std::vector &rec) override; + virtual std::size_t get_num_expected_toplevel() const override; - virtual std::size_t get_num_expected_retiring() const override; + virtual std::vector compute_retiring(const std::vector& rec) override; - virtual std::vector - compute_backend_bound(const std::vector &rec) override; + virtual std::size_t get_num_expected_retiring() const override; - virtual std::size_t get_num_expected_backend_bound() const override; + virtual std::vector compute_backend_bound(const std::vector& rec) override; - virtual std::vector - compute_frontend_bound(const std::vector &rec) override; + virtual std::size_t get_num_expected_backend_bound() const override; - virtual std::size_t get_num_expected_frontend_bound() const override; + virtual std::vector compute_frontend_bound(const std::vector& rec) override; - virtual std::vector - compute_bad_speculation(const std::vector &rec) override; + virtual std::size_t get_num_expected_frontend_bound() const override; - virtual std::size_t get_num_expected_bad_speculation() const override; + virtual std::vector compute_bad_speculation(const std::vector& rec) override; + + virtual std::size_t get_num_expected_bad_speculation() const override; }; } // namespace topdown diff --git a/src/services/topdown/SapphireRapidsTopdown.h b/src/services/topdown/SapphireRapidsTopdown.h index 8fc75282..bdba3bd8 100644 --- a/src/services/topdown/SapphireRapidsTopdown.h +++ b/src/services/topdown/SapphireRapidsTopdown.h @@ -3,41 +3,40 @@ #include "TopdownCalculator.h" -namespace cali { -namespace topdown { +namespace cali +{ +namespace topdown +{ -class SapphireRapidsTopdown : public TopdownCalculator { +class SapphireRapidsTopdown : public TopdownCalculator +{ public: - SapphireRapidsTopdown(IntelTopdownLevel level); - virtual ~SapphireRapidsTopdown() = default; + SapphireRapidsTopdown(IntelTopdownLevel level); - virtual bool check_for_disabled_multiplex() const override; + virtual ~SapphireRapidsTopdown() = default; - virtual std::vector - compute_toplevel(const std::vector &rec) override; + virtual bool check_for_disabled_multiplex() const override; - virtual std::size_t get_num_expected_toplevel() const override; + virtual std::vector compute_toplevel(const std::vector& rec) override; - virtual std::vector - compute_retiring(const std::vector &rec) override; + virtual std::size_t get_num_expected_toplevel() const override; - virtual std::size_t get_num_expected_retiring() const override; + virtual std::vector compute_retiring(const std::vector& rec) override; - virtual std::vector - compute_backend_bound(const std::vector &rec) override; + virtual std::size_t get_num_expected_retiring() const override; - virtual std::size_t get_num_expected_backend_bound() const override; + virtual std::vector compute_backend_bound(const std::vector& rec) override; - virtual std::vector - compute_frontend_bound(const std::vector &rec) override; + virtual std::size_t get_num_expected_backend_bound() const override; - virtual std::size_t get_num_expected_frontend_bound() const override; + virtual std::vector compute_frontend_bound(const std::vector& rec) override; - virtual std::vector - compute_bad_speculation(const std::vector &rec) override; + virtual std::size_t get_num_expected_frontend_bound() const override; - virtual std::size_t get_num_expected_bad_speculation() const override; + virtual std::vector compute_bad_speculation(const std::vector& rec) override; + + virtual std::size_t get_num_expected_bad_speculation() const override; }; } // namespace topdown diff --git a/src/services/topdown/SapphireRapidsTopdown_rdpmc.cpp b/src/services/topdown/SapphireRapidsTopdown_rdpmc.cpp index a7e55bcf..79df5dbc 100644 --- a/src/services/topdown/SapphireRapidsTopdown_rdpmc.cpp +++ b/src/services/topdown/SapphireRapidsTopdown_rdpmc.cpp @@ -12,233 +12,222 @@ #define FETCH_LAT_OFFSET 6 #define MEM_BOUND_OFFSET 7 -static double get_tma_percent_from_rdpmc_value(uint64_t rdpmc_value, - uint64_t offset) { - return (double)((rdpmc_value >> (offset * 8)) & 0xff) / 0xff; +static double get_tma_percent_from_rdpmc_value(uint64_t rdpmc_value, uint64_t offset) +{ + return (double) ((rdpmc_value >> (offset * 8)) & 0xff) / 0xff; } -namespace cali { -namespace topdown { +namespace cali +{ +namespace topdown +{ SapphireRapidsTopdown::SapphireRapidsTopdown(IntelTopdownLevel level) : cali::topdown::TopdownCalculator( - level, - // top_counters - "perf::slots" - ",perf::topdown-retiring", - // all_counters - "perf::slots" - ",perf::topdown-retiring", - // res_top - {"retiring", "backend_bound", "frontend_bound", "bad_speculation"}, - // res_all - {"retiring", "backend_bound", "frontend_bound", "bad_speculation", - "branch_mispredict", "machine_clears", "frontend_latency", - "frontend_bandwidth", "memory_bound", "core_bound", "light_ops", - "heavy_ops"}) {} - -bool SapphireRapidsTopdown::check_for_disabled_multiplex() const { - return true; + level, + // top_counters + "perf::slots" + ",perf::topdown-retiring", + // all_counters + "perf::slots" + ",perf::topdown-retiring", + // res_top + { "retiring", "backend_bound", "frontend_bound", "bad_speculation" }, + // res_all + { "retiring", + "backend_bound", + "frontend_bound", + "bad_speculation", + "branch_mispredict", + "machine_clears", + "frontend_latency", + "frontend_bandwidth", + "memory_bound", + "core_bound", + "light_ops", + "heavy_ops" } + ) +{} + +bool SapphireRapidsTopdown::check_for_disabled_multiplex() const +{ + return true; } -std::vector -SapphireRapidsTopdown::compute_toplevel(const std::vector &rec) { - std::vector ret; +std::vector SapphireRapidsTopdown::compute_toplevel(const std::vector& rec) +{ + std::vector ret; - // Get PAPI metrics for toplevel calculations - Variant v_slots_or_info_thread_slots = get_val_from_rec(rec, "perf::slots"); - Variant v_tma_metrics = get_val_from_rec(rec, "perf::topdown-retiring"); + // Get PAPI metrics for toplevel calculations + Variant v_slots_or_info_thread_slots = get_val_from_rec(rec, "perf::slots"); + Variant v_tma_metrics = get_val_from_rec(rec, "perf::topdown-retiring"); - // Check if any Variant is empty (use .empty()) - bool is_incomplete = - v_tma_metrics.empty() || v_slots_or_info_thread_slots.empty(); - // Check if all Variants are greater than 0 when casted to doubles (use - // .to_double()) - bool is_nonzero = v_tma_metrics.to_uint() > 0; + // Check if any Variant is empty (use .empty()) + bool is_incomplete = v_tma_metrics.empty() || v_slots_or_info_thread_slots.empty(); + // Check if all Variants are greater than 0 when casted to doubles (use + // .to_double()) + bool is_nonzero = v_tma_metrics.to_uint() > 0; - // Check if bad values were obtained - if (is_incomplete || !is_nonzero) - return ret; + // Check if bad values were obtained + if (is_incomplete || !is_nonzero) + return ret; + + uint64_t tma_metric_papi_rdpmc = v_tma_metrics.to_uint(); + + double retiring = get_tma_percent_from_rdpmc_value(tma_metric_papi_rdpmc, RETIRING_OFFSET); + double frontend_bound = get_tma_percent_from_rdpmc_value(tma_metric_papi_rdpmc, FE_BOUND_OFFSET); + double backend_bound = get_tma_percent_from_rdpmc_value(tma_metric_papi_rdpmc, BE_BOUND_OFFSET); + double bad_speculation = get_tma_percent_from_rdpmc_value(tma_metric_papi_rdpmc, BAD_SPEC_OFFSET); + + // Add toplevel metrics to vector of Entry + ret.reserve(4); + ret.push_back(Entry(m_result_attrs["retiring"], Variant(std::max(retiring, 0.0)))); + ret.push_back(Entry(m_result_attrs["backend_bound"], Variant(std::max(backend_bound, 0.0)))); + ret.push_back(Entry(m_result_attrs["frontend_bound"], Variant(std::max(frontend_bound, 0.0)))); + ret.push_back(Entry(m_result_attrs["bad_speculation"], Variant(std::max(bad_speculation, 0.0)))); - uint64_t tma_metric_papi_rdpmc = v_tma_metrics.to_uint(); - - double retiring = - get_tma_percent_from_rdpmc_value(tma_metric_papi_rdpmc, RETIRING_OFFSET); - double frontend_bound = - get_tma_percent_from_rdpmc_value(tma_metric_papi_rdpmc, FE_BOUND_OFFSET); - double backend_bound = - get_tma_percent_from_rdpmc_value(tma_metric_papi_rdpmc, BE_BOUND_OFFSET); - double bad_speculation = - get_tma_percent_from_rdpmc_value(tma_metric_papi_rdpmc, BAD_SPEC_OFFSET); - - // Add toplevel metrics to vector of Entry - ret.reserve(4); - ret.push_back( - Entry(m_result_attrs["retiring"], Variant(std::max(retiring, 0.0)))); - ret.push_back(Entry(m_result_attrs["backend_bound"], - Variant(std::max(backend_bound, 0.0)))); - ret.push_back(Entry(m_result_attrs["frontend_bound"], - Variant(std::max(frontend_bound, 0.0)))); - ret.push_back(Entry(m_result_attrs["bad_speculation"], - Variant(std::max(bad_speculation, 0.0)))); - - return ret; + return ret; } -std::size_t SapphireRapidsTopdown::get_num_expected_toplevel() const { - return 4; +std::size_t SapphireRapidsTopdown::get_num_expected_toplevel() const +{ + return 4; } -std::vector -SapphireRapidsTopdown::compute_retiring(const std::vector &rec) { - std::vector ret; +std::vector SapphireRapidsTopdown::compute_retiring(const std::vector& rec) +{ + std::vector ret; - // Get PAPI metrics for toplevel calculations - Variant v_slots_or_info_thread_slots = get_val_from_rec(rec, "perf::slots"); - Variant v_tma_metrics = get_val_from_rec(rec, "perf::topdown-retiring"); + // Get PAPI metrics for toplevel calculations + Variant v_slots_or_info_thread_slots = get_val_from_rec(rec, "perf::slots"); + Variant v_tma_metrics = get_val_from_rec(rec, "perf::topdown-retiring"); - // Check if any Variant is empty (use .empty()) - bool is_incomplete = - v_tma_metrics.empty() || v_slots_or_info_thread_slots.empty(); + // Check if any Variant is empty (use .empty()) + bool is_incomplete = v_tma_metrics.empty() || v_slots_or_info_thread_slots.empty(); - // Check if bad values were obtained - if (is_incomplete) - return ret; + // Check if bad values were obtained + if (is_incomplete) + return ret; - uint64_t tma_metric_papi_rdpmc = v_tma_metrics.to_uint(); + uint64_t tma_metric_papi_rdpmc = v_tma_metrics.to_uint(); - double retiring = - get_tma_percent_from_rdpmc_value(tma_metric_papi_rdpmc, RETIRING_OFFSET); - double heavy_ops = - get_tma_percent_from_rdpmc_value(tma_metric_papi_rdpmc, HEAVY_OPS_OFFSET); - double light_ops = std::max(0.0, retiring - heavy_ops); + double retiring = get_tma_percent_from_rdpmc_value(tma_metric_papi_rdpmc, RETIRING_OFFSET); + double heavy_ops = get_tma_percent_from_rdpmc_value(tma_metric_papi_rdpmc, HEAVY_OPS_OFFSET); + double light_ops = std::max(0.0, retiring - heavy_ops); - // Add toplevel metrics to vector of Entry - ret.reserve(2); - ret.push_back( - Entry(m_result_attrs["heavy_ops"], Variant(std::max(heavy_ops, 0.0)))); - ret.push_back( - Entry(m_result_attrs["light_ops"], Variant(std::max(light_ops, 0.0)))); + // Add toplevel metrics to vector of Entry + ret.reserve(2); + ret.push_back(Entry(m_result_attrs["heavy_ops"], Variant(std::max(heavy_ops, 0.0)))); + ret.push_back(Entry(m_result_attrs["light_ops"], Variant(std::max(light_ops, 0.0)))); - return ret; + return ret; } -std::size_t SapphireRapidsTopdown::get_num_expected_retiring() const { - return 2; +std::size_t SapphireRapidsTopdown::get_num_expected_retiring() const +{ + return 2; } -std::vector -SapphireRapidsTopdown::compute_backend_bound(const std::vector &rec) { - std::vector ret; +std::vector SapphireRapidsTopdown::compute_backend_bound(const std::vector& rec) +{ + std::vector ret; - // Get PAPI metrics for toplevel calculations - Variant v_slots_or_info_thread_slots = get_val_from_rec(rec, "perf::slots"); - Variant v_tma_metrics = get_val_from_rec(rec, "perf::topdown-retiring"); + // Get PAPI metrics for toplevel calculations + Variant v_slots_or_info_thread_slots = get_val_from_rec(rec, "perf::slots"); + Variant v_tma_metrics = get_val_from_rec(rec, "perf::topdown-retiring"); - // Check if any Variant is empty (use .empty()) - bool is_incomplete = - v_tma_metrics.empty() || v_slots_or_info_thread_slots.empty(); + // Check if any Variant is empty (use .empty()) + bool is_incomplete = v_tma_metrics.empty() || v_slots_or_info_thread_slots.empty(); - // Check if bad values were obtained - if (is_incomplete) - return ret; + // Check if bad values were obtained + if (is_incomplete) + return ret; - uint64_t tma_metric_papi_rdpmc = v_tma_metrics.to_uint(); + uint64_t tma_metric_papi_rdpmc = v_tma_metrics.to_uint(); - double backend_bound = - get_tma_percent_from_rdpmc_value(tma_metric_papi_rdpmc, BE_BOUND_OFFSET); - double memory_bound = - get_tma_percent_from_rdpmc_value(tma_metric_papi_rdpmc, MEM_BOUND_OFFSET); - double core_bound = std::max(0.0, backend_bound - memory_bound); + double backend_bound = get_tma_percent_from_rdpmc_value(tma_metric_papi_rdpmc, BE_BOUND_OFFSET); + double memory_bound = get_tma_percent_from_rdpmc_value(tma_metric_papi_rdpmc, MEM_BOUND_OFFSET); + double core_bound = std::max(0.0, backend_bound - memory_bound); - // Add toplevel metrics to vector of Entry - ret.reserve(2); - ret.push_back(Entry(m_result_attrs["memory_bound"], - Variant(std::max(memory_bound, 0.0)))); - ret.push_back( - Entry(m_result_attrs["core_bound"], Variant(std::max(core_bound, 0.0)))); + // Add toplevel metrics to vector of Entry + ret.reserve(2); + ret.push_back(Entry(m_result_attrs["memory_bound"], Variant(std::max(memory_bound, 0.0)))); + ret.push_back(Entry(m_result_attrs["core_bound"], Variant(std::max(core_bound, 0.0)))); - return ret; + return ret; } -std::size_t SapphireRapidsTopdown::get_num_expected_backend_bound() const { - return 2; +std::size_t SapphireRapidsTopdown::get_num_expected_backend_bound() const +{ + return 2; } -std::vector -SapphireRapidsTopdown::compute_frontend_bound(const std::vector &rec) { - std::vector ret; +std::vector SapphireRapidsTopdown::compute_frontend_bound(const std::vector& rec) +{ + std::vector ret; - // Get PAPI metrics for toplevel calculations - Variant v_slots_or_info_thread_slots = get_val_from_rec(rec, "perf::slots"); - Variant v_tma_metrics = get_val_from_rec(rec, "perf::topdown-retiring"); + // Get PAPI metrics for toplevel calculations + Variant v_slots_or_info_thread_slots = get_val_from_rec(rec, "perf::slots"); + Variant v_tma_metrics = get_val_from_rec(rec, "perf::topdown-retiring"); - // Check if any Variant is empty (use .empty()) - bool is_incomplete = - v_tma_metrics.empty() || v_slots_or_info_thread_slots.empty(); + // Check if any Variant is empty (use .empty()) + bool is_incomplete = v_tma_metrics.empty() || v_slots_or_info_thread_slots.empty(); - // Check if bad values were obtained - if (is_incomplete) - return ret; + // Check if bad values were obtained + if (is_incomplete) + return ret; - uint64_t tma_metric_papi_rdpmc = v_tma_metrics.to_uint(); + uint64_t tma_metric_papi_rdpmc = v_tma_metrics.to_uint(); - double frontend_bound = - get_tma_percent_from_rdpmc_value(tma_metric_papi_rdpmc, FE_BOUND_OFFSET); - double fetch_latency = - get_tma_percent_from_rdpmc_value(tma_metric_papi_rdpmc, FETCH_LAT_OFFSET); - double fetch_bandwidth = std::max(0.0, frontend_bound - fetch_latency); + double frontend_bound = get_tma_percent_from_rdpmc_value(tma_metric_papi_rdpmc, FE_BOUND_OFFSET); + double fetch_latency = get_tma_percent_from_rdpmc_value(tma_metric_papi_rdpmc, FETCH_LAT_OFFSET); + double fetch_bandwidth = std::max(0.0, frontend_bound - fetch_latency); - // Add toplevel metrics to vector of Entry - ret.reserve(2); - ret.push_back(Entry(m_result_attrs["frontend_latency"], - Variant(std::max(fetch_latency, 0.0)))); - ret.push_back(Entry(m_result_attrs["frontend_bandwidth"], - Variant(std::max(fetch_bandwidth, 0.0)))); + // Add toplevel metrics to vector of Entry + ret.reserve(2); + ret.push_back(Entry(m_result_attrs["frontend_latency"], Variant(std::max(fetch_latency, 0.0)))); + ret.push_back(Entry(m_result_attrs["frontend_bandwidth"], Variant(std::max(fetch_bandwidth, 0.0)))); - return ret; + return ret; } -std::size_t SapphireRapidsTopdown::get_num_expected_frontend_bound() const { - return 2; +std::size_t SapphireRapidsTopdown::get_num_expected_frontend_bound() const +{ + return 2; } -std::vector -SapphireRapidsTopdown::compute_bad_speculation(const std::vector &rec) { - std::vector ret; +std::vector SapphireRapidsTopdown::compute_bad_speculation(const std::vector& rec) +{ + std::vector ret; - // Get PAPI metrics for toplevel calculations - Variant v_slots_or_info_thread_slots = get_val_from_rec(rec, "perf::slots"); - Variant v_tma_metrics = get_val_from_rec(rec, "perf::topdown-retiring"); + // Get PAPI metrics for toplevel calculations + Variant v_slots_or_info_thread_slots = get_val_from_rec(rec, "perf::slots"); + Variant v_tma_metrics = get_val_from_rec(rec, "perf::topdown-retiring"); - // Check if any Variant is empty (use .empty()) - bool is_incomplete = - v_tma_metrics.empty() || v_slots_or_info_thread_slots.empty(); + // Check if any Variant is empty (use .empty()) + bool is_incomplete = v_tma_metrics.empty() || v_slots_or_info_thread_slots.empty(); - // Check if bad values were obtained - if (is_incomplete) - return ret; + // Check if bad values were obtained + if (is_incomplete) + return ret; - uint64_t tma_metric_papi_rdpmc = v_tma_metrics.to_uint(); + uint64_t tma_metric_papi_rdpmc = v_tma_metrics.to_uint(); - double bad_speculation = - get_tma_percent_from_rdpmc_value(tma_metric_papi_rdpmc, BAD_SPEC_OFFSET); - double branch_mispredict = get_tma_percent_from_rdpmc_value( - tma_metric_papi_rdpmc, BR_MISPRED_OFFSET); - double machine_clears = std::max(0.0, bad_speculation - branch_mispredict); + double bad_speculation = get_tma_percent_from_rdpmc_value(tma_metric_papi_rdpmc, BAD_SPEC_OFFSET); + double branch_mispredict = get_tma_percent_from_rdpmc_value(tma_metric_papi_rdpmc, BR_MISPRED_OFFSET); + double machine_clears = std::max(0.0, bad_speculation - branch_mispredict); - // Add toplevel metrics to vector of Entry - ret.reserve(2); - ret.push_back(Entry(m_result_attrs["branch_mispredict"], - Variant(std::max(branch_mispredict, 0.0)))); - ret.push_back(Entry(m_result_attrs["machine_clears"], - Variant(std::max(machine_clears, 0.0)))); + // Add toplevel metrics to vector of Entry + ret.reserve(2); + ret.push_back(Entry(m_result_attrs["branch_mispredict"], Variant(std::max(branch_mispredict, 0.0)))); + ret.push_back(Entry(m_result_attrs["machine_clears"], Variant(std::max(machine_clears, 0.0)))); - return ret; + return ret; } -std::size_t SapphireRapidsTopdown::get_num_expected_bad_speculation() const { - return 2; +std::size_t SapphireRapidsTopdown::get_num_expected_bad_speculation() const +{ + return 2; } } // namespace topdown diff --git a/src/services/topdown/SapphireRapidsTopdown_read.cpp b/src/services/topdown/SapphireRapidsTopdown_read.cpp index 1739e144..1e480505 100644 --- a/src/services/topdown/SapphireRapidsTopdown_read.cpp +++ b/src/services/topdown/SapphireRapidsTopdown_read.cpp @@ -2,301 +2,280 @@ #include -namespace cali { -namespace topdown { +namespace cali +{ +namespace topdown +{ SapphireRapidsTopdown::SapphireRapidsTopdown(IntelTopdownLevel level) : cali::topdown::TopdownCalculator( - level, - // top_counters - "perf::slots" - ",perf::topdown-retiring" - ",perf::topdown-bad-spec" - ",perf::topdown-fe-bound" - ",perf::topdown-be-bound" - ",INT_MISC:UOP_DROPPING", - // all_counters - "perf::slots" - ",perf::topdown-retiring" - ",perf::topdown-bad-spec" - ",perf::topdown-fe-bound" - ",perf::topdown-be-bound" - ",INT_MISC:UOP_DROPPING" - ",perf_raw::r8400" // topdown-heavy-ops - ",perf_raw::r8500" // topdown-br-mispredict - ",perf_raw::r8600" // topdown-fetch-lat - ",perf_raw::r8700", // topdown-mem-bound - // res_top - {"retiring", "backend_bound", "frontend_bound", "bad_speculation"}, - // res_all - {"retiring", "backend_bound", "frontend_bound", "bad_speculation", - "branch_mispredict", "machine_clears", "frontend_latency", - "frontend_bandwidth", "memory_bound", "core_bound", "light_ops", - "heavy_ops"}) {} - -bool SapphireRapidsTopdown::check_for_disabled_multiplex() const { - return true; + level, + // top_counters + "perf::slots" + ",perf::topdown-retiring" + ",perf::topdown-bad-spec" + ",perf::topdown-fe-bound" + ",perf::topdown-be-bound" + ",INT_MISC:UOP_DROPPING", + // all_counters + "perf::slots" + ",perf::topdown-retiring" + ",perf::topdown-bad-spec" + ",perf::topdown-fe-bound" + ",perf::topdown-be-bound" + ",INT_MISC:UOP_DROPPING" + ",perf_raw::r8400" // topdown-heavy-ops + ",perf_raw::r8500" // topdown-br-mispredict + ",perf_raw::r8600" // topdown-fetch-lat + ",perf_raw::r8700", // topdown-mem-bound + // res_top + { "retiring", "backend_bound", "frontend_bound", "bad_speculation" }, + // res_all + { "retiring", + "backend_bound", + "frontend_bound", + "bad_speculation", + "branch_mispredict", + "machine_clears", + "frontend_latency", + "frontend_bandwidth", + "memory_bound", + "core_bound", + "light_ops", + "heavy_ops" } + ) +{} + +bool SapphireRapidsTopdown::check_for_disabled_multiplex() const +{ + return true; } -std::vector -SapphireRapidsTopdown::compute_toplevel(const std::vector &rec) { - std::vector ret; - - // Get PAPI metrics for toplevel calculations - Variant v_slots_or_info_thread_slots = get_val_from_rec(rec, "perf::slots"); - Variant v_retiring = get_val_from_rec(rec, "perf::topdown-retiring"); - Variant v_bad_spec = get_val_from_rec(rec, "perf::topdown-bad-spec"); - Variant v_fe_bound = get_val_from_rec(rec, "perf::topdown-fe-bound"); - Variant v_be_bound = get_val_from_rec(rec, "perf::topdown-be-bound"); - Variant v_int_misc_uop_dropping = - get_val_from_rec(rec, "INT_MISC:UOP_DROPPING"); - - // Check if any Variant is empty (use .empty()) - bool is_incomplete = v_fe_bound.empty() || v_be_bound.empty() || - v_bad_spec.empty() || v_retiring.empty() || - v_int_misc_uop_dropping.empty() || - v_slots_or_info_thread_slots.empty(); - // Check if all Variants are greater than 0 when casted to doubles (use - // .to_double()) - bool is_nonzero = - v_fe_bound.to_double() > 0.0 && v_be_bound.to_double() > 0.0 && - v_bad_spec.to_double() > 0.0 && v_retiring.to_double() > 0.0 && - v_int_misc_uop_dropping.to_double() > 0.0 && - v_slots_or_info_thread_slots.to_double() > 0.0; - - // Check if bad values were obtained - if (is_incomplete || !is_nonzero) - return ret; +std::vector SapphireRapidsTopdown::compute_toplevel(const std::vector& rec) +{ + std::vector ret; + + // Get PAPI metrics for toplevel calculations + Variant v_slots_or_info_thread_slots = get_val_from_rec(rec, "perf::slots"); + Variant v_retiring = get_val_from_rec(rec, "perf::topdown-retiring"); + Variant v_bad_spec = get_val_from_rec(rec, "perf::topdown-bad-spec"); + Variant v_fe_bound = get_val_from_rec(rec, "perf::topdown-fe-bound"); + Variant v_be_bound = get_val_from_rec(rec, "perf::topdown-be-bound"); + Variant v_int_misc_uop_dropping = get_val_from_rec(rec, "INT_MISC:UOP_DROPPING"); + + // Check if any Variant is empty (use .empty()) + bool is_incomplete = v_fe_bound.empty() || v_be_bound.empty() || v_bad_spec.empty() || v_retiring.empty() + || v_int_misc_uop_dropping.empty() || v_slots_or_info_thread_slots.empty(); + // Check if all Variants are greater than 0 when casted to doubles (use + // .to_double()) + bool is_nonzero = v_fe_bound.to_double() > 0.0 && v_be_bound.to_double() > 0.0 && v_bad_spec.to_double() > 0.0 + && v_retiring.to_double() > 0.0 && v_int_misc_uop_dropping.to_double() > 0.0 + && v_slots_or_info_thread_slots.to_double() > 0.0; + + // Check if bad values were obtained + if (is_incomplete || !is_nonzero) + return ret; + + // Perform toplevel calcs + double toplevel_sum = + (v_retiring.to_double() + v_bad_spec.to_double() + v_fe_bound.to_double() + v_be_bound.to_double()); + + double retiring = (v_retiring.to_double() / toplevel_sum) + (0 * v_slots_or_info_thread_slots.to_double()); + double frontend_bound = (v_fe_bound.to_double() / toplevel_sum) + - (v_int_misc_uop_dropping.to_double() / v_slots_or_info_thread_slots.to_double()); + double backend_bound = (v_be_bound.to_double() / toplevel_sum) + (0 * v_slots_or_info_thread_slots.to_double()); + double bad_speculation = std::max(1.0 - (frontend_bound + backend_bound + retiring), 0.0); + + // Add toplevel metrics to vector of Entry + ret.reserve(4); + ret.push_back(Entry(m_result_attrs["retiring"], Variant(std::max(retiring, 0.0)))); + ret.push_back(Entry(m_result_attrs["backend_bound"], Variant(std::max(backend_bound, 0.0)))); + ret.push_back(Entry(m_result_attrs["frontend_bound"], Variant(std::max(frontend_bound, 0.0)))); + ret.push_back(Entry(m_result_attrs["bad_speculation"], Variant(std::max(bad_speculation, 0.0)))); - // Perform toplevel calcs - double toplevel_sum = (v_retiring.to_double() + v_bad_spec.to_double() + - v_fe_bound.to_double() + v_be_bound.to_double()); - - double retiring = (v_retiring.to_double() / toplevel_sum) + - (0 * v_slots_or_info_thread_slots.to_double()); - double frontend_bound = (v_fe_bound.to_double() / toplevel_sum) - - (v_int_misc_uop_dropping.to_double() / - v_slots_or_info_thread_slots.to_double()); - double backend_bound = (v_be_bound.to_double() / toplevel_sum) + - (0 * v_slots_or_info_thread_slots.to_double()); - double bad_speculation = - std::max(1.0 - (frontend_bound + backend_bound + retiring), 0.0); - - // Add toplevel metrics to vector of Entry - ret.reserve(4); - ret.push_back( - Entry(m_result_attrs["retiring"], Variant(std::max(retiring, 0.0)))); - ret.push_back(Entry(m_result_attrs["backend_bound"], - Variant(std::max(backend_bound, 0.0)))); - ret.push_back(Entry(m_result_attrs["frontend_bound"], - Variant(std::max(frontend_bound, 0.0)))); - ret.push_back(Entry(m_result_attrs["bad_speculation"], - Variant(std::max(bad_speculation, 0.0)))); - - return ret; + return ret; } -std::size_t SapphireRapidsTopdown::get_num_expected_toplevel() const { - return 4; +std::size_t SapphireRapidsTopdown::get_num_expected_toplevel() const +{ + return 4; } -std::vector -SapphireRapidsTopdown::compute_retiring(const std::vector &rec) { - std::vector ret; - - // Get PAPI metrics for toplevel calculations - Variant v_slots_or_info_thread_slots = get_val_from_rec(rec, "perf::slots"); - Variant v_retiring = get_val_from_rec(rec, "perf::topdown-retiring"); - Variant v_bad_spec = get_val_from_rec(rec, "perf::topdown-bad-spec"); - Variant v_fe_bound = get_val_from_rec(rec, "perf::topdown-fe-bound"); - Variant v_be_bound = get_val_from_rec(rec, "perf::topdown-be-bound"); - Variant v_heavy_ops = get_val_from_rec(rec, "perf_raw::r8400"); - - // Check if any Variant is empty (use .empty()) - bool is_incomplete = v_fe_bound.empty() || v_be_bound.empty() || - v_bad_spec.empty() || v_retiring.empty() || - v_slots_or_info_thread_slots.empty() || - v_heavy_ops.empty(); - - // Check if bad values were obtained - if (is_incomplete) - return ret; +std::vector SapphireRapidsTopdown::compute_retiring(const std::vector& rec) +{ + std::vector ret; + + // Get PAPI metrics for toplevel calculations + Variant v_slots_or_info_thread_slots = get_val_from_rec(rec, "perf::slots"); + Variant v_retiring = get_val_from_rec(rec, "perf::topdown-retiring"); + Variant v_bad_spec = get_val_from_rec(rec, "perf::topdown-bad-spec"); + Variant v_fe_bound = get_val_from_rec(rec, "perf::topdown-fe-bound"); + Variant v_be_bound = get_val_from_rec(rec, "perf::topdown-be-bound"); + Variant v_heavy_ops = get_val_from_rec(rec, "perf_raw::r8400"); + + // Check if any Variant is empty (use .empty()) + bool is_incomplete = v_fe_bound.empty() || v_be_bound.empty() || v_bad_spec.empty() || v_retiring.empty() + || v_slots_or_info_thread_slots.empty() || v_heavy_ops.empty(); - double toplevel_sum = (v_retiring.to_double() + v_bad_spec.to_double() + - v_fe_bound.to_double() + v_be_bound.to_double()); - // Copied from compute_toplevel - double retiring = (v_retiring.to_double() / toplevel_sum) + - (0 * v_slots_or_info_thread_slots.to_double()); + // Check if bad values were obtained + if (is_incomplete) + return ret; - double heavy_ops = (v_heavy_ops.to_double() / toplevel_sum) + - (0 * v_slots_or_info_thread_slots.to_double()); - double light_ops = std::max(0.0, retiring - heavy_ops); + double toplevel_sum = + (v_retiring.to_double() + v_bad_spec.to_double() + v_fe_bound.to_double() + v_be_bound.to_double()); + // Copied from compute_toplevel + double retiring = (v_retiring.to_double() / toplevel_sum) + (0 * v_slots_or_info_thread_slots.to_double()); - // Add toplevel metrics to vector of Entry - ret.reserve(2); - ret.push_back( - Entry(m_result_attrs["heavy_ops"], Variant(std::max(heavy_ops, 0.0)))); - ret.push_back( - Entry(m_result_attrs["light_ops"], Variant(std::max(light_ops, 0.0)))); + double heavy_ops = (v_heavy_ops.to_double() / toplevel_sum) + (0 * v_slots_or_info_thread_slots.to_double()); + double light_ops = std::max(0.0, retiring - heavy_ops); - return ret; + // Add toplevel metrics to vector of Entry + ret.reserve(2); + ret.push_back(Entry(m_result_attrs["heavy_ops"], Variant(std::max(heavy_ops, 0.0)))); + ret.push_back(Entry(m_result_attrs["light_ops"], Variant(std::max(light_ops, 0.0)))); + + return ret; } -std::size_t SapphireRapidsTopdown::get_num_expected_retiring() const { - return 2; +std::size_t SapphireRapidsTopdown::get_num_expected_retiring() const +{ + return 2; } -std::vector -SapphireRapidsTopdown::compute_backend_bound(const std::vector &rec) { - std::vector ret; - - // Get PAPI metrics for toplevel calculations - Variant v_slots_or_info_thread_slots = get_val_from_rec(rec, "perf::slots"); - Variant v_retiring = get_val_from_rec(rec, "perf::topdown-retiring"); - Variant v_bad_spec = get_val_from_rec(rec, "perf::topdown-bad-spec"); - Variant v_fe_bound = get_val_from_rec(rec, "perf::topdown-fe-bound"); - Variant v_be_bound = get_val_from_rec(rec, "perf::topdown-be-bound"); - Variant v_memory_bound = get_val_from_rec(rec, "perf_raw::r8700"); - - // Check if any Variant is empty (use .empty()) - bool is_incomplete = v_fe_bound.empty() || v_be_bound.empty() || - v_bad_spec.empty() || v_retiring.empty() || - v_slots_or_info_thread_slots.empty() || - v_memory_bound.empty(); - - // Check if bad values were obtained - if (is_incomplete) - return ret; +std::vector SapphireRapidsTopdown::compute_backend_bound(const std::vector& rec) +{ + std::vector ret; + + // Get PAPI metrics for toplevel calculations + Variant v_slots_or_info_thread_slots = get_val_from_rec(rec, "perf::slots"); + Variant v_retiring = get_val_from_rec(rec, "perf::topdown-retiring"); + Variant v_bad_spec = get_val_from_rec(rec, "perf::topdown-bad-spec"); + Variant v_fe_bound = get_val_from_rec(rec, "perf::topdown-fe-bound"); + Variant v_be_bound = get_val_from_rec(rec, "perf::topdown-be-bound"); + Variant v_memory_bound = get_val_from_rec(rec, "perf_raw::r8700"); + + // Check if any Variant is empty (use .empty()) + bool is_incomplete = v_fe_bound.empty() || v_be_bound.empty() || v_bad_spec.empty() || v_retiring.empty() + || v_slots_or_info_thread_slots.empty() || v_memory_bound.empty(); - double toplevel_sum = (v_retiring.to_double() + v_bad_spec.to_double() + - v_fe_bound.to_double() + v_be_bound.to_double()); - // Copied from compute_toplevel - double backend_bound = (v_be_bound.to_double() / toplevel_sum) + - (0 * v_slots_or_info_thread_slots.to_double()); + // Check if bad values were obtained + if (is_incomplete) + return ret; - double memory_bound = (v_memory_bound.to_double() / toplevel_sum) + - (0 * v_slots_or_info_thread_slots.to_double()); - double core_bound = std::max(0.0, backend_bound - memory_bound); + double toplevel_sum = + (v_retiring.to_double() + v_bad_spec.to_double() + v_fe_bound.to_double() + v_be_bound.to_double()); + // Copied from compute_toplevel + double backend_bound = (v_be_bound.to_double() / toplevel_sum) + (0 * v_slots_or_info_thread_slots.to_double()); - // Add toplevel metrics to vector of Entry - ret.reserve(2); - ret.push_back(Entry(m_result_attrs["memory_bound"], - Variant(std::max(memory_bound, 0.0)))); - ret.push_back( - Entry(m_result_attrs["core_bound"], Variant(std::max(core_bound, 0.0)))); + double memory_bound = (v_memory_bound.to_double() / toplevel_sum) + (0 * v_slots_or_info_thread_slots.to_double()); + double core_bound = std::max(0.0, backend_bound - memory_bound); - return ret; + // Add toplevel metrics to vector of Entry + ret.reserve(2); + ret.push_back(Entry(m_result_attrs["memory_bound"], Variant(std::max(memory_bound, 0.0)))); + ret.push_back(Entry(m_result_attrs["core_bound"], Variant(std::max(core_bound, 0.0)))); + + return ret; } -std::size_t SapphireRapidsTopdown::get_num_expected_backend_bound() const { - return 2; +std::size_t SapphireRapidsTopdown::get_num_expected_backend_bound() const +{ + return 2; } -std::vector -SapphireRapidsTopdown::compute_frontend_bound(const std::vector &rec) { - std::vector ret; - - // Get PAPI metrics for toplevel calculations - Variant v_slots_or_info_thread_slots = get_val_from_rec(rec, "perf::slots"); - Variant v_retiring = get_val_from_rec(rec, "perf::topdown-retiring"); - Variant v_bad_spec = get_val_from_rec(rec, "perf::topdown-bad-spec"); - Variant v_fe_bound = get_val_from_rec(rec, "perf::topdown-fe-bound"); - Variant v_be_bound = get_val_from_rec(rec, "perf::topdown-be-bound"); - Variant v_int_misc_uop_dropping = - get_val_from_rec(rec, "INT_MISC:UOP_DROPPING"); - Variant v_fetch_latency = get_val_from_rec(rec, "perf_raw::r8600"); - - // Check if any Variant is empty (use .empty()) - bool is_incomplete = - v_fe_bound.empty() || v_be_bound.empty() || v_bad_spec.empty() || - v_retiring.empty() || v_int_misc_uop_dropping.empty() || - v_slots_or_info_thread_slots.empty() || v_fetch_latency.empty(); - - // Check if bad values were obtained - if (is_incomplete) - return ret; +std::vector SapphireRapidsTopdown::compute_frontend_bound(const std::vector& rec) +{ + std::vector ret; + + // Get PAPI metrics for toplevel calculations + Variant v_slots_or_info_thread_slots = get_val_from_rec(rec, "perf::slots"); + Variant v_retiring = get_val_from_rec(rec, "perf::topdown-retiring"); + Variant v_bad_spec = get_val_from_rec(rec, "perf::topdown-bad-spec"); + Variant v_fe_bound = get_val_from_rec(rec, "perf::topdown-fe-bound"); + Variant v_be_bound = get_val_from_rec(rec, "perf::topdown-be-bound"); + Variant v_int_misc_uop_dropping = get_val_from_rec(rec, "INT_MISC:UOP_DROPPING"); + Variant v_fetch_latency = get_val_from_rec(rec, "perf_raw::r8600"); + + // Check if any Variant is empty (use .empty()) + bool is_incomplete = v_fe_bound.empty() || v_be_bound.empty() || v_bad_spec.empty() || v_retiring.empty() + || v_int_misc_uop_dropping.empty() || v_slots_or_info_thread_slots.empty() + || v_fetch_latency.empty(); - double toplevel_sum = (v_retiring.to_double() + v_bad_spec.to_double() + - v_fe_bound.to_double() + v_be_bound.to_double()); - // Copied from compute_toplevel - double frontend_bound = (v_fe_bound.to_double() / toplevel_sum) - - (v_int_misc_uop_dropping.to_double() / - v_slots_or_info_thread_slots.to_double()); + // Check if bad values were obtained + if (is_incomplete) + return ret; - double fetch_latency = (v_fetch_latency.to_double() / toplevel_sum) - - (v_int_misc_uop_dropping.to_double() / - v_slots_or_info_thread_slots.to_double()); + double toplevel_sum = + (v_retiring.to_double() + v_bad_spec.to_double() + v_fe_bound.to_double() + v_be_bound.to_double()); + // Copied from compute_toplevel + double frontend_bound = (v_fe_bound.to_double() / toplevel_sum) + - (v_int_misc_uop_dropping.to_double() / v_slots_or_info_thread_slots.to_double()); - double fetch_bandwidth = std::max(0.0, frontend_bound - fetch_latency); + double fetch_latency = (v_fetch_latency.to_double() / toplevel_sum) + - (v_int_misc_uop_dropping.to_double() / v_slots_or_info_thread_slots.to_double()); - // Add toplevel metrics to vector of Entry - ret.reserve(2); - ret.push_back(Entry(m_result_attrs["frontend_latency"], - Variant(std::max(fetch_latency, 0.0)))); - ret.push_back(Entry(m_result_attrs["frontend_bandwidth"], - Variant(std::max(fetch_bandwidth, 0.0)))); + double fetch_bandwidth = std::max(0.0, frontend_bound - fetch_latency); - return ret; + // Add toplevel metrics to vector of Entry + ret.reserve(2); + ret.push_back(Entry(m_result_attrs["frontend_latency"], Variant(std::max(fetch_latency, 0.0)))); + ret.push_back(Entry(m_result_attrs["frontend_bandwidth"], Variant(std::max(fetch_bandwidth, 0.0)))); + + return ret; } -std::size_t SapphireRapidsTopdown::get_num_expected_frontend_bound() const { - return 2; +std::size_t SapphireRapidsTopdown::get_num_expected_frontend_bound() const +{ + return 2; } -std::vector -SapphireRapidsTopdown::compute_bad_speculation(const std::vector &rec) { - std::vector ret; - - // Get PAPI metrics for toplevel calculations - Variant v_slots_or_info_thread_slots = get_val_from_rec(rec, "perf::slots"); - Variant v_retiring = get_val_from_rec(rec, "perf::topdown-retiring"); - Variant v_bad_spec = get_val_from_rec(rec, "perf::topdown-bad-spec"); - Variant v_fe_bound = get_val_from_rec(rec, "perf::topdown-fe-bound"); - Variant v_be_bound = get_val_from_rec(rec, "perf::topdown-be-bound"); - Variant v_int_misc_uop_dropping = - get_val_from_rec(rec, "INT_MISC:UOP_DROPPING"); - Variant v_branch_mispredict = get_val_from_rec(rec, "perf_raw::r8500"); - - // Check if any Variant is empty (use .empty()) - bool is_incomplete = - v_fe_bound.empty() || v_be_bound.empty() || v_bad_spec.empty() || - v_retiring.empty() || v_int_misc_uop_dropping.empty() || - v_slots_or_info_thread_slots.empty() || v_branch_mispredict.empty(); - - // Check if bad values were obtained - if (is_incomplete) - return ret; +std::vector SapphireRapidsTopdown::compute_bad_speculation(const std::vector& rec) +{ + std::vector ret; + + // Get PAPI metrics for toplevel calculations + Variant v_slots_or_info_thread_slots = get_val_from_rec(rec, "perf::slots"); + Variant v_retiring = get_val_from_rec(rec, "perf::topdown-retiring"); + Variant v_bad_spec = get_val_from_rec(rec, "perf::topdown-bad-spec"); + Variant v_fe_bound = get_val_from_rec(rec, "perf::topdown-fe-bound"); + Variant v_be_bound = get_val_from_rec(rec, "perf::topdown-be-bound"); + Variant v_int_misc_uop_dropping = get_val_from_rec(rec, "INT_MISC:UOP_DROPPING"); + Variant v_branch_mispredict = get_val_from_rec(rec, "perf_raw::r8500"); + + // Check if any Variant is empty (use .empty()) + bool is_incomplete = v_fe_bound.empty() || v_be_bound.empty() || v_bad_spec.empty() || v_retiring.empty() + || v_int_misc_uop_dropping.empty() || v_slots_or_info_thread_slots.empty() + || v_branch_mispredict.empty(); + + // Check if bad values were obtained + if (is_incomplete) + return ret; + + // Perform toplevel calcs + double toplevel_sum = + (v_retiring.to_double() + v_bad_spec.to_double() + v_fe_bound.to_double() + v_be_bound.to_double()); + + double retiring = (v_retiring.to_double() / toplevel_sum) + (0 * v_slots_or_info_thread_slots.to_double()); + double frontend_bound = (v_fe_bound.to_double() / toplevel_sum) + - (v_int_misc_uop_dropping.to_double() / v_slots_or_info_thread_slots.to_double()); + double backend_bound = (v_be_bound.to_double() / toplevel_sum) + (0 * v_slots_or_info_thread_slots.to_double()); + double bad_speculation = std::max(1.0 - (frontend_bound + backend_bound + retiring), 0.0); + + double branch_mispredict = + (v_branch_mispredict.to_double() / toplevel_sum) + (0 * v_slots_or_info_thread_slots.to_double()); + double machine_clears = std::max(0.0, bad_speculation - branch_mispredict); + + // Add toplevel metrics to vector of Entry + ret.reserve(2); + ret.push_back(Entry(m_result_attrs["branch_mispredict"], Variant(std::max(branch_mispredict, 0.0)))); + ret.push_back(Entry(m_result_attrs["machine_clears"], Variant(std::max(machine_clears, 0.0)))); - // Perform toplevel calcs - double toplevel_sum = (v_retiring.to_double() + v_bad_spec.to_double() + - v_fe_bound.to_double() + v_be_bound.to_double()); - - double retiring = (v_retiring.to_double() / toplevel_sum) + - (0 * v_slots_or_info_thread_slots.to_double()); - double frontend_bound = (v_fe_bound.to_double() / toplevel_sum) - - (v_int_misc_uop_dropping.to_double() / - v_slots_or_info_thread_slots.to_double()); - double backend_bound = (v_be_bound.to_double() / toplevel_sum) + - (0 * v_slots_or_info_thread_slots.to_double()); - double bad_speculation = - std::max(1.0 - (frontend_bound + backend_bound + retiring), 0.0); - - double branch_mispredict = (v_branch_mispredict.to_double() / toplevel_sum) + - (0 * v_slots_or_info_thread_slots.to_double()); - double machine_clears = std::max(0.0, bad_speculation - branch_mispredict); - - // Add toplevel metrics to vector of Entry - ret.reserve(2); - ret.push_back(Entry(m_result_attrs["branch_mispredict"], - Variant(std::max(branch_mispredict, 0.0)))); - ret.push_back(Entry(m_result_attrs["machine_clears"], - Variant(std::max(machine_clears, 0.0)))); - - return ret; + return ret; } -std::size_t SapphireRapidsTopdown::get_num_expected_bad_speculation() const { - return 2; +std::size_t SapphireRapidsTopdown::get_num_expected_bad_speculation() const +{ + return 2; } } // namespace topdown diff --git a/src/services/topdown/TopdownCalculator.cpp b/src/services/topdown/TopdownCalculator.cpp index bbfa386f..ab5ab271 100644 --- a/src/services/topdown/TopdownCalculator.cpp +++ b/src/services/topdown/TopdownCalculator.cpp @@ -5,87 +5,96 @@ #include -namespace cali { -namespace topdown { +namespace cali +{ +namespace topdown +{ -Variant TopdownCalculator::get_val_from_rec(const std::vector &rec, - const char *name) { - Variant ret; +Variant TopdownCalculator::get_val_from_rec(const std::vector& rec, const char* name) +{ + Variant ret; - auto c_it = m_counter_attrs.find(name); - if (c_it == m_counter_attrs.end()) - return ret; + auto c_it = m_counter_attrs.find(name); + if (c_it == m_counter_attrs.end()) + return ret; - cali_id_t attr_id = c_it->second.id(); + cali_id_t attr_id = c_it->second.id(); - auto it = std::find_if(rec.begin(), rec.end(), [attr_id](const Entry &e) { - return e.attribute() == attr_id; - }); + auto it = std::find_if(rec.begin(), rec.end(), [attr_id](const Entry& e) { return e.attribute() == attr_id; }); - if (it != rec.end()) - ret = it->value(); - else - ++m_counters_not_found[std::string(name)]; + if (it != rec.end()) + ret = it->value(); + else + ++m_counters_not_found[std::string(name)]; - return ret; + return ret; } -TopdownCalculator::TopdownCalculator(IntelTopdownLevel level, - const char *top_counters, - const char *all_counters, - std::vector &&res_top, - std::vector &&res_all) - : m_level(level), m_top_counters(top_counters), - m_all_counters(all_counters), m_res_top(res_top), m_res_all(res_all) {} - -TopdownCalculator::TopdownCalculator(IntelTopdownLevel level) - : m_level(level) {} - -bool TopdownCalculator::find_counter_attrs(CaliperMetadataAccessInterface &db) { - const char *list = (m_level == All ? m_all_counters : m_top_counters); - auto counters = StringConverter(list).to_stringlist(); - - for (const auto &s : counters) { - Attribute attr = db.get_attribute(std::string("sum#papi.") + s); - - if (!attr) - attr = db.get_attribute(std::string("papi.") + s); - if (!attr) { - Log(0).stream() << "topdown: " << s << " counter attribute not found!" - << std::endl; - return false; +TopdownCalculator::TopdownCalculator( + IntelTopdownLevel level, + const char* top_counters, + const char* all_counters, + std::vector&& res_top, + std::vector&& res_all +) + : m_level(level), m_top_counters(top_counters), m_all_counters(all_counters), m_res_top(res_top), m_res_all(res_all) +{} + +TopdownCalculator::TopdownCalculator(IntelTopdownLevel level) : m_level(level) +{} + +bool TopdownCalculator::find_counter_attrs(CaliperMetadataAccessInterface& db) +{ + const char* list = (m_level == All ? m_all_counters : m_top_counters); + auto counters = StringConverter(list).to_stringlist(); + + for (const auto& s : counters) { + Attribute attr = db.get_attribute(std::string("sum#papi.") + s); + + if (!attr) + attr = db.get_attribute(std::string("papi.") + s); + if (!attr) { + Log(0).stream() << "topdown: " << s << " counter attribute not found!" << std::endl; + return false; + } + + m_counter_attrs[s] = attr; } - m_counter_attrs[s] = attr; - } - - return true; + return true; } -void TopdownCalculator::make_result_attrs(CaliperMetadataAccessInterface &db) { - std::vector &res = (m_level == Top ? m_res_top : m_res_all); +void TopdownCalculator::make_result_attrs(CaliperMetadataAccessInterface& db) +{ + std::vector& res = (m_level == Top ? m_res_top : m_res_all); - for (const char *s : res) { - m_result_attrs[std::string(s)] = - db.create_attribute(std::string("topdown.") + s, CALI_TYPE_DOUBLE, - CALI_ATTR_ASVALUE | CALI_ATTR_SKIP_EVENTS); - } + for (const char* s : res) { + m_result_attrs[std::string(s)] = db.create_attribute( + std::string("topdown.") + s, + CALI_TYPE_DOUBLE, + CALI_ATTR_ASVALUE | CALI_ATTR_SKIP_EVENTS + ); + } } -const std::map & -TopdownCalculator::get_counters_not_found() const { - return m_counters_not_found; +const std::map& TopdownCalculator::get_counters_not_found() const +{ + return m_counters_not_found; } -const char *TopdownCalculator::get_counters() const { - if (m_level == All) { - return m_all_counters; - } else { - return m_top_counters; - } +const char* TopdownCalculator::get_counters() const +{ + if (m_level == All) { + return m_all_counters; + } else { + return m_top_counters; + } } -IntelTopdownLevel TopdownCalculator::get_level() const { return m_level; } +IntelTopdownLevel TopdownCalculator::get_level() const +{ + return m_level; +} } // namespace topdown } // namespace cali \ No newline at end of file diff --git a/src/services/topdown/TopdownCalculator.h b/src/services/topdown/TopdownCalculator.h index 9841580e..0bf29264 100644 --- a/src/services/topdown/TopdownCalculator.h +++ b/src/services/topdown/TopdownCalculator.h @@ -19,74 +19,99 @@ */ // clang-format on -namespace cali { -namespace topdown { +namespace cali +{ +namespace topdown +{ enum IntelTopdownLevel { All = 1, Top = 2 }; -class TopdownCalculator { +class TopdownCalculator +{ protected: - IntelTopdownLevel m_level; - const char *m_top_counters; - const char *m_all_counters; + IntelTopdownLevel m_level; - std::vector m_res_top; - std::vector m_res_all; + const char* m_top_counters; + const char* m_all_counters; - std::map m_counter_attrs; - std::map m_result_attrs; + std::vector m_res_top; + std::vector m_res_all; - std::map m_counters_not_found; + std::map m_counter_attrs; + std::map m_result_attrs; - Variant get_val_from_rec(const std::vector &rec, const char *name); + std::map m_counters_not_found; - TopdownCalculator(IntelTopdownLevel level, const char *top_counters, - const char *all_counters, - std::vector &&res_top, - std::vector &&res_all); + Variant get_val_from_rec(const std::vector& rec, const char* name); + + TopdownCalculator( + IntelTopdownLevel level, + const char* top_counters, + const char* all_counters, + std::vector&& res_top, + std::vector&& res_all + ); public: - TopdownCalculator(IntelTopdownLevel level); - virtual ~TopdownCalculator() = default; + TopdownCalculator(IntelTopdownLevel level); + + virtual ~TopdownCalculator() = default; - virtual bool check_for_disabled_multiplex() const = 0; + // Returns true if PAPI multiplexing cannot be used for the + // counters and/or architecture needed for the subclass + virtual bool check_for_disabled_multiplex() const = 0; - virtual std::vector - compute_toplevel(const std::vector &rec) = 0; + // Computes the L1 topdown metrics using the counters contained + // in the Caliper Entries. + virtual std::vector compute_toplevel(const std::vector& rec) = 0; - virtual std::size_t get_num_expected_toplevel() const = 0; + // Returns the expected size of the vectoor returned from + // compute_toplevel + virtual std::size_t get_num_expected_toplevel() const = 0; - virtual std::vector - compute_retiring(const std::vector &rec) = 0; + // Computes the topdown metrics beneath "Retiring" in the + // topdown hierarchy for the given architecture + virtual std::vector compute_retiring(const std::vector& rec) = 0; - virtual std::size_t get_num_expected_retiring() const = 0; + // Returns the expected size of the vector returned from + // compute_retiring + virtual std::size_t get_num_expected_retiring() const = 0; - virtual std::vector - compute_backend_bound(const std::vector &rec) = 0; + // Computes the topdown metrics beneath "Backend bound" in the + // topdown hierarchy for the given architecture + virtual std::vector compute_backend_bound(const std::vector& rec) = 0; - virtual std::size_t get_num_expected_backend_bound() const = 0; + // Returns the expected size of the vector returned from + // compute_backend_bounnd + virtual std::size_t get_num_expected_backend_bound() const = 0; - virtual std::vector - compute_frontend_bound(const std::vector &rec) = 0; + // Computes the topdown metrics beneath "Frontend bound" in the + // topdown hierarchy for the given architecture + virtual std::vector compute_frontend_bound(const std::vector& rec) = 0; - virtual std::size_t get_num_expected_frontend_bound() const = 0; + // Returns the expected size of the vector returned from + // compute_frontend_bounnd + virtual std::size_t get_num_expected_frontend_bound() const = 0; - virtual std::vector - compute_bad_speculation(const std::vector &rec) = 0; + // Computes the topdown metrics beneath "Bad speculation" in the + // topdown hierarchy for the given architecture + virtual std::vector compute_bad_speculation(const std::vector& rec) = 0; - virtual std::size_t get_num_expected_bad_speculation() const = 0; + // Returns the expected size of the vector returned from + // compute_bad_speculation + virtual std::size_t get_num_expected_bad_speculation() const = 0; - bool find_counter_attrs(CaliperMetadataAccessInterface &db); + bool find_counter_attrs(CaliperMetadataAccessInterface& db); - void make_result_attrs(CaliperMetadataAccessInterface &db); + void make_result_attrs(CaliperMetadataAccessInterface& db); - const std::map &get_counters_not_found() const; + const std::map& get_counters_not_found() const; - const char *get_counters() const; + const char* get_counters() const; - IntelTopdownLevel get_level() const; + IntelTopdownLevel get_level() const; }; } // namespace topdown