From 956c0f079d3b4259e89d197102a352d7dfbcc342 Mon Sep 17 00:00:00 2001 From: Shivam Raikundalia Date: Tue, 5 Nov 2024 16:48:19 -0800 Subject: [PATCH] Add Bandwidth Calculations to RoCM Traces (#1010) Summary: Pull Request resolved: https://github.com/pytorch/kineto/pull/1010 Takes the hip launch for memset and memcpy metadata and forwards it to kernel events. Afterwards use duration to calcaulate bandwidth. Also, make sure global vars are thread safe and anonymous Reviewed By: aaronenyeshi Differential Revision: D65452222 fbshipit-source-id: 3e516126d4d6bf1baf9d71e2ef87d122e946649b --- libkineto/src/RoctracerActivity_inl.h | 34 +++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 5 deletions(-) diff --git a/libkineto/src/RoctracerActivity_inl.h b/libkineto/src/RoctracerActivity_inl.h index f2ee41167..56d54f6a9 100644 --- a/libkineto/src/RoctracerActivity_inl.h +++ b/libkineto/src/RoctracerActivity_inl.h @@ -11,6 +11,7 @@ #include "RoctracerActivity.h" #include +#include #include #include "Demangle.h" @@ -20,8 +21,11 @@ namespace KINETO_NAMESPACE { using namespace libkineto; -static std::unordered_map correlationToGrid; -static std::unordered_map correlationToBlock; +namespace { +thread_local std::unordered_map correlationToGrid; +thread_local std::unordered_map correlationToBlock; +thread_local std::unordered_map correlationToSize; +} // namespace const char* getGpuActivityKindString(uint32_t kind) { switch (kind) { @@ -94,11 +98,29 @@ inline void GpuActivity::log(ActivityLogger& logger) const { logger.handleActivity(*this); } +static inline std::string bandwidth(size_t bytes, uint64_t duration) { + return duration == 0 ? "\"N/A\"" : fmt::format("{}", bytes * 1.0 / duration); +} + inline const std::string GpuActivity::metadataJson() const { const auto& gpuActivity = raw(); // clang-format off + + // if memcpy or memset, add size + if (correlationToSize.count(gpuActivity.id) > 0) { + size_t size = correlationToSize[gpuActivity.id]; + std::string bandwidth_gib = (bandwidth(size, gpuActivity.end - gpuActivity.begin)); + return fmt::format(R"JSON( + "device": {}, "stream": {}, + "correlation": {}, "kind": "{}", + "bytes": {}, "memory bandwidth (GB/s)": {})JSON", + gpuActivity.device, gpuActivity.queue, + gpuActivity.id, getGpuActivityKindString(gpuActivity.kind), + size, bandwidth_gib); + } - if (correlationToGrid.count(gpuActivity.id) > 0) { + // if compute kernel, add grid and block + else if (correlationToGrid.count(gpuActivity.id) > 0) { return fmt::format(R"JSON( "device": {}, "stream": {}, "correlation": {}, "kind": "{}", @@ -189,9 +211,10 @@ inline const std::string RuntimeActivity::metadataJson() template <> inline const std::string RuntimeActivity::metadataJson() const { + correlationToSize[raw().id] = raw().size; return fmt::format( R"JSON( - "cid": {}, "correlation": {}, "src": "{}", "dst": "{}", "size": "{}", "kind": "{}")JSON", + "cid": {}, "correlation": {}, "src": "{}", "dst": "{}", "bytes": "{}", "kind": "{}")JSON", raw().cid, raw().id, raw().src, @@ -203,11 +226,12 @@ inline const std::string RuntimeActivity::metadataJson() template <> inline const std::string RuntimeActivity::metadataJson() const { + correlationToSize[raw().id] = raw().size; std::string size = ""; if (raw().cid == HIP_API_ID_hipMalloc) { size = fmt::format( R"JSON( - "size": {}, )JSON", + "bytes": {}, )JSON", raw().size); } return fmt::format(