Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rework roctracer shutdown flushing #833

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 50 additions & 1 deletion libkineto/src/RoctracerLogger.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
#include <cstring>
#include <chrono>
#include <time.h>
#include <mutex>
#include <unistd.h>

#include "ThreadUtil.h"

Expand All @@ -24,6 +26,20 @@ using namespace std::chrono;

constexpr size_t kBufSize(2 * 1024 * 1024);

class Flush
{
public:
std::mutex mutex_;
std::atomic<uint64_t> maxCorrelationId_;
uint64_t maxCompletedCorrelationId_ {0};
void reportCorrelation(const uint64_t &cid) {
uint64_t prev = maxCorrelationId_;
while (prev < cid && !maxCorrelationId_.compare_exchange_weak(prev, cid))
{}
}
};
static Flush s_flush;

RoctracerLogger& RoctracerLogger::singleton() {
static RoctracerLogger instance;
return instance;
Expand Down Expand Up @@ -91,6 +107,7 @@ void RoctracerLogger::api_callback(uint32_t domain, uint32_t cid, const void* ca
case HIP_API_ID_hipExtLaunchKernel:
case HIP_API_ID_hipLaunchCooperativeKernel: // Should work here
{
s_flush.reportCorrelation(data->correlation_id);
auto &args = data->args.hipLaunchKernel;
dis->kernelRows_.emplace_back(data->correlation_id,
domain,
Expand All @@ -116,6 +133,7 @@ void RoctracerLogger::api_callback(uint32_t domain, uint32_t cid, const void* ca
case HIP_API_ID_hipModuleLaunchKernel:
case HIP_API_ID_hipExtModuleLaunchKernel:
{
s_flush.reportCorrelation(data->correlation_id);
auto &args = data->args.hipModuleLaunchKernel;
dis->kernelRows_.emplace_back(data->correlation_id,
domain,
Expand Down Expand Up @@ -252,6 +270,18 @@ void RoctracerLogger::activity_callback(const char* begin, const char* end, void
auto &gpuTraceBuffers = singleton().gpuTraceBuffers_;
memcpy(buffer, begin, size);
gpuTraceBuffers->emplace_back(buffer, size);

// Log latest completed correlation id. Used to ensure we have flushed all data on stop
std::unique_lock<std::mutex> lock(s_flush.mutex_);
const roctracer_record_t* record = (const roctracer_record_t*)(begin);
const roctracer_record_t* end_record = (const roctracer_record_t*)(end);

while (record < end_record) {
if (record->correlation_id > s_flush.maxCompletedCorrelationId_) {
s_flush.maxCompletedCorrelationId_ = record->correlation_id;
}
roctracer_next_record(record, &record);
}
}

void RoctracerLogger::startLogging() {
Expand Down Expand Up @@ -312,12 +342,31 @@ void RoctracerLogger::startLogging() {
}

externalCorrelationEnabled_ = true;
logging_ = true;
roctracer_start();
}

void RoctracerLogger::stopLogging() {
roctracer_stop();
if (logging_ == false)
return;
logging_ = false;

roctracer_flush_activity_expl(hccPool_);

// If we are stopping the tracer, implement reliable flushing
std::unique_lock<std::mutex> lock(s_flush.mutex_);

auto correlationId = s_flush.maxCorrelationId_.load(); // load ending id from the running max

// Poll on the worker finding the final correlation id
while (s_flush.maxCompletedCorrelationId_ < correlationId) {
lock.unlock();
roctracer_flush_activity_expl(hccPool_);
usleep(1000);
lock.lock();
}

roctracer_stop();
}

void RoctracerLogger::endTracing() {
Expand Down
1 change: 1 addition & 0 deletions libkineto/src/RoctracerLogger.h
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,7 @@ class RoctracerLogger {

std::unique_ptr<std::list<RoctracerActivityBuffer>> gpuTraceBuffers_;
bool externalCorrelationEnabled_{true};
bool logging_{false};

friend class onnxruntime::profiling::RocmProfiler;
friend class libkineto::RoctracerActivityApi;
Expand Down