Skip to content

Commit

Permalink
Rework roctracer shutdown flushing (#833)
Browse files Browse the repository at this point in the history
Summary:
Reworked roctracer flush on shutdown.  Removing a race condition added while removing the original race condition. :)
Previous implementation had a deadlock in the case where the op buffer had filled and flushed on its own immediately before shutdown.

This new approach is simpler but has to continuously log correlation ids of completed async ops.  This is done on the roctracer supplied callback thread so it is not an overhead/performance issue.  Previous attempt was over-optimized at the cost of not working correctly.  Oops.

Pull Request resolved: #833

Reviewed By: xuzhao9

Differential Revision: D51473804

Pulled By: aaronenyeshi

fbshipit-source-id: 7a7bea1356aea7e9a719fffe8e50735d072843f6
  • Loading branch information
mwootton authored and facebook-github-bot committed Nov 21, 2023
1 parent b2db477 commit 490f305
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 1 deletion.
51 changes: 50 additions & 1 deletion libkineto/src/RoctracerLogger.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
#include <cstring>
#include <chrono>
#include <time.h>
#include <mutex>
#include <unistd.h>

#include "ThreadUtil.h"

Expand All @@ -24,6 +26,20 @@ using namespace std::chrono;

constexpr size_t kBufSize(2 * 1024 * 1024);

class Flush
{
public:
std::mutex mutex_;
std::atomic<uint64_t> maxCorrelationId_;
uint64_t maxCompletedCorrelationId_ {0};
void reportCorrelation(const uint64_t &cid) {
uint64_t prev = maxCorrelationId_;
while (prev < cid && !maxCorrelationId_.compare_exchange_weak(prev, cid))
{}
}
};
static Flush s_flush;

RoctracerLogger& RoctracerLogger::singleton() {
static RoctracerLogger instance;
return instance;
Expand Down Expand Up @@ -91,6 +107,7 @@ void RoctracerLogger::api_callback(uint32_t domain, uint32_t cid, const void* ca
case HIP_API_ID_hipExtLaunchKernel:
case HIP_API_ID_hipLaunchCooperativeKernel: // Should work here
{
s_flush.reportCorrelation(data->correlation_id);
auto &args = data->args.hipLaunchKernel;
dis->kernelRows_.emplace_back(data->correlation_id,
domain,
Expand All @@ -116,6 +133,7 @@ void RoctracerLogger::api_callback(uint32_t domain, uint32_t cid, const void* ca
case HIP_API_ID_hipModuleLaunchKernel:
case HIP_API_ID_hipExtModuleLaunchKernel:
{
s_flush.reportCorrelation(data->correlation_id);
auto &args = data->args.hipModuleLaunchKernel;
dis->kernelRows_.emplace_back(data->correlation_id,
domain,
Expand Down Expand Up @@ -252,6 +270,18 @@ void RoctracerLogger::activity_callback(const char* begin, const char* end, void
auto &gpuTraceBuffers = singleton().gpuTraceBuffers_;
memcpy(buffer, begin, size);
gpuTraceBuffers->emplace_back(buffer, size);

// Log latest completed correlation id. Used to ensure we have flushed all data on stop
std::unique_lock<std::mutex> lock(s_flush.mutex_);
const roctracer_record_t* record = (const roctracer_record_t*)(begin);
const roctracer_record_t* end_record = (const roctracer_record_t*)(end);

while (record < end_record) {
if (record->correlation_id > s_flush.maxCompletedCorrelationId_) {
s_flush.maxCompletedCorrelationId_ = record->correlation_id;
}
roctracer_next_record(record, &record);
}
}

void RoctracerLogger::startLogging() {
Expand Down Expand Up @@ -312,12 +342,31 @@ void RoctracerLogger::startLogging() {
}

externalCorrelationEnabled_ = true;
logging_ = true;
roctracer_start();
}

void RoctracerLogger::stopLogging() {
roctracer_stop();
if (logging_ == false)
return;
logging_ = false;

roctracer_flush_activity_expl(hccPool_);

// If we are stopping the tracer, implement reliable flushing
std::unique_lock<std::mutex> lock(s_flush.mutex_);

auto correlationId = s_flush.maxCorrelationId_.load(); // load ending id from the running max

// Poll on the worker finding the final correlation id
while (s_flush.maxCompletedCorrelationId_ < correlationId) {
lock.unlock();
roctracer_flush_activity_expl(hccPool_);
usleep(1000);
lock.lock();
}

roctracer_stop();
}

void RoctracerLogger::endTracing() {
Expand Down
1 change: 1 addition & 0 deletions libkineto/src/RoctracerLogger.h
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,7 @@ class RoctracerLogger {

std::unique_ptr<std::list<RoctracerActivityBuffer>> gpuTraceBuffers_;
bool externalCorrelationEnabled_{true};
bool logging_{false};

friend class onnxruntime::profiling::RocmProfiler;
friend class libkineto::RoctracerActivityApi;
Expand Down

0 comments on commit 490f305

Please sign in to comment.