Skip to content

Commit

Permalink
Make CUPTI lazy init (#825)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: #825

Make the Kineto profiler lazily initialize the CUPTI library.

The following will change:
- Only the legacy event profiler will register callbacks on RESOURCE_CONTEXT_CREATED and RESOURCE_CONTEXT_DESTROYED.
- Move the initialization of profiler (libkineto::api().initProfilerIfRegistered()) to be lazy when prepare profiler is called, profilerStep is called, or during torch init pybinding (see D50894961).
- Remove previous logic to init on fork (ie. ENABLE_KINETO_ON_FORK), also removing the use of calling pthread_atfork.
- Unsubscribe the CUPTI Subscriber after teardown.
- Clean up verbose logging

Test Plan: CI

Reviewed By: satgera

Differential Revision: D50471345

Pulled By: aaronenyeshi

fbshipit-source-id: 609f300c77c120ae94dcbcdf48d5720efec9e0e6
  • Loading branch information
aaronenyeshi authored and facebook-github-bot committed Nov 15, 2023
1 parent 33fdaad commit 2cb8872
Show file tree
Hide file tree
Showing 4 changed files with 78 additions and 73 deletions.
37 changes: 14 additions & 23 deletions libkineto/src/CuptiActivityApi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,19 +41,13 @@ inline bool cuptiLazyInit_() {

inline void reenableCuptiCallbacks_(std::shared_ptr<CuptiCallbackApi>& cbapi_) {
// Re-enable callbacks from the past if they exist.
LOG(INFO) << "Re-enable previous CUPTI callbacks - Starting";
VLOG(1) << " CUPTI subscriber before reinit:" << cbapi_->getCuptiSubscriber();
cbapi_->initCallbackApi();
if (cbapi_->initSuccess()) {
VLOG(1) << " CUPTI subscriber after reinit:" << cbapi_->getCuptiSubscriber();
bool status = cbapi_->reenableCallbacks();
if (!status) {
LOG(WARNING) << "Re-enable previous CUPTI callbacks - Failed to reenableCallbacks";
} else {
LOG(INFO) << "Re-enable previous CUPTI callbacks - Successful";
}
VLOG(1) << "Re-enable previous CUPTI callbacks - Starting";
bool status = cbapi_->reenableCallbacks();
LOG(INFO) << " CUPTI subscriber after enable:" << cbapi_->getCuptiSubscriber();
if (!status) {
LOG(WARNING) << "Re-enable previous CUPTI callbacks - Failed to reenableCallbacks";
} else {
LOG(WARNING) << "Re-enable previous CUPTI callbacks - Failed to initCallbackApi";
VLOG(1) << "Re-enable previous CUPTI callbacks - Successful";
}
}
#endif
Expand Down Expand Up @@ -306,10 +300,13 @@ void CuptiActivityApi::bufferCompleted(
void CuptiActivityApi::enableCuptiActivities(
const std::set<ActivityType>& selected_activities) {
#ifdef HAS_CUPTI
// Lazily support re-init of CUPTI Callbacks, if they were finalized before.
auto cbapi_ = CuptiCallbackApi::singleton();
if (!tracingEnabled_ && !cbapi_->initSuccess() && cuptiLazyInit_()) {
reenableCuptiCallbacks_(cbapi_);
if (!tracingEnabled_ && !cbapi_->initStatus()) {
cbapi_->initCallbackApi();
// Lazily support init of CUPTI Callbacks.
if (cuptiLazyInit_()) {
reenableCuptiCallbacks_(cbapi_);
}
}
cbapi_.reset();

Expand Down Expand Up @@ -396,13 +393,6 @@ void CuptiActivityApi::teardownContext() {
// PyTorch Profiler is synchronous, so teardown needs to be run async in this thread.
std::thread teardownThread([&] {
auto cbapi_ = CuptiCallbackApi::singleton();
if (!cbapi_->initSuccess()) {
cbapi_->initCallbackApi();
if (!cbapi_->initSuccess()) {
LOG(WARNING) << "CUPTI Callback failed to init, skipping teardown";
return;
}
}
// Subscribe callbacks to call cuptiFinalize in the exit callback of these APIs
bool status = cbapi_->enableCallbackDomain(CUPTI_CB_DOMAIN_RUNTIME_API);
status = status && cbapi_->enableCallbackDomain(CUPTI_CB_DOMAIN_DRIVER_API);
Expand All @@ -414,7 +404,7 @@ void CuptiActivityApi::teardownContext() {
// Force Flush before finalize
CUPTI_CALL(cuptiActivityFlushAll(CUPTI_ACTIVITY_FLAG_FLUSH_FORCED));

LOG(INFO) << " CUPTI subscriber before finalize:" << cbapi_->getCuptiSubscriber();
VLOG(1) << " CUPTI subscriber before finalize:" << cbapi_->getCuptiSubscriber();
teardownCupti_ = 1;
std::unique_lock<std::mutex> lck(finalizeMutex_);
finalizeCond_.wait(lck, [&]{return teardownCupti_ == 0;});
Expand All @@ -427,6 +417,7 @@ void CuptiActivityApi::teardownContext() {
// Remove the callbacks used specifically for cuptiFinalize
cbapi_->disableCallbackDomain(CUPTI_CB_DOMAIN_RUNTIME_API);
cbapi_->disableCallbackDomain(CUPTI_CB_DOMAIN_DRIVER_API);
cbapi_->deinitCallbackApi();

// Re-init CUPTI Callbacks if Lazy Re-init is not enabled.
if (!cuptiLazyInit_()) {
Expand Down
29 changes: 28 additions & 1 deletion libkineto/src/CuptiCallbackApi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -165,19 +165,43 @@ std::shared_ptr<CuptiCallbackApi> CuptiCallbackApi::singleton() {

void CuptiCallbackApi::initCallbackApi() {
#ifdef HAS_CUPTI
if (initSuccess_) {
return;
}

lastCuptiStatus_ = CUPTI_ERROR_UNKNOWN;
lastCuptiStatus_ = CUPTI_CALL_NOWARN(
cuptiSubscribe(&subscriber_,
(CUpti_CallbackFunc)callback_switchboard,
nullptr));
if (lastCuptiStatus_ != CUPTI_SUCCESS) {
VLOG(1) << "Failed cuptiSubscribe, status: " << lastCuptiStatus_;
LOG(WARNING) << "Failed cuptiSubscribe, status: " << lastCuptiStatus_;
LOG(WARNING) << "CUPTI initialization failed - "
<< "CUDA profiler activities will be missing";
if (lastCuptiStatus_ == CUPTI_ERROR_INSUFFICIENT_PRIVILEGES) {
LOG(INFO) << "For CUPTI_ERROR_INSUFFICIENT_PRIVILEGES, refer to "
<< "https://developer.nvidia.com/nvidia-development-tools-solutions-err-nvgpuctrperm-cupti";
}
}

initSuccess_ = (lastCuptiStatus_ == CUPTI_SUCCESS);
#endif
}

void CuptiCallbackApi::deinitCallbackApi() {
#ifdef HAS_CUPTI
if (!initSuccess_) {
return;
}
lastCuptiStatus_ = CUPTI_CALL_NOWARN(
cuptiUnsubscribe(subscriber_));
if (lastCuptiStatus_ != CUPTI_SUCCESS) {
LOG(WARNING) << "Failed cuptiUnsubscribe, status: " << lastCuptiStatus_;
}
initSuccess_ = false;
#endif
}

CuptiCallbackApi::CallbackList* CuptiCallbackApi::CallbackTable::lookup(
CUpti_CallbackDomain domain, CuptiCallBackID cbid) {
size_t idx;
Expand Down Expand Up @@ -262,6 +286,7 @@ bool CuptiCallbackApi::deleteCallback(
bool CuptiCallbackApi::enableCallback(
CUpti_CallbackDomain domain, CUpti_CallbackId cbid) {
#ifdef HAS_CUPTI
initCallbackApi();
if (initSuccess_) {
lastCuptiStatus_ = CUPTI_CALL_NOWARN(
cuptiEnableCallback(1, subscriber_, domain, cbid));
Expand All @@ -288,6 +313,7 @@ bool CuptiCallbackApi::disableCallback(
bool CuptiCallbackApi::enableCallbackDomain(
CUpti_CallbackDomain domain) {
#ifdef HAS_CUPTI
initCallbackApi();
if (initSuccess_) {
lastCuptiStatus_ = CUPTI_CALL_NOWARN(
cuptiEnableDomain(1, subscriber_, domain));
Expand All @@ -313,6 +339,7 @@ bool CuptiCallbackApi::disableCallbackDomain(

bool CuptiCallbackApi::reenableCallbacks() {
#ifdef HAS_CUPTI
initCallbackApi();
if (initSuccess_) {
for (auto& cbpair : enabledCallbacks_) {
if ((uint32_t)cbpair.second == MAX_CUPTI_CALLBACK_ID_ALL) {
Expand Down
3 changes: 2 additions & 1 deletion libkineto/src/CuptiCallbackApi.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,9 @@ class CuptiCallbackApi {
static std::shared_ptr<CuptiCallbackApi> singleton();

void initCallbackApi();
void deinitCallbackApi();

bool initSuccess() const {
bool initStatus() const {
return initSuccess_;
}

Expand Down
82 changes: 34 additions & 48 deletions libkineto/src/init.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@
namespace KINETO_NAMESPACE {

#ifdef HAS_CUPTI
static bool initialized = false;
static std::mutex initMutex;

bool enableEventProfiler() {
Expand All @@ -46,25 +45,14 @@ static void initProfilers(
VLOG(0) << "CUDA Context created";
std::lock_guard<std::mutex> lock(initMutex);

if (!initialized) {
libkineto::api().initProfilerIfRegistered();
initialized = true;
VLOG(0) << "libkineto profilers activated";
}

if (!enableEventProfiler()) {
VLOG(0) << "Kineto EventProfiler disabled, skipping start";
return;
} else {
CUpti_ResourceData* d = (CUpti_ResourceData*)cbInfo;
CUcontext ctx = d->context;
ConfigLoader& config_loader = libkineto::api().configLoader();
config_loader.initBaseConfig();
auto config = config_loader.getConfigCopy();
if (config->eventProfilerEnabled()) {
EventProfilerController::start(ctx, config_loader);
LOG(INFO) << "Kineto EventProfiler started";
}
CUpti_ResourceData* d = (CUpti_ResourceData*)cbInfo;
CUcontext ctx = d->context;
ConfigLoader& config_loader = libkineto::api().configLoader();
config_loader.initBaseConfig();
auto config = config_loader.getConfigCopy();
if (config->eventProfilerEnabled()) {
EventProfilerController::start(ctx, config_loader);
LOG(INFO) << "Kineto EventProfiler started";
}
}

Expand All @@ -88,12 +76,10 @@ static void stopProfiler(
VLOG(0) << "CUDA Context destroyed";
std::lock_guard<std::mutex> lock(initMutex);

if (enableEventProfiler()) {
CUpti_ResourceData* d = (CUpti_ResourceData*)cbInfo;
CUcontext ctx = d->context;
EventProfilerController::stopIfEnabled(ctx);
LOG(INFO) << "Kineto EventProfiler stopped";
}
CUpti_ResourceData* d = (CUpti_ResourceData*)cbInfo;
CUcontext ctx = d->context;
EventProfilerController::stopIfEnabled(ctx);
LOG(INFO) << "Kineto EventProfiler stopped";
}

static std::unique_ptr<CuptiRangeProfilerInit> rangeProfilerInit;
Expand Down Expand Up @@ -124,31 +110,29 @@ void libkineto_init(bool cpuOnly, bool logOnError) {
#endif

#ifdef HAS_CUPTI
if (!cpuOnly) {
bool initRangeProfiler = false;
if (!cpuOnly && enableEventProfiler() ) {
// libcupti will be lazily loaded on this call.
// If it is not available (e.g. CUDA is not installed),
// then this call will return an error and we just abort init.
auto cbapi = CuptiCallbackApi::singleton();
cbapi->initCallbackApi();
bool status = false;
bool initRangeProfiler = true;

if (cbapi->initSuccess()){
const CUpti_CallbackDomain domain = CUPTI_CB_DOMAIN_RESOURCE;
status = cbapi->registerCallback(
domain, CuptiCallbackApi::RESOURCE_CONTEXT_CREATED, initProfilers);
status = status && cbapi->registerCallback(
domain, CuptiCallbackApi::RESOURCE_CONTEXT_DESTROYED, stopProfiler);

if (status) {
status = cbapi->enableCallback(
domain, CuptiCallbackApi::RESOURCE_CONTEXT_CREATED);
status = status && cbapi->enableCallback(
domain, CuptiCallbackApi::RESOURCE_CONTEXT_DESTROYED);
}
initRangeProfiler = true;

const CUpti_CallbackDomain domain = CUPTI_CB_DOMAIN_RESOURCE;
status = cbapi->registerCallback(
domain, CuptiCallbackApi::RESOURCE_CONTEXT_CREATED, initProfilers);
status = status && cbapi->registerCallback(
domain, CuptiCallbackApi::RESOURCE_CONTEXT_DESTROYED, stopProfiler);

if (status) {
status = cbapi->enableCallback(
domain, CuptiCallbackApi::RESOURCE_CONTEXT_CREATED);
status = status && cbapi->enableCallback(
domain, CuptiCallbackApi::RESOURCE_CONTEXT_DESTROYED);
}

if (!cbapi->initSuccess() || !status) {
if (!cbapi->initStatus() || !status) {
initRangeProfiler = false;
cpuOnly = true;
if (logOnError) {
Expand All @@ -159,11 +143,13 @@ void libkineto_init(bool cpuOnly, bool logOnError) {
<< "https://developer.nvidia.com/nvidia-development-tools-solutions-err-nvgpuctrperm-cupti";
}
}
} else {
VLOG(0) << "Kineto EventProfiler disabled, skipping it";
}

// initialize CUPTI Range Profiler API
if (initRangeProfiler) {
rangeProfilerInit = std::make_unique<CuptiRangeProfilerInit>();
}
// initialize CUPTI Range Profiler API
if (initRangeProfiler) {
rangeProfilerInit = std::make_unique<CuptiRangeProfilerInit>();
}

if (shouldPreloadCuptiInstrumentation()) {
Expand Down

0 comments on commit 2cb8872

Please sign in to comment.