Skip to content

Commit

Permalink
lyp_clang_tidy: warning uint64_t->int (DeepLink-org#518)
Browse files Browse the repository at this point in the history
* clang_tidy:torch_dipu/csrc_dipu/profiler/CorrelationIDManager.cpp
                                         CorrelationIDManager.h

* clang_tidy dipu/torch_dipu/csrc_dipu/profiler/DIPUDeviceActivity.cpp .h

* clang_tidy:torch_dipu/csrc_dipu/profiler/profiler.cpp

* clang_tidy:torch_dipu/csrc_dipu/profiler/patch.cpp

* clang_tidy:torch_dipu/csrc_dipu/profiler/patch.cpp --v2

* clang_tidy:dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUBFCachingAllocator.cpp

* clang_tidy:dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUBFCachingAllocator.cpp -v2

* clang_tidy: dipu/torch_dipu/csrc_dipu/runtime/core/DIPUEvent.h

* clang_tidy: torch_dipu/csrc_dipu/profiler/profiler.h --v2

* clang_tidy: torch_dipu/csrc_dipu/profiler/DIPUDeviceActivity.cpp --v2

* clang_tidy: torch_dipu/csrc_dipu/profiler/CorrelationIDManager.cpp .h --v2

* clang_tidy: magic number; const_cast

* clang_tidy: fix some review issus

* clang_tidy: modify format by using run_format.sh
  • Loading branch information
lyp-liuyipeng authored Dec 18, 2023
1 parent db40164 commit 77585a6
Show file tree
Hide file tree
Showing 8 changed files with 172 additions and 135 deletions.
10 changes: 7 additions & 3 deletions dipu/torch_dipu/csrc_dipu/profiler/CorrelationIDManager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,18 @@ void CorrelationIDManager::popCorrelationID(
type_.pop_back();
}

uint64_t CorrelationIDManager::getCorrelationID() const {
uint64_t CorrelationIDManager::getCorrelationID() {
DeviceActivityInterface::CorrelationFlowType type = type_.back();
return external_ids_[type].back();
}

thread_local std::deque<uint64_t> CorrelationIDManager::external_ids_
[DeviceActivityInterface::CorrelationFlowType::End];
thread_local std::array<std::deque<uint64_t>,
DeviceActivityInterface::CorrelationFlowType::End>
// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
CorrelationIDManager::external_ids_;

thread_local std::deque<DeviceActivityInterface::CorrelationFlowType>
// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
CorrelationIDManager::type_;

} // namespace profile
Expand Down
21 changes: 13 additions & 8 deletions dipu/torch_dipu/csrc_dipu/profiler/CorrelationIDManager.h
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
#pragma once

#include <DeviceActivityInterface.h>
#include <cstdint>
#include <deque>
#include <stdint.h>

#include "DeviceActivityInterface.h"

namespace dipu {
namespace profile {
Expand All @@ -15,21 +16,25 @@ class CorrelationIDManager {
// CorrelationIDManager designed as a singleton
static CorrelationIDManager& instance();

void pushCorrelationID(
static void pushCorrelationID(
uint64_t id,
libkineto::DeviceActivityInterface::CorrelationFlowType type);
void popCorrelationID(
static void popCorrelationID(
libkineto::DeviceActivityInterface::CorrelationFlowType type);
uint64_t getCorrelationID() const;
static uint64_t getCorrelationID();

private:
CorrelationIDManager() = default;

private:
thread_local static std::deque<uint64_t> external_ids_
[libkineto::DeviceActivityInterface::CorrelationFlowType::End];
thread_local static std::array<
std::deque<uint64_t>,
libkineto::DeviceActivityInterface::CorrelationFlowType::End>
// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
external_ids_;

thread_local static std::deque<
libkineto::DeviceActivityInterface::CorrelationFlowType>
// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
type_;
};

Expand Down
32 changes: 16 additions & 16 deletions dipu/torch_dipu/csrc_dipu/profiler/DIPUDeviceActivity.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
namespace dipu {
namespace profile {

using libkineto::DeviceActivityInterface;
using libkineto::GenericTraceActivity;

DIPUDeviceActivity::~DIPUDeviceActivity() {
Expand All @@ -34,12 +33,12 @@ void DIPUDeviceActivity::popCorrelationID(
}

void DIPUDeviceActivity::enableActivities(
const std::set<libkineto::ActivityType>& selectedActivities) {}
const std::set<libkineto::ActivityType>& selected_activities) {}

void DIPUDeviceActivity::disableActivities(
const std::set<libkineto::ActivityType>& selectedActivities) {
if (selectedActivities.find(libkineto::ActivityType::CONCURRENT_KERNEL) !=
selectedActivities.end()) {
const std::set<libkineto::ActivityType>& selected_activities) {
if (selected_activities.find(libkineto::ActivityType::CONCURRENT_KERNEL) !=
selected_activities.end()) {
setProfileOpen(false);
}
}
Expand All @@ -52,18 +51,18 @@ void DIPUDeviceActivity::clearActivities() {

int32_t DIPUDeviceActivity::processActivities(
libkineto::ActivityLogger& logger,
std::function<const libkineto::ITraceActivity*(int32_t)> linkedActivity,
int64_t startTime, int64_t endTime) {
std::function<const libkineto::ITraceActivity*(int32_t)> linked_activity,
int64_t start_time, int64_t end_time) {
FlushAllRecords();

constexpr size_t kMillisecondPerSecond = 1000;
auto records = RecordsImpl::get().getAllRecordList();
for (const auto& record : records) {
GenericTraceActivity act;
act.startTime = record.begin / 1000;
act.endTime = record.end / 1000;
act.id = record.opId;
act.device = record.pid;
act.resource = record.threadIdx;
act.startTime = static_cast<int64_t>(record.begin / kMillisecondPerSecond);
act.endTime = static_cast<int64_t>(record.end / kMillisecondPerSecond);
act.id = static_cast<int32_t>(record.opId);
act.device = static_cast<int32_t>(record.pid);
act.resource = static_cast<int32_t>(record.threadIdx);
act.flow.id = record.opId;
if (record.isKernel) {
act.activityType = libkineto::ActivityType::CONCURRENT_KERNEL;
Expand All @@ -76,17 +75,17 @@ int32_t DIPUDeviceActivity::processActivities(
act.flow.id = record.opId;
act.flow.type = libkineto::kLinkAsyncCpuGpu;
auto link_cor_id = record.linkCorrelationId;
act.linked = linkedActivity(link_cor_id);
act.linked = linked_activity(static_cast<int32_t>(link_cor_id));
logger.handleGenericActivity(act);
}

std::map<std::pair<int64_t, int64_t>, libkineto::ResourceInfo>
resource_infos = RecordsImpl::get().getResourceInfo();
for (const auto& kv : resource_infos) {
logger.handleResourceInfo(kv.second, startTime);
logger.handleResourceInfo(kv.second, start_time);
}

return records.size();
return static_cast<int32_t>(records.size());
}

void DIPUDeviceActivity::teardownContext() {}
Expand All @@ -98,6 +97,7 @@ void DIPUDeviceActivity::setMaxBufferSize(int32_t size) {}

namespace libkineto {

// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
DeviceActivityInterface* device_activity_singleton =
&dipu::profile::DIPUDeviceActivity::instance();

Expand Down
1 change: 0 additions & 1 deletion dipu/torch_dipu/csrc_dipu/profiler/DIPUDeviceActivity.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@ class DIPUDeviceActivity : public libkineto::DeviceActivityInterface {
private:
DIPUDeviceActivity() = default;

private:
std::unordered_map<uint64_t, std::unique_ptr<libkineto::GenericTraceActivity>>
cpu_activities_;
std::unordered_map<uint64_t, std::unique_ptr<libkineto::GenericTraceActivity>>
Expand Down
124 changes: 68 additions & 56 deletions dipu/torch_dipu/csrc_dipu/profiler/patch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ ApproximateClockToUnixTimeConverter::measurePair() {
}

ApproximateClockToUnixTimeConverter::time_pairs
// NOLINTNEXTLINE(readability-convert-member-functions-to-static)
ApproximateClockToUnixTimeConverter::measurePairs() {
static constexpr auto n_warmup = 5;
for (C10_UNUSED const auto _ : c10::irange(n_warmup)) {
Expand All @@ -58,7 +59,8 @@ ApproximateClockToUnixTimeConverter::makeConverter() {
for (const auto i : c10::irange(replicates)) {
auto delta_ns = end_times[i].t_ - start_times_[i].t_;
auto delta_approx = end_times[i].approx_t_ - start_times_[i].approx_t_;
scale_factors[i] = (double)delta_ns / (double)delta_approx;
scale_factors[i] =
static_cast<double>(delta_ns) / static_cast<double>(delta_approx);
}
std::sort(scale_factors.begin(), scale_factors.end());
long double scale_factor = scale_factors[replicates / 2 + 1];
Expand All @@ -76,14 +78,18 @@ ApproximateClockToUnixTimeConverter::makeConverter() {
for (const auto i : c10::irange(replicates)) {
auto dt = start_times_[i].t_ - t0;
auto dt_approx =
(double)(start_times_[i].approx_t_ - t0_approx) * scale_factor;
t0_correction[i] = dt - (time_t)dt_approx;
static_cast<double>(start_times_[i].approx_t_ - t0_approx) *
scale_factor;
t0_correction[i] = static_cast<double>(dt - static_cast<time_t>(dt_approx));
}
t0 += t0_correction[t0_correction.size() / 2 + 1];
t0 += static_cast<time_t>(t0_correction[t0_correction.size() / 2 + 1]);

return [=](approx_time_t t_approx) {
// See above for why this is more stable than `A * t_approx + B`.
auto result = (time_t)((double)(t_approx - t0_approx) * scale_factor) + t0;
auto result =
static_cast<time_t>(static_cast<double>(t_approx - t0_approx) *
scale_factor) +
t0;
return result;
};
}
Expand All @@ -98,12 +104,13 @@ namespace linux_perf {
/*
* Syscall wrapper for perf_event_open(2)
*/
inline long perf_event_open(struct perf_event_attr* hw_event, pid_t pid,
int cpu, int group_fd, unsigned long flags) {
inline int64_t perf_event_open(struct perf_event_attr* hw_event, pid_t pid,
int cpu, int group_fd, uint64_t flags) {
return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags);
}

// TODO sync with Kineto level abstract events in profiler/events.h
// TODO(caikun-pjlab): sync with Kineto level abstract events in
// profiler/events.h
static const std::unordered_map<
std::string, std::pair<perf_type_id, /* perf event type */ uint32_t>>
EventTable{{"cycles",
Expand Down Expand Up @@ -156,7 +163,7 @@ void PerfEvent::Init() {
pid_t pid = getpid(); // this pid
int cpu = -1; // all cpus
int group_fd = -1;
unsigned long flags = 0;
uint64_t flags = 0;

fd_ = static_cast<int>(perf_event_open(&attr, pid, cpu, group_fd, flags));
if (fd_ == -1) {
Expand All @@ -168,7 +175,7 @@ void PerfEvent::Init() {

uint64_t PerfEvent::ReadCounter() const {
PerfCounter counter{};
long n = read(fd_, &counter, sizeof(PerfCounter));
int64_t n = read(fd_, &counter, sizeof(PerfCounter));
TORCH_CHECK(n == sizeof(counter),
"Read failed for Perf event fd, event : ", name_,
", error: ", std::strerror(errno));
Expand Down Expand Up @@ -197,7 +204,7 @@ void PerfProfiler::Configure(std::vector<std::string>& event_names) {
events_.back().Init();
}

// TODO
// TODO(caikun-pjlab):
// Reset pthreadpool here to make sure we can attach to new children
// threads
}
Expand Down Expand Up @@ -265,7 +272,7 @@ activity_t* TraceWrapper::addCPUActivity(
auto& act = libkineto::CpuTraceBuffer::toRef(cpu_trace_->activities.back());
act.device = device_and_resource.device;
act.resource = device_and_resource.resource;
act.id = correlation_id;
act.id = static_cast<int32_t>(correlation_id);
act.startTime = start_time;
if (type != libkineto::ActivityType::CPU_INSTANT_EVENT) {
act.endTime = end_time;
Expand Down Expand Up @@ -318,9 +325,11 @@ void ActivityTraceWrapper::save(const std::string& path) {

void addMetadata(const activity_t* activity, const std::string& key,
const std::string& value) {
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
const_cast<activity_t*>(activity)->addMetadata(key, value);
}

// NOLINTNEXTLINE(readability-const-return-type)
const DeviceAndResource kineto_ids() {
#ifdef USE_KINETO
return {/*device=*/libkineto::processId(),
Expand All @@ -330,14 +339,14 @@ const DeviceAndResource kineto_ids() {
#endif // USE_KINETO
}

struct RegisterLibKinetoClient {
const struct RegisterLibKinetoClient {
RegisterLibKinetoClient() { libkineto::api(); }
} register_libkineto_client;

} // namespace kineto

namespace {
static constexpr TensorImplAddress NoTensorImpl{nullptr};
constexpr TensorImplAddress NoTensorImpl{nullptr};

struct RawTensorInfo {
TensorImplAddress impl_;
Expand Down Expand Up @@ -378,10 +387,51 @@ struct RawTensors {
}

template <typename T>
void operator()(T&) {}
void operator()(T& t) {}

std::vector<RawTensorInfo> tensors_;
};

void FlattenToUniformRepresentation(
std::vector<std::shared_ptr<Result>>& sorted_results,
std::vector<RawTensorInfo>& tensors) {
RawTensors raw_tensors;
// The python tracer caches values, so it's only safe to use the first case.
ska::flat_hash_set<PyModuleSelf> seen_modules;
ska::flat_hash_set<PyOptimizerSelf> seen_optimizers;
for (auto& result : sorted_results) {
result->visit(c10::overloaded(
[&](ExtraFields<EventType::TorchOp>& torch_op) {
for (auto& i : torch_op.inputs_) {
c10::visit(raw_tensors, i);
}
},
[&](ExtraFields<EventType::PyCall>& py_call) {
// torch.nn.Module
if (py_call.module_.has_value() &&
seen_modules.insert(py_call.module_->self_).second) {
for (auto& p : py_call.module_->parameters_) {
raw_tensors(p.metadata_);
raw_tensors(p.grad_metadata_);
}
}

// torch.optim.Optimizer
if (py_call.optimizer_.has_value() &&
seen_optimizers.insert(py_call.optimizer_->self_).second) {
for (auto& p : py_call.optimizer_->parameters_) {
raw_tensors(p.metadata_);
raw_tensors(p.grad_metadata_);
for (auto& state_i : p.state_) {
raw_tensors(state_i.second);
}
}
}
},
[&](auto& i) { raw_tensors(i); }));
}
tensors = std::move(raw_tensors.tensors_);
}
} // namespace

void calculateUniqueTensorIDs(
Expand All @@ -393,45 +443,7 @@ void calculateUniqueTensorIDs(

// Flatten results to a uniform representation.
// --------------------------------------------------------------------------
{
RawTensors raw_tensors;

// The python tracer caches values, so it's only safe to use the first case.
ska::flat_hash_set<PyModuleSelf> seen_modules;
ska::flat_hash_set<PyOptimizerSelf> seen_optimizers;
for (auto& result : sorted_results) {
result->visit(c10::overloaded(
[&](ExtraFields<EventType::TorchOp>& torch_op) {
for (auto& i : torch_op.inputs_) {
c10::visit(raw_tensors, i);
}
},
[&](ExtraFields<EventType::PyCall>& py_call) {
// torch.nn.Module
if (py_call.module_.has_value() &&
seen_modules.insert(py_call.module_->self_).second) {
for (auto& p : py_call.module_->parameters_) {
raw_tensors(p.metadata_);
raw_tensors(p.grad_metadata_);
}
}

// torch.optim.Optimizer
if (py_call.optimizer_.has_value() &&
seen_optimizers.insert(py_call.optimizer_->self_).second) {
for (auto& p : py_call.optimizer_->parameters_) {
raw_tensors(p.metadata_);
raw_tensors(p.grad_metadata_);
for (auto& state_i : p.state_) {
raw_tensors(state_i.second);
}
}
}
},
[&](auto& i) { raw_tensors(i); }));
}
tensors = std::move(raw_tensors.tensors_);
}
FlattenToUniformRepresentation(sorted_results, tensors);

// Assign IDs to solve ABA for Storage.
// --------------------------------------------------------------------------
Expand All @@ -441,7 +453,7 @@ void calculateUniqueTensorIDs(
ska::flat_hash_map<key_t, size_t, HashCombine> versions;
for (auto& t : tensors) {
auto inserted = versions.insert({{t.storage_, t.device_}, counter});
counter += inserted.second;
counter += static_cast<size_t>(inserted.second);
t.allocation_id_ref_.get().emplace(AllocationID(inserted.first->second));
if (t.is_free_) {
versions.erase(inserted.first);
Expand Down Expand Up @@ -503,7 +515,7 @@ void calculateUniqueTensorIDs(
size_t current_id{0};
for (const auto& i : unique_pairs) {
auto inserted = id_map.insert({i.first, current_id});
current_id += inserted.second;
current_id += static_cast<size_t>(inserted.second);
id_map.insert({i.second, inserted.first->second});
}
}
Expand Down
Loading

0 comments on commit 77585a6

Please sign in to comment.