Skip to content

Commit

Permalink
feat(system_monitor): add diagnostic feature for monitoring swap usage
Browse files Browse the repository at this point in the history
  • Loading branch information
ito-san committed Nov 24, 2024
1 parent efe7c62 commit 4ae1b85
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 0 deletions.
2 changes: 2 additions & 0 deletions system/system_monitor/config/mem_monitor.param.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,5 @@
available_size: 1024 # MB
usage_timeout: 5 # sec
ecc_timeout: 5 # sec
swap_usage_warn: 0.25 # %
swap_usage_error: 0.75 # %
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,15 @@ class MemMonitor : public rclcpp::Node
void checkUsage(
diagnostic_updater::DiagnosticStatusWrapper & stat); // NOLINT(runtime/references)

/**
* @brief check Swap usage
* @param @param [out] stat diagnostic message passed directly to diagnostic publish calls
* @note NOLINT syntax is needed since diagnostic_updater asks for a non-const reference
* to pass diagnostic message updated in this function to diagnostic publish calls.
*/
void checkSwapUsage(
diagnostic_updater::DiagnosticStatusWrapper & stat); // NOLINT(runtime/references)

/**
* @brief check Memory ECC
* @param [out] stat diagnostic message passed directly to diagnostic publish calls
Expand Down Expand Up @@ -97,6 +106,8 @@ class MemMonitor : public rclcpp::Node
size_t available_size_; //!< @brief Memory available size to generate error
int usage_timeout_; //!< @brief Timeout duration for executing readUsage
int ecc_timeout_; //!< @brief Timeout duration for executing edac-util command
float swap_usage_warn_; //!< @brief Swap usage(%) to generate warning
float swap_usage_error_; //!< @brief Swap usage(%) to generate error

rclcpp::TimerBase::SharedPtr
timer_; //!< @brief Timer to execute readUsage and edac-utils command
Expand Down
52 changes: 52 additions & 0 deletions system/system_monitor/src/mem_monitor/mem_monitor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ MemMonitor::MemMonitor(const rclcpp::NodeOptions & options)
available_size_(declare_parameter<int>("available_size", 1024) * 1024 * 1024),
usage_timeout_(declare_parameter<int>("usage_timeout", 5)),
ecc_timeout_(declare_parameter<int>("ecc_timeout", 5)),
swap_usage_warn_(declare_parameter<float>("swap_usage_warn", 0.25)),
swap_usage_error_(declare_parameter<float>("swap_usage_error", 0.75)),
usage_elapsed_ms_(0.0),
ecc_elapsed_ms_(0.0),
use_edac_util_(false)
Expand All @@ -47,6 +49,7 @@ MemMonitor::MemMonitor(const rclcpp::NodeOptions & options)
gethostname(hostname_, sizeof(hostname_));
updater_.setHardwareID(hostname_);
updater_.add("Memory Usage", this, &MemMonitor::checkUsage);
updater_.add("Swap Usage", this, &MemMonitor::checkSwapUsage);

// Start timer to execute checkUsage and checkEcc
timer_callback_group_ = this->create_callback_group(rclcpp::CallbackGroupType::MutuallyExclusive);
Expand Down Expand Up @@ -116,6 +119,54 @@ void MemMonitor::checkUsage(diagnostic_updater::DiagnosticStatusWrapper & stat)
stat.addf("execution time", "%f ms", elapsed_ms);
}

void MemMonitor::checkSwapUsage(diagnostic_updater::DiagnosticStatusWrapper & stat)
{
std::string error_str;
std::map<std::string, size_t> map;
double elapsed_ms;

// thread-safe copy
{
std::lock_guard<std::mutex> lock(usage_mutex_);
error_str = usage_error_str_;
map = usage_map_;
elapsed_ms = usage_elapsed_ms_;
}

if (!error_str.empty()) {
stat.summary(DiagStatus::ERROR, "readUsage error");
stat.add("readUsage", error_str);
return;
}

// Check if Swap Usage
const auto swap_usage = static_cast<double>(map["Swap: usage"]) / 1e+2;
int level = DiagStatus::OK;

if (swap_usage >= swap_usage_error_) {
level = std::max(level, static_cast<int>(DiagStatus::ERROR));
} else if (swap_usage >= swap_usage_warn_) {
level = std::max(level, static_cast<int>(DiagStatus::WARN));
}

stat.addf("Swap: usage", "%.2f%%", static_cast<double>(map["Swap: usage"]));
stat.add("Swap: total", toHumanReadable(std::to_string(map["Swap: total"])));
stat.add("Swap: used", toHumanReadable(std::to_string(map["Swap: used"])));
stat.add("Swap: free", toHumanReadable(std::to_string(map["Swap: free"])));

if (level == DiagStatus::ERROR) {
stat.summary(level, usage_dict_.at(level));
} else if (elapsed_ms == 0.0) {
stat.summary(DiagStatus::WARN, "do not execute readUsage yet");
} else if (elapsed_ms > usage_timeout_ * 1000) {
stat.summary(DiagStatus::WARN, "readUsage timeout expired");
} else {
stat.summary(level, usage_dict_.at(level));
}

stat.addf("execution time", "%f ms", elapsed_ms);
}

void MemMonitor::checkEcc(diagnostic_updater::DiagnosticStatusWrapper & stat)
{
std::string error_str;
Expand Down Expand Up @@ -252,6 +303,7 @@ std::string MemMonitor::readUsage(std::map<std::string, size_t> & map)
map["Swap: total"] = swap_total;
map["Swap: used"] = swap_used;
map["Swap: free"] = swap_free;
map["Swap: usage"] = (swap_total > 0) ? static_cast<double>(swap_used) / swap_total * 1e+2 : 0.0;

size_t total_total = mem_total + swap_total;
size_t total_used = mem_used + swap_used;
Expand Down

0 comments on commit 4ae1b85

Please sign in to comment.