diff --git a/system/system_monitor/config/mem_monitor.param.yaml b/system/system_monitor/config/mem_monitor.param.yaml index e352d8ba0da97..d6444e553b305 100644 --- a/system/system_monitor/config/mem_monitor.param.yaml +++ b/system/system_monitor/config/mem_monitor.param.yaml @@ -3,3 +3,5 @@ available_size: 1024 # MB usage_timeout: 5 # sec ecc_timeout: 5 # sec + swap_usage_warn: 0.25 # % + swap_usage_error: 0.75 # % diff --git a/system/system_monitor/include/system_monitor/mem_monitor/mem_monitor.hpp b/system/system_monitor/include/system_monitor/mem_monitor/mem_monitor.hpp index 2cb0da0430956..e3c82513643f3 100644 --- a/system/system_monitor/include/system_monitor/mem_monitor/mem_monitor.hpp +++ b/system/system_monitor/include/system_monitor/mem_monitor/mem_monitor.hpp @@ -57,6 +57,15 @@ class MemMonitor : public rclcpp::Node void checkUsage( diagnostic_updater::DiagnosticStatusWrapper & stat); // NOLINT(runtime/references) + /** + * @brief check Swap usage + * @param @param [out] stat diagnostic message passed directly to diagnostic publish calls + * @note NOLINT syntax is needed since diagnostic_updater asks for a non-const reference + * to pass diagnostic message updated in this function to diagnostic publish calls. + */ + void checkSwapUsage( + diagnostic_updater::DiagnosticStatusWrapper & stat); // NOLINT(runtime/references) + /** * @brief check Memory ECC * @param [out] stat diagnostic message passed directly to diagnostic publish calls @@ -97,6 +106,8 @@ class MemMonitor : public rclcpp::Node size_t available_size_; //!< @brief Memory available size to generate error int usage_timeout_; //!< @brief Timeout duration for executing readUsage int ecc_timeout_; //!< @brief Timeout duration for executing edac-util command + float swap_usage_warn_; //!< @brief Swap usage(%) to generate warning + float swap_usage_error_; //!< @brief Swap usage(%) to generate error rclcpp::TimerBase::SharedPtr timer_; //!< @brief Timer to execute readUsage and edac-utils command diff --git a/system/system_monitor/src/mem_monitor/mem_monitor.cpp b/system/system_monitor/src/mem_monitor/mem_monitor.cpp index a5e07917fd04c..373415b7f79ba 100644 --- a/system/system_monitor/src/mem_monitor/mem_monitor.cpp +++ b/system/system_monitor/src/mem_monitor/mem_monitor.cpp @@ -38,6 +38,8 @@ MemMonitor::MemMonitor(const rclcpp::NodeOptions & options) available_size_(declare_parameter("available_size", 1024) * 1024 * 1024), usage_timeout_(declare_parameter("usage_timeout", 5)), ecc_timeout_(declare_parameter("ecc_timeout", 5)), + swap_usage_warn_(declare_parameter("swap_usage_warn", 0.25)), + swap_usage_error_(declare_parameter("swap_usage_error", 0.75)), usage_elapsed_ms_(0.0), ecc_elapsed_ms_(0.0), use_edac_util_(false) @@ -47,6 +49,7 @@ MemMonitor::MemMonitor(const rclcpp::NodeOptions & options) gethostname(hostname_, sizeof(hostname_)); updater_.setHardwareID(hostname_); updater_.add("Memory Usage", this, &MemMonitor::checkUsage); + updater_.add("Swap Usage", this, &MemMonitor::checkSwapUsage); // Start timer to execute checkUsage and checkEcc timer_callback_group_ = this->create_callback_group(rclcpp::CallbackGroupType::MutuallyExclusive); @@ -116,6 +119,54 @@ void MemMonitor::checkUsage(diagnostic_updater::DiagnosticStatusWrapper & stat) stat.addf("execution time", "%f ms", elapsed_ms); } +void MemMonitor::checkSwapUsage(diagnostic_updater::DiagnosticStatusWrapper & stat) +{ + std::string error_str; + std::map map; + double elapsed_ms; + + // thread-safe copy + { + std::lock_guard lock(usage_mutex_); + error_str = usage_error_str_; + map = usage_map_; + elapsed_ms = usage_elapsed_ms_; + } + + if (!error_str.empty()) { + stat.summary(DiagStatus::ERROR, "readUsage error"); + stat.add("readUsage", error_str); + return; + } + + // Check if Swap Usage + const auto swap_usage = static_cast(map["Swap: usage"]) / 1e+2; + int level = DiagStatus::OK; + + if (swap_usage >= swap_usage_error_) { + level = std::max(level, static_cast(DiagStatus::ERROR)); + } else if (swap_usage >= swap_usage_warn_) { + level = std::max(level, static_cast(DiagStatus::WARN)); + } + + stat.addf("Swap: usage", "%.2f%%", static_cast(map["Swap: usage"])); + stat.add("Swap: total", toHumanReadable(std::to_string(map["Swap: total"]))); + stat.add("Swap: used", toHumanReadable(std::to_string(map["Swap: used"]))); + stat.add("Swap: free", toHumanReadable(std::to_string(map["Swap: free"]))); + + if (level == DiagStatus::ERROR) { + stat.summary(level, usage_dict_.at(level)); + } else if (elapsed_ms == 0.0) { + stat.summary(DiagStatus::WARN, "do not execute readUsage yet"); + } else if (elapsed_ms > usage_timeout_ * 1000) { + stat.summary(DiagStatus::WARN, "readUsage timeout expired"); + } else { + stat.summary(level, usage_dict_.at(level)); + } + + stat.addf("execution time", "%f ms", elapsed_ms); +} + void MemMonitor::checkEcc(diagnostic_updater::DiagnosticStatusWrapper & stat) { std::string error_str; @@ -252,6 +303,7 @@ std::string MemMonitor::readUsage(std::map & map) map["Swap: total"] = swap_total; map["Swap: used"] = swap_used; map["Swap: free"] = swap_free; + map["Swap: usage"] = (swap_total > 0) ? static_cast(swap_used) / swap_total * 1e+2 : 0.0; size_t total_total = mem_total + swap_total; size_t total_used = mem_used + swap_used;