diff --git a/system/system_monitor/config/mem_monitor.param.yaml b/system/system_monitor/config/mem_monitor.param.yaml index f2da3972be91a..e352d8ba0da97 100644 --- a/system/system_monitor/config/mem_monitor.param.yaml +++ b/system/system_monitor/config/mem_monitor.param.yaml @@ -1,3 +1,5 @@ /**: ros__parameters: available_size: 1024 # MB + usage_timeout: 5 # sec + ecc_timeout: 5 # sec diff --git a/system/system_monitor/include/system_monitor/mem_monitor/mem_monitor.hpp b/system/system_monitor/include/system_monitor/mem_monitor/mem_monitor.hpp index 11bf70e7fdc73..2cb0da0430956 100644 --- a/system/system_monitor/include/system_monitor/mem_monitor/mem_monitor.hpp +++ b/system/system_monitor/include/system_monitor/mem_monitor/mem_monitor.hpp @@ -22,9 +22,14 @@ #include +#include + #include #include #include +#include + +namespace bp = boost::process; class MemMonitor : public rclcpp::Node { @@ -52,6 +57,32 @@ class MemMonitor : public rclcpp::Node void checkUsage( diagnostic_updater::DiagnosticStatusWrapper & stat); // NOLINT(runtime/references) + /** + * @brief check Memory ECC + * @param [out] stat diagnostic message passed directly to diagnostic publish calls + */ + void checkEcc(diagnostic_updater::DiagnosticStatusWrapper & stat); + + /** + * @brief read /proc/meminfo + */ + void readMemInfo(std::unordered_map & memInfo); + + /** + * @brief call readMemInfo and calculate memory usage + */ + std::string readUsage(std::map & map); + + /** + * @brief execute edac-util command + */ + std::string executeEdacUtil(std::string & output, std::string & pipe2_error_str); + + /** + * @brief Timer callback to execute read infomation about usages and ecc + */ + void onTimer(); + /** * @brief get human-readable output for memory size * @param [in] str size with bytes @@ -64,6 +95,27 @@ class MemMonitor : public rclcpp::Node char hostname_[HOST_NAME_MAX + 1]; //!< @brief host name size_t available_size_; //!< @brief Memory available size to generate error + int usage_timeout_; //!< @brief Timeout duration for executing readUsage + int ecc_timeout_; //!< @brief Timeout duration for executing edac-util command + + rclcpp::TimerBase::SharedPtr + timer_; //!< @brief Timer to execute readUsage and edac-utils command + rclcpp::CallbackGroup::SharedPtr timer_callback_group_; //!< @brief Callback Group + + std::mutex usage_mutex_; //!< @brief Mutex for output from /proc/meminfo + std::string usage_error_str_; //!< @brief Error string + std::map usage_map_; //!< @brief Output of /proc/meminfo + double usage_elapsed_ms_; //!< @brief Execution time of readUsage + std::mutex usage_timeout_mutex_; //!< @brief Mutex regarding timeout for executing readUsage + + std::mutex ecc_mutex_; //!< @brief Mutex for output from edac-util command + std::string ecc_error_str_; //!< @brief Error string + std::string ecc_pipe2_error_str_; //!< @brief Error string regarding pipe2 function call + std::string ecc_output_; //!< @brief Output of edac-util command + double ecc_elapsed_ms_; //!< @brief Execution time of edac-util command + std::mutex + ecc_timeout_mutex_; //!< @brief Mutex regarding timeout for executing edac-util command + bool use_edac_util_; //!< @brief Available to use edac-util command or not /** * @brief Memory usage status messages diff --git a/system/system_monitor/src/mem_monitor/mem_monitor.cpp b/system/system_monitor/src/mem_monitor/mem_monitor.cpp index 489a4dc72bbe8..a5e07917fd04c 100644 --- a/system/system_monitor/src/mem_monitor/mem_monitor.cpp +++ b/system/system_monitor/src/mem_monitor/mem_monitor.cpp @@ -21,6 +21,8 @@ #include "system_monitor/system_monitor_utility.hpp" +#include + #include #include @@ -33,124 +35,315 @@ namespace bp = boost::process; MemMonitor::MemMonitor(const rclcpp::NodeOptions & options) : Node("mem_monitor", options), updater_(this), - available_size_(declare_parameter("available_size", 1024) * 1024 * 1024) + available_size_(declare_parameter("available_size", 1024) * 1024 * 1024), + usage_timeout_(declare_parameter("usage_timeout", 5)), + ecc_timeout_(declare_parameter("ecc_timeout", 5)), + usage_elapsed_ms_(0.0), + ecc_elapsed_ms_(0.0), + use_edac_util_(false) { + using namespace std::literals::chrono_literals; + gethostname(hostname_, sizeof(hostname_)); updater_.setHardwareID(hostname_); updater_.add("Memory Usage", this, &MemMonitor::checkUsage); + + // Start timer to execute checkUsage and checkEcc + timer_callback_group_ = this->create_callback_group(rclcpp::CallbackGroupType::MutuallyExclusive); + timer_ = rclcpp::create_timer( + this, get_clock(), 1s, std::bind(&MemMonitor::onTimer, this), timer_callback_group_); + + // Enable ECC error detection if edac-utils package is installed + if (!bp::search_path("edac-util").empty()) { + updater_.add("Memory ECC", this, &MemMonitor::checkEcc); + use_edac_util_ = true; + } } -void MemMonitor::update() { updater_.force_update(); } +void MemMonitor::update() +{ + updater_.force_update(); +} void MemMonitor::checkUsage(diagnostic_updater::DiagnosticStatusWrapper & stat) { - // Remember start time to measure elapsed time - const auto t_start = SystemMonitorUtility::startMeasurement(); + std::string error_str; + std::map map; + double elapsed_ms; + + // thread-safe copy + { + std::lock_guard lock(usage_mutex_); + error_str = usage_error_str_; + map = usage_map_; + elapsed_ms = usage_elapsed_ms_; + } + + if (!error_str.empty()) { + stat.summary(DiagStatus::ERROR, "readUsage error"); + stat.add("readUsage", error_str); + return; + } + + // Check if Memory Usage is sound state + int level; + if (map["Mem: total"] > map["Total: used+"]) { + level = DiagStatus::OK; + } else if (map["Mem: available"] >= available_size_) { + level = DiagStatus::WARN; + } else { + level = DiagStatus::ERROR; + } + + for (auto itr = map.begin(); itr != map.end(); ++itr) { + if (itr->first == "Mem: usage") { + stat.addf(itr->first, "%.2f%%", static_cast(itr->second)); + } else { + stat.add(itr->first, toHumanReadable(std::to_string(itr->second))); + } + } + + if (level == DiagStatus::ERROR) { + stat.summary(level, usage_dict_.at(level)); + } else if (elapsed_ms == 0.0) { + stat.summary(DiagStatus::WARN, "do not execute readUsage yet"); + } else if (elapsed_ms > usage_timeout_ * 1000) { + stat.summary(DiagStatus::WARN, "readUsage timeout expired"); + } else { + stat.summary(level, usage_dict_.at(level)); + } + + stat.addf("execution time", "%f ms", elapsed_ms); +} + +void MemMonitor::checkEcc(diagnostic_updater::DiagnosticStatusWrapper & stat) +{ + std::string error_str; + std::string pipe2_error_str; + std::string output; + double elapsed_ms = 0.0; + + // thread-safe copy + { + std::lock_guard lock(ecc_mutex_); + error_str = ecc_error_str_; + output = ecc_output_; + elapsed_ms = ecc_elapsed_ms_; + } + + if (!pipe2_error_str.empty()) { + stat.summary(DiagStatus::ERROR, "pipe2 error"); + stat.add("pipe2", pipe2_error_str); + return; + } + if (!error_str.empty()) { + stat.summary(DiagStatus::ERROR, "edac_util error"); + stat.add("edac_util", error_str); + return; + } + + /* + Output example of `edac-util --quiet` + edac-util generates output if error occurred, otherwise no output + mc0: 3 Uncorrected Errors with no DIMM info + mc0: 3 Corrected Errors with no DIMM info + */ + std::istringstream iss(ecc_output_); + std::string line; + + while (std::getline(iss, line)) { + if (line.find("Uncorrected") != std::string::npos) { + stat.summary(DiagStatus::ERROR, line); + return; + } else if (line.find("Corrected") != std::string::npos) { + stat.summary(DiagStatus::WARN, line); + return; + } + } + + if (elapsed_ms == 0.0) { + stat.summary(DiagStatus::WARN, "do not execute edac-util yet"); + } else if (elapsed_ms > ecc_timeout_ * 1000) { + stat.summary(DiagStatus::WARN, "edac-util timeout expired"); + } else { + stat.summary(DiagStatus::OK, "OK"); + } + + stat.addf("execution time", "%f ms", elapsed_ms); +} + +void MemMonitor::readMemInfo(std::unordered_map & memInfo) +{ + std::ifstream file("/proc/meminfo"); + if (!file.is_open()) { + throw std::runtime_error("Could not open /proc/meminfo"); + } + + std::string line; + while (std::getline(file, line)) { + std::size_t pos = line.find(':'); + if (pos != std::string::npos) { + std::string key = line.substr(0, pos); + try { + size_t value = std::stoll(line.substr(pos + 1)) * 1024; + memInfo[key] = value; + } catch (const std::invalid_argument & e) { + throw std::runtime_error("Invalid value in /proc/meminfo: " + line); + } catch (const std::out_of_range & e) { + throw std::runtime_error("Value out of range in /proc/meminfo: " + line); + } + } else { + throw std::runtime_error("Invalid line in /proc/meminfo: " + line); + } + } +} + +std::string MemMonitor::readUsage(std::map & map) +{ // Get total amount of free and used memory + std::unordered_map memInfo; + size_t mem_total = 0; + size_t mem_free = 0; + size_t mem_shared = 0; + size_t mem_available = 0; + size_t slab_reclaimable = 0; + size_t buffers = 0; + size_t cached = 0; + size_t swap_total = 0; + size_t swap_free = 0; + + try { + readMemInfo(memInfo); + } catch (const std::exception & e) { + return e.what(); + } + + try { + mem_total = memInfo.at("MemTotal"); + mem_free = memInfo.at("MemFree"); + mem_shared = memInfo.at("Shmem"); + mem_available = memInfo.at("MemAvailable"); + slab_reclaimable = memInfo.at("SReclaimable"); + buffers = memInfo.at("Buffers"); + cached = memInfo.at("Cached"); + swap_total = memInfo.at("SwapTotal"); + swap_free = memInfo.at("SwapFree"); + } catch (const std::out_of_range & e) { + return e.what(); + } + + if (mem_total == 0) { + return "Usage calculate error: mem_info is zero"; + } + + float usage = 1.0f - static_cast(mem_available) / mem_total; + size_t mem_buff_and_cache = buffers + cached + slab_reclaimable; + size_t mem_used = mem_total - mem_free - mem_buff_and_cache; + map["Mem: usage"] = usage * 1e+2; + map["Mem: total"] = mem_total; + map["Mem: used"] = mem_used; + map["Mem: free"] = mem_free; + map["Mem: shared"] = mem_shared; + map["Mem: buff/cache"] = mem_buff_and_cache; + map["Mem: available"] = mem_available; + + size_t swap_used = swap_total - swap_free; + map["Swap: total"] = swap_total; + map["Swap: used"] = swap_used; + map["Swap: free"] = swap_free; + + size_t total_total = mem_total + swap_total; + size_t total_used = mem_used + swap_used; + size_t total_free = mem_free + swap_free; + size_t used_plus = total_used + mem_shared; + map["Total: total"] = total_total; + map["Total: used"] = total_used; + map["Total: free"] = total_free; + map["Total: used+"] = used_plus; + + return ""; +} + +std::string MemMonitor::executeEdacUtil(std::string & output, std::string & pipe2_error_str) +{ + std::string result = ""; + std::ostringstream os; - // boost::process create file descriptor without O_CLOEXEC required for multithreading. - // So create file descriptor with O_CLOEXEC and pass it to boost::process. int out_fd[2]; if (pipe2(out_fd, O_CLOEXEC) != 0) { - stat.summary(DiagStatus::ERROR, "pipe2 error"); - stat.add("pipe2", strerror(errno)); - return; + pipe2_error_str = std::string(strerror(errno)); + return result; } bp::pipe out_pipe{out_fd[0], out_fd[1]}; bp::ipstream is_out{std::move(out_pipe)}; int err_fd[2]; if (pipe2(err_fd, O_CLOEXEC) != 0) { - stat.summary(DiagStatus::ERROR, "pipe2 error"); - stat.add("pipe2", strerror(errno)); - return; + pipe2_error_str = std::string(strerror(errno)); + return result; } bp::pipe err_pipe{err_fd[0], err_fd[1]}; bp::ipstream is_err{std::move(err_pipe)}; - bp::child c("free -tb", bp::std_out > is_out, bp::std_err > is_err); + bp::child c("edac-util --quiet", bp::std_out > is_out, bp::std_err > is_err); c.wait(); if (c.exit_code() != 0) { - std::ostringstream os; is_err >> os.rdbuf(); - stat.summary(DiagStatus::ERROR, "free error"); - stat.add("free", os.str().c_str()); - return; + result = os.str().c_str(); + return result; } + is_out >> os.rdbuf(); + output = os.str().c_str(); + return result; +} - std::string line; - int index = 0; - std::vector list; - float usage; - size_t mem_total = 0; - size_t mem_shared = 0; - size_t mem_available = 0; - size_t used_plus = 0; +void MemMonitor::onTimer() +{ + tier4_autoware_utils::StopWatch stop_watch; - /* - Output example of `free -tb` - - list[0] list[1] list[2] list[3] list[4] list[5] list[6] - index 0 | total used free shared buff/cache available - index 1 | Mem: 32809744 12554780 13090376 292840 7164588 19622092 - index 2 | Swap: 33554428 1767680 31786748 - index 3 | Total: 66364172 14322460 44877124 - */ - while (std::getline(is_out, line) && !line.empty()) { - // Skip header - if (index <= 0) { - ++index; - continue; - } + // Check Memory Usage + { + // Start to measure elapsed time + stop_watch.tic("usage_execution_time"); - boost::split(list, line, boost::is_space(), boost::token_compress_on); + std::string error_str; + std::map map; - // Physical memory - if (index == 1) { - mem_total = std::atoll(list[1].c_str()); - mem_shared = std::atoll(list[4].c_str()); - mem_available = std::atoll(list[6].c_str()); + error_str = readUsage(map); - // available divided by total is available memory including calculation for buff/cache, - // so the subtraction of this from 1 gives real usage. - usage = 1.0f - static_cast(mem_available) / mem_total; - stat.addf(fmt::format("{} usage", list[0]), "%.2f%%", usage * 1e+2); - } + const double elapsed_ms = stop_watch.toc("usage_execution_time"); - stat.add(fmt::format("{} total", list[0]), toHumanReadable(list[1])); - stat.add(fmt::format("{} used", list[0]), toHumanReadable(list[2])); - stat.add(fmt::format("{} free", list[0]), toHumanReadable(list[3])); - - // Add an additional information for physical memory - if (index == 1) { - stat.add(fmt::format("{} shared", list[0]), toHumanReadable(list[4])); - stat.add(fmt::format("{} buff/cache", list[0]), toHumanReadable(list[5])); - stat.add(fmt::format("{} available", list[0]), toHumanReadable(list[6])); - } else if (index == 3) { - // Total:used + Mem:shared - used_plus = std::atoll(list[2].c_str()) + mem_shared; - double giga = static_cast(used_plus) / (1024 * 1024 * 1024); - stat.add(fmt::format("{} used+", list[0]), fmt::format("{:.1f}{}", giga, "G")); - } else { - /* nothing */ + // thread-safe copy + { + std::lock_guard lock(usage_mutex_); + usage_error_str_ = error_str; + usage_map_ = map; + usage_elapsed_ms_ = elapsed_ms; } - ++index; } - int level; - if (mem_total > used_plus) { - level = DiagStatus::OK; - } else if (mem_available >= available_size_) { - level = DiagStatus::WARN; - } else { - level = DiagStatus::ERROR; - } + // Check ECC Error + if (use_edac_util_) { + stop_watch.tic("ecc_execution_time"); + + std::string error_str; + std::string pipe2_error_str; + std::string output; - stat.summary(level, usage_dict_.at(level)); + error_str = executeEdacUtil(output, pipe2_error_str); - // Measure elapsed time since start time and report - SystemMonitorUtility::stopMeasurement(t_start, stat); + const double elapsed_ms = stop_watch.toc("ecc_execution_time"); + + // thread-safe copy + { + std::lock_guard lock(ecc_mutex_); + ecc_error_str_ = error_str; + ecc_pipe2_error_str_ = pipe2_error_str; + ecc_output_ = output; + ecc_elapsed_ms_ = elapsed_ms; + } + } } std::string MemMonitor::toHumanReadable(const std::string & str)