diff --git a/system/system_monitor/README.md b/system/system_monitor/README.md index 6203ad45d4b3a..d00c57c895279 100644 --- a/system/system_monitor/README.md +++ b/system/system_monitor/README.md @@ -77,6 +77,7 @@ Every topic is published in 1 minute interval. | | Network Usage | ✓ | ✓ | ✓ | Notification of usage only, normally error not generated. | | | Network CRC Error | ✓ | ✓ | ✓ | Warning occurs when the number of CRC errors in the period reaches the threshold value. The number of CRC errors that occur is the same as the value that can be confirmed with the ip command. | | | IP Packet Reassembles Failed | ✓ | ✓ | ✓ | | +| | UDP Buf Errors | ✓ | ✓ | ✓ | | | NTP Monitor | NTP Offset | ✓ | ✓ | ✓ | | | Process Monitor | Tasks Summary | ✓ | ✓ | ✓ | | | | High-load Proc[0-9] | ✓ | ✓ | ✓ | | diff --git a/system/system_monitor/config/net_monitor.param.yaml b/system/system_monitor/config/net_monitor.param.yaml index 7a1e5aeff2db1..b366e26395641 100644 --- a/system/system_monitor/config/net_monitor.param.yaml +++ b/system/system_monitor/config/net_monitor.param.yaml @@ -7,3 +7,5 @@ crc_error_count_threshold: 1 reassembles_failed_check_duration: 1 reassembles_failed_check_count: 1 + udp_buf_errors_check_duration: 1 + udp_buf_errors_check_count: 1 diff --git a/system/system_monitor/docs/ros_parameters.md b/system/system_monitor/docs/ros_parameters.md index 1d081d513c85c..8ec952e60e242 100644 --- a/system/system_monitor/docs/ros_parameters.md +++ b/system/system_monitor/docs/ros_parameters.md @@ -69,6 +69,8 @@ net_monitor: | crc_error_count_threshold | int | n/a | 1 | Generates warning when count of CRC errors during CRC error check duration reaches a specified value or higher. | | reassembles_failed_check_duration | int | sec | 1 | IP packet reassembles failed check duration. | | reassembles_failed_check_count | int | n/a | 1 | Generates warning when count of IP packet reassembles failed during IP packet reassembles failed check duration reaches a specified value or higher. | +| udp_buf_errors_check_duration | int | sec | 1 | UDP buf errors check duration. | +| udp_buf_errors_check_count | int | n/a | 1 | Generates warning when count of UDP buf errors during udp_buf_errors_check_duration reaches a specified value or higher. | ## NTP Monitor diff --git a/system/system_monitor/docs/topics_net_monitor.md b/system/system_monitor/docs/topics_net_monitor.md index edb067c0a5be7..84afb250929b8 100644 --- a/system/system_monitor/docs/topics_net_monitor.md +++ b/system/system_monitor/docs/topics_net_monitor.md @@ -106,3 +106,23 @@ | --------------------------------------- | --------------- | | total packet reassembles failed | 0 | | packet reassembles failed per unit time | 0 | + +## UDP Buf Errors + +/diagnostics/net_monitor: UDP Buf Errors + +[summary] + +| level | message | +| ----- | -------------- | +| OK | OK | +| WARN | UDP buf errors | + +[values] + +| key | value (example) | +| -------------------------------- | --------------- | +| total UDP rcv buf errors | 0 | +| UDP rcv buf errors per unit time | 0 | +| total UDP snd buf errors | 0 | +| UDP snd buf errors per unit time | 0 | diff --git a/system/system_monitor/include/system_monitor/net_monitor/net_monitor.hpp b/system/system_monitor/include/system_monitor/net_monitor/net_monitor.hpp index 5aa2cc9790143..77536c4a60615 100644 --- a/system/system_monitor/include/system_monitor/net_monitor/net_monitor.hpp +++ b/system/system_monitor/include/system_monitor/net_monitor/net_monitor.hpp @@ -81,6 +81,93 @@ struct CrcErrors unsigned int last_rx_crc_errors{0}; //!< @brief rx_crc_error at the time of the last monitoring }; +/** + * @brief /proc/net/snmp information + */ +class NetSnmp +{ +public: + enum class Result { + OK, + CHECK_WARNING, + READ_ERROR, + }; + + /** + * @brief Constructor + * @param [in] node node using this class. + */ + explicit NetSnmp(rclcpp::Node * node); + + /** + * @brief Constructor + */ + NetSnmp() = delete; + + /** + * @brief Copy constructor + */ + NetSnmp(const NetSnmp &) = delete; + + /** + * @brief Copy assignment operator + */ + NetSnmp & operator=(const NetSnmp &) = delete; + + /** + * @brief Move constructor + */ + NetSnmp(const NetSnmp &&) = delete; + + /** + * @brief Move assignment operator + */ + NetSnmp & operator=(const NetSnmp &&) = delete; + + /** + * @brief Set parameters for check + * @param [in] check_duration the value for check_duration + * @param [in] check_count the value for check_count + */ + void set_check_parameters(unsigned int check_duration, unsigned int check_count); + + /** + * @brief Find index in `/proc/net/snmp` + * @param [in] protocol Protocol name (the first column string). e.g. "Ip:" or "Udp:" + * @param [in] metrics Metrics name. e.g. "ReasmFails" + */ + void find_index(const std::string & protocol, const std::string & metrics); + + /** + * @brief Check metrics + * @param [out] current_value the value read from snmp + * @param [out] value_per_unit_time the increase of the value during the duration + * @return the result of check + */ + Result check_metrics(uint64_t & current_value, uint64_t & value_per_unit_time); + +private: + /** + * @brief Read value from `/proc/net/snmp` + * @param [in] index_row row in `/proc/net/snmp` + * @param [in] index_col col in `/proc/net/snmp` + * @param [out] output_value retrieved value + * @return execution result + */ + bool read_value_from_proc( + unsigned int index_row, unsigned int index_col, uint64_t & output_value); + + rclcpp::Logger logger_; //!< @brief logger gotten from user node + unsigned int check_duration_; //!< @brief check duration + unsigned int check_count_; //!< @brief check count threshold + unsigned int index_row_; //!< @brief index for the target metrics in /proc/net/snmp + unsigned int index_col_; //!< @brief index for the target metrics in /proc/net/snmp + uint64_t current_value_; //!< @brief the value read from snmp + uint64_t last_value_; //!< @brief the value read from snmp at the last monitoring + uint64_t value_per_unit_time_; //!< @brief the increase of the value during the duration + std::deque queue_; //!< @brief queue that holds the delta of the value +}; + namespace local = boost::asio::local; class NetMonitor : public rclcpp::Node @@ -150,6 +237,12 @@ class NetMonitor : public rclcpp::Node */ void check_reassembles_failed(diagnostic_updater::DiagnosticStatusWrapper & status); + /** + * @brief Check UDP buf errors + * @param [out] status diagnostic message passed directly to diagnostic publish calls + */ + void check_udp_buf_errors(diagnostic_updater::DiagnosticStatusWrapper & status); + /** * @brief Timer callback */ @@ -273,18 +366,6 @@ class NetMonitor : public rclcpp::Node */ void close_connection(); - /** - * @brief Get column index of IP packet reassembles failed from `/proc/net/snmp` - */ - void get_reassembles_failed_column_index(); - - /** - * @brief get IP packet reassembles failed - * @param [out] reassembles_failed IP packet reassembles failed - * @return execution result - */ - bool get_reassembles_failed(uint64_t & reassembles_failed); - diagnostic_updater::Updater updater_; //!< @brief Updater class which advertises to /diagnostics rclcpp::TimerBase::SharedPtr timer_; //!< @brief timer to get Network information @@ -307,16 +388,9 @@ class NetMonitor : public rclcpp::Node unsigned int crc_error_check_duration_; //!< @brief CRC error check duration unsigned int crc_error_count_threshold_; //!< @brief CRC error count threshold - std::deque - reassembles_failed_queue_; //!< @brief queue that holds count of IP packet reassembles failed - uint64_t last_reassembles_failed_; //!< @brief IP packet reassembles failed at the time of the - //!< last monitoring - unsigned int - reassembles_failed_check_duration_; //!< @brief IP packet reassembles failed check duration - unsigned int - reassembles_failed_check_count_; //!< @brief IP packet reassembles failed check count threshold - unsigned int reassembles_failed_column_index_; //!< @brief column index of IP Reassembles failed - //!< in /proc/net/snmp + NetSnmp reassembles_failed_info_; //!< @brief information of IP packet reassembles failed + NetSnmp udp_rcvbuf_errors_info_; //!< @brief information of UDP rcv buf errors + NetSnmp udp_sndbuf_errors_info_; //!< @brief information of UDP snd buf errors /** * @brief Network connection status messages diff --git a/system/system_monitor/src/net_monitor/net_monitor.cpp b/system/system_monitor/src/net_monitor/net_monitor.cpp index d99150f0b3037..fd00d7f7b895b 100644 --- a/system/system_monitor/src/net_monitor/net_monitor.cpp +++ b/system/system_monitor/src/net_monitor/net_monitor.cpp @@ -28,6 +28,7 @@ #include #include +#include #include #include @@ -51,11 +52,9 @@ NetMonitor::NetMonitor(const rclcpp::NodeOptions & options) socket_path_(declare_parameter("socket_path", traffic_reader_service::socket_path)), crc_error_check_duration_(declare_parameter("crc_error_check_duration", 1)), crc_error_count_threshold_(declare_parameter("crc_error_count_threshold", 1)), - last_reassembles_failed_(0), - reassembles_failed_check_duration_( - declare_parameter("reassembles_failed_check_duration", 1)), - reassembles_failed_check_count_(declare_parameter("reassembles_failed_check_count", 1)), - reassembles_failed_column_index_(0) + reassembles_failed_info_(this), + udp_rcvbuf_errors_info_(this), + udp_sndbuf_errors_info_(this) { if (monitor_program_.empty()) { monitor_program_ = "*"; @@ -68,6 +67,7 @@ NetMonitor::NetMonitor(const rclcpp::NodeOptions & options) updater_.add("Network Traffic", this, &NetMonitor::monitor_traffic); updater_.add("Network CRC Error", this, &NetMonitor::check_crc_error); updater_.add("IP Packet Reassembles Failed", this, &NetMonitor::check_reassembles_failed); + updater_.add("UDP Buf Errors", this, &NetMonitor::check_udp_buf_errors); nl80211_.init(); @@ -83,8 +83,21 @@ NetMonitor::NetMonitor(const rclcpp::NodeOptions & options) using namespace std::literals::chrono_literals; timer_ = rclcpp::create_timer(this, get_clock(), 1s, std::bind(&NetMonitor::on_timer, this)); - // Get column index of IP packet reassembles failed from `/proc/net/snmp` - get_reassembles_failed_column_index(); + // Initialize information for `/proc/net/snmp` + int reassembles_failed_check_duration = + declare_parameter("reassembles_failed_check_duration", 1); + int reassembles_failed_check_count = declare_parameter("reassembles_failed_check_count", 1); + int udp_buf_errors_check_duration = declare_parameter("udp_buf_errors_check_duration", 1); + int udp_buf_errors_check_count = declare_parameter("udp_buf_errors_check_count", 1); + reassembles_failed_info_.set_check_parameters( + reassembles_failed_check_duration, reassembles_failed_check_count); + udp_rcvbuf_errors_info_.set_check_parameters( + udp_buf_errors_check_duration, udp_buf_errors_check_count); + udp_sndbuf_errors_info_.set_check_parameters( + udp_buf_errors_check_duration, udp_buf_errors_check_count); + reassembles_failed_info_.find_index("Ip:", "ReasmFails"); + udp_rcvbuf_errors_info_.find_index("Udp:", "RcvbufErrors"); + udp_sndbuf_errors_info_.find_index("Udp:", "SndbufErrors"); // Send request to start nethogs if (enable_traffic_monitor_) { @@ -292,41 +305,66 @@ void NetMonitor::check_reassembles_failed(diagnostic_updater::DiagnosticStatusWr // Remember start time to measure elapsed time const auto t_start = SystemMonitorUtility::startMeasurement(); - int whole_level = DiagStatus::OK; - std::string error_message; uint64_t total_reassembles_failed = 0; + uint64_t unit_reassembles_failed = 0; + NetSnmp::Result ret = + reassembles_failed_info_.check_metrics(total_reassembles_failed, unit_reassembles_failed); + status.add("total packet reassembles failed", total_reassembles_failed); + status.add("packet reassembles failed per unit time", unit_reassembles_failed); - if (get_reassembles_failed(total_reassembles_failed)) { - reassembles_failed_queue_.push_back(total_reassembles_failed - last_reassembles_failed_); - while (reassembles_failed_queue_.size() > reassembles_failed_check_duration_) { - reassembles_failed_queue_.pop_front(); - } + int whole_level = DiagStatus::OK; + std::string error_message = "OK"; + switch (ret) { + case NetSnmp::Result::OK: + default: + break; + case NetSnmp::Result::CHECK_WARNING: + whole_level = DiagStatus::WARN; + error_message = "reassembles failed"; + break; + case NetSnmp::Result::READ_ERROR: + whole_level = DiagStatus::ERROR; + error_message = "failed to read /proc/net/snmp"; + break; + } - uint64_t unit_reassembles_failed = 0; - for (auto reassembles_failed : reassembles_failed_queue_) { - unit_reassembles_failed += reassembles_failed; - } + status.summary(whole_level, error_message); - status.add(fmt::format("total packet reassembles failed"), total_reassembles_failed); - status.add(fmt::format("packet reassembles failed per unit time"), unit_reassembles_failed); + // Measure elapsed time since start time and report + SystemMonitorUtility::stopMeasurement(t_start, status); +} - if (unit_reassembles_failed >= reassembles_failed_check_count_) { - whole_level = std::max(whole_level, static_cast(DiagStatus::WARN)); - error_message = "reassembles failed"; - } +void NetMonitor::check_udp_buf_errors(diagnostic_updater::DiagnosticStatusWrapper & status) +{ + // Remember start time to measure elapsed time + const auto t_start = SystemMonitorUtility::startMeasurement(); - last_reassembles_failed_ = total_reassembles_failed; - } else { - reassembles_failed_queue_.push_back(0); - whole_level = std::max(whole_level, static_cast(DiagStatus::ERROR)); + uint64_t total_udp_rcvbuf_errors = 0; + uint64_t unit_udp_rcvbuf_errors = 0; + NetSnmp::Result ret_rcv = + udp_rcvbuf_errors_info_.check_metrics(total_udp_rcvbuf_errors, unit_udp_rcvbuf_errors); + status.add("total UDP rcv buf errors", total_udp_rcvbuf_errors); + status.add("UDP rcv buf errors per unit time", unit_udp_rcvbuf_errors); + + uint64_t total_udp_sndbuf_errors = 0; + uint64_t unit_udp_sndbuf_errors = 0; + NetSnmp::Result ret_snd = + udp_sndbuf_errors_info_.check_metrics(total_udp_sndbuf_errors, unit_udp_sndbuf_errors); + status.add("total UDP snd buf errors", total_udp_sndbuf_errors); + status.add("UDP snd buf errors per unit time", unit_udp_sndbuf_errors); + + int whole_level = DiagStatus::OK; + std::string error_message = "OK"; + if (ret_rcv == NetSnmp::Result::READ_ERROR || ret_snd == NetSnmp::Result::READ_ERROR) { + whole_level = DiagStatus::ERROR; error_message = "failed to read /proc/net/snmp"; + } else if ( + ret_rcv == NetSnmp::Result::CHECK_WARNING || ret_snd == NetSnmp::Result::CHECK_WARNING) { + whole_level = DiagStatus::WARN; + error_message = "UDP buf errors"; } - if (!error_message.empty()) { - status.summary(whole_level, error_message); - } else { - status.summary(whole_level, "OK"); - } + status.summary(whole_level, error_message); // Measure elapsed time since start time and report SystemMonitorUtility::stopMeasurement(t_start, status); @@ -544,90 +582,6 @@ void NetMonitor::update_crc_error(NetworkInfomation & network, const struct rtnl crc_errors.last_rx_crc_errors = stats->rx_crc_errors; } -void NetMonitor::get_reassembles_failed_column_index() -{ - std::ifstream ifs("/proc/net/snmp"); - if (!ifs) { - RCLCPP_WARN(get_logger(), "Failed to open /proc/net/snmp."); - return; - } - - // Find column index of 'ReasmFails' - std::string line; - if (!std::getline(ifs, line)) { - RCLCPP_WARN(get_logger(), "Failed to get header of /proc/net/snmp."); - return; - } - - // /proc/net/snmp - // Ip: Forwarding DefaultTTL InReceives ... ReasmTimeout ReasmReqds ReasmOKs ReasmFails ... - // Ip: 2 64 5636471397 ... 135 2303339 216166 270 .. - std::vector header_list; - boost::split(header_list, line, boost::is_space()); - - if (header_list.empty()) { - RCLCPP_WARN(get_logger(), "Failed to get header list of /proc/net/snmp."); - return; - } - if (header_list[0] != "Ip:") { - RCLCPP_WARN( - get_logger(), "Header column is invalid in /proc/net/snmp. %s", header_list[0].c_str()); - return; - } - - int index = 0; - for (const auto & header : header_list) { - if (header == "ReasmFails") { - reassembles_failed_column_index_ = index; - break; - } - ++index; - } -} - -bool NetMonitor::get_reassembles_failed(uint64_t & reassembles_failed) -{ - if (reassembles_failed_column_index_ == 0) { - RCLCPP_WARN(get_logger(), "Column index is invalid. : %d", reassembles_failed_column_index_); - return false; - } - - std::ifstream ifs("/proc/net/snmp"); - if (!ifs) { - RCLCPP_WARN(get_logger(), "Failed to open /proc/net/snmp."); - return false; - } - - std::string line; - - // Skip header row - if (!std::getline(ifs, line)) { - RCLCPP_WARN(get_logger(), "Failed to get header of /proc/net/snmp."); - return false; - } - - // Find a value of 'ReasmFails' - if (!std::getline(ifs, line)) { - RCLCPP_WARN(get_logger(), "Failed to get a line of /proc/net/snmp."); - return false; - } - - std::vector value_list; - boost::split(value_list, line, boost::is_space()); - - if (reassembles_failed_column_index_ >= value_list.size()) { - RCLCPP_WARN( - get_logger(), - "There are not enough columns for reassembles failed column index. : columns=%d index=%d", - static_cast(value_list.size()), reassembles_failed_column_index_); - return false; - } - - reassembles_failed = std::stoull(value_list[reassembles_failed_column_index_]); - - return true; -} - void NetMonitor::send_start_nethogs_request() { // Connect to boot/shutdown service @@ -701,7 +655,7 @@ bool NetMonitor::connect_service() socket_->connect(endpoint, error_code); if (error_code) { - RCLCPP_ERROR(get_logger(), "Failed to connect socket. %s", error_code.message().c_str()); + RCLCPP_ERROR_ONCE(get_logger(), "Failed to connect socket. %s", error_code.message().c_str()); return false; } @@ -782,5 +736,145 @@ void NetMonitor::close_connection() socket_->close(); } +NetSnmp::NetSnmp(rclcpp::Node * node) +: logger_(node->get_logger().get_child("net_snmp")), + check_duration_(1), + check_count_(1), + index_row_(0), + index_col_(0), + current_value_(0), + last_value_(0), + value_per_unit_time_(0), + queue_() +{ +} + +void NetSnmp::set_check_parameters(unsigned int check_duration, unsigned int check_count) +{ + check_duration_ = check_duration; + check_count_ = check_count; +} + +void NetSnmp::find_index(const std::string & protocol, const std::string & metrics) +{ + // /proc/net/snmp + // Ip: Forwarding DefaultTTL InReceives ... ReasmTimeout ReasmReqds ReasmOKs ReasmFails ... + // Ip: 2 64 5636471397 ... 135 2303339 216166 270 .. + std::ifstream ifs("/proc/net/snmp"); + if (!ifs) { + RCLCPP_WARN(logger_, "Failed to open /proc/net/snmp."); + index_row_ = index_col_ = 0; + return; + } + + std::vector target_header_list; + std::string line; + while (std::getline(ifs, line)) { + std::vector header_list; + boost::split(header_list, line, boost::is_space()); + if (header_list.empty()) continue; + if (header_list[0] == protocol) { + target_header_list = header_list; + break; + } + ++index_row_; + } + + ++index_row_; // The values are placed in the row following the header + + for (const auto & header : target_header_list) { + if (header == metrics) { + return; + } + ++index_col_; + } + + RCLCPP_WARN(logger_, "Failed to get header of /proc/net/snmp."); + index_row_ = index_col_ = 0; + return; +} + +NetSnmp::Result NetSnmp::check_metrics(uint64_t & current_value, uint64_t & value_per_unit_time) +{ + if (!read_value_from_proc(index_row_, index_col_, current_value_)) { + queue_.push_back(0); + current_value = value_per_unit_time = 0; + return Result::READ_ERROR; + } + + if (queue_.empty()) { + last_value_ = current_value_; + } + queue_.push_back(current_value_ - last_value_); + last_value_ = current_value_; + while (queue_.size() > check_duration_) { + queue_.pop_front(); + } + + value_per_unit_time_ = std::accumulate(queue_.begin(), queue_.end(), static_cast(0)); + + current_value = current_value_; + value_per_unit_time = value_per_unit_time_; + + if (value_per_unit_time_ >= check_count_) { + return Result::CHECK_WARNING; + } else { + return Result::OK; + } +} + +bool NetSnmp::read_value_from_proc( + unsigned int index_row, unsigned int index_col, uint64_t & output_value) +{ + if (index_row == 0 && index_col == 0) { + RCLCPP_WARN_ONCE(logger_, "index is invalid. : %u, %u", index_row, index_col); + return false; + } + + std::ifstream ifs("/proc/net/snmp"); + if (!ifs) { + RCLCPP_WARN_ONCE(logger_, "Failed to open /proc/net/snmp."); + return false; + } + + std::string target_line; + std::string line; + for (unsigned int row_index = 0; std::getline(ifs, line); ++row_index) { + if (row_index == index_row) { + target_line = line; + break; + } + } + + if (target_line.empty()) { + RCLCPP_WARN_ONCE(logger_, "Failed to get a line of /proc/net/snmp."); + return false; + } + + std::vector value_list; + boost::split(value_list, target_line, boost::is_space()); + if (index_col >= value_list.size()) { + RCLCPP_WARN_ONCE( + logger_, "There are not enough columns for the column index. : column size=%lu index=%u, %u", + value_list.size(), index_row, index_col); + return false; + } + + std::string value_str = value_list[index_col]; + if (value_str.empty()) { + RCLCPP_WARN_ONCE(logger_, "The value is empty. : index=%u, %u", index_row, index_col); + return false; + } + + if (value_str[0] == '-') { + RCLCPP_WARN_ONCE(logger_, "The value is minus. : %s", value_str.c_str()); + output_value = 0; + return false; + } else { + output_value = std::stoull(value_str); + return true; + } +} + #include RCLCPP_COMPONENTS_REGISTER_NODE(NetMonitor)