diff --git a/diagnostic_common_diagnostics/CMakeLists.txt b/diagnostic_common_diagnostics/CMakeLists.txt index 65720a08e..43202b9e0 100644 --- a/diagnostic_common_diagnostics/CMakeLists.txt +++ b/diagnostic_common_diagnostics/CMakeLists.txt @@ -8,6 +8,7 @@ find_package(ament_cmake_python REQUIRED) ament_python_install_package(${PROJECT_NAME}) install(PROGRAMS + ${PROJECT_NAME}/cpu_monitor.py ${PROJECT_NAME}/ntp_monitor.py DESTINATION lib/${PROJECT_NAME} ) @@ -21,6 +22,10 @@ if(BUILD_TESTING) test_ntp_monitor test/systemtest/test_ntp_monitor.py TIMEOUT 10) + ament_add_pytest_test( + test_cpu_monitor + test/systemtest/test_cpu_monitor.py + TIMEOUT 10) endif() -ament_package() \ No newline at end of file +ament_package() diff --git a/diagnostic_common_diagnostics/README.md b/diagnostic_common_diagnostics/README.md index 452d163ed..95e416277 100644 --- a/diagnostic_common_diagnostics/README.md +++ b/diagnostic_common_diagnostics/README.md @@ -8,19 +8,28 @@ Currently only the NTP monitor is ported to ROS2. # Nodes ## ntp_monitor.py -Runs 'ntpdate' to check if the system clock is synchronized with the NTP server. +Runs 'ntpdate' to check if the system clock is synchronized with the NTP server. * If the offset is smaller than `offset-tolerance`, an `OK` status will be published. * If the offset is larger than the configured `offset-tolerance`, a `WARN` status will be published, * if it is bigger than `error-offset-tolerance`, an `ERROR` status will be published. * If there was an error running `ntpdate`, an `ERROR` status will be published. +## cpu_monitor.py +The `cpu_monitor` module allows users to monitor the CPU usage of their system in real-time. +It publishes the usage percentage in a diagnostic message. + +* Name of the node is "cpu_monitor_" + hostname. +* Uses the following args: + * warning_percentage: If the CPU usage is > warning_percentage, a WARN status will be publised. + * window: the maximum length of the used collections.deque for queuing CPU readings. + ### Published Topics #### /diagnostics diagnostic_msgs/DiagnosticArray The diagnostics information. ### Parameters -#### ntp_hostname +#### ntp_hostname (default: "pool.ntp.org") Hostname of NTP server. @@ -46,9 +55,6 @@ Disable self test. ## hd_monitor.py **To be ported** -## cpu_monitor.py -**To be ported** - ## ram_monitor.py **To be ported** @@ -56,4 +62,4 @@ Disable self test. **To be ported** ## tf_monitor.py -**To be ported** \ No newline at end of file +**To be ported** diff --git a/diagnostic_common_diagnostics/diagnostic_common_diagnostics/cpu_monitor.py b/diagnostic_common_diagnostics/diagnostic_common_diagnostics/cpu_monitor.py new file mode 100755 index 000000000..bab6d52a3 --- /dev/null +++ b/diagnostic_common_diagnostics/diagnostic_common_diagnostics/cpu_monitor.py @@ -0,0 +1,119 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# +# Software License Agreement (BSD License) +# +# Copyright (c) 2017, TNO IVS, Helmond, Netherlands +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# * Neither the name of the TNO IVS nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +# \author Rein Appeldoorn + +import collections +import socket +import traceback + +from diagnostic_msgs.msg import DiagnosticStatus + +from diagnostic_updater import DiagnosticTask, Updater + +import psutil + +import rclpy +from rclpy.node import Node + + +class CpuTask(DiagnosticTask): + + def __init__(self, warning_percentage=90, window=1): + DiagnosticTask.__init__(self, 'CPU Information') + + self._warning_percentage = int(warning_percentage) + self._readings = collections.deque(maxlen=window) + + def _get_average_reading(self): + def avg(lst): + return float(sum(lst)) / len(lst) if lst else float('nan') + + return [avg(cpu_percentages) + for cpu_percentages in zip(*self._readings)] + + def run(self, stat): + self._readings.append(psutil.cpu_percent(percpu=True)) + cpu_percentages = self._get_average_reading() + cpu_average = sum(cpu_percentages) / len(cpu_percentages) + + stat.add('CPU Load Average', '{:.2f}'.format(cpu_average)) + + warn = False + for idx, cpu_percentage in enumerate(cpu_percentages): + stat.add('CPU {} Load'.format(idx), '{:.2f}'.format(cpu_percentage)) + if cpu_percentage > self._warning_percentage: + warn = True + + if warn: + stat.summary(DiagnosticStatus.WARN, + 'At least one CPU exceeds {} percent'.format(self._warning_percentage)) + else: + stat.summary(DiagnosticStatus.OK, + 'CPU Average {:.2f} percent'.format(cpu_average)) + + return stat + + +def main(args=None): + rclpy.init(args=args) + + # Create the node + hostname = socket.gethostname() + node = Node('cpu_monitor_%s' % hostname.replace('-', '_')) + + # Declare and get parameters + node.declare_parameter('warning_percentage', 90) + node.declare_parameter('window', 1) + + warning_percentage = node.get_parameter( + 'warning_percentage').get_parameter_value().integer_value + window = node.get_parameter('window').get_parameter_value().integer_value + + # Create diagnostic updater with default updater rate of 1 hz + updater = Updater(node) + updater.setHardwareID(hostname) + updater.add(CpuTask(warning_percentage=warning_percentage, window=window)) + + rclpy.spin(node) + + +if __name__ == '__main__': + try: + main() + except KeyboardInterrupt: + pass + except Exception: + traceback.print_exc() diff --git a/diagnostic_common_diagnostics/mainpage.dox b/diagnostic_common_diagnostics/mainpage.dox index 25ee9c863..f16635ce2 100644 --- a/diagnostic_common_diagnostics/mainpage.dox +++ b/diagnostic_common_diagnostics/mainpage.dox @@ -5,6 +5,7 @@ \b diagnostic_common_diagnostics contains a few common diagnostic nodes - ntp_monitor publishes diagnostic messages for how well the NTP time sync is working. +- cpu_monitor publishes diagnostic messages with the CPU usage of the system. - tf_monitor used to publish diagnostic messages reporting on the health of the TF tree. It is based on tfwtf. It is not ported to ROS2. diff --git a/diagnostic_common_diagnostics/package.xml b/diagnostic_common_diagnostics/package.xml index 9d50bbe33..78d837930 100644 --- a/diagnostic_common_diagnostics/package.xml +++ b/diagnostic_common_diagnostics/package.xml @@ -18,21 +18,22 @@ ament_cmake ament_cmake_python - rclpy diagnostic_updater python3-ntplib + python3-psutil + rclpy + ament_cmake_lint_cmake + ament_cmake_pytest + ament_cmake_xmllint ament_lint_auto - ament_cmake_xmllint - ament_cmake_lint_cmake - - ament_cmake_pytest + ament_cmake diff --git a/diagnostic_common_diagnostics/test/systemtest/test_cpu_monitor.py b/diagnostic_common_diagnostics/test/systemtest/test_cpu_monitor.py new file mode 100644 index 000000000..6e733692c --- /dev/null +++ b/diagnostic_common_diagnostics/test/systemtest/test_cpu_monitor.py @@ -0,0 +1,123 @@ +# -*- coding: utf-8 -*- +# Software License Agreement (BSD License) +# +# Copyright (c) 2023, Robert Bosch GmbH +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# * Neither the name of the Willow Garage nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +import time +import unittest + +from diagnostic_common_diagnostics.cpu_monitor import CpuTask + +from diagnostic_msgs.msg import DiagnosticStatus + +from diagnostic_updater import DiagnosticArray, Updater +from diagnostic_updater import DiagnosticStatusWrapper + +import rclpy +from rclpy.node import Node + + +class TestCPUMonitor(unittest.TestCase): + + @classmethod + def setUpClass(cls): + rclpy.init(args=None) + + @classmethod + def tearDownClass(cls): + if rclpy.ok(): + rclpy.shutdown() + + def diagnostics_callback(self, msg): + self.message_recieved = True + self.assertEqual(len(msg.status), 1) + + def test_ok(self): + # In this case is recommended for accuracy that psutil.cpu_percent() + # function be called with at least 0.1 seconds between calls. + time.sleep(0.1) + + warning_percentage = 100 + task = CpuTask(warning_percentage) + stat = DiagnosticStatusWrapper() + task.run(stat) + self.assertEqual(task.name, 'CPU Information') + self.assertEqual(stat.level, DiagnosticStatus.OK) + self.assertIn(str('CPU Average'), stat.message) + + # Check for at least 1 CPU Load Average and 1 CPU Load + self.assertGreaterEqual(len(stat.values), 2) + + def test_warn(self): + # In this case is recommended for accuracy that psutil.cpu_percent() + # function be called with at least 0.1 seconds between calls. + time.sleep(0.1) + + warning_percentage = -1 + task = CpuTask(warning_percentage) + stat = DiagnosticStatusWrapper() + task.run(stat) + print(f'Raw readings: {task._readings}') + self.assertEqual(task.name, 'CPU Information') + self.assertEqual(stat.level, DiagnosticStatus.WARN) + self.assertIn(str('At least one CPU exceeds'), stat.message) + + # Check for at least 1 CPU Load Average and 1 CPU Load + self.assertGreaterEqual(len(stat.values), 2) + + def test_updater(self): + # In this case is recommended for accuracy that psutil.cpu_percent() + # function be called with at least 0.1 seconds between calls. + time.sleep(0.1) + + self.message_recieved = False + + node = Node('cpu_monitor_test') + updater = Updater(node) + updater.setHardwareID('test_id') + updater.add(CpuTask()) + + node.create_subscription( + DiagnosticArray, '/diagnostics', self.diagnostics_callback, 10) + + start_time = time.time() + timeout = 5.0 # Timeout in seconds + + while not self.message_recieved: + rclpy.spin_once(node) + time.sleep(0.1) + elapsed_time = time.time() - start_time + if elapsed_time >= timeout: + self.fail('No diagnostics received') + + +if __name__ == '__main__': + unittest.main()