From e080940fd87456b32e1d1ffe5e33db83ae69d44d Mon Sep 17 00:00:00 2001 From: Cong Hou <97947969+congh-nvidia@users.noreply.github.com> Date: Sat, 24 Aug 2024 04:53:25 +0800 Subject: [PATCH] [BGP Memory Leak] Add FRR memory checker in the continuous link flap test (#13066) * Add bgp memory checker in the continuous link flap test --- .../link_flap/test_cont_link_flap.py | 54 +++++++++++++++++-- 1 file changed, 51 insertions(+), 3 deletions(-) diff --git a/tests/platform_tests/link_flap/test_cont_link_flap.py b/tests/platform_tests/link_flap/test_cont_link_flap.py index ea7a2abfa20..af38c0ad81f 100644 --- a/tests/platform_tests/link_flap/test_cont_link_flap.py +++ b/tests/platform_tests/link_flap/test_cont_link_flap.py @@ -8,6 +8,8 @@ import logging import pytest +import time +import math from tests.common.helpers.assertions import pytest_assert, pytest_require from tests.common import port_toggle @@ -28,6 +30,13 @@ class TestContLinkFlap(object): TestContLinkFlap class for continuous link flap """ + def get_frr_daemon_memory_usage(self, duthost, daemon): + frr_daemon_memory_output = duthost.shell(f'vtysh -c "show memory {daemon}"')["stdout"] + logging.info(f"{daemon} memory status: \n%s", frr_daemon_memory_output) + frr_daemon_memory = duthost.shell( + f'vtysh -c "show memory {daemon}" | grep "Used ordinary blocks"')["stdout"].split()[-2] + return frr_daemon_memory + def test_cont_link_flap(self, request, duthosts, nbrhosts, enum_rand_one_per_hwsku_frontend_hostname, fanouthosts, bring_up_dut_interfaces, tbinfo): """ @@ -38,10 +47,10 @@ def test_cont_link_flap(self, request, duthosts, nbrhosts, enum_rand_one_per_hws to cause BGP Flaps. 2.) Flap all interfaces on peer (FanOutLeaf) one by one 1-3 iteration to cause BGP Flaps. - 3.) Watch for memory (show system-memory) ,orchagent CPU Utilization - and Redis_memory. + 3.) Watch for memory (show system-memory), FRR daemons memory(vtysh -c "show memory bgp/zebra"), + orchagent CPU Utilization and Redis_memory. - Pass Criteria: All routes must be re-learned with < 5% increase in Redis and + Pass Criteria: All routes must be re-learned with < 5% increase in Redis/FRR memory usage and ORCH agent CPU consumption below threshold after 3 mins after stopping flaps. """ duthost = duthosts[enum_rand_one_per_hwsku_frontend_hostname] @@ -69,6 +78,13 @@ def test_cont_link_flap(self, request, duthosts, nbrhosts, enum_rand_one_per_hws logging.info("IPv4 routes: start {}, summary {}".format(start_time_ipv4_route_counts, sumv4)) logging.info("IPv6 routes: start {}, summary {}".format(start_time_ipv6_route_counts, sumv6)) + # Record FRR daemons memory status at start + frr_demons_to_check = ['bgpd', 'zebra'] + start_time_frr_daemon_memory = {} + for daemon in frr_demons_to_check: + start_time_frr_daemon_memory[daemon] = self.get_frr_daemon_memory_usage(duthost, daemon) + logging.info(f"{daemon} memory usage at start: \n%s", start_time_frr_daemon_memory[daemon]) + # Make Sure Orch CPU < orch_cpu_threshold before starting test. logging.info("Make Sure orchagent CPU utilization is less that %d before link flap", orch_cpu_threshold) pytest_assert(wait_until(100, 2, 0, check_orch_cpu_utilization, duthost, orch_cpu_threshold), @@ -124,10 +140,42 @@ def test_cont_link_flap(self, request, duthosts, nbrhosts, enum_rand_one_per_hws pytest.fail(str(failmsg)) + # Wait 30s for the memory usage to be stable + time.sleep(30) + # Record memory status at end memory_output = duthost.shell("show system-memory")["stdout"] logging.info("Memory Status at end: %s", memory_output) + # Check the FRR daemons memory usage at end + end_time_frr_daemon_memory = {} + incr_frr_daemon_memory_threshold = {} + for daemon in frr_demons_to_check: + incr_frr_daemon_memory_threshold[daemon] = 10 if tbinfo["topo"]["type"] in ["m0", "mx"] else 5 + min_threshold_percent = 1 / float(start_time_frr_daemon_memory[daemon]) * 100 + if min_threshold_percent > incr_frr_daemon_memory_threshold[daemon]: + incr_frr_daemon_memory_threshold[daemon] = math.ceil(min_threshold_percent) + logging.info(f"The memory increment threshold for frr daemon {daemon} " + f"is {incr_frr_daemon_memory_threshold[daemon]}%") + for daemon in frr_demons_to_check: + # Record FRR daemon memory status at end + end_time_frr_daemon_memory[daemon] = self.get_frr_daemon_memory_usage(duthost, daemon) + logging.info(f"{daemon} memory usage at end: \n%s", end_time_frr_daemon_memory[daemon]) + + # Calculate diff in FRR daemon memory + incr_frr_daemon_memory = \ + float(end_time_frr_daemon_memory[daemon]) - float(start_time_frr_daemon_memory[daemon]) + logging.info(f"{daemon} absolute difference: %d", incr_frr_daemon_memory) + + # Check FRR daemon memory only if it is increased else default to pass + if incr_frr_daemon_memory > 0: + percent_incr_frr_daemon_memory = \ + (incr_frr_daemon_memory / float(start_time_frr_daemon_memory[daemon])) * 100 + logging.info(f"{daemon} memory percentage increase: %d", percent_incr_frr_daemon_memory) + pytest_assert(percent_incr_frr_daemon_memory < incr_frr_daemon_memory_threshold[daemon], + f"{daemon} memory increase more than expected: " + f"{incr_frr_daemon_memory_threshold[daemon]}%") + # Record orchagent CPU utilization at end orch_cpu = duthost.shell( "COLUMNS=512 show processes cpu | grep orchagent | awk '{print $1, $9}'")["stdout_lines"]