Added multi-hop SONiC upgrade path test case (sonic-net#14563)

* Added multi-hop test case * Added consistency checker to multi-hop test case * Fixed a bug where some logs would be missing in multi-hop test The following log files were missing: - capture.pcap - capture_filtered.pcap - warm-reboot-report.json - warm-reboot.log This didn't cause the test to fail they simply weren't being captured. This change makes it so that they are captured. * Renamed 'set_base_image_a' to be more descriptive
congh-nvidia · Dec 10, 2024 · 6bf773b · 6bf773b
1 parent 882fb10
commit 6bf773b
Show file tree

Hide file tree

Showing 9 changed files with 315 additions and 71 deletions.
diff --git a/tests/common/fixtures/advanced_reboot.py b/tests/common/fixtures/advanced_reboot.py
@@ -421,12 +421,14 @@ def __clearArpAndFdbTables(self):
         logger.info('Clearing all fdb entries on DUT  {}'.format(self.duthost.hostname))
         self.duthost.shell('sonic-clear fdb all')
 
-    def __fetchTestLogs(self, rebootOper=None):
+    def __fetchTestLogs(self, rebootOper=None, log_dst_suffix=None):
         """
-        Fetch test logs from duthost and ptfhost after individual test run
+        Fetch test logs from duthost and ptfhost.
+        @param rebootOper: if provided it will be added to each individual file name
+        @param log_dst_suffix: if provided it will be appended to the directory name
         """
-        if rebootOper:
-            dir_name = "{}_{}".format(self.request.node.name, rebootOper)
+        if log_dst_suffix:
+            dir_name = "{}_{}".format(self.request.node.name, log_dst_suffix)
         else:
             dir_name = self.request.node.name
         report_file_dir = os.path.realpath((os.path.join(os.path.dirname(__file__), "../../logs/platform_tests/")))
@@ -596,7 +598,7 @@ def runRebootTest(self):
                 if self.postboot_setup:
                     self.postboot_setup()
                 # capture the test logs, and print all of them in case of failure, or a summary in case of success
-                log_dir = self.__fetchTestLogs(rebootOper)
+                log_dir = self.__fetchTestLogs(rebootOper, log_dst_suffix=rebootOper)
                 self.print_test_logs_summary(log_dir)
                 if self.advanceboot_loganalyzer and post_reboot_analysis:
                     verification_errors = post_reboot_analysis(marker, event_counters=event_counters,
@@ -630,6 +632,88 @@ def runRebootTestcase(self, prebootList=None, inbootList=None, prebootFiles='pee
         self.imageInstall(prebootList, inbootList, prebootFiles)
         return self.runRebootTest()
 
+    def runMultiHopRebootTestcase(self, upgrade_path_urls, prebootFiles='peer_dev_info,neigh_port_info',
+                                  base_image_setup=None, pre_hop_setup=None,
+                                  post_hop_teardown=None, multihop_advanceboot_loganalyzer_factory=None):
+        """
+        This method validates and prepares test bed for multi-hop reboot test case. It runs the reboot test case using
+        provided test arguments.
+        @param prebootList: list of operation to run before reboot process
+        @param prebootFiles: preboot files
+        """
+        # Install image A (base image)
+        self.imageInstall(None, None, prebootFiles)
+        if base_image_setup:
+            base_image_setup()
+
+        test_results = dict()
+        test_case_name = str(self.request.node.name)
+        test_results[test_case_name] = list()
+        for hop_index, _ in enumerate(upgrade_path_urls[1:], start=1):
+            try:
+                if pre_hop_setup:
+                    pre_hop_setup(hop_index)
+                if multihop_advanceboot_loganalyzer_factory:
+                    pre_reboot_analysis, post_reboot_analysis = multihop_advanceboot_loganalyzer_factory(hop_index)
+                    marker = pre_reboot_analysis()
+                event_counters = self.__setupRebootOper(None)
+
+                # Run the upgrade
+                thread = InterruptableThread(
+                    target=self.__runPtfRunner,
+                    kwargs={"ptf_collect_dir": "./logs/ptf_collect/hop{}/".format(hop_index)})
+                thread.daemon = True
+                thread.start()
+                # give the test REBOOT_CASE_TIMEOUT (1800s) to complete the reboot with IO,
+                # and then additional 300s to examine the pcap, logs and generate reports
+                ptf_timeout = REBOOT_CASE_TIMEOUT + 300
+                thread.join(timeout=ptf_timeout, suppress_exception=True)
+                self.ptfhost.shell("pkill -f 'ptftests advanced-reboot.ReloadTest'", module_ignore_errors=True)
+                # the thread might still be running, and to catch any exceptions after pkill allow 10s to join
+                thread.join(timeout=10)
+
+                self.__verifyRebootOper(None)
+                if self.duthost.num_asics() == 1 and not check_bgp_router_id(self.duthost, self.mgFacts):
+                    test_results[test_case_name].append("Failed to verify BGP router identifier is Loopback0 on %s" %
+                                                        self.duthost.hostname)
+                if post_hop_teardown:
+                    post_hop_teardown(hop_index)
+            except Exception:
+                traceback_msg = traceback.format_exc()
+                err_msg = "Exception caught while running advanced-reboot test on ptf: \n{}".format(traceback_msg)
+                logger.error(err_msg)
+                test_results[test_case_name].append(err_msg)
+            finally:
+                # capture the test logs, and print all of them in case of failure, or a summary in case of success
+                log_dir = self.__fetchTestLogs(log_dst_suffix="hop{}".format(hop_index))
+                self.print_test_logs_summary(log_dir)
+                if multihop_advanceboot_loganalyzer_factory and post_reboot_analysis:
+                    verification_errors = post_reboot_analysis(marker, event_counters=event_counters, log_dir=log_dir)
+                    if verification_errors:
+                        logger.error("Post reboot verification failed. List of failures: {}"
+                                     .format('\n'.join(verification_errors)))
+                        test_results[test_case_name].extend(verification_errors)
+                    # Set the post_reboot_analysis to None to avoid using it again after post_hop_teardown
+                    # on the subsequent iteration in the event that we land in the finally block before
+                    # the new one is initialised
+                    post_reboot_analysis = None
+                self.acl_manager_checker(test_results[test_case_name])
+                self.__clearArpAndFdbTables()
+                self.__revertRebootOper(None)
+
+            failed_list = [(testcase, failures) for testcase, failures in list(test_results.items())
+                           if len(failures) != 0]
+            pytest_assert(len(failed_list) == 0, "Advanced-reboot failure. Failed multi-hop test {testname} "
+                                                 "on update {hop_index} from {from_image} to {to_image}, "
+                                                 "failure summary:\n{fail_summary}".format(
+                                                    testname=self.request.node.name,
+                                                    hop_index=hop_index,
+                                                    from_image=upgrade_path_urls[hop_index-1],
+                                                    to_image=upgrade_path_urls[hop_index],
+                                                    fail_summary=failed_list
+                                                ))
+        return True  # Success
+
     def __setupRebootOper(self, rebootOper):
         if self.dual_tor_mode:
             for device in self.duthosts:
@@ -694,10 +778,11 @@ def __revertRebootOper(self, rebootOper):
             logger.info('Running revert handler for reboot operation {}'.format(rebootOper))
             rebootOper.revert()
 
-    def __runPtfRunner(self, rebootOper=None):
+    def __runPtfRunner(self, rebootOper=None, ptf_collect_dir="./logs/ptf_collect/"):
         """
         Run single PTF advanced-reboot.ReloadTest
         @param rebootOper:Reboot operation to conduct before/during reboot process
+        @param ptf_collect_dir: PTF log collection directory
         """
         logger.info("Running PTF runner on PTF host: {0}".format(self.ptfhost))
 
@@ -775,6 +860,7 @@ def __runPtfRunner(self, rebootOper=None):
             platform="remote",
             params=params,
             log_file='/tmp/advanced-reboot.ReloadTest.log',
+            ptf_collect_dir=ptf_collect_dir,
             module_ignore_errors=self.moduleIgnoreErrors,
             timeout=REBOOT_CASE_TIMEOUT,
             is_python3=True

diff --git a/tests/common/helpers/upgrade_helpers.py b/tests/common/helpers/upgrade_helpers.py
@@ -221,6 +221,28 @@ def upgrade_test_helper(duthost, localhost, ptfhost, from_image, to_image,
         ptfhost.shell('supervisorctl stop ferret')
 
 
+def multi_hop_warm_upgrade_test_helper(duthost, localhost, ptfhost, tbinfo, get_advanced_reboot, upgrade_type,
+                                       upgrade_path_urls, base_image_setup=None, pre_hop_setup=None,
+                                       post_hop_teardown=None, multihop_advanceboot_loganalyzer_factory=None,
+                                       enable_cpa=False):
+
+    reboot_type = get_reboot_command(duthost, upgrade_type)
+    if enable_cpa and "warm-reboot" in reboot_type:
+        # always do warm-reboot with CPA enabled
+        setup_ferret(duthost, ptfhost, tbinfo)
+        ptf_ip = ptfhost.host.options['inventory_manager'].get_host(ptfhost.hostname).vars['ansible_host']
+        reboot_type = reboot_type + " -c {}".format(ptf_ip)
+
+    advancedReboot = get_advanced_reboot(rebootType=reboot_type)
+    advancedReboot.runMultiHopRebootTestcase(
+        upgrade_path_urls, base_image_setup=base_image_setup, pre_hop_setup=pre_hop_setup,
+        post_hop_teardown=post_hop_teardown,
+        multihop_advanceboot_loganalyzer_factory=multihop_advanceboot_loganalyzer_factory)
+
+    if enable_cpa and "warm-reboot" in reboot_type:
+        ptfhost.shell('supervisorctl stop ferret')
+
+
 def check_asic_and_db_consistency(pytest_config, duthost, consistency_checker_provider):
     if not pytest_config.getoption("enable_consistency_checker"):
         logger.info("Consistency checker is not enabled. Skipping check.")

diff --git a/tests/common/platform/args/advanced_reboot_args.py b/tests/common/platform/args/advanced_reboot_args.py
@@ -135,6 +135,12 @@ def add_advanced_reboot_args(parser):
         help="Specify the target image(s) for upgrade (comma seperated list is allowed)",
         )
 
+    parser.addoption(
+        "--multi_hop_upgrade_path",
+        default="",
+        help="Specify the multi-hop upgrade path as a comma separated list of image URLs to download",
+    )
+
     parser.addoption(
         "--restore_to_image",
         default="",

diff --git a/tests/common/platform/device_utils.py b/tests/common/platform/device_utils.py
@@ -738,18 +738,8 @@ def verify_required_events(duthost, event_counters, timing_data, verification_er
                                            format(observed_start_count, observed_end_count))
 
 
-@pytest.fixture()
-def advanceboot_loganalyzer(duthosts, enum_rand_one_per_hwsku_frontend_hostname, request):
-    """
-    Advance reboot log analysis.
-    This fixture starts log analysis at the beginning of the test. At the end,
-    the collected expect messages are verified and timing of start/stop is calculated.
-
-    Args:
-        duthosts : List of DUT hosts
-        enum_rand_one_per_hwsku_frontend_hostname: hostname of a randomly selected DUT
-    """
-    duthost = duthosts[enum_rand_one_per_hwsku_frontend_hostname]
+def advanceboot_loganalyzer_factory(duthost, request, marker_postfix=None):
+    """Create pre-reboot and post-reboot analysis functions via `LogAnalyzer` with optional marker postfix"""
     test_name = request.node.name
     if "upgrade_path" in test_name:
         reboot_type_source = request.config.getoption("--upgrade_type")
@@ -761,18 +751,13 @@ def advanceboot_loganalyzer(duthosts, enum_rand_one_per_hwsku_frontend_hostname,
         reboot_type = "fast"
     else:
         reboot_type = "unknown"
-    # Currently, advanced reboot test would skip for kvm platform if the test has no device_type marker for vs.
-    # Doing the same skip logic in this fixture to avoid running loganalyzer without the test executed
-    if duthost.facts['platform'] == 'x86_64-kvm_x86_64-r0':
-        device_marks = [arg for mark in request.node.iter_markers(
-            name='device_type') for arg in mark.args]
-        if 'vs' not in device_marks:
-            pytest.skip('Testcase not supported for kvm')
     platform = duthost.facts["platform"]
     logs_in_tmpfs = list()
 
+    marker_prefix = "test_advanced_reboot_{}".format(test_name) if not marker_postfix else\
+        "test_advanced_reboot_{}_{}".format(test_name, marker_postfix)
     loganalyzer = LogAnalyzer(
-        ansible_host=duthost, marker_prefix="test_advanced_reboot_{}".format(test_name))
+        ansible_host=duthost, marker_prefix=marker_prefix)
     base_os_version = list()
 
     def bgpd_log_handler(preboot=False):
@@ -926,9 +911,63 @@ def post_reboot_analysis(marker, event_counters=None, reboot_oper=None, log_dir=
                 duthost, event_counters, analyze_result, verification_errors)
         return verification_errors
 
+    return pre_reboot_analysis, post_reboot_analysis
+
+
+@pytest.fixture()
+def advanceboot_loganalyzer(duthosts, enum_rand_one_per_hwsku_frontend_hostname, request):
+    """
+    Advance reboot log analysis.
+    This fixture starts log analysis at the beginning of the test. At the end,
+    the collected expect messages are verified and timing of start/stop is calculated.
+
+    Args:
+        duthosts : List of DUT hosts
+        enum_rand_one_per_hwsku_frontend_hostname: hostname of a randomly selected DUT
+    """
+    duthost = duthosts[enum_rand_one_per_hwsku_frontend_hostname]
+    # Currently, advanced reboot test would skip for kvm platform if the test has no device_type marker for vs.
+    # Doing the same skip logic in this fixture to avoid running loganalyzer without the test executed
+    if duthost.facts['platform'] == 'x86_64-kvm_x86_64-r0':
+        device_marks = [arg for mark in request.node.iter_markers(
+            name='device_type') for arg in mark.args]
+        if 'vs' not in device_marks:
+            pytest.skip('Testcase not supported for kvm')
+
+    pre_reboot_analysis, post_reboot_analysis = advanceboot_loganalyzer_factory(duthost, request)
     yield pre_reboot_analysis, post_reboot_analysis
 
 
+@pytest.fixture()
+def multihop_advanceboot_loganalyzer_factory(duthosts, enum_rand_one_per_hwsku_frontend_hostname, request):
+    """
+    Advance reboot log analysis involving multiple hops.
+    This fixture returns a factory function requiring the hop_index to be supplied.
+    Then, it starts log analysis at the beginning of the test. At the end,
+    the collected expect messages are verified and timing of start/stop is calculated.
+
+    Args:
+        duthosts : List of DUT hosts
+        enum_rand_one_per_hwsku_frontend_hostname: hostname of a randomly selected DUT
+        request: pytests request fixture
+    """
+    duthost = duthosts[enum_rand_one_per_hwsku_frontend_hostname]
+    # Currently, advanced reboot test would skip for kvm platform if the test has no device_type marker for vs.
+    # Doing the same skip logic in this fixture to avoid running loganalyzer without the test executed
+    if duthost.facts['platform'] == 'x86_64-kvm_x86_64-r0':
+        device_marks = [arg for mark in request.node.iter_markers(
+            name='device_type') for arg in mark.args]
+        if 'vs' not in device_marks:
+            pytest.skip('Testcase not supported for kvm')
+
+    def _multihop_advanceboot_loganalyzer_factory(hop_index):
+        pre_reboot_analysis, post_reboot_analysis = advanceboot_loganalyzer_factory(
+            duthost, request, marker_postfix="hop-{}".format(hop_index))
+        return pre_reboot_analysis, post_reboot_analysis
+
+    yield _multihop_advanceboot_loganalyzer_factory
+
+
 @pytest.fixture()
 def advanceboot_neighbor_restore(duthosts, enum_rand_one_per_hwsku_frontend_hostname, nbrhosts, tbinfo):
     """

diff --git a/tests/ptf_runner.py b/tests/ptf_runner.py
@@ -12,14 +12,18 @@
 logger = logging.getLogger(__name__)
 
 
-def ptf_collect(host, log_file, skip_pcap=False):
+def ptf_collect(host, log_file, skip_pcap=False, dst_dir='./logs/ptf_collect/'):
+    """
+    Collect PTF log and pcap files from PTF container to sonic-mgmt container.
+    Optionally, save the files to a sub-directory in the destination.
+    """
     pos = log_file.rfind('.')
     filename_prefix = log_file[0:pos] if pos > -1 else log_file
 
     pos = filename_prefix.rfind('/') + 1
     rename_prefix = filename_prefix[pos:] if pos > 0 else filename_prefix
     suffix = str(datetime.utcnow()).replace(' ', '.')
-    filename_log = './logs/ptf_collect/' + rename_prefix + '.' + suffix + '.log'
+    filename_log = dst_dir + rename_prefix + '.' + suffix + '.log'
     host.fetch(src=log_file, dest=filename_log, flat=True, fail_on_missing=False)
     allure.attach.file(filename_log, 'ptf_log: ' + filename_log, allure.attachment_type.TEXT)
     if skip_pcap:
@@ -31,7 +35,7 @@ def ptf_collect(host, log_file, skip_pcap=False):
         compressed_pcap_file = pcap_file + '.tar.gz'
         host.archive(path=pcap_file, dest=compressed_pcap_file, format='gz')
         # Copy compressed file from ptf to sonic-mgmt
-        filename_pcap = './logs/ptf_collect/' + rename_prefix + '.' + suffix + '.pcap.tar.gz'
+        filename_pcap = dst_dir + rename_prefix + '.' + suffix + '.pcap.tar.gz'
         host.fetch(src=compressed_pcap_file, dest=filename_pcap, flat=True, fail_on_missing=False)
         allure.attach.file(filename_pcap, 'ptf_pcap: ' + filename_pcap, allure.attachment_type.PCAP)
 
@@ -101,9 +105,10 @@ def is_py3_compat(test_fpath):
 
 def ptf_runner(host, testdir, testname, platform_dir=None, params={},
                platform="remote", qlen=0, relax=True, debug_level="info",
-               socket_recv_size=None, log_file=None, device_sockets=[], timeout=0, custom_options="",
+               socket_recv_size=None, log_file=None,
+               ptf_collect_dir="./logs/ptf_collect/",
+               device_sockets=[], timeout=0, custom_options="",
                module_ignore_errors=False, is_python3=None, async_mode=False, pdb=False):
-
     dut_type = get_dut_type(host)
     kvm_support = params.get("kvm_support", False)
     if dut_type == "kvm" and kvm_support is False:
@@ -201,15 +206,15 @@ def ptf_runner(host, testdir, testname, platform_dir=None, params={},
         result = host.shell(cmd, chdir="/root", module_ignore_errors=module_ignore_errors, module_async=async_mode)
         if not async_mode:
             if log_file:
-                ptf_collect(host, log_file)
+                ptf_collect(host, log_file, dst_dir=ptf_collect_dir)
             if result:
                 allure.attach(json.dumps(result, indent=4), 'ptf_console_result', allure.attachment_type.TEXT)
         if module_ignore_errors:
             if result["rc"] != 0:
                 return result
     except Exception:
         if log_file:
-            ptf_collect(host, log_file)
+            ptf_collect(host, log_file, dst_dir=ptf_collect_dir)
         traceback_msg = traceback.format_exc()
         allure.attach(traceback_msg, 'ptf_runner_exception_traceback', allure.attachment_type.TEXT)
         logger.error("Exception caught while executing case: {}. Error message: {}".format(testname, traceback_msg))

diff --git a/tests/upgrade_path/conftest.py b/tests/upgrade_path/conftest.py
@@ -4,6 +4,9 @@
 def pytest_runtest_setup(item):
     from_list = item.config.getoption('base_image_list')
     to_list = item.config.getoption('target_image_list')
+    multi_hop_upgrade_path = item.config.getoption('multi_hop_upgrade_path')
+    if multi_hop_upgrade_path:
+        return
     if not from_list or not to_list:
         pytest.skip("base_image_list or target_image_list is empty")