Skip to content

Commit

Permalink
Added multi-hop SONiC upgrade path test case (sonic-net#14563)
Browse files Browse the repository at this point in the history
* Added multi-hop test case

* Added consistency checker to multi-hop test case

* Fixed a bug where some logs would be missing in multi-hop test

The following log files were missing:
 - capture.pcap
 - capture_filtered.pcap
 - warm-reboot-report.json
 - warm-reboot.log
This didn't cause the test to fail they simply weren't being captured.
This change makes it so that they are captured.

* Renamed 'set_base_image_a' to be more descriptive
  • Loading branch information
Ryangwaite authored Dec 10, 2024
1 parent 882fb10 commit 6bf773b
Show file tree
Hide file tree
Showing 9 changed files with 315 additions and 71 deletions.
98 changes: 92 additions & 6 deletions tests/common/fixtures/advanced_reboot.py
Original file line number Diff line number Diff line change
Expand Up @@ -421,12 +421,14 @@ def __clearArpAndFdbTables(self):
logger.info('Clearing all fdb entries on DUT {}'.format(self.duthost.hostname))
self.duthost.shell('sonic-clear fdb all')

def __fetchTestLogs(self, rebootOper=None):
def __fetchTestLogs(self, rebootOper=None, log_dst_suffix=None):
"""
Fetch test logs from duthost and ptfhost after individual test run
Fetch test logs from duthost and ptfhost.
@param rebootOper: if provided it will be added to each individual file name
@param log_dst_suffix: if provided it will be appended to the directory name
"""
if rebootOper:
dir_name = "{}_{}".format(self.request.node.name, rebootOper)
if log_dst_suffix:
dir_name = "{}_{}".format(self.request.node.name, log_dst_suffix)
else:
dir_name = self.request.node.name
report_file_dir = os.path.realpath((os.path.join(os.path.dirname(__file__), "../../logs/platform_tests/")))
Expand Down Expand Up @@ -596,7 +598,7 @@ def runRebootTest(self):
if self.postboot_setup:
self.postboot_setup()
# capture the test logs, and print all of them in case of failure, or a summary in case of success
log_dir = self.__fetchTestLogs(rebootOper)
log_dir = self.__fetchTestLogs(rebootOper, log_dst_suffix=rebootOper)
self.print_test_logs_summary(log_dir)
if self.advanceboot_loganalyzer and post_reboot_analysis:
verification_errors = post_reboot_analysis(marker, event_counters=event_counters,
Expand Down Expand Up @@ -630,6 +632,88 @@ def runRebootTestcase(self, prebootList=None, inbootList=None, prebootFiles='pee
self.imageInstall(prebootList, inbootList, prebootFiles)
return self.runRebootTest()

def runMultiHopRebootTestcase(self, upgrade_path_urls, prebootFiles='peer_dev_info,neigh_port_info',
base_image_setup=None, pre_hop_setup=None,
post_hop_teardown=None, multihop_advanceboot_loganalyzer_factory=None):
"""
This method validates and prepares test bed for multi-hop reboot test case. It runs the reboot test case using
provided test arguments.
@param prebootList: list of operation to run before reboot process
@param prebootFiles: preboot files
"""
# Install image A (base image)
self.imageInstall(None, None, prebootFiles)
if base_image_setup:
base_image_setup()

test_results = dict()
test_case_name = str(self.request.node.name)
test_results[test_case_name] = list()
for hop_index, _ in enumerate(upgrade_path_urls[1:], start=1):
try:
if pre_hop_setup:
pre_hop_setup(hop_index)
if multihop_advanceboot_loganalyzer_factory:
pre_reboot_analysis, post_reboot_analysis = multihop_advanceboot_loganalyzer_factory(hop_index)
marker = pre_reboot_analysis()
event_counters = self.__setupRebootOper(None)

# Run the upgrade
thread = InterruptableThread(
target=self.__runPtfRunner,
kwargs={"ptf_collect_dir": "./logs/ptf_collect/hop{}/".format(hop_index)})
thread.daemon = True
thread.start()
# give the test REBOOT_CASE_TIMEOUT (1800s) to complete the reboot with IO,
# and then additional 300s to examine the pcap, logs and generate reports
ptf_timeout = REBOOT_CASE_TIMEOUT + 300
thread.join(timeout=ptf_timeout, suppress_exception=True)
self.ptfhost.shell("pkill -f 'ptftests advanced-reboot.ReloadTest'", module_ignore_errors=True)
# the thread might still be running, and to catch any exceptions after pkill allow 10s to join
thread.join(timeout=10)

self.__verifyRebootOper(None)
if self.duthost.num_asics() == 1 and not check_bgp_router_id(self.duthost, self.mgFacts):
test_results[test_case_name].append("Failed to verify BGP router identifier is Loopback0 on %s" %
self.duthost.hostname)
if post_hop_teardown:
post_hop_teardown(hop_index)
except Exception:
traceback_msg = traceback.format_exc()
err_msg = "Exception caught while running advanced-reboot test on ptf: \n{}".format(traceback_msg)
logger.error(err_msg)
test_results[test_case_name].append(err_msg)
finally:
# capture the test logs, and print all of them in case of failure, or a summary in case of success
log_dir = self.__fetchTestLogs(log_dst_suffix="hop{}".format(hop_index))
self.print_test_logs_summary(log_dir)
if multihop_advanceboot_loganalyzer_factory and post_reboot_analysis:
verification_errors = post_reboot_analysis(marker, event_counters=event_counters, log_dir=log_dir)
if verification_errors:
logger.error("Post reboot verification failed. List of failures: {}"
.format('\n'.join(verification_errors)))
test_results[test_case_name].extend(verification_errors)
# Set the post_reboot_analysis to None to avoid using it again after post_hop_teardown
# on the subsequent iteration in the event that we land in the finally block before
# the new one is initialised
post_reboot_analysis = None
self.acl_manager_checker(test_results[test_case_name])
self.__clearArpAndFdbTables()
self.__revertRebootOper(None)

failed_list = [(testcase, failures) for testcase, failures in list(test_results.items())
if len(failures) != 0]
pytest_assert(len(failed_list) == 0, "Advanced-reboot failure. Failed multi-hop test {testname} "
"on update {hop_index} from {from_image} to {to_image}, "
"failure summary:\n{fail_summary}".format(
testname=self.request.node.name,
hop_index=hop_index,
from_image=upgrade_path_urls[hop_index-1],
to_image=upgrade_path_urls[hop_index],
fail_summary=failed_list
))
return True # Success

def __setupRebootOper(self, rebootOper):
if self.dual_tor_mode:
for device in self.duthosts:
Expand Down Expand Up @@ -694,10 +778,11 @@ def __revertRebootOper(self, rebootOper):
logger.info('Running revert handler for reboot operation {}'.format(rebootOper))
rebootOper.revert()

def __runPtfRunner(self, rebootOper=None):
def __runPtfRunner(self, rebootOper=None, ptf_collect_dir="./logs/ptf_collect/"):
"""
Run single PTF advanced-reboot.ReloadTest
@param rebootOper:Reboot operation to conduct before/during reboot process
@param ptf_collect_dir: PTF log collection directory
"""
logger.info("Running PTF runner on PTF host: {0}".format(self.ptfhost))

Expand Down Expand Up @@ -775,6 +860,7 @@ def __runPtfRunner(self, rebootOper=None):
platform="remote",
params=params,
log_file='/tmp/advanced-reboot.ReloadTest.log',
ptf_collect_dir=ptf_collect_dir,
module_ignore_errors=self.moduleIgnoreErrors,
timeout=REBOOT_CASE_TIMEOUT,
is_python3=True
Expand Down
22 changes: 22 additions & 0 deletions tests/common/helpers/upgrade_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,28 @@ def upgrade_test_helper(duthost, localhost, ptfhost, from_image, to_image,
ptfhost.shell('supervisorctl stop ferret')


def multi_hop_warm_upgrade_test_helper(duthost, localhost, ptfhost, tbinfo, get_advanced_reboot, upgrade_type,
upgrade_path_urls, base_image_setup=None, pre_hop_setup=None,
post_hop_teardown=None, multihop_advanceboot_loganalyzer_factory=None,
enable_cpa=False):

reboot_type = get_reboot_command(duthost, upgrade_type)
if enable_cpa and "warm-reboot" in reboot_type:
# always do warm-reboot with CPA enabled
setup_ferret(duthost, ptfhost, tbinfo)
ptf_ip = ptfhost.host.options['inventory_manager'].get_host(ptfhost.hostname).vars['ansible_host']
reboot_type = reboot_type + " -c {}".format(ptf_ip)

advancedReboot = get_advanced_reboot(rebootType=reboot_type)
advancedReboot.runMultiHopRebootTestcase(
upgrade_path_urls, base_image_setup=base_image_setup, pre_hop_setup=pre_hop_setup,
post_hop_teardown=post_hop_teardown,
multihop_advanceboot_loganalyzer_factory=multihop_advanceboot_loganalyzer_factory)

if enable_cpa and "warm-reboot" in reboot_type:
ptfhost.shell('supervisorctl stop ferret')


def check_asic_and_db_consistency(pytest_config, duthost, consistency_checker_provider):
if not pytest_config.getoption("enable_consistency_checker"):
logger.info("Consistency checker is not enabled. Skipping check.")
Expand Down
6 changes: 6 additions & 0 deletions tests/common/platform/args/advanced_reboot_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,12 @@ def add_advanced_reboot_args(parser):
help="Specify the target image(s) for upgrade (comma seperated list is allowed)",
)

parser.addoption(
"--multi_hop_upgrade_path",
default="",
help="Specify the multi-hop upgrade path as a comma separated list of image URLs to download",
)

parser.addoption(
"--restore_to_image",
default="",
Expand Down
79 changes: 59 additions & 20 deletions tests/common/platform/device_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -738,18 +738,8 @@ def verify_required_events(duthost, event_counters, timing_data, verification_er
format(observed_start_count, observed_end_count))


@pytest.fixture()
def advanceboot_loganalyzer(duthosts, enum_rand_one_per_hwsku_frontend_hostname, request):
"""
Advance reboot log analysis.
This fixture starts log analysis at the beginning of the test. At the end,
the collected expect messages are verified and timing of start/stop is calculated.
Args:
duthosts : List of DUT hosts
enum_rand_one_per_hwsku_frontend_hostname: hostname of a randomly selected DUT
"""
duthost = duthosts[enum_rand_one_per_hwsku_frontend_hostname]
def advanceboot_loganalyzer_factory(duthost, request, marker_postfix=None):
"""Create pre-reboot and post-reboot analysis functions via `LogAnalyzer` with optional marker postfix"""
test_name = request.node.name
if "upgrade_path" in test_name:
reboot_type_source = request.config.getoption("--upgrade_type")
Expand All @@ -761,18 +751,13 @@ def advanceboot_loganalyzer(duthosts, enum_rand_one_per_hwsku_frontend_hostname,
reboot_type = "fast"
else:
reboot_type = "unknown"
# Currently, advanced reboot test would skip for kvm platform if the test has no device_type marker for vs.
# Doing the same skip logic in this fixture to avoid running loganalyzer without the test executed
if duthost.facts['platform'] == 'x86_64-kvm_x86_64-r0':
device_marks = [arg for mark in request.node.iter_markers(
name='device_type') for arg in mark.args]
if 'vs' not in device_marks:
pytest.skip('Testcase not supported for kvm')
platform = duthost.facts["platform"]
logs_in_tmpfs = list()

marker_prefix = "test_advanced_reboot_{}".format(test_name) if not marker_postfix else\
"test_advanced_reboot_{}_{}".format(test_name, marker_postfix)
loganalyzer = LogAnalyzer(
ansible_host=duthost, marker_prefix="test_advanced_reboot_{}".format(test_name))
ansible_host=duthost, marker_prefix=marker_prefix)
base_os_version = list()

def bgpd_log_handler(preboot=False):
Expand Down Expand Up @@ -926,9 +911,63 @@ def post_reboot_analysis(marker, event_counters=None, reboot_oper=None, log_dir=
duthost, event_counters, analyze_result, verification_errors)
return verification_errors

return pre_reboot_analysis, post_reboot_analysis


@pytest.fixture()
def advanceboot_loganalyzer(duthosts, enum_rand_one_per_hwsku_frontend_hostname, request):
"""
Advance reboot log analysis.
This fixture starts log analysis at the beginning of the test. At the end,
the collected expect messages are verified and timing of start/stop is calculated.
Args:
duthosts : List of DUT hosts
enum_rand_one_per_hwsku_frontend_hostname: hostname of a randomly selected DUT
"""
duthost = duthosts[enum_rand_one_per_hwsku_frontend_hostname]
# Currently, advanced reboot test would skip for kvm platform if the test has no device_type marker for vs.
# Doing the same skip logic in this fixture to avoid running loganalyzer without the test executed
if duthost.facts['platform'] == 'x86_64-kvm_x86_64-r0':
device_marks = [arg for mark in request.node.iter_markers(
name='device_type') for arg in mark.args]
if 'vs' not in device_marks:
pytest.skip('Testcase not supported for kvm')

pre_reboot_analysis, post_reboot_analysis = advanceboot_loganalyzer_factory(duthost, request)
yield pre_reboot_analysis, post_reboot_analysis


@pytest.fixture()
def multihop_advanceboot_loganalyzer_factory(duthosts, enum_rand_one_per_hwsku_frontend_hostname, request):
"""
Advance reboot log analysis involving multiple hops.
This fixture returns a factory function requiring the hop_index to be supplied.
Then, it starts log analysis at the beginning of the test. At the end,
the collected expect messages are verified and timing of start/stop is calculated.
Args:
duthosts : List of DUT hosts
enum_rand_one_per_hwsku_frontend_hostname: hostname of a randomly selected DUT
request: pytests request fixture
"""
duthost = duthosts[enum_rand_one_per_hwsku_frontend_hostname]
# Currently, advanced reboot test would skip for kvm platform if the test has no device_type marker for vs.
# Doing the same skip logic in this fixture to avoid running loganalyzer without the test executed
if duthost.facts['platform'] == 'x86_64-kvm_x86_64-r0':
device_marks = [arg for mark in request.node.iter_markers(
name='device_type') for arg in mark.args]
if 'vs' not in device_marks:
pytest.skip('Testcase not supported for kvm')

def _multihop_advanceboot_loganalyzer_factory(hop_index):
pre_reboot_analysis, post_reboot_analysis = advanceboot_loganalyzer_factory(
duthost, request, marker_postfix="hop-{}".format(hop_index))
return pre_reboot_analysis, post_reboot_analysis

yield _multihop_advanceboot_loganalyzer_factory


@pytest.fixture()
def advanceboot_neighbor_restore(duthosts, enum_rand_one_per_hwsku_frontend_hostname, nbrhosts, tbinfo):
"""
Expand Down
19 changes: 12 additions & 7 deletions tests/ptf_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,18 @@
logger = logging.getLogger(__name__)


def ptf_collect(host, log_file, skip_pcap=False):
def ptf_collect(host, log_file, skip_pcap=False, dst_dir='./logs/ptf_collect/'):
"""
Collect PTF log and pcap files from PTF container to sonic-mgmt container.
Optionally, save the files to a sub-directory in the destination.
"""
pos = log_file.rfind('.')
filename_prefix = log_file[0:pos] if pos > -1 else log_file

pos = filename_prefix.rfind('/') + 1
rename_prefix = filename_prefix[pos:] if pos > 0 else filename_prefix
suffix = str(datetime.utcnow()).replace(' ', '.')
filename_log = './logs/ptf_collect/' + rename_prefix + '.' + suffix + '.log'
filename_log = dst_dir + rename_prefix + '.' + suffix + '.log'
host.fetch(src=log_file, dest=filename_log, flat=True, fail_on_missing=False)
allure.attach.file(filename_log, 'ptf_log: ' + filename_log, allure.attachment_type.TEXT)
if skip_pcap:
Expand All @@ -31,7 +35,7 @@ def ptf_collect(host, log_file, skip_pcap=False):
compressed_pcap_file = pcap_file + '.tar.gz'
host.archive(path=pcap_file, dest=compressed_pcap_file, format='gz')
# Copy compressed file from ptf to sonic-mgmt
filename_pcap = './logs/ptf_collect/' + rename_prefix + '.' + suffix + '.pcap.tar.gz'
filename_pcap = dst_dir + rename_prefix + '.' + suffix + '.pcap.tar.gz'
host.fetch(src=compressed_pcap_file, dest=filename_pcap, flat=True, fail_on_missing=False)
allure.attach.file(filename_pcap, 'ptf_pcap: ' + filename_pcap, allure.attachment_type.PCAP)

Expand Down Expand Up @@ -101,9 +105,10 @@ def is_py3_compat(test_fpath):

def ptf_runner(host, testdir, testname, platform_dir=None, params={},
platform="remote", qlen=0, relax=True, debug_level="info",
socket_recv_size=None, log_file=None, device_sockets=[], timeout=0, custom_options="",
socket_recv_size=None, log_file=None,
ptf_collect_dir="./logs/ptf_collect/",
device_sockets=[], timeout=0, custom_options="",
module_ignore_errors=False, is_python3=None, async_mode=False, pdb=False):

dut_type = get_dut_type(host)
kvm_support = params.get("kvm_support", False)
if dut_type == "kvm" and kvm_support is False:
Expand Down Expand Up @@ -201,15 +206,15 @@ def ptf_runner(host, testdir, testname, platform_dir=None, params={},
result = host.shell(cmd, chdir="/root", module_ignore_errors=module_ignore_errors, module_async=async_mode)
if not async_mode:
if log_file:
ptf_collect(host, log_file)
ptf_collect(host, log_file, dst_dir=ptf_collect_dir)
if result:
allure.attach(json.dumps(result, indent=4), 'ptf_console_result', allure.attachment_type.TEXT)
if module_ignore_errors:
if result["rc"] != 0:
return result
except Exception:
if log_file:
ptf_collect(host, log_file)
ptf_collect(host, log_file, dst_dir=ptf_collect_dir)
traceback_msg = traceback.format_exc()
allure.attach(traceback_msg, 'ptf_runner_exception_traceback', allure.attachment_type.TEXT)
logger.error("Exception caught while executing case: {}. Error message: {}".format(testname, traceback_msg))
Expand Down
3 changes: 3 additions & 0 deletions tests/upgrade_path/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@
def pytest_runtest_setup(item):
from_list = item.config.getoption('base_image_list')
to_list = item.config.getoption('target_image_list')
multi_hop_upgrade_path = item.config.getoption('multi_hop_upgrade_path')
if multi_hop_upgrade_path:
return
if not from_list or not to_list:
pytest.skip("base_image_list or target_image_list is empty")

Expand Down
Loading

0 comments on commit 6bf773b

Please sign in to comment.