From abf6d447d155daf5284401ced0e7084a568aaeb2 Mon Sep 17 00:00:00 2001 From: Tytus Bierwiaczonek Date: Tue, 21 Jun 2022 16:50:15 +0200 Subject: [PATCH] Titusen/slow fast path alternately (#2689) * WIP * WIP fast path recovery * Review comments application * Review comments application * Fail on linearizability * Final stable version * Final stable version Co-authored-by: Tytus Bierwiaczonek --- tests/apollo/test_skvbc_restart_recovery.py | 86 ++++++++++++++------- tests/apollo/util/bft.py | 6 +- 2 files changed, 62 insertions(+), 30 deletions(-) diff --git a/tests/apollo/test_skvbc_restart_recovery.py b/tests/apollo/test_skvbc_restart_recovery.py index fc94b2324d..a966a30536 100644 --- a/tests/apollo/test_skvbc_restart_recovery.py +++ b/tests/apollo/test_skvbc_restart_recovery.py @@ -44,6 +44,9 @@ def start_replica_cmd(builddir, replica_id): "-e", str(True) ] +class foo: + def log_message(self, var): + print(f"{var}") class SkvbcRestartRecoveryTest(ApolloTest): @@ -79,10 +82,6 @@ async def test_restarting_replica_with_client_load(self, bft_network): bft_network.all_replicas(without={primary_replica})) # uncomment for live tracking of log messages from the test - # class foo: - # def log_message(self, var): - # print(f"{var}") - # # log = foo() for v in range(300): @@ -120,13 +119,6 @@ async def _restarting_replica_during_system_is_in_view_change(self, bft_network, next_primary = 1 view = 0 - # uncomment for live tracking of log messages from the test - # class foo: - # def log_message(self, var): - # print(f"{var}") - # - # log = foo() - # Perform multiple view changes and restart 1 replica while the replicas are agreeing the new View while view < 100: # Pick one replica to restart while the others are agreeing the next View. @@ -245,10 +237,6 @@ async def test_restarting_f_replicas_for_view_change(self, bft_network): view = 0 # uncomment for live tracking of log messages from the test - # class foo: - # def log_message(self, var): - # print(f"{var}") - # # log = foo() # Perform multiple view changes by restarting F replicas where the Primary is included @@ -320,6 +308,62 @@ async def test_restarting_f_replicas_for_view_change(self, bft_network): await bft_network.wait_for_fast_path_to_be_prevalent( run_ops=lambda: skvbc.run_concurrent_ops(num_ops=20, write_weight=1), threshold=20) + @with_trio + @with_bft_network(start_replica_cmd, selected_configs=lambda n, f, c: c == 0, rotate_keys=True) + @with_constant_load + async def test_recovering_fast_path(self, bft_network, skvbc, constant_load): + """ + The Apollo test, which should be part of the test_skvbc_restart_recovery suite needs to implement the following steps: + + 1. Start all Replicas and Introduce constantly running Client requests in the background. + 2. Verify that the system is processing client requests on Fast Path. + 3. Restart the Primary to initiate View Change. + 4. Verify the View Change succeeded. + 5. Wait for the system to recover Fast commit path again (Here we will have to set a time limit for which we expect the system to recover to Fast path. We can start with 1 minute interval.) + 6. Stop 1 Non Primary replica to transition the system to Slow path. + 7. Verify the system is making progress on Slow path. + 8. Restart the Primary to initiate View Change. + 9. Start the Replica we stopped in step 6 and verify the View Change succeeded. + 10. Goto Step 2. + """ + # start replicas + [bft_network.start_replica(i) for i in bft_network.all_replicas()] + + # log = foo() + + loop_count = 0 + while (loop_count < 100): + loop_count = loop_count + 1 + + view = await bft_network.get_current_view() + + primary = await bft_network.get_current_primary() + bft_network.stop_replica(primary) + await trio.sleep(seconds=10) + bft_network.start_replica(primary) + + await bft_network.wait_for_replicas_to_reach_at_least_view(replicas_ids=bft_network.all_replicas(), expected_view=view + 1, timeout=60) + + await bft_network.wait_for_fast_path_to_be_prevalent( + run_ops=lambda: skvbc.run_concurrent_ops(num_ops=20, write_weight=1), threshold=20, timeout=180) + + view = await bft_network.get_current_view() + + non_primary = primary + bft_network.stop_replica(non_primary) + + primary = await bft_network.get_current_primary() + + await bft_network.wait_for_slow_path_to_be_prevalent( + run_ops=lambda: skvbc.run_concurrent_ops(num_ops=20, write_weight=1), threshold=20, replica_id=primary, timeout=180) + + bft_network.stop_replica(primary) + bft_network.start_replica(primary) + + await bft_network.wait_for_replicas_to_reach_at_least_view(replicas_ids=bft_network.all_replicas(), expected_view=view + 1, timeout=60) + + bft_network.start_replica(non_primary) + @with_trio @with_bft_network(start_replica_cmd, selected_configs=lambda n, f, c: c == 0, rotate_keys=True) @verify_linearizability() @@ -407,13 +451,6 @@ async def test_view_change_with_non_primary_replica_in_state_transfer(self, bft_ 8) Goto Step 2. """ - # uncomment for live tracking of log messages from the test - # class foo: - # def log_message(self, var): - # print(f"{var}") - # - # log = foo() - # start replicas [bft_network.start_replica(i) for i in bft_network.all_replicas()] @@ -488,12 +525,7 @@ async def test_recovering_view_after_restart_with_packet_loss(self, bft_network, 5. Loop to step 3. """ # uncomment for live tracking of log messages from the test - # class foo: - # def log_message(self, var): - # print(f"{var}") - # log = foo() - # start replicas [bft_network.start_replica(i) for i in bft_network.all_replicas()] loop_count_outer = 0 diff --git a/tests/apollo/util/bft.py b/tests/apollo/util/bft.py index f23d22d5fc..e85e326393 100644 --- a/tests/apollo/util/bft.py +++ b/tests/apollo/util/bft.py @@ -1391,13 +1391,13 @@ async def expected_checkpoint_to_be_reached(): return await self.wait_for(expected_checkpoint_to_be_reached, 30, .5) - async def wait_for_fast_path_to_be_prevalent(self, run_ops, threshold, replica_id=0): + async def wait_for_fast_path_to_be_prevalent(self, run_ops, threshold, replica_id=0, timeout=90): await self._wait_for_consensus_path_to_be_prevalent( fast=True, run_ops=run_ops, threshold=threshold, replica_id=replica_id) - async def wait_for_slow_path_to_be_prevalent(self, run_ops, threshold, replica_id=0): + async def wait_for_slow_path_to_be_prevalent(self, run_ops, threshold, replica_id=0, timeout=90): await self._wait_for_consensus_path_to_be_prevalent( - fast=False, run_ops=run_ops, threshold=threshold, replica_id=replica_id) + fast=False, run_ops=run_ops, threshold=threshold, replica_id=replica_id, timeout=timeout) async def _wait_for_consensus_path_to_be_prevalent(self, fast, run_ops, threshold, replica_id=0, timeout=90): """