From aa2552ec6eaeffb8a481b3a68b542636a8c2f078 Mon Sep 17 00:00:00 2001 From: Kush Agrawal Date: Tue, 26 Jul 2022 02:54:16 +0530 Subject: [PATCH] Fix and enable tests in chaotic startup 1. test_inactive_window_catchup_up_to_gap 2. test_missed_two_view_changes --- .../workflows/build_and_test_gcc_debug.yml | 6 +- .../workflows/build_and_test_gcc_release.yml | 4 +- Makefile | 21 +++-- tests/apollo/test_skvbc_chaotic_startup.py | 92 ++++++++++--------- 4 files changed, 67 insertions(+), 56 deletions(-) diff --git a/.github/workflows/build_and_test_gcc_debug.yml b/.github/workflows/build_and_test_gcc_debug.yml index 59dc77efb0..2f6d3f5bc4 100644 --- a/.github/workflows/build_and_test_gcc_debug.yml +++ b/.github/workflows/build_and_test_gcc_debug.yml @@ -17,7 +17,7 @@ jobs: compiler: - "CONCORD_BFT_CONTAINER_CC=gcc CONCORD_BFT_CONTAINER_CXX=g++" ci_build_type: - - "-DCMAKE_BUILD_TYPE=DEBUG -DBUILD_COMM_TCP_TLS=FALSE" + - "-DCMAKE_BUILD_TYPE=DEBUG -DBUILD_COMM_TCP_TLS=TRUE" use_s3_obj_store: - "-DUSE_S3_OBJECT_STORE=ON" steps: @@ -58,10 +58,10 @@ jobs: -DUSE_OPENTRACING=ON \ -DOMIT_TEST_OUTPUT=OFF\ -DKEEP_APOLLO_LOGS=TRUE\ - -DRUN_APOLLO_TESTS=FALSE\ + -DRUN_APOLLO_TESTS=TRUE\ -DUSE_FAKE_CLOCK_IN_TIME_SERVICE=TRUE\" "\ && script -q -e -c "make simple-test" \ - && script -q -e -c "make test" + && script -q -e -c "make test-single-suite TEST_NAME=skvbc_chaotic_startup NUM_REPEATS=100 BREAK_ON_FAILURE=FALSE" - name: Prepare artifacts if: failure() run: | diff --git a/.github/workflows/build_and_test_gcc_release.yml b/.github/workflows/build_and_test_gcc_release.yml index b7bfe63eb3..3b4021480c 100644 --- a/.github/workflows/build_and_test_gcc_release.yml +++ b/.github/workflows/build_and_test_gcc_release.yml @@ -58,9 +58,9 @@ jobs: -DUSE_OPENTRACING=ON \ -DOMIT_TEST_OUTPUT=OFF\ -DKEEP_APOLLO_LOGS=TRUE\ - -DRUN_APOLLO_TESTS=FALSE\ + -DRUN_APOLLO_TESTS=TRUE\ -DUSE_FAKE_CLOCK_IN_TIME_SERVICE=TRUE\" "\ - && script -q -e -c "make test" + && script -q -e -c "make test-single-suite TEST_NAME=skvbc_chaotic_startup NUM_REPEATS=100 BREAK_ON_FAILURE=FALSE" - name: Prepare artifacts if: failure() run: | diff --git a/Makefile b/Makefile index 1f985dbbde..afb1318b4a 100644 --- a/Makefile +++ b/Makefile @@ -248,12 +248,21 @@ test-range: ## Run all tests in the range [START,END], inclusive: `make test-ran ctest ${CONCORD_BFT_ADDITIONAL_CTEST_RUN_PARAMS} -I ${START},${END}" .PHONY: test-single-suite -test-single-suite: ## Run a single test `make test-single-suite TEST_NAME=` - docker run ${BASIC_RUN_PARAMS} \ - ${CONCORD_BFT_CONTAINER_SHELL} -c \ - "mkdir -p ${CONCORD_BFT_CORE_DIR} && \ - cd ${CONCORD_BFT_BUILD_DIR} && \ - ctest ${CONCORD_BFT_ADDITIONAL_CTEST_RUN_PARAMS} -V -R ${TEST_NAME} --timeout ${CONCORD_BFT_CTEST_TIMEOUT} --output-on-failure" +test-single-suite: SHELL:=/bin/bash +test-single-suite: ## Run a single test `make test-single-suite TEST_NAME= NUM_REPEATS= BREAK_ON_FAILURE=`. Example: `make test-single-suite TEST_NAME=timers_tests BREAK_ON_FAILURE=TRUE NUM_REPEATS=3` + num_failures=0; \ + for (( i=1; i<=${NUM_REPEATS__}; i++ )); do \ + echo "=== Starting iteration $${i}/${NUM_REPEATS__}"; \ + docker run ${BASIC_RUN_PARAMS} ${CONCORD_BFT_CONTAINER_SHELL} -c \ + "mkdir -p ${CONCORD_BFT_CORE_DIR} && cd ${CONCORD_BFT_BUILD_DIR} && \ + ctest ${CONCORD_BFT_ADDITIONAL_CTEST_RUN_PARAMS} -V -R ${TEST_NAME} --timeout ${CONCORD_BFT_CTEST_TIMEOUT} --output-on-failure"; \ + RESULT=$$?; \ + if [[ $${RESULT} -ne 0 ]];then \ + (( num_failures=num_failures+1 )); \ + if [[ '${BREAK_ON_FAILURE__}' = 'TRUE' ]];then echo "Breaking on first failure! (iteration $$i)"; exit $${RESULT}; fi; fi; \ + done; \ + echo "Test ${TEST_NAME} completed ${NUM_REPEATS__} iterations" \ + "($$((${NUM_REPEATS__}-num_failures)) succeed, $${num_failures} failed)"; .PHONY: test-single-apollo-case test-single-apollo-case: ## Run a single Apollo test case: `make test-single-apollo-case TEST_FILE_NAME= TEST_CASE_NAME= NUM_REPEATS= BREAK_ON_FAILURE=`. Test suite file name should come without *.py. Test case is expected without a class name, and must be unique. Example: `make test-single-apollo-case BREAK_ON_FAILURE=TRUE NUM_REPEATS=100 TEST_FILE_NAME=test_skvbc_reconfiguration TEST_CASE_NAME=test_tls_exchange_client_replica_with_st` diff --git a/tests/apollo/test_skvbc_chaotic_startup.py b/tests/apollo/test_skvbc_chaotic_startup.py index 9aa5c32eb7..0586b7e917 100644 --- a/tests/apollo/test_skvbc_chaotic_startup.py +++ b/tests/apollo/test_skvbc_chaotic_startup.py @@ -51,7 +51,6 @@ class SkvbcChaoticStartupTest(ApolloTest): __test__ = False # so that PyTest ignores this test scenario - @unittest.skip("After CheckpointMsg-s forwarding, in this situation the late Replica initiates State Transfer.") @with_trio @with_bft_network(start_replica_cmd, selected_configs=lambda n, f, c: n == 7) async def test_inactive_window_catchup_up_to_gap(self, bft_network): @@ -84,6 +83,8 @@ async def write_req(num_req=1): for _ in range(num_req): await skvbc.send_write_kv_set() + await trio.sleep(1) + with net.ReplicaOneWayTwoSubsetsIsolatingAdversary( bft_network, {late_replica}, bft_network.all_replicas(without={primary, late_replica})) as adversary: @@ -107,38 +108,37 @@ async def write_req(num_req=1): while True: last_exec = await bft_network.get_metric(late_replica, bft_network, 'Gauges', "lastExecutedSeqNum") log.log_message(message_type=f"replica = {late_replica}; lase_exec = {last_exec}") - if last_exec == seq_nums_per_checkpoint + num_reqs_after_first_checkpoint: + if last_exec == seq_nums_per_checkpoint + num_reqs_after_first_checkpoint + 1: break await trio.sleep(seconds=0.3) bft_network.stop_replica(late_replica) - # create 2 checkpoints and wait for checkpoint propagation - await skvbc.fill_and_wait_for_checkpoint( - initial_nodes=bft_network.all_replicas(without={late_replica}), - num_of_checkpoints_to_add=checkpoints_to_advance_after_first, - verify_checkpoint_persistency=False - ) - - await bft_network.wait_for_replicas_to_collect_stable_checkpoint( - bft_network.all_replicas(without={late_replica}), - first_stable_checkpoint_to_reach + checkpoints_to_advance_after_first) + # create 2 checkpoints and wait for checkpoint propagation + await skvbc.fill_and_wait_for_checkpoint( + initial_nodes=bft_network.all_replicas(without={late_replica}), + num_of_checkpoints_to_add=checkpoints_to_advance_after_first, + verify_checkpoint_persistency=False + ) - bft_network.start_replica(late_replica) - with trio.fail_after(seconds=30): - - late_replica_catch_up = False - while not late_replica_catch_up: - for replica_id in bft_network.get_live_replicas(): - last_stable = await bft_network.get_metric(replica_id, bft_network, 'Gauges', "lastStableSeqNum") - last_exec = await bft_network.get_metric(replica_id, bft_network, 'Gauges', "lastExecutedSeqNum") - log.log_message(message_type=f"replica = {replica_id}; last_stable = {last_stable}; lase_exec = {last_exec}") - if replica_id == late_replica and last_exec == 2*seq_nums_per_checkpoint: - late_replica_catch_up = True + await bft_network.wait_for_replicas_to_collect_stable_checkpoint( + bft_network.all_replicas(without={late_replica}), + first_stable_checkpoint_to_reach + checkpoints_to_advance_after_first) - await write_req() - await trio.sleep(seconds=3) + bft_network.start_replica(late_replica) + with trio.fail_after(seconds=30): + late_replica_catch_up = False + while not late_replica_catch_up: + for replica_id in bft_network.get_live_replicas(): + last_stable = await bft_network.get_metric(replica_id, bft_network, 'Gauges', "lastStableSeqNum") + last_exec = await bft_network.get_metric(replica_id, bft_network, 'Gauges', "lastExecutedSeqNum") + log.log_message(message_type=f"replica = {replica_id}; last_stable = {last_stable}; lase_exec = {last_exec}") + if replica_id == late_replica and last_exec >= 3*seq_nums_per_checkpoint: + late_replica_catch_up = True + await write_req() + await trio.sleep(seconds=3) + @unittest.skip("Testing in CI/CD") @skip_for_tls @with_trio @with_bft_network(start_replica_cmd, selected_configs=lambda n, f, c: n == 7) @@ -301,7 +301,6 @@ async def write_req(num_req=1): # @unittest.skipIf(environ.get('BUILD_COMM_TCP_TLS', "").lower() == "true", "Unstable on CI (TCP/TLS only)") - @unittest.skip("Disabled due to BC-6816") @with_trio @with_bft_network(start_replica_cmd, selected_configs=lambda n, f, c: n == 7) @with_constant_load @@ -337,25 +336,26 @@ async def write_req(): bft_network.stop_replica(late_replica) for isolated_replica, views_to_advance in [(0, 1), (1, 2)]: - with net.ReplicaSubsetTwoWayIsolatingAdversary(bft_network, {isolated_replica}) as adversary: - adversary.interfere() - try: - client = bft_network.random_client() - client.primary = None - for _ in range(5): - msg = skvbc.write_req( - [], [(skvbc.random_key(), skvbc.random_value())], 0) - await client.write(msg) - except: - pass - - # Wait for View Change initiation to happen - with trio.fail_after(60): - while True: - view_of_connected_replica = await self._get_gauge(connected_replica, bft_network, "currentActiveView") - if view_of_connected_replica == current_view + views_to_advance: - break - await trio.sleep(0.2) + bft_network.stop_replica(isolated_replica) + try: + client = bft_network.random_client() + client.primary = None + for _ in range(5): + msg = skvbc.write_req( + [], [(skvbc.random_key(), skvbc.random_value())], 0) + await client.write(msg) + except: + pass + + # Wait for View Change initiation to happen + with trio.fail_after(60): + while True: + view_of_connected_replica = await self._get_gauge(connected_replica, bft_network, "currentActiveView") + if view_of_connected_replica == current_view + views_to_advance: + break + await trio.sleep(0.2) + + bft_network.start_replica(isolated_replica) view = await bft_network.wait_for_view( replica_id=connected_replica, @@ -377,6 +377,7 @@ async def write_req(): await bft_network.wait_for_fast_path_to_be_prevalent( run_ops=lambda: write_req(), threshold=num_req) + @unittest.skip("Testing in CI/CD") @with_trio @with_bft_network(start_replica_cmd, selected_configs=lambda n, f, c: n == 7) @with_constant_load @@ -625,6 +626,7 @@ async def test_f_minus_one_staggered_replicas_requesting_vc(self, bft_network, s await self._wait_for_replicas_to_generate_checkpoint(bft_network, skvbc, expected_next_primary, bft_network.all_replicas(without={initial_primary})) + @unittest.skip("Testing in CI/CD") @skip_for_tls @with_trio @with_bft_network(start_replica_cmd_with_vc_timeout("20000"),