From cc2aee7f3bb557b2fb0683b83c6d11a50014e614 Mon Sep 17 00:00:00 2001 From: Mikhail Uvarov Date: Tue, 12 Mar 2024 12:27:36 +0100 Subject: [PATCH 01/30] Add cets_test_node/cets_test_rpc helpers --- test/cets_SUITE.erl | 87 +++++++++-------------------------------- test/cets_test_node.erl | 37 ++++++++++++++++++ test/cets_test_rpc.erl | 52 ++++++++++++++++++++++++ 3 files changed, 108 insertions(+), 68 deletions(-) create mode 100644 test/cets_test_node.erl create mode 100644 test/cets_test_rpc.erl diff --git a/test/cets_SUITE.erl b/test/cets_SUITE.erl index 9235526..edd4187 100644 --- a/test/cets_SUITE.erl +++ b/test/cets_SUITE.erl @@ -12,6 +12,25 @@ -compile([export_all, nowarn_export_all]). +-import(cets_test_node, [ + block_node/2, + reconnect_node/2, + disconnect_node/2, + disconnect_node_by_name/2 +]). + +-import(cets_test_rpc, [ + rpc/4, + insert/3, + insert_many/3, + delete/3, + delete_request/3, + delete_many/3, + dump/2, + other_nodes/2, + join/4 +]). + all() -> [ {group, cets}, @@ -3009,46 +3028,6 @@ wait_for_name_to_be_free(Node, Name) -> %% Cleaner is fast, but not instant. cets_test_wait:wait_until(fun() -> rpc(Node, erlang, whereis, [Name]) end, undefined). -insert(Node, Tab, Rec) -> - rpc(Node, cets, insert, [Tab, Rec]). - -insert_many(Node, Tab, Records) -> - rpc(Node, cets, insert_many, [Tab, Records]). - -delete(Node, Tab, Key) -> - rpc(Node, cets, delete, [Tab, Key]). - -delete_request(Node, Tab, Key) -> - rpc(Node, cets, delete_request, [Tab, Key]). - -delete_many(Node, Tab, Keys) -> - rpc(Node, cets, delete_many, [Tab, Keys]). - -dump(Node, Tab) -> - rpc(Node, cets, dump, [Tab]). - -other_nodes(Node, Tab) -> - rpc(Node, cets, other_nodes, [Tab]). - -join(Node1, Tab, Pid1, Pid2) -> - rpc(Node1, cets_join, join, [lock1, #{table => Tab}, Pid1, Pid2]). - -%% Apply function using rpc or peer module -rpc(Peer, M, F, Args) when is_pid(Peer) -> - case peer:call(Peer, M, F, Args) of - {badrpc, Error} -> - ct:fail({badrpc, Error}); - Other -> - Other - end; -rpc(Node, M, F, Args) when is_atom(Node) -> - case rpc:call(Node, M, F, Args) of - {badrpc, Error} -> - ct:fail({badrpc, Error}); - Other -> - Other - end. - receive_message(M) -> receive M -> ok @@ -3241,34 +3220,6 @@ wait_for_down(Pid) -> after 5000 -> ct:fail({wait_for_down_timeout, Pid}) end. -%% Disconnect node until manually connected -block_node(Node, Peer) when is_atom(Node), is_pid(Peer) -> - rpc(Peer, erlang, set_cookie, [node(), invalid_cookie]), - disconnect_node(Peer, node()), - %% Wait till node() is notified about the disconnect - cets_test_wait:wait_until(fun() -> rpc(Peer, net_adm, ping, [node()]) end, pang), - cets_test_wait:wait_until(fun() -> rpc(node(), net_adm, ping, [Node]) end, pang). - -reconnect_node(Node, Peer) when is_atom(Node), is_pid(Peer) -> - rpc(Peer, erlang, set_cookie, [node(), erlang:get_cookie()]), - %% Very rarely it could return pang - cets_test_wait:wait_until(fun() -> rpc(Peer, net_adm, ping, [node()]) end, pong), - cets_test_wait:wait_until(fun() -> rpc(node(), net_adm, ping, [Node]) end, pong). - -disconnect_node(RPCNode, DisconnectNode) -> - rpc(RPCNode, erlang, disconnect_node, [DisconnectNode]). - -disconnect_node_by_name(Config, Id) -> - Peer = maps:get(Id, proplists:get_value(peers, Config)), - Node = maps:get(Id, proplists:get_value(nodes, Config)), - %% We could need to retry to disconnect, if the local node is currently trying to establish a connection - %% with Node2 (could be triggered by the previous tests) - F = fun() -> - disconnect_node(Peer, node()), - lists:member(Node, nodes()) - end, - cets_test_wait:wait_until(F, false). - not_leader(Leader, Other, Leader) -> Other; not_leader(Other, Leader, Leader) -> diff --git a/test/cets_test_node.erl b/test/cets_test_node.erl new file mode 100644 index 0000000..1f7421b --- /dev/null +++ b/test/cets_test_node.erl @@ -0,0 +1,37 @@ +-module(cets_test_node). +-export([ + block_node/2, + reconnect_node/2, + disconnect_node/2, + disconnect_node_by_name/2 +]). + +-import(cets_test_rpc, [rpc/4]). + +%% Disconnect node until manually connected +block_node(Node, Peer) when is_atom(Node), is_pid(Peer) -> + rpc(Peer, erlang, set_cookie, [node(), invalid_cookie]), + disconnect_node(Peer, node()), + %% Wait till node() is notified about the disconnect + cets_test_wait:wait_until(fun() -> rpc(Peer, net_adm, ping, [node()]) end, pang), + cets_test_wait:wait_until(fun() -> rpc(node(), net_adm, ping, [Node]) end, pang). + +reconnect_node(Node, Peer) when is_atom(Node), is_pid(Peer) -> + rpc(Peer, erlang, set_cookie, [node(), erlang:get_cookie()]), + %% Very rarely it could return pang + cets_test_wait:wait_until(fun() -> rpc(Peer, net_adm, ping, [node()]) end, pong), + cets_test_wait:wait_until(fun() -> rpc(node(), net_adm, ping, [Node]) end, pong). + +disconnect_node(RPCNode, DisconnectNode) -> + rpc(RPCNode, erlang, disconnect_node, [DisconnectNode]). + +disconnect_node_by_name(Config, Id) -> + Peer = maps:get(Id, proplists:get_value(peers, Config)), + Node = maps:get(Id, proplists:get_value(nodes, Config)), + %% We could need to retry to disconnect, if the local node is currently trying to establish a connection + %% with Node2 (could be triggered by the previous tests) + F = fun() -> + disconnect_node(Peer, node()), + lists:member(Node, nodes()) + end, + cets_test_wait:wait_until(F, false). diff --git a/test/cets_test_rpc.erl b/test/cets_test_rpc.erl new file mode 100644 index 0000000..971b350 --- /dev/null +++ b/test/cets_test_rpc.erl @@ -0,0 +1,52 @@ +-module(cets_test_rpc). +-export([ + rpc/4, + insert/3, + insert_many/3, + delete/3, + delete_request/3, + delete_many/3, + dump/2, + other_nodes/2, + join/4 +]). + +%% Apply function using rpc or peer module +rpc(Peer, M, F, Args) when is_pid(Peer) -> + case peer:call(Peer, M, F, Args) of + {badrpc, Error} -> + ct:fail({badrpc, Error}); + Other -> + Other + end; +rpc(Node, M, F, Args) when is_atom(Node) -> + case rpc:call(Node, M, F, Args) of + {badrpc, Error} -> + ct:fail({badrpc, Error}); + Other -> + Other + end. + +insert(Node, Tab, Rec) -> + rpc(Node, cets, insert, [Tab, Rec]). + +insert_many(Node, Tab, Records) -> + rpc(Node, cets, insert_many, [Tab, Records]). + +delete(Node, Tab, Key) -> + rpc(Node, cets, delete, [Tab, Key]). + +delete_request(Node, Tab, Key) -> + rpc(Node, cets, delete_request, [Tab, Key]). + +delete_many(Node, Tab, Keys) -> + rpc(Node, cets, delete_many, [Tab, Keys]). + +dump(Node, Tab) -> + rpc(Node, cets, dump, [Tab]). + +other_nodes(Node, Tab) -> + rpc(Node, cets, other_nodes, [Tab]). + +join(Node1, Tab, Pid1, Pid2) -> + rpc(Node1, cets_join, join, [lock1, #{table => Tab}, Pid1, Pid2]). From 72cf27b787bd4966f4deed5893d32e7079e85dab Mon Sep 17 00:00:00 2001 From: Mikhail Uvarov Date: Tue, 12 Mar 2024 12:51:18 +0100 Subject: [PATCH 02/30] Add cets_test_setup helper --- test/cets_SUITE.erl | 113 ++++++--------------------------------- test/cets_test_setup.erl | 99 ++++++++++++++++++++++++++++++++++ test/cets_test_wait.erl | 22 ++++++++ 3 files changed, 138 insertions(+), 96 deletions(-) create mode 100644 test/cets_test_setup.erl diff --git a/test/cets_SUITE.erl b/test/cets_SUITE.erl index edd4187..0bc7870 100644 --- a/test/cets_SUITE.erl +++ b/test/cets_SUITE.erl @@ -31,6 +31,18 @@ join/4 ]). +-import(cets_test_setup, [ + start/2, + start_local/1, + start_local/2, + start_disco/2, + start_simple_disco/0 +]). + +-import(cets_test_wait, [ + wait_for_down/1 +]). + all() -> [ {group, cets}, @@ -254,7 +266,7 @@ cets_seq_no_log_cases() -> ]. init_per_suite(Config) -> - init_cleanup_table(), + cets_test_setup:init_cleanup_table(), Names = [ct2, ct3, ct4, ct5, ct6, ct7], {Nodes, Peers} = lists:unzip([cets_test_peer:start_node(N) || N <- Names]), [ @@ -288,7 +300,7 @@ init_per_testcase_generic(Name, Config) -> [{testcase, Name} | Config]. end_per_testcase(_, _Config) -> - wait_for_cleanup(), + cets_test_setup:wait_for_cleanup(), ok. %% Modules that use a multiline LOG_ macro @@ -1317,7 +1329,7 @@ pause_on_remote_node_crashes(Config) -> Tab = make_name(Config), {ok, Pid1} = start(Node1, Tab), {ok, Pid2} = start(Node2, Tab), - ok = rpc(Node2, ?MODULE, mock_pause_on_remote_node_failing, []), + ok = rpc(Node2, cets_test_setup, mock_pause_on_remote_node_failing, []), try {error, {task_failed, @@ -2876,14 +2888,14 @@ cets_ping_non_existing_node(_Config) -> pang = cets_ping:ping('mongooseim@non_existing_host'). pre_connect_fails_on_our_node(_Config) -> - mock_epmd(), + cets_test_setup:mock_epmd(), %% We would fail to connect to the remote EPMD but we would get an IP pang = cets_ping:ping('mongooseim@resolvabletobadip'), meck:unload(). pre_connect_fails_on_one_of_the_nodes(Config) -> #{ct2 := Node2} = proplists:get_value(nodes, Config), - mock_epmd(), + cets_test_setup:mock_epmd(), %% We would get pong on Node2, but would fail an RPC to our hode pang = rpc(Node2, cets_ping, ping, ['cetsnode1@localhost']), History = meck:history(erl_epmd), @@ -2958,76 +2970,6 @@ start_link_local(Name, Opts) -> schedule_cleanup(Pid), {ok, Pid}. -start_local(Name) -> - start_local(Name, #{}). - -start_local(Name, Opts) -> - catch cets:stop(Name), - wait_for_name_to_be_free(node(), Name), - {ok, Pid} = cets:start(Name, Opts), - schedule_cleanup(Pid), - {ok, Pid}. - -schedule_cleanup(Pid) -> - Me = self(), - Cleaner = proc_lib:spawn(fun() -> - Ref = erlang:monitor(process, Me), - receive - {'DOWN', Ref, process, Me, _} -> - %% We do an RPC call, because erlang distribution - %% could not be always reliable (because we test netsplits) - rpc(cets_test_peer:node_to_peer(node(Pid)), cets, stop, [Pid]), - ets:delete_object(cleanup_table, {Me, self()}) - end - end), - ets:insert(cleanup_table, {Me, Cleaner}). - -init_cleanup_table() -> - spawn(fun() -> - ets:new(cleanup_table, [named_table, public, bag]), - timer:sleep(infinity) - end). - -%% schedule_cleanup is async, so this function is waiting for it to finish -wait_for_cleanup() -> - [ - wait_for_down(Cleaner) - || {Owner, Cleaner} <- ets:tab2list(cleanup_table), not is_process_alive(Owner) - ]. - -start(Node, Tab) -> - catch rpc(Node, cets, stop, [Tab]), - wait_for_name_to_be_free(Node, Tab), - {ok, Pid} = rpc(Node, cets, start, [Tab, #{}]), - schedule_cleanup(Pid), - {ok, Pid}. - -start_disco(Node, Opts) -> - case Opts of - #{name := Name} -> - catch rpc(Node, cets, stop, [Name]), - wait_for_name_to_be_free(Node, Name); - _ -> - ok - end, - {ok, Pid} = rpc(Node, cets_discovery, start, [Opts]), - schedule_cleanup(Pid), - Pid. - -start_simple_disco() -> - F = fun(State) -> - {{ok, []}, State} - end, - {ok, Pid} = cets_discovery:start_link(#{ - backend_module => cets_discovery_fun, get_nodes_fn => F - }), - Pid. - -wait_for_name_to_be_free(Node, Name) -> - %% Wait for the old process to be killed by the cleaner in schedule_cleanup. - %% Cleaner is fast, but not instant. - cets_test_wait:wait_until(fun() -> rpc(Node, erlang, whereis, [Name]) end, undefined). - receive_message(M) -> receive M -> ok @@ -3213,13 +3155,6 @@ get_message_queue_length(Pid) -> {message_queue_len, Len} = erlang:process_info(Pid, message_queue_len), Len. -wait_for_down(Pid) -> - Mon = erlang:monitor(process, Pid), - receive - {'DOWN', Mon, process, Pid, Reason} -> Reason - after 5000 -> ct:fail({wait_for_down_timeout, Pid}) - end. - not_leader(Leader, Other, Leader) -> Other; not_leader(Other, Leader, Leader) -> @@ -3312,20 +3247,6 @@ make_signalling_process() -> end end). -mock_epmd() -> - meck:new(erl_epmd, [passthrough, unstick]), - meck:expect(erl_epmd, address_please, fun - ("cetsnode1", "localhost", inet) -> {ok, {192, 168, 100, 134}}; - (Name, Host, Family) -> meck:passthrough([Name, Host, Family]) - end). - -mock_pause_on_remote_node_failing() -> - meck:new(cets_join, [passthrough, no_link]), - meck:expect(cets_join, pause_on_remote_node, fun(_JoinerPid, _AllPids) -> - error(mock_pause_on_remote_node_failing) - end), - ok. - %% Fails if List has duplicates assert_unique(List) -> ?assertEqual([], List -- lists:usort(List)), diff --git a/test/cets_test_setup.erl b/test/cets_test_setup.erl new file mode 100644 index 0000000..45afae7 --- /dev/null +++ b/test/cets_test_setup.erl @@ -0,0 +1,99 @@ +-module(cets_test_setup). +-export([ + mock_epmd/0, + mock_pause_on_remote_node_failing/0 +]). + +-export([ + start_local/1, + start_local/2, + start/2, + start_disco/2, + start_simple_disco/0 +]). + +-export([ + init_cleanup_table/0, + wait_for_cleanup/0 +]). + +-import(cets_test_rpc, [rpc/4]). + +mock_epmd() -> + meck:new(erl_epmd, [passthrough, unstick]), + meck:expect(erl_epmd, address_please, fun + ("cetsnode1", "localhost", inet) -> {ok, {192, 168, 100, 134}}; + (Name, Host, Family) -> meck:passthrough([Name, Host, Family]) + end). + +mock_pause_on_remote_node_failing() -> + meck:new(cets_join, [passthrough, no_link]), + meck:expect(cets_join, pause_on_remote_node, fun(_JoinerPid, _AllPids) -> + error(mock_pause_on_remote_node_failing) + end), + ok. + +start_local(Name) -> + start_local(Name, #{}). + +start_local(Name, Opts) -> + catch cets:stop(Name), + cets_test_wait:wait_for_name_to_be_free(node(), Name), + {ok, Pid} = cets:start(Name, Opts), + schedule_cleanup(Pid), + {ok, Pid}. + +start(Node, Tab) -> + catch rpc(Node, cets, stop, [Tab]), + cets_test_wait:wait_for_name_to_be_free(Node, Tab), + {ok, Pid} = rpc(Node, cets, start, [Tab, #{}]), + schedule_cleanup(Pid), + {ok, Pid}. + +start_disco(Node, Opts) -> + case Opts of + #{name := Name} -> + catch rpc(Node, cets, stop, [Name]), + cets_test_wait:wait_for_name_to_be_free(Node, Name); + _ -> + ok + end, + {ok, Pid} = rpc(Node, cets_discovery, start, [Opts]), + schedule_cleanup(Pid), + Pid. + +start_simple_disco() -> + F = fun(State) -> + {{ok, []}, State} + end, + {ok, Pid} = cets_discovery:start_link(#{ + backend_module => cets_discovery_fun, get_nodes_fn => F + }), + Pid. + +schedule_cleanup(Pid) -> + Me = self(), + Cleaner = proc_lib:spawn(fun() -> + Ref = erlang:monitor(process, Me), + receive + {'DOWN', Ref, process, Me, _} -> + %% We do an RPC call, because erlang distribution + %% could not be always reliable (because we test netsplits) + rpc(cets_test_peer:node_to_peer(node(Pid)), cets, stop, [Pid]), + ets:delete_object(cleanup_table, {Me, self()}) + end + end), + ets:insert(cleanup_table, {Me, Cleaner}). + +init_cleanup_table() -> + spawn(fun() -> + ets:new(cleanup_table, [named_table, public, bag]), + timer:sleep(infinity) + end). + +%% schedule_cleanup is async, so this function is waiting for it to finish +wait_for_cleanup() -> + [ + cets_test_wait:wait_for_down(Cleaner) + || {Owner, Cleaner} <- ets:tab2list(cleanup_table), not is_process_alive(Owner) + ]. diff --git a/test/cets_test_wait.erl b/test/cets_test_wait.erl index 7cb5b7b..b34e82c 100644 --- a/test/cets_test_wait.erl +++ b/test/cets_test_wait.erl @@ -1,6 +1,12 @@ -module(cets_test_wait). -export([wait_until/2]). +%% Helpers +-export([ + wait_for_name_to_be_free/2, + wait_for_down/1 +]). + %% From mongoose_helper %% @doc Waits `TimeLeft` for `Fun` to return `ExpectedValue` @@ -72,3 +78,19 @@ wait_and_continue( time_left => TimeLeft - SleepTime, history => [FunResult | History] }). + +%% Helpers + +wait_for_name_to_be_free(Node, Name) -> + %% Wait for the old process to be killed by the cleaner in schedule_cleanup. + %% Cleaner is fast, but not instant. + cets_test_wait:wait_until( + fun() -> cets_test_rpc:rpc(Node, erlang, whereis, [Name]) end, undefined + ). + +wait_for_down(Pid) -> + Mon = erlang:monitor(process, Pid), + receive + {'DOWN', Mon, process, Pid, Reason} -> Reason + after 5000 -> ct:fail({wait_for_down_timeout, Pid}) + end. From d994f1a45311042dc7633847b3af9a6003778ed2 Mon Sep 17 00:00:00 2001 From: Mikhail Uvarov Date: Tue, 12 Mar 2024 13:08:33 +0100 Subject: [PATCH 03/30] Move setup functions into cets_test_setup --- test/cets_SUITE.erl | 163 ++++----------------------------------- test/cets_test_setup.erl | 158 ++++++++++++++++++++++++++++++++++++- test/cets_test_wait.erl | 21 ++++- 3 files changed, 192 insertions(+), 150 deletions(-) diff --git a/test/cets_SUITE.erl b/test/cets_SUITE.erl index 0bc7870..5e82b16 100644 --- a/test/cets_SUITE.erl +++ b/test/cets_SUITE.erl @@ -36,11 +36,24 @@ start_local/1, start_local/2, start_disco/2, - start_simple_disco/0 + start_simple_disco/0, + make_name/1, + make_name/2, + lock_name/1, + disco_name/1, + given_two_joined_tables/1, + given_two_joined_tables/2, + given_3_servers/1, + given_3_servers/2, + given_n_servers/3, + setup_two_nodes_and_discovery/1, + setup_two_nodes_and_discovery/2, + simulate_disco_restart/1 ]). -import(cets_test_wait, [ - wait_for_down/1 + wait_for_down/1, + wait_for_ready/2 ]). all() -> @@ -893,7 +906,7 @@ join_fails_before_send_dump_and_there_are_pending_remote_ops(Config) -> receive_message(before_send_dump_called_for_pid1), cets:insert_request(Pid1, {1}), %% Check that the remote_op has reached Pid2 message box - cets_test_wait:wait_until(fun() -> count_remote_ops_in_the_message_box(Pid2) end, 1), + cets_test_wait:wait_for_remote_ops_in_the_message_box(Pid2, 1), sys:resume(Pid2), %% Wait till remote_op is processed cets:ping(Pid2), @@ -2990,29 +3003,6 @@ flush_message(M) -> ok end. -make_name(Config) -> - make_name(Config, 1). - -make_name(Config, Num) when is_integer(Num) -> - Testcase = proplists:get_value(testcase, Config), - list_to_atom(atom_to_list(Testcase) ++ "_" ++ integer_to_list(Num)); -make_name(Config, Atom) when is_atom(Atom) -> - Testcase = proplists:get_value(testcase, Config), - list_to_atom(atom_to_list(Testcase) ++ "_" ++ atom_to_list(Atom)). - -lock_name(Config) -> - Testcase = proplists:get_value(testcase, Config), - list_to_atom(atom_to_list(Testcase) ++ "_lock"). - -disco_name(Config) -> - Testcase = proplists:get_value(testcase, Config), - list_to_atom(atom_to_list(Testcase) ++ "_disco"). - -count_remote_ops_in_the_message_box(Pid) -> - {messages, Messages} = erlang:process_info(Pid, messages), - Ops = [M || M <- Messages, element(1, M) =:= remote_op], - length(Ops). - set_join_ref(Pid, JoinRef) -> sys:replace_state(Pid, fun(#{join_ref := _} = State) -> State#{join_ref := JoinRef} end). @@ -3021,118 +3011,6 @@ set_other_servers(Pid, Servers) -> State#{other_servers := Servers} end). -given_two_joined_tables(Config) -> - given_two_joined_tables(Config, #{}). - -given_two_joined_tables(Config, Opts) -> - Tab1 = make_name(Config, 1), - Tab2 = make_name(Config, 2), - {ok, Pid1} = start_local(Tab1, Opts), - {ok, Pid2} = start_local(Tab2, Opts), - ok = cets_join:join(lock_name(Config), #{}, Pid1, Pid2), - #{ - tab1 => Tab1, - tab2 => Tab2, - pid1 => Pid1, - pid2 => Pid2, - tabs => [Tab1, Tab2], - pids => [Pid1, Pid2] - }. - -given_3_servers(Config) -> - given_3_servers(Config, #{}). - -given_3_servers(Config, Opts) -> - given_n_servers(Config, 3, Opts). - -given_n_servers(Config, N, Opts) -> - Tabs = [make_name(Config, X) || X <- lists:seq(1, N)], - Pids = [ - begin - {ok, Pid} = start_local(Tab, Opts), - Pid - end - || Tab <- Tabs - ], - #{pids => Pids, tabs => Tabs}. - -setup_two_nodes_and_discovery(Config) -> - setup_two_nodes_and_discovery(Config, []). - -%% Flags: -%% - disco2 - start discovery on Node2 -%% - wait - call wait_for_ready/2 -setup_two_nodes_and_discovery(Config, Flags) -> - Me = self(), - Node1 = node(), - #{ct2 := Peer2} = proplists:get_value(peers, Config), - #{ct2 := Node2} = proplists:get_value(nodes, Config), - disconnect_node_by_name(Config, ct2), - Tab = make_name(Config), - {ok, _Pid1} = start(Node1, Tab), - {ok, _Pid2} = start(Peer2, Tab), - F = fun(State) -> - case lists:member(notify_get_nodes, Flags) of - true -> - Me ! get_nodes; - false -> - ok - end, - {{ok, [Node1, Node2]}, State} - end, - DiscoName = disco_name(Config), - DiscoOpts = #{ - name => DiscoName, backend_module => cets_discovery_fun, get_nodes_fn => F - }, - Disco = start_disco(Node1, DiscoOpts), - %% Start Disco on second node (it is not always needed) - Res = - case lists:member(disco2, Flags) of - true -> - Disco2 = start_disco(Node2, DiscoOpts), - cets_discovery:add_table(Disco2, Tab), - #{disco2 => Disco2}; - false -> - #{} - end, - cets_discovery:add_table(Disco, Tab), - case lists:member(wait, Flags) of - true -> - wait_for_ready(Disco, 5000); - false -> - ok - end, - case lists:member(netsplit, Flags) of - true -> - %% Simulate a loss of connection between nodes - disconnect_node_by_name(Config, ct2); - false -> - ok - end, - Res#{ - disco_name => DiscoName, - disco_opts => DiscoOpts, - disco => Disco, - node1 => Node1, - node2 => Node2, - peer2 => Peer2 - }. - -simulate_disco_restart(#{ - disco_opts := DiscoOpts, - disco2 := Disco2, - node1 := Node1, - node2 := Node2, - peer2 := Peer2 -}) -> - %% Instead of restart the node, restart the process. It is enough to get - %% a new start_time. - disconnect_node(Peer2, Node1), - rpc(Peer2, cets, stop, [Disco2]), - %% We actually would not detect the case of us just stopping the remote disco - %% server. Because we use nodeup/nodedown to detect downs, not monitors. - _RestartedDisco2 = start_disco(Node2, DiscoOpts). - stopped_pid() -> %% Get a pid for a stopped process {Pid, Mon} = spawn_monitor(fun() -> ok end), @@ -3259,15 +3137,6 @@ make_process() -> end end). -wait_for_ready(Disco, Timeout) -> - try - ok = cets_discovery:wait_for_ready(Disco, Timeout) - catch - Class:Reason:Stacktrace -> - ct:pal("system_info: ~p", [cets_discovery:system_info(Disco)]), - erlang:raise(Class, Reason, Stacktrace) - end. - %% Overwrites nodedown timestamp for the Node in the discovery server state set_nodedown_timestamp(Disco, Node, NewTimestamp) -> sys:replace_state(Disco, fun(#{nodedown_timestamps := Map} = State) -> diff --git a/test/cets_test_setup.erl b/test/cets_test_setup.erl index 45afae7..c92cdee 100644 --- a/test/cets_test_setup.erl +++ b/test/cets_test_setup.erl @@ -4,6 +4,11 @@ mock_pause_on_remote_node_failing/0 ]). +-export([ + init_cleanup_table/0, + wait_for_cleanup/0 +]). + -export([ start_local/1, start_local/2, @@ -13,8 +18,27 @@ ]). -export([ - init_cleanup_table/0, - wait_for_cleanup/0 + make_name/1, + make_name/2, + lock_name/1, + disco_name/1 +]). + +-export([ + given_two_joined_tables/1, + given_two_joined_tables/2, + given_3_servers/1, + given_3_servers/2, + given_n_servers/3, + setup_two_nodes_and_discovery/1, + setup_two_nodes_and_discovery/2 +]). + +-export([simulate_disco_restart/1]). + +-import(cets_test_node, [ + disconnect_node/2, + disconnect_node_by_name/2 ]). -import(cets_test_rpc, [rpc/4]). @@ -97,3 +121,133 @@ wait_for_cleanup() -> cets_test_wait:wait_for_down(Cleaner) || {Owner, Cleaner} <- ets:tab2list(cleanup_table), not is_process_alive(Owner) ]. + +make_name(Config) -> + make_name(Config, 1). + +make_name(Config, Num) when is_integer(Num) -> + Testcase = proplists:get_value(testcase, Config), + list_to_atom(atom_to_list(Testcase) ++ "_" ++ integer_to_list(Num)); +make_name(Config, Atom) when is_atom(Atom) -> + Testcase = proplists:get_value(testcase, Config), + list_to_atom(atom_to_list(Testcase) ++ "_" ++ atom_to_list(Atom)). + +lock_name(Config) -> + Testcase = proplists:get_value(testcase, Config), + list_to_atom(atom_to_list(Testcase) ++ "_lock"). + +disco_name(Config) -> + Testcase = proplists:get_value(testcase, Config), + list_to_atom(atom_to_list(Testcase) ++ "_disco"). + +given_two_joined_tables(Config) -> + given_two_joined_tables(Config, #{}). + +given_two_joined_tables(Config, Opts) -> + Tab1 = make_name(Config, 1), + Tab2 = make_name(Config, 2), + {ok, Pid1} = start_local(Tab1, Opts), + {ok, Pid2} = start_local(Tab2, Opts), + ok = cets_join:join(lock_name(Config), #{}, Pid1, Pid2), + #{ + tab1 => Tab1, + tab2 => Tab2, + pid1 => Pid1, + pid2 => Pid2, + tabs => [Tab1, Tab2], + pids => [Pid1, Pid2] + }. + +given_3_servers(Config) -> + given_3_servers(Config, #{}). + +given_3_servers(Config, Opts) -> + given_n_servers(Config, 3, Opts). + +given_n_servers(Config, N, Opts) -> + Tabs = [make_name(Config, X) || X <- lists:seq(1, N)], + Pids = [ + begin + {ok, Pid} = start_local(Tab, Opts), + Pid + end + || Tab <- Tabs + ], + #{pids => Pids, tabs => Tabs}. + +setup_two_nodes_and_discovery(Config) -> + setup_two_nodes_and_discovery(Config, []). + +%% Flags: +%% - disco2 - start discovery on Node2 +%% - wait - call wait_for_ready/2 +setup_two_nodes_and_discovery(Config, Flags) -> + Me = self(), + Node1 = node(), + #{ct2 := Peer2} = proplists:get_value(peers, Config), + #{ct2 := Node2} = proplists:get_value(nodes, Config), + disconnect_node_by_name(Config, ct2), + Tab = make_name(Config), + {ok, _Pid1} = start(Node1, Tab), + {ok, _Pid2} = start(Peer2, Tab), + F = fun(State) -> + case lists:member(notify_get_nodes, Flags) of + true -> + Me ! get_nodes; + false -> + ok + end, + {{ok, [Node1, Node2]}, State} + end, + DiscoName = disco_name(Config), + DiscoOpts = #{ + name => DiscoName, backend_module => cets_discovery_fun, get_nodes_fn => F + }, + Disco = start_disco(Node1, DiscoOpts), + %% Start Disco on second node (it is not always needed) + Res = + case lists:member(disco2, Flags) of + true -> + Disco2 = start_disco(Node2, DiscoOpts), + cets_discovery:add_table(Disco2, Tab), + #{disco2 => Disco2}; + false -> + #{} + end, + cets_discovery:add_table(Disco, Tab), + case lists:member(wait, Flags) of + true -> + cets_test_wait:wait_for_ready(Disco, 5000); + false -> + ok + end, + case lists:member(netsplit, Flags) of + true -> + %% Simulate a loss of connection between nodes + disconnect_node_by_name(Config, ct2); + false -> + ok + end, + Res#{ + disco_name => DiscoName, + disco_opts => DiscoOpts, + disco => Disco, + node1 => Node1, + node2 => Node2, + peer2 => Peer2 + }. + +simulate_disco_restart(#{ + disco_opts := DiscoOpts, + disco2 := Disco2, + node1 := Node1, + node2 := Node2, + peer2 := Peer2 +}) -> + %% Instead of restart the node, restart the process. It is enough to get + %% a new start_time. + disconnect_node(Peer2, Node1), + rpc(Peer2, cets, stop, [Disco2]), + %% We actually would not detect the case of us just stopping the remote disco + %% server. Because we use nodeup/nodedown to detect downs, not monitors. + _RestartedDisco2 = start_disco(Node2, DiscoOpts). diff --git a/test/cets_test_wait.erl b/test/cets_test_wait.erl index b34e82c..42e03b4 100644 --- a/test/cets_test_wait.erl +++ b/test/cets_test_wait.erl @@ -4,7 +4,9 @@ %% Helpers -export([ wait_for_name_to_be_free/2, - wait_for_down/1 + wait_for_down/1, + wait_for_remote_ops_in_the_message_box/2, + wait_for_ready/2 ]). %% From mongoose_helper @@ -94,3 +96,20 @@ wait_for_down(Pid) -> {'DOWN', Mon, process, Pid, Reason} -> Reason after 5000 -> ct:fail({wait_for_down_timeout, Pid}) end. + +wait_for_remote_ops_in_the_message_box(Pid, Count) -> + cets_test_wait:wait_until(fun() -> count_remote_ops_in_the_message_box(Pid) end, Count). + +count_remote_ops_in_the_message_box(Pid) -> + {messages, Messages} = erlang:process_info(Pid, messages), + Ops = [M || M <- Messages, element(1, M) =:= remote_op], + length(Ops). + +wait_for_ready(Disco, Timeout) -> + try + ok = cets_discovery:wait_for_ready(Disco, Timeout) + catch + Class:Reason:Stacktrace -> + ct:pal("system_info: ~p", [cets_discovery:system_info(Disco)]), + erlang:raise(Class, Reason, Stacktrace) + end. From ff5777d4febe0690339e252f8a1b9a4e51583fe5 Mon Sep 17 00:00:00 2001 From: Mikhail Uvarov Date: Tue, 12 Mar 2024 13:18:10 +0100 Subject: [PATCH 04/30] Move wait helpers to cets_test_wait --- test/cets_SUITE.erl | 42 +++++++-------------------------------- test/cets_test_helper.erl | 7 +++++++ test/cets_test_wait.erl | 33 +++++++++++++++++++++++++++++- 3 files changed, 46 insertions(+), 36 deletions(-) create mode 100644 test/cets_test_helper.erl diff --git a/test/cets_SUITE.erl b/test/cets_SUITE.erl index 5e82b16..a272a1f 100644 --- a/test/cets_SUITE.erl +++ b/test/cets_SUITE.erl @@ -53,7 +53,11 @@ -import(cets_test_wait, [ wait_for_down/1, - wait_for_ready/2 + wait_for_ready/2, + wait_for_disco_timestamp_to_appear/3, + wait_for_disco_timestamp_to_be_updated/4, + wait_for_unpaused/3, + wait_for_join_ref_to_match/2 ]). all() -> @@ -2842,7 +2846,7 @@ disco_nodeup_timestamp_is_updated_after_node_reconnects(Config) -> logger_debug_h:start(#{id => ?FUNCTION_NAME}), Setup = setup_two_nodes_and_discovery(Config, [wait, disco2]), #{disco := Disco, node2 := Node2} = Setup, - OldTimestamp = get_disco_timestamp(Disco, nodeup_timestamps, Node2), + OldTimestamp = cets_test_helper:get_disco_timestamp(Disco, nodeup_timestamps, Node2), disconnect_node_by_name(Config, ct2), wait_for_disco_timestamp_to_be_updated(Disco, nodeup_timestamps, Node2, OldTimestamp). @@ -2850,7 +2854,7 @@ disco_node_start_timestamp_is_updated_after_node_restarts(Config) -> logger_debug_h:start(#{id => ?FUNCTION_NAME}), Setup = setup_two_nodes_and_discovery(Config, [wait, disco2]), #{disco := Disco, node2 := Node2} = Setup, - OldTimestamp = get_disco_timestamp(Disco, node_start_timestamps, Node2), + OldTimestamp = cets_test_helper:get_disco_timestamp(Disco, node_start_timestamps, Node2), simulate_disco_restart(Setup), wait_for_disco_timestamp_to_be_updated(Disco, node_start_timestamps, Node2, OldTimestamp). @@ -3086,38 +3090,6 @@ test_data_for_duplicate_missing_table_in_status(Config) -> return_same(X) -> X. -wait_for_disco_timestamp_to_appear(Disco, MapName, NodeKey) -> - F = fun() -> - #{MapName := Map} = cets_discovery:system_info(Disco), - maps:is_key(NodeKey, Map) - end, - cets_test_wait:wait_until(F, true). - -wait_for_disco_timestamp_to_be_updated(Disco, MapName, NodeKey, OldTimestamp) -> - Cond = fun() -> - NewTimestamp = get_disco_timestamp(Disco, MapName, NodeKey), - NewTimestamp =/= OldTimestamp - end, - cets_test_wait:wait_until(Cond, true). - -wait_for_unpaused(Peer, Pid, PausedByPid) -> - Cond = fun() -> - {monitors, Info} = rpc(Peer, erlang, process_info, [Pid, monitors]), - lists:member({process, PausedByPid}, Info) - end, - cets_test_wait:wait_until(Cond, false). - -wait_for_join_ref_to_match(Pid, JoinRef) -> - Cond = fun() -> - maps:get(join_ref, cets:info(Pid)) - end, - cets_test_wait:wait_until(Cond, JoinRef). - -get_disco_timestamp(Disco, MapName, NodeKey) -> - Info = cets_discovery:system_info(Disco), - #{MapName := #{NodeKey := Timestamp}} = Info, - Timestamp. - make_signalling_process() -> proc_lib:spawn_link(fun() -> receive diff --git a/test/cets_test_helper.erl b/test/cets_test_helper.erl new file mode 100644 index 0000000..fb524ec --- /dev/null +++ b/test/cets_test_helper.erl @@ -0,0 +1,7 @@ +-module(cets_test_helper). +-export([get_disco_timestamp/3]). + +get_disco_timestamp(Disco, MapName, NodeKey) -> + Info = cets_discovery:system_info(Disco), + #{MapName := #{NodeKey := Timestamp}} = Info, + Timestamp. diff --git a/test/cets_test_wait.erl b/test/cets_test_wait.erl index 42e03b4..c26ef9d 100644 --- a/test/cets_test_wait.erl +++ b/test/cets_test_wait.erl @@ -6,7 +6,11 @@ wait_for_name_to_be_free/2, wait_for_down/1, wait_for_remote_ops_in_the_message_box/2, - wait_for_ready/2 + wait_for_ready/2, + wait_for_disco_timestamp_to_appear/3, + wait_for_disco_timestamp_to_be_updated/4, + wait_for_unpaused/3, + wait_for_join_ref_to_match/2 ]). %% From mongoose_helper @@ -113,3 +117,30 @@ wait_for_ready(Disco, Timeout) -> ct:pal("system_info: ~p", [cets_discovery:system_info(Disco)]), erlang:raise(Class, Reason, Stacktrace) end. + +wait_for_disco_timestamp_to_appear(Disco, MapName, NodeKey) -> + F = fun() -> + #{MapName := Map} = cets_discovery:system_info(Disco), + maps:is_key(NodeKey, Map) + end, + cets_test_wait:wait_until(F, true). + +wait_for_disco_timestamp_to_be_updated(Disco, MapName, NodeKey, OldTimestamp) -> + Cond = fun() -> + NewTimestamp = cets_test_helper:get_disco_timestamp(Disco, MapName, NodeKey), + NewTimestamp =/= OldTimestamp + end, + cets_test_wait:wait_until(Cond, true). + +wait_for_unpaused(Peer, Pid, PausedByPid) -> + Cond = fun() -> + {monitors, Info} = cets_test_rpc:rpc(Peer, erlang, process_info, [Pid, monitors]), + lists:member({process, PausedByPid}, Info) + end, + cets_test_wait:wait_until(Cond, false). + +wait_for_join_ref_to_match(Pid, JoinRef) -> + Cond = fun() -> + maps:get(join_ref, cets:info(Pid)) + end, + cets_test_wait:wait_until(Cond, JoinRef). From 4445987ec1a5549223aaee5522fa7e31d81079a4 Mon Sep 17 00:00:00 2001 From: Mikhail Uvarov Date: Tue, 12 Mar 2024 13:22:04 +0100 Subject: [PATCH 05/30] Move wait_till_test_stage/wait_till_message_queue_length into cets_test_wait --- test/cets_SUITE.erl | 14 +++----------- test/cets_test_wait.erl | 18 +++++++++++++++++- 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/test/cets_SUITE.erl b/test/cets_SUITE.erl index a272a1f..3828c8e 100644 --- a/test/cets_SUITE.erl +++ b/test/cets_SUITE.erl @@ -57,7 +57,9 @@ wait_for_disco_timestamp_to_appear/3, wait_for_disco_timestamp_to_be_updated/4, wait_for_unpaused/3, - wait_for_join_ref_to_match/2 + wait_for_join_ref_to_match/2, + wait_till_test_stage/2, + wait_till_message_queue_length/2 ]). all() -> @@ -3027,16 +3029,6 @@ get_pd(Pid, Key) -> {dictionary, Dict} = erlang:process_info(Pid, dictionary), proplists:get_value(Key, Dict). -wait_till_test_stage(Pid, Stage) -> - cets_test_wait:wait_until(fun() -> get_pd(Pid, test_stage) end, Stage). - -wait_till_message_queue_length(Pid, Len) -> - cets_test_wait:wait_until(fun() -> get_message_queue_length(Pid) end, Len). - -get_message_queue_length(Pid) -> - {message_queue_len, Len} = erlang:process_info(Pid, message_queue_len), - Len. - not_leader(Leader, Other, Leader) -> Other; not_leader(Other, Leader, Leader) -> diff --git a/test/cets_test_wait.erl b/test/cets_test_wait.erl index c26ef9d..4e29344 100644 --- a/test/cets_test_wait.erl +++ b/test/cets_test_wait.erl @@ -10,7 +10,9 @@ wait_for_disco_timestamp_to_appear/3, wait_for_disco_timestamp_to_be_updated/4, wait_for_unpaused/3, - wait_for_join_ref_to_match/2 + wait_for_join_ref_to_match/2, + wait_till_test_stage/2, + wait_till_message_queue_length/2 ]). %% From mongoose_helper @@ -144,3 +146,17 @@ wait_for_join_ref_to_match(Pid, JoinRef) -> maps:get(join_ref, cets:info(Pid)) end, cets_test_wait:wait_until(Cond, JoinRef). + +get_pd(Pid, Key) -> + {dictionary, Dict} = erlang:process_info(Pid, dictionary), + proplists:get_value(Key, Dict). + +wait_till_test_stage(Pid, Stage) -> + cets_test_wait:wait_until(fun() -> get_pd(Pid, test_stage) end, Stage). + +wait_till_message_queue_length(Pid, Len) -> + cets_test_wait:wait_until(fun() -> get_message_queue_length(Pid) end, Len). + +get_message_queue_length(Pid) -> + {message_queue_len, Len} = erlang:process_info(Pid, message_queue_len), + Len. From c4f5a004fb0e9844b84d85848c2ee63ea5fd7dee Mon Sep 17 00:00:00 2001 From: Mikhail Uvarov Date: Tue, 12 Mar 2024 13:27:33 +0100 Subject: [PATCH 06/30] Create cets_test_receive helper --- test/cets_SUITE.erl | 79 +++++++++++--------------------------- test/cets_test_receive.erl | 47 +++++++++++++++++++++++ 2 files changed, 69 insertions(+), 57 deletions(-) create mode 100644 test/cets_test_receive.erl diff --git a/test/cets_SUITE.erl b/test/cets_SUITE.erl index 3828c8e..189212d 100644 --- a/test/cets_SUITE.erl +++ b/test/cets_SUITE.erl @@ -62,6 +62,14 @@ wait_till_message_queue_length/2 ]). +-import(cets_test_receive, [ + receive_message/1, + receive_message_with_arg/1, + flush_message/1, + receive_all_logs/1, + assert_nothing_is_logged/2 +]). + all() -> [ {group, cets}, @@ -2965,14 +2973,6 @@ ping_pairs_returns_earlier(Config) -> %% Helper functions -receive_all_logs(Id) -> - receive - {log, Id, Log} -> - [Log | receive_all_logs(Id)] - after 100 -> - [] - end. - still_works(Pid) -> pong = cets:ping(Pid), %% The server works fine @@ -2989,26 +2989,6 @@ start_link_local(Name, Opts) -> schedule_cleanup(Pid), {ok, Pid}. -receive_message(M) -> - receive - M -> ok - after 5000 -> error({receive_message_timeout, M}) - end. - -receive_message_with_arg(Tag) -> - receive - {Tag, Arg} -> Arg - after 5000 -> error({receive_message_with_arg_timeout, Tag}) - end. - -flush_message(M) -> - receive - M -> - flush_message(M) - after 0 -> - ok - end. - set_join_ref(Pid, JoinRef) -> sys:replace_state(Pid, fun(#{join_ref := _} = State) -> State#{join_ref := JoinRef} end). @@ -3017,6 +2997,12 @@ set_other_servers(Pid, Servers) -> State#{other_servers := Servers} end). +%% Overwrites nodedown timestamp for the Node in the discovery server state +set_nodedown_timestamp(Disco, Node, NewTimestamp) -> + sys:replace_state(Disco, fun(#{nodedown_timestamps := Map} = State) -> + State#{nodedown_timestamps := maps:put(Node, NewTimestamp, Map)} + end). + stopped_pid() -> %% Get a pid for a stopped process {Pid, Mon} = spawn_monitor(fun() -> ok end), @@ -3025,15 +3011,6 @@ stopped_pid() -> end, Pid. -get_pd(Pid, Key) -> - {dictionary, Dict} = erlang:process_info(Pid, dictionary), - proplists:get_value(Key, Dict). - -not_leader(Leader, Other, Leader) -> - Other; -not_leader(Other, Leader, Leader) -> - Other. - bad_node_pid() -> binary_to_term(bad_node_pid_binary()). @@ -3042,16 +3019,10 @@ bad_node_pid_binary() -> <<131, 88, 100, 0, 17, 98, 97, 100, 110, 111, 100, 101, 64, 108, 111, 99, 97, 108, 104, 111, 115, 116, 0, 0, 0, 90, 0, 0, 0, 0, 100, 206, 70, 92>>. -assert_nothing_is_logged(LogHandlerId, LogRef) -> - receive - {log, LogHandlerId, #{ - level := Level, - msg := {report, #{log_ref := LogRef}} - }} when Level =:= warning; Level =:= error -> - ct:fail(got_logging_but_should_not) - after 0 -> - ok - end. +%% Fails if List has duplicates +assert_unique(List) -> + ?assertEqual([], List -- lists:usort(List)), + List. send_join_start_back_and_wait_for_continue_joining() -> Me = self(), @@ -3089,11 +3060,6 @@ make_signalling_process() -> end end). -%% Fails if List has duplicates -assert_unique(List) -> - ?assertEqual([], List -- lists:usort(List)), - List. - make_process() -> proc_lib:spawn(fun() -> receive @@ -3101,8 +3067,7 @@ make_process() -> end end). -%% Overwrites nodedown timestamp for the Node in the discovery server state -set_nodedown_timestamp(Disco, Node, NewTimestamp) -> - sys:replace_state(Disco, fun(#{nodedown_timestamps := Map} = State) -> - State#{nodedown_timestamps := maps:put(Node, NewTimestamp, Map)} - end). +not_leader(Leader, Other, Leader) -> + Other; +not_leader(Other, Leader, Leader) -> + Other. diff --git a/test/cets_test_receive.erl b/test/cets_test_receive.erl new file mode 100644 index 0000000..6fd2116 --- /dev/null +++ b/test/cets_test_receive.erl @@ -0,0 +1,47 @@ +-module(cets_test_receive). +-export([ + receive_message/1, + receive_message_with_arg/1, + flush_message/1, + receive_all_logs/1, + assert_nothing_is_logged/2 +]). + +receive_message(M) -> + receive + M -> ok + after 5000 -> error({receive_message_timeout, M}) + end. + +receive_message_with_arg(Tag) -> + receive + {Tag, Arg} -> Arg + after 5000 -> error({receive_message_with_arg_timeout, Tag}) + end. + +flush_message(M) -> + receive + M -> + flush_message(M) + after 0 -> + ok + end. + +receive_all_logs(Id) -> + receive + {log, Id, Log} -> + [Log | receive_all_logs(Id)] + after 100 -> + [] + end. + +assert_nothing_is_logged(LogHandlerId, LogRef) -> + receive + {log, LogHandlerId, #{ + level := Level, + msg := {report, #{log_ref := LogRef}} + }} when Level =:= warning; Level =:= error -> + ct:fail(got_logging_but_should_not) + after 0 -> + ok + end. From 616243c85b3cb2eaab2c66fbc8a7d5dc21388e0d Mon Sep 17 00:00:00 2001 From: Mikhail Uvarov Date: Tue, 12 Mar 2024 13:32:48 +0100 Subject: [PATCH 07/30] Move make_process() into cets_test_setup --- test/cets_SUITE.erl | 21 ++++++--------------- test/cets_test_setup.erl | 19 +++++++++++++++++++ 2 files changed, 25 insertions(+), 15 deletions(-) diff --git a/test/cets_SUITE.erl b/test/cets_SUITE.erl index 189212d..c46cbee 100644 --- a/test/cets_SUITE.erl +++ b/test/cets_SUITE.erl @@ -51,6 +51,11 @@ simulate_disco_restart/1 ]). +-import(cets_test_setup, [ + make_signalling_process/0, + make_process/0 +]). + -import(cets_test_wait, [ wait_for_down/1, wait_for_ready/2, @@ -1332,7 +1337,7 @@ ignore_send_dump_received_when_unpaused(Config) -> pause_on_remote_node_returns_if_monitor_process_dies(Config) -> JoinPid = make_process(), #{ct2 := Node2} = proplists:get_value(nodes, Config), - AllPids = [rpc(Node2, ?MODULE, make_process, [])], + AllPids = [rpc(Node2, cets_test_setup, make_process, [])], TestPid = proc_lib:spawn(fun() -> %% Would block cets_join:pause_on_remote_node(JoinPid, AllPids) @@ -3053,20 +3058,6 @@ test_data_for_duplicate_missing_table_in_status(Config) -> return_same(X) -> X. -make_signalling_process() -> - proc_lib:spawn_link(fun() -> - receive - stop -> ok - end - end). - -make_process() -> - proc_lib:spawn(fun() -> - receive - stop -> stop - end - end). - not_leader(Leader, Other, Leader) -> Other; not_leader(Other, Leader, Leader) -> diff --git a/test/cets_test_setup.erl b/test/cets_test_setup.erl index c92cdee..c0b8a9f 100644 --- a/test/cets_test_setup.erl +++ b/test/cets_test_setup.erl @@ -36,6 +36,11 @@ -export([simulate_disco_restart/1]). +-export([ + make_signalling_process/0, + make_process/0 +]). + -import(cets_test_node, [ disconnect_node/2, disconnect_node_by_name/2 @@ -251,3 +256,17 @@ simulate_disco_restart(#{ %% We actually would not detect the case of us just stopping the remote disco %% server. Because we use nodeup/nodedown to detect downs, not monitors. _RestartedDisco2 = start_disco(Node2, DiscoOpts). + +make_signalling_process() -> + proc_lib:spawn_link(fun() -> + receive + stop -> ok + end + end). + +make_process() -> + proc_lib:spawn(fun() -> + receive + stop -> stop + end + end). From 2c9130ec7c2d0bc369c5d060c37406e267af0976 Mon Sep 17 00:00:00 2001 From: Mikhail Uvarov Date: Tue, 12 Mar 2024 14:14:39 +0100 Subject: [PATCH 08/30] Move test cases into cets_disco_SUITE --- test/cets_SUITE.erl | 188 +--------------------------- test/cets_disco_SUITE.erl | 257 ++++++++++++++++++++++++++++++++++++++ test/cets_test_helper.erl | 12 +- test/cets_test_node.erl | 37 ------ test/cets_test_peer.erl | 56 +++++++++ test/cets_test_setup.erl | 10 +- 6 files changed, 338 insertions(+), 222 deletions(-) create mode 100644 test/cets_disco_SUITE.erl delete mode 100644 test/cets_test_node.erl diff --git a/test/cets_SUITE.erl b/test/cets_SUITE.erl index c46cbee..86286e8 100644 --- a/test/cets_SUITE.erl +++ b/test/cets_SUITE.erl @@ -12,11 +12,10 @@ -compile([export_all, nowarn_export_all]). --import(cets_test_node, [ +-import(cets_test_peer, [ block_node/2, reconnect_node/2, - disconnect_node/2, - disconnect_node_by_name/2 + disconnect_node/2 ]). -import(cets_test_rpc, [ @@ -45,10 +44,7 @@ given_two_joined_tables/2, given_3_servers/1, given_3_servers/2, - given_n_servers/3, - setup_two_nodes_and_discovery/1, - setup_two_nodes_and_discovery/2, - simulate_disco_restart/1 + given_n_servers/3 ]). -import(cets_test_setup, [ @@ -59,8 +55,6 @@ -import(cets_test_wait, [ wait_for_down/1, wait_for_ready/2, - wait_for_disco_timestamp_to_appear/3, - wait_for_disco_timestamp_to_be_updated/4, wait_for_unpaused/3, wait_for_join_ref_to_match/2, wait_till_test_stage/2, @@ -70,11 +64,12 @@ -import(cets_test_receive, [ receive_message/1, receive_message_with_arg/1, - flush_message/1, receive_all_logs/1, assert_nothing_is_logged/2 ]). +-import(cets_test_helper, [assert_unique/1]). + all() -> [ {group, cets}, @@ -265,16 +260,6 @@ seq_cases() -> logging_when_failing_join_with_disco, cets_ping_all_returns_when_ping_crashes, join_interrupted_when_ping_crashes, - disco_logs_nodeup, - disco_logs_nodedown, - disco_logs_nodeup_after_downtime, - disco_logs_node_reconnects_after_downtime, - disco_node_up_timestamp_is_remembered, - disco_node_down_timestamp_is_remembered, - disco_nodeup_timestamp_is_updated_after_node_reconnects, - disco_node_start_timestamp_is_updated_after_node_restarts, - disco_late_pang_result_arrives_after_node_went_up, - disco_nodeup_triggers_check_and_get_nodes, ping_pairs_returns_pongs, ping_pairs_returns_earlier, pre_connect_fails_on_our_node, @@ -288,11 +273,6 @@ cets_seq_no_log_cases() -> [ join_interrupted_when_ping_crashes, node_down_history_is_updated_when_netsplit_happens, - disco_node_up_timestamp_is_remembered, - disco_node_down_timestamp_is_remembered, - disco_nodeup_timestamp_is_updated_after_node_reconnects, - disco_node_start_timestamp_is_updated_after_node_restarts, - disco_late_pang_result_arrives_after_node_went_up, send_check_servers_is_called_before_last_server_got_dump, remote_ops_are_not_sent_before_last_server_got_dump ]. @@ -308,6 +288,7 @@ init_per_suite(Config) -> ]. end_per_suite(Config) -> + cets_test_setup:remove_cleanup_table(), Config. init_per_group(Group, Config) when Group == cets_seq_no_log; Group == cets_no_log -> @@ -2760,158 +2741,6 @@ node_down_history_is_updated_when_netsplit_happens(Config) -> cets:stop(Pid5) end. -disco_logs_nodeup(Config) -> - logger_debug_h:start(#{id => ?FUNCTION_NAME}), - #{disco := Disco, node2 := Node2} = setup_two_nodes_and_discovery(Config), - %% There could be several disco processes still running from the previous tests, - %% filter out logs by pid. - receive - {log, ?FUNCTION_NAME, #{ - level := warning, - meta := #{pid := Disco}, - msg := {report, #{what := nodeup, remote_node := Node2} = R} - }} = M -> - ?assert(is_integer(maps:get(connected_nodes, R)), M), - ?assert(is_integer(maps:get(time_since_startup_in_milliseconds, R)), M) - after 5000 -> - ct:fail(timeout) - end. - -disco_node_up_timestamp_is_remembered(Config) -> - #{disco := Disco, node2 := Node2} = setup_two_nodes_and_discovery(Config), - %% Check that nodeup is remembered - wait_for_disco_timestamp_to_appear(Disco, nodeup_timestamps, Node2). - -disco_logs_nodedown(Config) -> - logger_debug_h:start(#{id => ?FUNCTION_NAME}), - ok = net_kernel:monitor_nodes(true), - #{disco := Disco, node2 := Node2} = setup_two_nodes_and_discovery(Config, [wait, netsplit]), - receive_message({nodedown, Node2}), - receive - {log, ?FUNCTION_NAME, #{ - level := warning, - meta := #{pid := Disco}, - msg := {report, #{what := nodedown, remote_node := Node2} = R} - }} = M -> - ?assert(is_integer(maps:get(connected_nodes, R)), M), - ?assert(is_integer(maps:get(time_since_startup_in_milliseconds, R)), M), - ?assert(is_integer(maps:get(connected_millisecond_duration, R)), M) - after 5000 -> - ct:fail(timeout) - end. - -disco_node_down_timestamp_is_remembered(Config) -> - #{disco := Disco, node2 := Node2} = setup_two_nodes_and_discovery(Config, [wait, netsplit]), - %% Check that nodedown is remembered - wait_for_disco_timestamp_to_appear(Disco, nodedown_timestamps, Node2). - -disco_logs_nodeup_after_downtime(Config) -> - logger_debug_h:start(#{id => ?FUNCTION_NAME}), - #{disco := Disco, node2 := Node2} = setup_two_nodes_and_discovery(Config, [wait, netsplit]), - %% At this point cets_disco should reconnect nodes back automatically - %% after retry_type_to_timeout(after_nodedown) time. - %% We want to speed this up for tests though. - Disco ! check, - %% Receive a nodeup after the disconnect. - %% This nodeup should contain the downtime_millisecond_duration field - %% (initial nodeup should not contain this field). - receive - {log, ?FUNCTION_NAME, #{ - level := warning, - meta := #{pid := Disco}, - msg := - {report, - #{ - what := nodeup, - remote_node := Node2, - downtime_millisecond_duration := Downtime - } = R} - }} = M -> - ?assert(is_integer(maps:get(connected_nodes, R)), M), - ?assert(is_integer(Downtime), M) - after 5000 -> - ct:fail(timeout) - end. - -disco_logs_node_reconnects_after_downtime(Config) -> - logger_debug_h:start(#{id => ?FUNCTION_NAME}), - Setup = setup_two_nodes_and_discovery(Config, [wait, disco2]), - #{disco := Disco, node1 := Node1, node2 := Node2, peer2 := Peer2} = Setup, - %% Check that a start timestamp from a remote node is stored - Info = cets_discovery:system_info(Disco), - ?assertMatch(#{node_start_timestamps := #{Node2 := _}}, Info), - disconnect_node(Peer2, Node1), - receive - {log, ?FUNCTION_NAME, #{ - level := warning, - meta := #{pid := Disco}, - msg := - {report, #{ - what := node_reconnects, - start_time := StartTime, - remote_node := Node2 - }} - }} = M -> - ?assert(is_integer(StartTime), M) - after 5000 -> - ct:fail(timeout) - end. - -disco_nodeup_timestamp_is_updated_after_node_reconnects(Config) -> - logger_debug_h:start(#{id => ?FUNCTION_NAME}), - Setup = setup_two_nodes_and_discovery(Config, [wait, disco2]), - #{disco := Disco, node2 := Node2} = Setup, - OldTimestamp = cets_test_helper:get_disco_timestamp(Disco, nodeup_timestamps, Node2), - disconnect_node_by_name(Config, ct2), - wait_for_disco_timestamp_to_be_updated(Disco, nodeup_timestamps, Node2, OldTimestamp). - -disco_node_start_timestamp_is_updated_after_node_restarts(Config) -> - logger_debug_h:start(#{id => ?FUNCTION_NAME}), - Setup = setup_two_nodes_and_discovery(Config, [wait, disco2]), - #{disco := Disco, node2 := Node2} = Setup, - OldTimestamp = cets_test_helper:get_disco_timestamp(Disco, node_start_timestamps, Node2), - simulate_disco_restart(Setup), - wait_for_disco_timestamp_to_be_updated(Disco, node_start_timestamps, Node2, OldTimestamp). - -disco_late_pang_result_arrives_after_node_went_up(Config) -> - Node1 = node(), - #{ct2 := Node2} = proplists:get_value(nodes, Config), - %% unavailable_nodes list contains nodes which have not responded to pings. - %% Ping is async though. - %% So, there could be the situation when the result of ping would be processed - %% after the node actually got connected. - meck:new(cets_ping, [passthrough]), - Me = self(), - meck:expect(cets_ping, send_ping_result, fun(Pid, Node, _PingResult) -> - %% Wait until Node is up - Cond = fun() -> lists:member(Node, nodes()) end, - cets_test_wait:wait_until(Cond, true), - Me ! send_ping_result_called, - %% Return pang to cets_discovery. - %% cets_join does not use send_ping_result function - %% and would receive pong and join correctly. - meck:passthrough([Pid, Node, pang]) - end), - try - %% setup_two_nodes_and_discovery would call disconnect_node/2 function - Setup = setup_two_nodes_and_discovery(Config, [wait, disco2]), - receive_message(send_ping_result_called), - #{disco_name := DiscoName} = Setup, - Status = cets_status:status(DiscoName), - %% Check that pang is ignored and unavailable_nodes list is empty. - ?assertMatch([], maps:get(unavailable_nodes, Status)), - ?assertMatch([Node1, Node2], maps:get(joined_nodes, Status)) - after - meck:unload() - end. - -disco_nodeup_triggers_check_and_get_nodes(Config) -> - Setup = setup_two_nodes_and_discovery(Config, [wait, notify_get_nodes]), - #{disco := Disco, node2 := Node2} = Setup, - flush_message(get_nodes), - Disco ! {nodeup, Node2}, - receive_message(get_nodes). - format_data_does_not_return_table_duplicates(Config) -> Res = cets_status:format_data(test_data_for_duplicate_missing_table_in_status(Config)), ?assertMatch(#{remote_unknown_tables := [], remote_nodes_with_missing_tables := []}, Res). @@ -3024,11 +2853,6 @@ bad_node_pid_binary() -> <<131, 88, 100, 0, 17, 98, 97, 100, 110, 111, 100, 101, 64, 108, 111, 99, 97, 108, 104, 111, 115, 116, 0, 0, 0, 90, 0, 0, 0, 0, 100, 206, 70, 92>>. -%% Fails if List has duplicates -assert_unique(List) -> - ?assertEqual([], List -- lists:usort(List)), - List. - send_join_start_back_and_wait_for_continue_joining() -> Me = self(), fun diff --git a/test/cets_disco_SUITE.erl b/test/cets_disco_SUITE.erl new file mode 100644 index 0000000..82023f3 --- /dev/null +++ b/test/cets_disco_SUITE.erl @@ -0,0 +1,257 @@ +-module(cets_disco_SUITE). +-include_lib("common_test/include/ct.hrl"). +-include_lib("eunit/include/eunit.hrl"). +-include_lib("kernel/include/logger.hrl"). + +-compile([export_all, nowarn_export_all]). + +-import(cets_test_setup, [ + setup_two_nodes_and_discovery/1, + setup_two_nodes_and_discovery/2, + simulate_disco_restart/1 +]). + +-import(cets_test_wait, [ + wait_for_disco_timestamp_to_appear/3, + wait_for_disco_timestamp_to_be_updated/4 +]). + +-import(cets_test_receive, [ + receive_message/1, + flush_message/1 +]). + +-import(cets_test_peer, [ + disconnect_node/2, + disconnect_node_by_name/2 +]). + +-import(cets_test_helper, [assert_unique/1]). + +all() -> + [ + {group, cets_seq}, + {group, cets_seq_no_log} + ]. + +groups() -> + %% Cases should have unique names, because we name CETS servers based on case names + [ + %% These tests actually simulate a netsplit on the distribution level. + %% Though, global's prevent_overlapping_partitions option starts kicking + %% all nodes from the cluster, so we have to be careful not to break other cases. + %% Setting prevent_overlapping_partitions=false on ct5 helps. + {cets_seq, [sequence, {repeat_until_any_fail, 2}], assert_unique(seq_cases())}, + {cets_seq_no_log, [sequence, {repeat_until_any_fail, 2}], + assert_unique(cets_seq_no_log_cases())} + ]. + +seq_cases() -> + [ + disco_logs_nodeup, + disco_logs_nodedown, + disco_logs_nodeup_after_downtime, + disco_logs_node_reconnects_after_downtime, + disco_node_up_timestamp_is_remembered, + disco_node_down_timestamp_is_remembered, + disco_nodeup_timestamp_is_updated_after_node_reconnects, + disco_node_start_timestamp_is_updated_after_node_restarts, + disco_late_pang_result_arrives_after_node_went_up, + disco_nodeup_triggers_check_and_get_nodes + ]. + +cets_seq_no_log_cases() -> + [ + disco_node_up_timestamp_is_remembered, + disco_node_down_timestamp_is_remembered, + disco_nodeup_timestamp_is_updated_after_node_reconnects, + disco_node_start_timestamp_is_updated_after_node_restarts, + disco_late_pang_result_arrives_after_node_went_up + ]. + +init_per_suite(Config) -> + cets_test_setup:init_cleanup_table(), + cets_test_peer:start([ct2], Config). + +end_per_suite(Config) -> + cets_test_setup:remove_cleanup_table(), + cets_test_peer:stop(Config), + Config. + +init_per_group(Group, Config) when Group == cets_seq_no_log; Group == cets_no_log -> + [ok = logger:set_module_level(M, none) || M <- log_modules()], + Config; +init_per_group(_Group, Config) -> + Config. + +end_per_group(Group, Config) when Group == cets_seq_no_log; Group == cets_no_log -> + [ok = logger:unset_module_level(M) || M <- log_modules()], + Config; +end_per_group(_Group, Config) -> + Config. + +init_per_testcase(Name, Config) -> + init_per_testcase_generic(Name, Config). + +init_per_testcase_generic(Name, Config) -> + [{testcase, Name} | Config]. + +end_per_testcase(_, _Config) -> + cets_test_setup:wait_for_cleanup(), + ok. + +%% Modules that use a multiline LOG_ macro +log_modules() -> + [cets, cets_call, cets_long, cets_join, cets_discovery]. + +disco_logs_nodeup(Config) -> + logger_debug_h:start(#{id => ?FUNCTION_NAME}), + #{disco := Disco, node2 := Node2} = setup_two_nodes_and_discovery(Config), + %% There could be several disco processes still running from the previous tests, + %% filter out logs by pid. + receive + {log, ?FUNCTION_NAME, #{ + level := warning, + meta := #{pid := Disco}, + msg := {report, #{what := nodeup, remote_node := Node2} = R} + }} = M -> + ?assert(is_integer(maps:get(connected_nodes, R)), M), + ?assert(is_integer(maps:get(time_since_startup_in_milliseconds, R)), M) + after 5000 -> + ct:fail(timeout) + end. + +disco_node_up_timestamp_is_remembered(Config) -> + #{disco := Disco, node2 := Node2} = setup_two_nodes_and_discovery(Config), + %% Check that nodeup is remembered + wait_for_disco_timestamp_to_appear(Disco, nodeup_timestamps, Node2). + +disco_logs_nodedown(Config) -> + logger_debug_h:start(#{id => ?FUNCTION_NAME}), + ok = net_kernel:monitor_nodes(true), + #{disco := Disco, node2 := Node2} = setup_two_nodes_and_discovery(Config, [wait, netsplit]), + receive_message({nodedown, Node2}), + receive + {log, ?FUNCTION_NAME, #{ + level := warning, + meta := #{pid := Disco}, + msg := {report, #{what := nodedown, remote_node := Node2} = R} + }} = M -> + ?assert(is_integer(maps:get(connected_nodes, R)), M), + ?assert(is_integer(maps:get(time_since_startup_in_milliseconds, R)), M), + ?assert(is_integer(maps:get(connected_millisecond_duration, R)), M) + after 5000 -> + ct:fail(timeout) + end. + +disco_node_down_timestamp_is_remembered(Config) -> + #{disco := Disco, node2 := Node2} = setup_two_nodes_and_discovery(Config, [wait, netsplit]), + %% Check that nodedown is remembered + wait_for_disco_timestamp_to_appear(Disco, nodedown_timestamps, Node2). + +disco_logs_nodeup_after_downtime(Config) -> + logger_debug_h:start(#{id => ?FUNCTION_NAME}), + #{disco := Disco, node2 := Node2} = setup_two_nodes_and_discovery(Config, [wait, netsplit]), + %% At this point cets_disco should reconnect nodes back automatically + %% after retry_type_to_timeout(after_nodedown) time. + %% We want to speed this up for tests though. + Disco ! check, + %% Receive a nodeup after the disconnect. + %% This nodeup should contain the downtime_millisecond_duration field + %% (initial nodeup should not contain this field). + receive + {log, ?FUNCTION_NAME, #{ + level := warning, + meta := #{pid := Disco}, + msg := + {report, + #{ + what := nodeup, + remote_node := Node2, + downtime_millisecond_duration := Downtime + } = R} + }} = M -> + ?assert(is_integer(maps:get(connected_nodes, R)), M), + ?assert(is_integer(Downtime), M) + after 5000 -> + ct:fail(timeout) + end. + +disco_logs_node_reconnects_after_downtime(Config) -> + logger_debug_h:start(#{id => ?FUNCTION_NAME}), + Setup = setup_two_nodes_and_discovery(Config, [wait, disco2]), + #{disco := Disco, node1 := Node1, node2 := Node2, peer2 := Peer2} = Setup, + %% Check that a start timestamp from a remote node is stored + Info = cets_discovery:system_info(Disco), + ?assertMatch(#{node_start_timestamps := #{Node2 := _}}, Info), + disconnect_node(Peer2, Node1), + receive + {log, ?FUNCTION_NAME, #{ + level := warning, + meta := #{pid := Disco}, + msg := + {report, #{ + what := node_reconnects, + start_time := StartTime, + remote_node := Node2 + }} + }} = M -> + ?assert(is_integer(StartTime), M) + after 5000 -> + ct:fail(timeout) + end. + +disco_nodeup_timestamp_is_updated_after_node_reconnects(Config) -> + logger_debug_h:start(#{id => ?FUNCTION_NAME}), + Setup = setup_two_nodes_and_discovery(Config, [wait, disco2]), + #{disco := Disco, node2 := Node2} = Setup, + OldTimestamp = cets_test_helper:get_disco_timestamp(Disco, nodeup_timestamps, Node2), + disconnect_node_by_name(Config, ct2), + wait_for_disco_timestamp_to_be_updated(Disco, nodeup_timestamps, Node2, OldTimestamp). + +disco_node_start_timestamp_is_updated_after_node_restarts(Config) -> + logger_debug_h:start(#{id => ?FUNCTION_NAME}), + Setup = setup_two_nodes_and_discovery(Config, [wait, disco2]), + #{disco := Disco, node2 := Node2} = Setup, + OldTimestamp = cets_test_helper:get_disco_timestamp(Disco, node_start_timestamps, Node2), + simulate_disco_restart(Setup), + wait_for_disco_timestamp_to_be_updated(Disco, node_start_timestamps, Node2, OldTimestamp). + +disco_late_pang_result_arrives_after_node_went_up(Config) -> + Node1 = node(), + #{ct2 := Node2} = proplists:get_value(nodes, Config), + %% unavailable_nodes list contains nodes which have not responded to pings. + %% Ping is async though. + %% So, there could be the situation when the result of ping would be processed + %% after the node actually got connected. + meck:new(cets_ping, [passthrough]), + Me = self(), + meck:expect(cets_ping, send_ping_result, fun(Pid, Node, _PingResult) -> + %% Wait until Node is up + Cond = fun() -> lists:member(Node, nodes()) end, + cets_test_wait:wait_until(Cond, true), + Me ! send_ping_result_called, + %% Return pang to cets_discovery. + %% cets_join does not use send_ping_result function + %% and would receive pong and join correctly. + meck:passthrough([Pid, Node, pang]) + end), + try + %% setup_two_nodes_and_discovery would call disconnect_node/2 function + Setup = setup_two_nodes_and_discovery(Config, [wait, disco2]), + receive_message(send_ping_result_called), + #{disco_name := DiscoName} = Setup, + Status = cets_status:status(DiscoName), + %% Check that pang is ignored and unavailable_nodes list is empty. + ?assertMatch([], maps:get(unavailable_nodes, Status)), + ?assertMatch([Node1, Node2], maps:get(joined_nodes, Status)) + after + meck:unload() + end. + +disco_nodeup_triggers_check_and_get_nodes(Config) -> + Setup = setup_two_nodes_and_discovery(Config, [wait, notify_get_nodes]), + #{disco := Disco, node2 := Node2} = Setup, + flush_message(get_nodes), + Disco ! {nodeup, Node2}, + receive_message(get_nodes). diff --git a/test/cets_test_helper.erl b/test/cets_test_helper.erl index fb524ec..144bf54 100644 --- a/test/cets_test_helper.erl +++ b/test/cets_test_helper.erl @@ -1,7 +1,17 @@ -module(cets_test_helper). --export([get_disco_timestamp/3]). +-include_lib("eunit/include/eunit.hrl"). + +-export([ + get_disco_timestamp/3, + assert_unique/1 +]). get_disco_timestamp(Disco, MapName, NodeKey) -> Info = cets_discovery:system_info(Disco), #{MapName := #{NodeKey := Timestamp}} = Info, Timestamp. + +%% Fails if List has duplicates +assert_unique(List) -> + ?assertEqual([], List -- lists:usort(List)), + List. diff --git a/test/cets_test_node.erl b/test/cets_test_node.erl deleted file mode 100644 index 1f7421b..0000000 --- a/test/cets_test_node.erl +++ /dev/null @@ -1,37 +0,0 @@ --module(cets_test_node). --export([ - block_node/2, - reconnect_node/2, - disconnect_node/2, - disconnect_node_by_name/2 -]). - --import(cets_test_rpc, [rpc/4]). - -%% Disconnect node until manually connected -block_node(Node, Peer) when is_atom(Node), is_pid(Peer) -> - rpc(Peer, erlang, set_cookie, [node(), invalid_cookie]), - disconnect_node(Peer, node()), - %% Wait till node() is notified about the disconnect - cets_test_wait:wait_until(fun() -> rpc(Peer, net_adm, ping, [node()]) end, pang), - cets_test_wait:wait_until(fun() -> rpc(node(), net_adm, ping, [Node]) end, pang). - -reconnect_node(Node, Peer) when is_atom(Node), is_pid(Peer) -> - rpc(Peer, erlang, set_cookie, [node(), erlang:get_cookie()]), - %% Very rarely it could return pang - cets_test_wait:wait_until(fun() -> rpc(Peer, net_adm, ping, [node()]) end, pong), - cets_test_wait:wait_until(fun() -> rpc(node(), net_adm, ping, [Node]) end, pong). - -disconnect_node(RPCNode, DisconnectNode) -> - rpc(RPCNode, erlang, disconnect_node, [DisconnectNode]). - -disconnect_node_by_name(Config, Id) -> - Peer = maps:get(Id, proplists:get_value(peers, Config)), - Node = maps:get(Id, proplists:get_value(nodes, Config)), - %% We could need to retry to disconnect, if the local node is currently trying to establish a connection - %% with Node2 (could be triggered by the previous tests) - F = fun() -> - disconnect_node(Peer, node()), - lists:member(Node, nodes()) - end, - cets_test_wait:wait_until(F, false). diff --git a/test/cets_test_peer.erl b/test/cets_test_peer.erl index 81f86f0..d821a67 100644 --- a/test/cets_test_peer.erl +++ b/test/cets_test_peer.erl @@ -1,10 +1,38 @@ -module(cets_test_peer). -export([ + start/2, + stop/1, start_node/1, node_to_peer/1 ]). + +-export([ + block_node/2, + reconnect_node/2, + disconnect_node/2, + disconnect_node_by_name/2 +]). + +-import(cets_test_rpc, [rpc/4]). + -include_lib("common_test/include/ct.hrl"). +start(Names, Config) -> + {Nodes, Peers} = lists:unzip([cets_test_peer:start_node(name(N)) || N <- Names]), + [ + {nodes, maps:from_list(lists:zip(Names, Nodes))}, + {peers, maps:from_list(lists:zip(Names, Peers))} + | Config + ]. + +stop(Config) -> + Peers = proplists:get_value(peers, Config), + [peer:stop(Peer) || Peer <- maps:values(Peers)], + ok. + +name(Node) -> + list_to_atom(peer:random_name(atom_to_list(Node))). + start_node(Sname) -> {ok, Peer, Node} = ?CT_PEER(#{ name => Sname, connection => standard_io, args => extra_args(Sname) @@ -40,3 +68,31 @@ extra_args(X) when X == ct5; X == ct6; X == ct7 -> ["-kernel", "prevent_overlapping_partitions", "false"]; extra_args(_) -> "". + +%% Disconnect node until manually connected +block_node(Node, Peer) when is_atom(Node), is_pid(Peer) -> + rpc(Peer, erlang, set_cookie, [node(), invalid_cookie]), + disconnect_node(Peer, node()), + %% Wait till node() is notified about the disconnect + cets_test_wait:wait_until(fun() -> rpc(Peer, net_adm, ping, [node()]) end, pang), + cets_test_wait:wait_until(fun() -> rpc(node(), net_adm, ping, [Node]) end, pang). + +reconnect_node(Node, Peer) when is_atom(Node), is_pid(Peer) -> + rpc(Peer, erlang, set_cookie, [node(), erlang:get_cookie()]), + %% Very rarely it could return pang + cets_test_wait:wait_until(fun() -> rpc(Peer, net_adm, ping, [node()]) end, pong), + cets_test_wait:wait_until(fun() -> rpc(node(), net_adm, ping, [Node]) end, pong). + +disconnect_node(RPCNode, DisconnectNode) -> + rpc(RPCNode, erlang, disconnect_node, [DisconnectNode]). + +disconnect_node_by_name(Config, Id) -> + Peer = maps:get(Id, proplists:get_value(peers, Config)), + Node = maps:get(Id, proplists:get_value(nodes, Config)), + %% We could need to retry to disconnect, if the local node is currently trying to establish a connection + %% with Node2 (could be triggered by the previous tests) + F = fun() -> + disconnect_node(Peer, node()), + lists:member(Node, nodes()) + end, + cets_test_wait:wait_until(F, false). diff --git a/test/cets_test_setup.erl b/test/cets_test_setup.erl index c0b8a9f..3decf1d 100644 --- a/test/cets_test_setup.erl +++ b/test/cets_test_setup.erl @@ -6,6 +6,7 @@ -export([ init_cleanup_table/0, + remove_cleanup_table/0, wait_for_cleanup/0 ]). @@ -41,7 +42,7 @@ make_process/0 ]). --import(cets_test_node, [ +-import(cets_test_peer, [ disconnect_node/2, disconnect_node_by_name/2 ]). @@ -117,9 +118,14 @@ schedule_cleanup(Pid) -> init_cleanup_table() -> spawn(fun() -> ets:new(cleanup_table, [named_table, public, bag]), - timer:sleep(infinity) + receive + stop -> ok + end end). +remove_cleanup_table() -> + ets:info(cleanup_table, owner) ! stop. + %% schedule_cleanup is async, so this function is waiting for it to finish wait_for_cleanup() -> [ From 28b67f87751148a997f77344c3a2af28843cde07 Mon Sep 17 00:00:00 2001 From: Mikhail Uvarov Date: Tue, 12 Mar 2024 14:27:00 +0100 Subject: [PATCH 09/30] Move small disco cases into disco suite --- test/cets_SUITE.erl | 301 ----------------------------------- test/cets_disco_SUITE.erl | 319 ++++++++++++++++++++++++++++++++++++++ test/cets_test_helper.erl | 10 ++ 3 files changed, 329 insertions(+), 301 deletions(-) diff --git a/test/cets_SUITE.erl b/test/cets_SUITE.erl index 86286e8..b1a8092 100644 --- a/test/cets_SUITE.erl +++ b/test/cets_SUITE.erl @@ -149,21 +149,6 @@ cases() -> test_multinode, test_multinode_remote_insert, node_list_is_correct, - test_multinode_auto_discovery, - test_disco_add_table, - test_disco_delete_table, - test_disco_delete_unknown_table, - test_disco_delete_table_twice, - test_disco_file_appears, - test_disco_handles_bad_node, - cets_discovery_fun_backend_works, - test_disco_add_table_twice, - test_disco_add_two_tables, - disco_retried_if_get_nodes_fail, - disco_uses_regular_retry_interval_in_the_regular_phase, - disco_uses_regular_retry_interval_in_the_regular_phase_after_node_down, - disco_uses_regular_retry_interval_in_the_regular_phase_after_expired_node_down, - disco_handles_node_up_and_down, status_available_nodes, status_available_nodes_do_not_contain_nodes_with_stopped_disco, status_unavailable_nodes, @@ -303,9 +288,6 @@ end_per_group(Group, Config) when Group == cets_seq_no_log; Group == cets_no_log end_per_group(_Group, Config) -> Config. -init_per_testcase(test_multinode_auto_discovery = Name, Config) -> - ct:make_priv_dir(), - init_per_testcase_generic(Name, Config); init_per_testcase(Name, Config) -> init_per_testcase_generic(Name, Config). @@ -1501,283 +1483,6 @@ node_list_is_correct(Config) -> [Node1, Node2, Node3] = other_nodes(Node4, Tab), ok. -test_multinode_auto_discovery(Config) -> - Node1 = node(), - #{ct2 := Node2} = proplists:get_value(nodes, Config), - Tab = make_name(Config), - {ok, _Pid1} = start(Node1, Tab), - {ok, _Pid2} = start(Node2, Tab), - Dir = proplists:get_value(priv_dir, Config), - ct:pal("Dir ~p", [Dir]), - FileName = filename:join(Dir, "disco.txt"), - ok = file:write_file(FileName, io_lib:format("~s~n~s~n", [Node1, Node2])), - {ok, Disco} = cets_discovery:start_link(#{tables => [Tab], disco_file => FileName}), - %% Disco is async, so we have to wait for the final state - ok = wait_for_ready(Disco, 5000), - [Node2] = other_nodes(Node1, Tab), - [#{memory := _, nodes := [Node1, Node2], size := 0, table := Tab}] = - cets_discovery:info(Disco), - #{verify_ready := []} = - cets_discovery:system_info(Disco), - ok. - -test_disco_add_table(Config) -> - Node1 = node(), - #{ct2 := Node2} = proplists:get_value(nodes, Config), - Tab = make_name(Config), - {ok, _Pid1} = start(Node1, Tab), - {ok, _Pid2} = start(Node2, Tab), - Dir = proplists:get_value(priv_dir, Config), - ct:pal("Dir ~p", [Dir]), - FileName = filename:join(Dir, "disco.txt"), - ok = file:write_file(FileName, io_lib:format("~s~n~s~n", [Node1, Node2])), - {ok, Disco} = cets_discovery:start_link(#{tables => [], disco_file => FileName}), - cets_discovery:add_table(Disco, Tab), - %% Disco is async, so we have to wait for the final state - ok = wait_for_ready(Disco, 5000), - [Node2] = other_nodes(Node1, Tab), - [#{memory := _, nodes := [Node1, Node2], size := 0, table := Tab}] = - cets_discovery:info(Disco), - ok. - -test_disco_delete_table(Config) -> - F = fun(State) -> {{ok, []}, State} end, - {ok, Disco} = cets_discovery:start_link(#{ - backend_module => cets_discovery_fun, get_nodes_fn => F - }), - Tab = make_name(Config), - cets_discovery:add_table(Disco, Tab), - #{tables := [Tab]} = cets_discovery:system_info(Disco), - cets_discovery:delete_table(Disco, Tab), - #{tables := []} = cets_discovery:system_info(Disco). - -test_disco_delete_unknown_table(Config) -> - F = fun(State) -> {{ok, []}, State} end, - {ok, Disco} = cets_discovery:start_link(#{ - backend_module => cets_discovery_fun, get_nodes_fn => F - }), - Tab = make_name(Config), - cets_discovery:delete_table(Disco, Tab), - #{tables := []} = cets_discovery:system_info(Disco). - -test_disco_delete_table_twice(Config) -> - F = fun(State) -> {{ok, []}, State} end, - {ok, Disco} = cets_discovery:start_link(#{ - backend_module => cets_discovery_fun, get_nodes_fn => F - }), - Tab = make_name(Config), - cets_discovery:add_table(Disco, Tab), - #{tables := [Tab]} = cets_discovery:system_info(Disco), - cets_discovery:delete_table(Disco, Tab), - cets_discovery:delete_table(Disco, Tab), - #{tables := []} = cets_discovery:system_info(Disco). - -test_disco_file_appears(Config) -> - Node1 = node(), - #{ct2 := Node2} = proplists:get_value(nodes, Config), - Tab = make_name(Config), - {ok, _Pid1} = start(Node1, Tab), - {ok, _Pid2} = start(Node2, Tab), - Dir = proplists:get_value(priv_dir, Config), - ct:pal("Dir ~p", [Dir]), - FileName = filename:join(Dir, "disco3.txt"), - file:delete(FileName), - {ok, Disco} = cets_discovery:start_link(#{tables => [], disco_file => FileName}), - cets_discovery:add_table(Disco, Tab), - cets_test_wait:wait_until( - fun() -> maps:get(last_get_nodes_retry_type, cets_discovery:system_info(Disco)) end, - after_error - ), - ok = file:write_file(FileName, io_lib:format("~s~n~s~n", [Node1, Node2])), - %% Disco is async, so we have to wait for the final state - ok = wait_for_ready(Disco, 5000), - [Node2] = other_nodes(Node1, Tab), - [#{memory := _, nodes := [Node1, Node2], size := 0, table := Tab}] = - cets_discovery:info(Disco), - ok. - -test_disco_handles_bad_node(Config) -> - Node1 = node(), - #{ct2 := Node2} = proplists:get_value(nodes, Config), - Tab = make_name(Config), - {ok, _Pid1} = start(Node1, Tab), - {ok, _Pid2} = start(Node2, Tab), - Dir = proplists:get_value(priv_dir, Config), - ct:pal("Dir ~p", [Dir]), - FileName = filename:join(Dir, "disco_badnode.txt"), - ok = file:write_file(FileName, io_lib:format("badnode@localhost~n~s~n~s~n", [Node1, Node2])), - {ok, Disco} = cets_discovery:start_link(#{tables => [], disco_file => FileName}), - cets_discovery:add_table(Disco, Tab), - %% Check that wait_for_ready would not block forever: - ok = wait_for_ready(Disco, 5000), - %% Check if the node sent pang: - #{unavailable_nodes := ['badnode@localhost']} = cets_discovery:system_info(Disco), - %% Check that other nodes are discovered fine - [#{memory := _, nodes := [Node1, Node2], size := 0, table := Tab}] = - cets_discovery:info(Disco). - -cets_discovery_fun_backend_works(Config) -> - Node1 = node(), - #{ct2 := Node2} = proplists:get_value(nodes, Config), - Tab = make_name(Config), - {ok, _Pid1} = start(Node1, Tab), - {ok, _Pid2} = start(Node2, Tab), - F = fun(State) -> {{ok, [Node1, Node2]}, State} end, - {ok, Disco} = cets_discovery:start_link(#{ - backend_module => cets_discovery_fun, get_nodes_fn => F - }), - cets_discovery:add_table(Disco, Tab), - ok = wait_for_ready(Disco, 5000), - [#{memory := _, nodes := [Node1, Node2], size := 0, table := Tab}] = - cets_discovery:info(Disco). - -test_disco_add_table_twice(Config) -> - Dir = proplists:get_value(priv_dir, Config), - FileName = filename:join(Dir, "disco.txt"), - {ok, Disco} = cets_discovery:start_link(#{tables => [], disco_file => FileName}), - Tab = make_name(Config), - {ok, _Pid} = start_local(Tab), - cets_discovery:add_table(Disco, Tab), - cets_discovery:add_table(Disco, Tab), - %% Check that everything is fine - #{tables := [Tab]} = cets_discovery:system_info(Disco). - -test_disco_add_two_tables(Config) -> - Node1 = node(), - #{ct2 := Node2} = proplists:get_value(nodes, Config), - Tab1 = make_name(Config, 1), - Tab2 = make_name(Config, 2), - {ok, _} = start(Node1, Tab1), - {ok, _} = start(Node2, Tab1), - {ok, _} = start(Node1, Tab2), - {ok, _} = start(Node2, Tab2), - Me = self(), - F = fun - (State = #{waited := true}) -> - Me ! called_after_waited, - {{ok, [Node1, Node2]}, State}; - (State) -> - wait_till_test_stage(Me, sent_both), - Me ! waited_for_sent_both, - {{ok, [Node1, Node2]}, State#{waited => true}} - end, - {ok, Disco} = cets_discovery:start_link(#{ - backend_module => cets_discovery_fun, get_nodes_fn => F - }), - %% Add two tables async - cets_discovery:add_table(Disco, Tab1), - %% After the first table, Disco would get blocked in get_nodes function (see wait_till_test_stage in F above) - cets_discovery:add_table(Disco, Tab2), - put(test_stage, sent_both), - %% Just ensure wait_till_test_stage function works: - wait_till_test_stage(Me, sent_both), - %% First check is done, the second check should be triggered asap - %% (i.e. because of should_retry_get_nodes=true set in state) - receive_message(waited_for_sent_both), - %% try_joining would be called after set_nodes, - %% but it is async, so wait until it is done: - cets_test_wait:wait_until( - fun() -> - maps:with( - [get_nodes_status, should_retry_get_nodes, join_status, should_retry_join], - cets_discovery:system_info(Disco) - ) - end, - #{ - get_nodes_status => not_running, - should_retry_get_nodes => false, - join_status => not_running, - should_retry_join => false - } - ), - [ - #{memory := _, nodes := [Node1, Node2], size := 0, table := Tab1}, - #{memory := _, nodes := [Node1, Node2], size := 0, table := Tab2} - ] = - cets_discovery:info(Disco), - ok. - -disco_retried_if_get_nodes_fail(Config) -> - Node1 = node(), - #{ct2 := Node2} = proplists:get_value(nodes, Config), - Tab = make_name(Config), - {ok, _} = start(Node1, Tab), - {ok, _} = start(Node2, Tab), - F = fun(State) -> - {{error, simulate_error}, State} - end, - {ok, Disco} = cets_discovery:start_link(#{ - backend_module => cets_discovery_fun, get_nodes_fn => F - }), - cets_discovery:add_table(Disco, Tab), - cets_test_wait:wait_until( - fun() -> maps:get(last_get_nodes_retry_type, cets_discovery:system_info(Disco)) end, - after_error - ), - ok. - -disco_uses_regular_retry_interval_in_the_regular_phase(Config) -> - #{disco := Disco} = generic_disco_uses_regular_retry_interval_in_the_regular_phase(Config), - #{phase := regular, retry_type := regular} = cets_discovery:system_info(Disco). - -%% Similar to disco_uses_regular_retry_interval_in_the_regular_phase, but has nodedown -disco_uses_regular_retry_interval_in_the_regular_phase_after_node_down(Config) -> - SysInfo = generic_disco_uses_regular_retry_interval_in_the_regular_phase(Config), - #{disco := Disco, node2 := Node2} = SysInfo, - Disco ! {nodedown, Node2}, - #{phase := regular, retry_type := after_nodedown} = cets_discovery:system_info(Disco). - -%% Similar to disco_uses_regular_retry_interval_in_the_regular_phase_after_node_down, but we simulate long downtime -disco_uses_regular_retry_interval_in_the_regular_phase_after_expired_node_down(Config) -> - #{disco := Disco, node2 := Node2} = generic_disco_uses_regular_retry_interval_in_the_regular_phase( - Config - ), - Disco ! {nodedown, Node2}, - TestTimestamp = erlang:system_time(millisecond) - timer:seconds(1000), - set_nodedown_timestamp(Disco, Node2, TestTimestamp), - #{phase := regular, retry_type := regular} = cets_discovery:system_info(Disco). - -generic_disco_uses_regular_retry_interval_in_the_regular_phase(Config) -> - Node1 = node(), - #{ct2 := Node2} = proplists:get_value(nodes, Config), - Tab = make_name(Config), - {ok, _} = start(Node1, Tab), - {ok, _} = start(Node2, Tab), - F = fun(State) -> {{ok, [Node1, Node2]}, State} end, - {ok, Disco} = cets_discovery:start_link(#{ - backend_module => cets_discovery_fun, get_nodes_fn => F - }), - Disco ! enter_regular_phase, - cets_discovery:add_table(Disco, Tab), - cets_test_wait:wait_until( - fun() -> maps:get(last_get_nodes_retry_type, cets_discovery:system_info(Disco)) end, regular - ), - #{disco => Disco, node2 => Node2}. - -disco_handles_node_up_and_down(Config) -> - BadNode = 'badnode@localhost', - Node1 = node(), - #{ct2 := Node2} = proplists:get_value(nodes, Config), - Tab = make_name(Config), - {ok, _} = start(Node1, Tab), - {ok, _} = start(Node2, Tab), - F = fun(State) -> - {{ok, [Node1, Node2, BadNode]}, State} - end, - {ok, Disco} = cets_discovery:start_link(#{ - backend_module => cets_discovery_fun, get_nodes_fn => F - }), - cets_discovery:add_table(Disco, Tab), - %% get_nodes call is async, so wait for it - cets_test_wait:wait_until( - fun() -> length(maps:get(nodes, cets_discovery:system_info(Disco))) end, - 3 - ), - Disco ! {nodeup, BadNode}, - Disco ! {nodedown, BadNode}, - %% Check that wait_for_ready still works - ok = wait_for_ready(Disco, 5000). - status_available_nodes(Config) -> Node1 = node(), #{ct2 := Node2} = proplists:get_value(nodes, Config), @@ -2831,12 +2536,6 @@ set_other_servers(Pid, Servers) -> State#{other_servers := Servers} end). -%% Overwrites nodedown timestamp for the Node in the discovery server state -set_nodedown_timestamp(Disco, Node, NewTimestamp) -> - sys:replace_state(Disco, fun(#{nodedown_timestamps := Map} = State) -> - State#{nodedown_timestamps := maps:put(Node, NewTimestamp, Map)} - end). - stopped_pid() -> %% Get a pid for a stopped process {Pid, Mon} = spawn_monitor(fun() -> ok end), diff --git a/test/cets_disco_SUITE.erl b/test/cets_disco_SUITE.erl index 82023f3..583efee 100644 --- a/test/cets_disco_SUITE.erl +++ b/test/cets_disco_SUITE.erl @@ -5,6 +5,20 @@ -compile([export_all, nowarn_export_all]). +-import(cets_test_setup, [ + start/2, + start_local/1, + start_local/2, + make_name/1, + make_name/2, + disco_name/1 +]). + +-import(cets_test_wait, [ + wait_for_ready/2, + wait_till_test_stage/2 +]). + -import(cets_test_setup, [ setup_two_nodes_and_discovery/1, setup_two_nodes_and_discovery/2, @@ -28,8 +42,13 @@ -import(cets_test_helper, [assert_unique/1]). +-import(cets_test_rpc, [ + other_nodes/2 +]). + all() -> [ + {group, cets}, {group, cets_seq}, {group, cets_seq_no_log} ]. @@ -37,6 +56,7 @@ all() -> groups() -> %% Cases should have unique names, because we name CETS servers based on case names [ + {cets, [parallel, {repeat_until_any_fail, 3}], assert_unique(cases())}, %% These tests actually simulate a netsplit on the distribution level. %% Though, global's prevent_overlapping_partitions option starts kicking %% all nodes from the cluster, so we have to be careful not to break other cases. @@ -46,6 +66,25 @@ groups() -> assert_unique(cets_seq_no_log_cases())} ]. +cases() -> + [ + test_multinode_auto_discovery, + test_disco_add_table, + test_disco_delete_table, + test_disco_delete_unknown_table, + test_disco_delete_table_twice, + test_disco_file_appears, + test_disco_handles_bad_node, + cets_discovery_fun_backend_works, + test_disco_add_table_twice, + test_disco_add_two_tables, + disco_retried_if_get_nodes_fail, + disco_uses_regular_retry_interval_in_the_regular_phase, + disco_uses_regular_retry_interval_in_the_regular_phase_after_node_down, + disco_uses_regular_retry_interval_in_the_regular_phase_after_expired_node_down, + disco_handles_node_up_and_down + ]. + seq_cases() -> [ disco_logs_nodeup, @@ -90,6 +129,9 @@ end_per_group(Group, Config) when Group == cets_seq_no_log; Group == cets_no_log end_per_group(_Group, Config) -> Config. +init_per_testcase(test_multinode_auto_discovery = Name, Config) -> + ct:make_priv_dir(), + init_per_testcase_generic(Name, Config); init_per_testcase(Name, Config) -> init_per_testcase_generic(Name, Config). @@ -104,6 +146,283 @@ end_per_testcase(_, _Config) -> log_modules() -> [cets, cets_call, cets_long, cets_join, cets_discovery]. +test_multinode_auto_discovery(Config) -> + Node1 = node(), + #{ct2 := Node2} = proplists:get_value(nodes, Config), + Tab = make_name(Config), + {ok, _Pid1} = start(Node1, Tab), + {ok, _Pid2} = start(Node2, Tab), + Dir = proplists:get_value(priv_dir, Config), + ct:pal("Dir ~p", [Dir]), + FileName = filename:join(Dir, "disco.txt"), + ok = file:write_file(FileName, io_lib:format("~s~n~s~n", [Node1, Node2])), + {ok, Disco} = cets_discovery:start_link(#{tables => [Tab], disco_file => FileName}), + %% Disco is async, so we have to wait for the final state + ok = wait_for_ready(Disco, 5000), + [Node2] = other_nodes(Node1, Tab), + [#{memory := _, nodes := [Node1, Node2], size := 0, table := Tab}] = + cets_discovery:info(Disco), + #{verify_ready := []} = + cets_discovery:system_info(Disco), + ok. + +test_disco_add_table(Config) -> + Node1 = node(), + #{ct2 := Node2} = proplists:get_value(nodes, Config), + Tab = make_name(Config), + {ok, _Pid1} = start(Node1, Tab), + {ok, _Pid2} = start(Node2, Tab), + Dir = proplists:get_value(priv_dir, Config), + ct:pal("Dir ~p", [Dir]), + FileName = filename:join(Dir, "disco.txt"), + ok = file:write_file(FileName, io_lib:format("~s~n~s~n", [Node1, Node2])), + {ok, Disco} = cets_discovery:start_link(#{tables => [], disco_file => FileName}), + cets_discovery:add_table(Disco, Tab), + %% Disco is async, so we have to wait for the final state + ok = wait_for_ready(Disco, 5000), + [Node2] = other_nodes(Node1, Tab), + [#{memory := _, nodes := [Node1, Node2], size := 0, table := Tab}] = + cets_discovery:info(Disco), + ok. + +test_disco_delete_table(Config) -> + F = fun(State) -> {{ok, []}, State} end, + {ok, Disco} = cets_discovery:start_link(#{ + backend_module => cets_discovery_fun, get_nodes_fn => F + }), + Tab = make_name(Config), + cets_discovery:add_table(Disco, Tab), + #{tables := [Tab]} = cets_discovery:system_info(Disco), + cets_discovery:delete_table(Disco, Tab), + #{tables := []} = cets_discovery:system_info(Disco). + +test_disco_delete_unknown_table(Config) -> + F = fun(State) -> {{ok, []}, State} end, + {ok, Disco} = cets_discovery:start_link(#{ + backend_module => cets_discovery_fun, get_nodes_fn => F + }), + Tab = make_name(Config), + cets_discovery:delete_table(Disco, Tab), + #{tables := []} = cets_discovery:system_info(Disco). + +test_disco_delete_table_twice(Config) -> + F = fun(State) -> {{ok, []}, State} end, + {ok, Disco} = cets_discovery:start_link(#{ + backend_module => cets_discovery_fun, get_nodes_fn => F + }), + Tab = make_name(Config), + cets_discovery:add_table(Disco, Tab), + #{tables := [Tab]} = cets_discovery:system_info(Disco), + cets_discovery:delete_table(Disco, Tab), + cets_discovery:delete_table(Disco, Tab), + #{tables := []} = cets_discovery:system_info(Disco). + +test_disco_file_appears(Config) -> + Node1 = node(), + #{ct2 := Node2} = proplists:get_value(nodes, Config), + Tab = make_name(Config), + {ok, _Pid1} = start(Node1, Tab), + {ok, _Pid2} = start(Node2, Tab), + Dir = proplists:get_value(priv_dir, Config), + ct:pal("Dir ~p", [Dir]), + FileName = filename:join(Dir, "disco3.txt"), + file:delete(FileName), + {ok, Disco} = cets_discovery:start_link(#{tables => [], disco_file => FileName}), + cets_discovery:add_table(Disco, Tab), + cets_test_wait:wait_until( + fun() -> maps:get(last_get_nodes_retry_type, cets_discovery:system_info(Disco)) end, + after_error + ), + ok = file:write_file(FileName, io_lib:format("~s~n~s~n", [Node1, Node2])), + %% Disco is async, so we have to wait for the final state + ok = wait_for_ready(Disco, 5000), + [Node2] = other_nodes(Node1, Tab), + [#{memory := _, nodes := [Node1, Node2], size := 0, table := Tab}] = + cets_discovery:info(Disco), + ok. + +test_disco_handles_bad_node(Config) -> + Node1 = node(), + #{ct2 := Node2} = proplists:get_value(nodes, Config), + Tab = make_name(Config), + {ok, _Pid1} = start(Node1, Tab), + {ok, _Pid2} = start(Node2, Tab), + Dir = proplists:get_value(priv_dir, Config), + ct:pal("Dir ~p", [Dir]), + FileName = filename:join(Dir, "disco_badnode.txt"), + ok = file:write_file(FileName, io_lib:format("badnode@localhost~n~s~n~s~n", [Node1, Node2])), + {ok, Disco} = cets_discovery:start_link(#{tables => [], disco_file => FileName}), + cets_discovery:add_table(Disco, Tab), + %% Check that wait_for_ready would not block forever: + ok = wait_for_ready(Disco, 5000), + %% Check if the node sent pang: + #{unavailable_nodes := ['badnode@localhost']} = cets_discovery:system_info(Disco), + %% Check that other nodes are discovered fine + [#{memory := _, nodes := [Node1, Node2], size := 0, table := Tab}] = + cets_discovery:info(Disco). + +cets_discovery_fun_backend_works(Config) -> + Node1 = node(), + #{ct2 := Node2} = proplists:get_value(nodes, Config), + Tab = make_name(Config), + {ok, _Pid1} = start(Node1, Tab), + {ok, _Pid2} = start(Node2, Tab), + F = fun(State) -> {{ok, [Node1, Node2]}, State} end, + {ok, Disco} = cets_discovery:start_link(#{ + backend_module => cets_discovery_fun, get_nodes_fn => F + }), + cets_discovery:add_table(Disco, Tab), + ok = wait_for_ready(Disco, 5000), + [#{memory := _, nodes := [Node1, Node2], size := 0, table := Tab}] = + cets_discovery:info(Disco). + +test_disco_add_table_twice(Config) -> + Dir = proplists:get_value(priv_dir, Config), + FileName = filename:join(Dir, "disco.txt"), + {ok, Disco} = cets_discovery:start_link(#{tables => [], disco_file => FileName}), + Tab = make_name(Config), + {ok, _Pid} = start_local(Tab), + cets_discovery:add_table(Disco, Tab), + cets_discovery:add_table(Disco, Tab), + %% Check that everything is fine + #{tables := [Tab]} = cets_discovery:system_info(Disco). + +test_disco_add_two_tables(Config) -> + Node1 = node(), + #{ct2 := Node2} = proplists:get_value(nodes, Config), + Tab1 = make_name(Config, 1), + Tab2 = make_name(Config, 2), + {ok, _} = start(Node1, Tab1), + {ok, _} = start(Node2, Tab1), + {ok, _} = start(Node1, Tab2), + {ok, _} = start(Node2, Tab2), + Me = self(), + F = fun + (State = #{waited := true}) -> + Me ! called_after_waited, + {{ok, [Node1, Node2]}, State}; + (State) -> + wait_till_test_stage(Me, sent_both), + Me ! waited_for_sent_both, + {{ok, [Node1, Node2]}, State#{waited => true}} + end, + {ok, Disco} = cets_discovery:start_link(#{ + backend_module => cets_discovery_fun, get_nodes_fn => F + }), + %% Add two tables async + cets_discovery:add_table(Disco, Tab1), + %% After the first table, Disco would get blocked in get_nodes function (see wait_till_test_stage in F above) + cets_discovery:add_table(Disco, Tab2), + put(test_stage, sent_both), + %% Just ensure wait_till_test_stage function works: + wait_till_test_stage(Me, sent_both), + %% First check is done, the second check should be triggered asap + %% (i.e. because of should_retry_get_nodes=true set in state) + receive_message(waited_for_sent_both), + %% try_joining would be called after set_nodes, + %% but it is async, so wait until it is done: + cets_test_wait:wait_until( + fun() -> + maps:with( + [get_nodes_status, should_retry_get_nodes, join_status, should_retry_join], + cets_discovery:system_info(Disco) + ) + end, + #{ + get_nodes_status => not_running, + should_retry_get_nodes => false, + join_status => not_running, + should_retry_join => false + } + ), + [ + #{memory := _, nodes := [Node1, Node2], size := 0, table := Tab1}, + #{memory := _, nodes := [Node1, Node2], size := 0, table := Tab2} + ] = + cets_discovery:info(Disco), + ok. + +disco_retried_if_get_nodes_fail(Config) -> + Node1 = node(), + #{ct2 := Node2} = proplists:get_value(nodes, Config), + Tab = make_name(Config), + {ok, _} = start(Node1, Tab), + {ok, _} = start(Node2, Tab), + F = fun(State) -> + {{error, simulate_error}, State} + end, + {ok, Disco} = cets_discovery:start_link(#{ + backend_module => cets_discovery_fun, get_nodes_fn => F + }), + cets_discovery:add_table(Disco, Tab), + cets_test_wait:wait_until( + fun() -> maps:get(last_get_nodes_retry_type, cets_discovery:system_info(Disco)) end, + after_error + ), + ok. + +disco_uses_regular_retry_interval_in_the_regular_phase(Config) -> + #{disco := Disco} = generic_disco_uses_regular_retry_interval_in_the_regular_phase(Config), + #{phase := regular, retry_type := regular} = cets_discovery:system_info(Disco). + +%% Similar to disco_uses_regular_retry_interval_in_the_regular_phase, but has nodedown +disco_uses_regular_retry_interval_in_the_regular_phase_after_node_down(Config) -> + SysInfo = generic_disco_uses_regular_retry_interval_in_the_regular_phase(Config), + #{disco := Disco, node2 := Node2} = SysInfo, + Disco ! {nodedown, Node2}, + #{phase := regular, retry_type := after_nodedown} = cets_discovery:system_info(Disco). + +%% Similar to disco_uses_regular_retry_interval_in_the_regular_phase_after_node_down, but we simulate long downtime +disco_uses_regular_retry_interval_in_the_regular_phase_after_expired_node_down(Config) -> + #{disco := Disco, node2 := Node2} = generic_disco_uses_regular_retry_interval_in_the_regular_phase( + Config + ), + Disco ! {nodedown, Node2}, + TestTimestamp = erlang:system_time(millisecond) - timer:seconds(1000), + cets_test_helper:set_nodedown_timestamp(Disco, Node2, TestTimestamp), + #{phase := regular, retry_type := regular} = cets_discovery:system_info(Disco). + +generic_disco_uses_regular_retry_interval_in_the_regular_phase(Config) -> + Node1 = node(), + #{ct2 := Node2} = proplists:get_value(nodes, Config), + Tab = make_name(Config), + {ok, _} = start(Node1, Tab), + {ok, _} = start(Node2, Tab), + F = fun(State) -> {{ok, [Node1, Node2]}, State} end, + {ok, Disco} = cets_discovery:start_link(#{ + backend_module => cets_discovery_fun, get_nodes_fn => F + }), + Disco ! enter_regular_phase, + cets_discovery:add_table(Disco, Tab), + cets_test_wait:wait_until( + fun() -> maps:get(last_get_nodes_retry_type, cets_discovery:system_info(Disco)) end, regular + ), + #{disco => Disco, node2 => Node2}. + +disco_handles_node_up_and_down(Config) -> + BadNode = 'badnode@localhost', + Node1 = node(), + #{ct2 := Node2} = proplists:get_value(nodes, Config), + Tab = make_name(Config), + {ok, _} = start(Node1, Tab), + {ok, _} = start(Node2, Tab), + F = fun(State) -> + {{ok, [Node1, Node2, BadNode]}, State} + end, + {ok, Disco} = cets_discovery:start_link(#{ + backend_module => cets_discovery_fun, get_nodes_fn => F + }), + cets_discovery:add_table(Disco, Tab), + %% get_nodes call is async, so wait for it + cets_test_wait:wait_until( + fun() -> length(maps:get(nodes, cets_discovery:system_info(Disco))) end, + 3 + ), + Disco ! {nodeup, BadNode}, + Disco ! {nodedown, BadNode}, + %% Check that wait_for_ready still works + ok = wait_for_ready(Disco, 5000). + disco_logs_nodeup(Config) -> logger_debug_h:start(#{id => ?FUNCTION_NAME}), #{disco := Disco, node2 := Node2} = setup_two_nodes_and_discovery(Config), diff --git a/test/cets_test_helper.erl b/test/cets_test_helper.erl index 144bf54..34137b8 100644 --- a/test/cets_test_helper.erl +++ b/test/cets_test_helper.erl @@ -6,6 +6,10 @@ assert_unique/1 ]). +-export([ + set_nodedown_timestamp/3 +]). + get_disco_timestamp(Disco, MapName, NodeKey) -> Info = cets_discovery:system_info(Disco), #{MapName := #{NodeKey := Timestamp}} = Info, @@ -15,3 +19,9 @@ get_disco_timestamp(Disco, MapName, NodeKey) -> assert_unique(List) -> ?assertEqual([], List -- lists:usort(List)), List. + +%% Overwrites nodedown timestamp for the Node in the discovery server state +set_nodedown_timestamp(Disco, Node, NewTimestamp) -> + sys:replace_state(Disco, fun(#{nodedown_timestamps := Map} = State) -> + State#{nodedown_timestamps := maps:put(Node, NewTimestamp, Map)} + end). From 3baa846fba7424563cd55cd0bb7089b916dcfba5 Mon Sep 17 00:00:00 2001 From: Mikhail Uvarov Date: Tue, 12 Mar 2024 14:55:14 +0100 Subject: [PATCH 10/30] Move cases into cets_status_SUITE --- test/cets_SUITE.erl | 289 +------------ test/cets_status_SUITE.erl | 400 ++++++++++++++++++ .../status_data.txt | 0 test/cets_test_helper.erl | 8 +- 4 files changed, 411 insertions(+), 286 deletions(-) create mode 100644 test/cets_status_SUITE.erl rename test/{cets_SUITE_data => cets_status_SUITE_data}/status_data.txt (100%) diff --git a/test/cets_SUITE.erl b/test/cets_SUITE.erl index b1a8092..33961d0 100644 --- a/test/cets_SUITE.erl +++ b/test/cets_SUITE.erl @@ -68,7 +68,10 @@ assert_nothing_is_logged/2 ]). --import(cets_test_helper, [assert_unique/1]). +-import(cets_test_helper, [ + assert_unique/1, + set_other_servers/2 +]). all() -> [ @@ -149,17 +152,6 @@ cases() -> test_multinode, test_multinode_remote_insert, node_list_is_correct, - status_available_nodes, - status_available_nodes_do_not_contain_nodes_with_stopped_disco, - status_unavailable_nodes, - status_unavailable_nodes_is_subset_of_discovery_nodes, - status_joined_nodes, - status_discovery_works, - status_discovered_nodes, - status_remote_nodes_without_disco, - status_remote_nodes_with_unknown_tables, - status_remote_nodes_with_missing_nodes, - status_conflict_nodes, disco_wait_for_get_nodes_works, disco_wait_for_get_nodes_blocks_and_returns, disco_wait_for_get_nodes_when_get_nodes_needs_to_be_retried, @@ -209,7 +201,6 @@ cases() -> send_leader_op_throws_noproc, pinfo_returns_value, pinfo_returns_undefined, - format_data_does_not_return_table_duplicates, cets_ping_non_existing_node, cets_ping_net_family, unexpected_nodedown_is_ignored_by_disco, @@ -1483,253 +1474,6 @@ node_list_is_correct(Config) -> [Node1, Node2, Node3] = other_nodes(Node4, Tab), ok. -status_available_nodes(Config) -> - Node1 = node(), - #{ct2 := Node2} = proplists:get_value(nodes, Config), - F = fun(State) -> - {{ok, []}, State} - end, - DiscoName = disco_name(Config), - start_disco(Node1, #{name => DiscoName, backend_module => cets_discovery_fun, get_nodes_fn => F}), - start_disco(Node2, #{name => DiscoName, backend_module => cets_discovery_fun, get_nodes_fn => F}), - ?assertMatch(#{available_nodes := [Node1, Node2]}, cets_status:status(DiscoName)). - -status_available_nodes_do_not_contain_nodes_with_stopped_disco(Config) -> - Node1 = node(), - #{ct2 := Node2} = proplists:get_value(nodes, Config), - F = fun(State) -> - {{ok, [Node1, Node2]}, State} - end, - DiscoName = disco_name(Config), - start_disco(Node1, #{name => DiscoName, backend_module => cets_discovery_fun, get_nodes_fn => F}), - %% Disco not running - ?assertMatch(#{available_nodes := [Node1]}, cets_status:status(DiscoName)). - -status_unavailable_nodes(Config) -> - Node1 = node(), - F = fun(State) -> - {{ok, [Node1, 'badnode@localhost']}, State} - end, - DiscoName = disco_name(Config), - Disco = start_disco(Node1, #{ - name => DiscoName, backend_module => cets_discovery_fun, get_nodes_fn => F - }), - %% Disco needs at least one table to start calling get_nodes function - Tab = make_name(Config), - {ok, _} = start(Node1, Tab), - cets_discovery:add_table(Disco, Tab), - ok = wait_for_ready(DiscoName, 5000), - ?assertMatch(#{unavailable_nodes := ['badnode@localhost']}, cets_status:status(DiscoName)). - -status_unavailable_nodes_is_subset_of_discovery_nodes(Config) -> - Node1 = node(), - Self = self(), - GetFn1 = fun(State) -> {{ok, [Node1, 'badnode@localhost']}, State} end, - GetFn2 = fun(State) -> - Self ! get_fn2_called, - {{ok, [Node1]}, State} - end, - %% Setup meck - BackendModule = make_name(Config, disco_backend), - meck:new(BackendModule, [non_strict]), - meck:expect(BackendModule, init, fun(_Opts) -> undefined end), - meck:expect(BackendModule, get_nodes, GetFn1), - DiscoName = disco_name(Config), - Disco = start_disco(Node1, #{ - name => DiscoName, backend_module => BackendModule - }), - %% Disco needs at least one table to start calling get_nodes function - Tab = make_name(Config), - {ok, _} = start(Node1, Tab), - cets_discovery:add_table(Disco, Tab), - ok = wait_for_ready(DiscoName, 5000), - ?assertMatch(#{unavailable_nodes := ['badnode@localhost']}, cets_status:status(DiscoName)), - %% Remove badnode from disco - meck:expect(BackendModule, get_nodes, GetFn2), - %% Force check. - Disco ! check, - receive_message(get_fn2_called), - %% The unavailable_nodes list is updated - CondF = fun() -> maps:get(unavailable_nodes, cets_status:status(DiscoName)) end, - cets_test_wait:wait_until(CondF, []). - -status_joined_nodes(Config) -> - Node1 = node(), - #{ct2 := Node2} = proplists:get_value(nodes, Config), - F = fun(State) -> - {{ok, [Node1, Node2]}, State} - end, - DiscoName = disco_name(Config), - DiscoOpts = #{ - name => DiscoName, backend_module => cets_discovery_fun, get_nodes_fn => F - }, - Disco1 = start_disco(Node1, DiscoOpts), - Disco2 = start_disco(Node2, DiscoOpts), - Tab = make_name(Config), - {ok, _} = start(Node1, Tab), - {ok, _} = start(Node2, Tab), - %% Add table using pids (i.e. no need to do RPCs here) - cets_discovery:add_table(Disco1, Tab), - cets_discovery:add_table(Disco2, Tab), - ok = wait_for_ready(DiscoName, 5000), - cets_test_wait:wait_until(fun() -> maps:get(joined_nodes, cets_status:status(DiscoName)) end, [ - Node1, Node2 - ]). - -status_discovery_works(Config) -> - Node1 = node(), - #{ct2 := Node2} = proplists:get_value(nodes, Config), - F = fun(State) -> - {{ok, [Node1, Node2]}, State} - end, - DiscoName = disco_name(Config), - DiscoOpts = #{ - name => DiscoName, backend_module => cets_discovery_fun, get_nodes_fn => F - }, - Disco1 = start_disco(Node1, DiscoOpts), - Disco2 = start_disco(Node2, DiscoOpts), - Tab = make_name(Config), - {ok, _} = start(Node1, Tab), - {ok, _} = start(Node2, Tab), - %% Add table using pids (i.e. no need to do RPCs here) - cets_discovery:add_table(Disco1, Tab), - cets_discovery:add_table(Disco2, Tab), - ok = wait_for_ready(DiscoName, 5000), - ?assertMatch(#{discovery_works := true}, cets_status:status(DiscoName)). - -status_discovered_nodes(Config) -> - Node1 = node(), - #{ct2 := Node2} = proplists:get_value(nodes, Config), - F = fun(State) -> - {{ok, [Node1, Node2]}, State} - end, - DiscoName = disco_name(Config), - Disco = start_disco(Node1, #{ - name => DiscoName, backend_module => cets_discovery_fun, get_nodes_fn => F - }), - Tab = make_name(Config), - {ok, _} = start(Node1, Tab), - {ok, _} = start(Node2, Tab), - %% Add table using pids (i.e. no need to do RPCs here) - cets_discovery:add_table(Disco, Tab), - ok = wait_for_ready(DiscoName, 5000), - ?assertMatch(#{discovered_nodes := [Node1, Node2]}, cets_status:status(DiscoName)). - -status_remote_nodes_without_disco(Config) -> - Node1 = node(), - #{ct2 := Node2} = proplists:get_value(nodes, Config), - F = fun(State) -> - {{ok, [Node1, Node2]}, State} - end, - DiscoName = disco_name(Config), - Disco = start_disco(Node1, #{ - name => DiscoName, backend_module => cets_discovery_fun, get_nodes_fn => F - }), - Tab = make_name(Config), - {ok, _} = start(Node1, Tab), - cets_discovery:add_table(Disco, Tab), - ok = wait_for_ready(DiscoName, 5000), - ?assertMatch(#{remote_nodes_without_disco := [Node2]}, cets_status:status(DiscoName)). - -status_remote_nodes_with_unknown_tables(Config) -> - Node1 = node(), - #{ct2 := Node2} = proplists:get_value(nodes, Config), - F = fun(State) -> - {{ok, [Node1, Node2]}, State} - end, - DiscoName = disco_name(Config), - DiscoOpts = #{ - name => DiscoName, backend_module => cets_discovery_fun, get_nodes_fn => F - }, - Disco1 = start_disco(Node1, DiscoOpts), - Disco2 = start_disco(Node2, DiscoOpts), - Tab1 = make_name(Config, 1), - Tab2 = make_name(Config, 2), - %% Node1 does not have Tab2 - {ok, _} = start(Node1, Tab2), - {ok, _} = start(Node2, Tab1), - {ok, _} = start(Node2, Tab2), - %% Add table using pids (i.e. no need to do RPCs here) - cets_discovery:add_table(Disco1, Tab1), - cets_discovery:add_table(Disco2, Tab1), - cets_discovery:add_table(Disco2, Tab2), - ok = wait_for_ready(DiscoName, 5000), - cets_test_wait:wait_until( - fun() -> maps:get(remote_nodes_with_unknown_tables, cets_status:status(DiscoName)) end, [ - Node2 - ] - ), - cets_test_wait:wait_until( - fun() -> maps:get(remote_unknown_tables, cets_status:status(DiscoName)) end, [ - Tab2 - ] - ). - -status_remote_nodes_with_missing_nodes(Config) -> - Node1 = node(), - #{ct2 := Node2} = proplists:get_value(nodes, Config), - F = fun(State) -> - {{ok, [Node1, Node2]}, State} - end, - DiscoName = disco_name(Config), - DiscoOpts = #{ - name => DiscoName, backend_module => cets_discovery_fun, get_nodes_fn => F - }, - Disco1 = start_disco(Node1, DiscoOpts), - Disco2 = start_disco(Node2, DiscoOpts), - Tab1 = make_name(Config, 1), - Tab2 = make_name(Config, 2), - %% Node2 does not have Tab2 - {ok, _} = start(Node1, Tab1), - {ok, _} = start(Node1, Tab2), - {ok, _} = start(Node2, Tab1), - cets_discovery:add_table(Disco1, Tab1), - cets_discovery:add_table(Disco1, Tab2), - cets_discovery:add_table(Disco2, Tab1), - ok = wait_for_ready(DiscoName, 5000), - cets_test_wait:wait_until( - fun() -> maps:get(remote_nodes_with_missing_tables, cets_status:status(DiscoName)) end, [ - Node2 - ] - ), - cets_test_wait:wait_until( - fun() -> maps:get(remote_missing_tables, cets_status:status(DiscoName)) end, [ - Tab2 - ] - ). - -status_conflict_nodes(Config) -> - Node1 = node(), - #{ct2 := Node2} = proplists:get_value(nodes, Config), - F = fun(State) -> - {{ok, [Node1, Node2]}, State} - end, - DiscoName = disco_name(Config), - DiscoOpts = #{ - name => DiscoName, backend_module => cets_discovery_fun, get_nodes_fn => F - }, - Disco1 = start_disco(Node1, DiscoOpts), - Disco2 = start_disco(Node2, DiscoOpts), - Tab1 = make_name(Config, 1), - Tab2 = make_name(Config, 2), - {ok, _} = start(Node1, Tab1), - {ok, _} = start(Node1, Tab2), - {ok, _} = start(Node2, Tab1), - {ok, Pid22} = start(Node2, Tab2), - cets_discovery:add_table(Disco1, Tab1), - cets_discovery:add_table(Disco1, Tab2), - cets_discovery:add_table(Disco2, Tab1), - cets_discovery:add_table(Disco2, Tab2), - - ok = wait_for_ready(DiscoName, 5000), - set_other_servers(Pid22, []), - cets_test_wait:wait_until( - fun() -> maps:get(conflict_nodes, cets_status:status(DiscoName)) end, [Node2] - ), - cets_test_wait:wait_until( - fun() -> maps:get(conflict_tables, cets_status:status(DiscoName)) end, [Tab2] - ). - disco_wait_for_get_nodes_works(_Config) -> F = fun(State) -> {{ok, []}, State} end, {ok, Disco} = cets_discovery:start_link(#{ @@ -2446,10 +2190,6 @@ node_down_history_is_updated_when_netsplit_happens(Config) -> cets:stop(Pid5) end. -format_data_does_not_return_table_duplicates(Config) -> - Res = cets_status:format_data(test_data_for_duplicate_missing_table_in_status(Config)), - ?assertMatch(#{remote_unknown_tables := [], remote_nodes_with_missing_tables := []}, Res). - cets_ping_non_existing_node(_Config) -> pang = cets_ping:ping('mongooseim@non_existing_host'). @@ -2531,11 +2271,6 @@ start_link_local(Name, Opts) -> set_join_ref(Pid, JoinRef) -> sys:replace_state(Pid, fun(#{join_ref := _} = State) -> State#{join_ref := JoinRef} end). -set_other_servers(Pid, Servers) -> - sys:replace_state(Pid, fun(#{other_servers := _} = State) -> - State#{other_servers := Servers} - end). - stopped_pid() -> %% Get a pid for a stopped process {Pid, Mon} = spawn_monitor(fun() -> ok end), @@ -2565,22 +2300,6 @@ send_join_start_back_and_wait_for_continue_joining() -> ok end. -%% Gathered after Helm update -%% with cets_status:gather_data(mongoose_cets_discovery). -test_data_for_duplicate_missing_table_in_status(Config) -> - %% Create atoms in non sorted order - %% maps:keys returns keys in the atom-creation order (and not sorted). - %% Also, compiler is smart and would optimize list_to_atom("literal_string"), - %% so we do a module call to disable this optimization. - _ = list_to_atom(?MODULE:return_same("cets_external_component")), - _ = list_to_atom(?MODULE:return_same("cets_bosh")), - Name = filename:join(proplists:get_value(data_dir, Config), "status_data.txt"), - {ok, [Term]} = file:consult(Name), - Term. - -return_same(X) -> - X. - not_leader(Leader, Other, Leader) -> Other; not_leader(Other, Leader, Leader) -> diff --git a/test/cets_status_SUITE.erl b/test/cets_status_SUITE.erl new file mode 100644 index 0000000..fd6ccd5 --- /dev/null +++ b/test/cets_status_SUITE.erl @@ -0,0 +1,400 @@ +-module(cets_status_SUITE). +-include_lib("common_test/include/ct.hrl"). +-include_lib("eunit/include/eunit.hrl"). +-include_lib("kernel/include/logger.hrl"). + +-compile([export_all, nowarn_export_all]). + +-import(cets_test_setup, [ + start/2, + start_local/1, + start_local/2, + start_disco/2, + make_name/1, + make_name/2, + disco_name/1 +]). + +-import(cets_test_wait, [ + wait_for_ready/2, + wait_till_test_stage/2 +]). + +-import(cets_test_setup, [ + setup_two_nodes_and_discovery/1, + setup_two_nodes_and_discovery/2, + simulate_disco_restart/1 +]). + +-import(cets_test_wait, [ + wait_for_disco_timestamp_to_appear/3, + wait_for_disco_timestamp_to_be_updated/4 +]). + +-import(cets_test_receive, [ + receive_message/1, + flush_message/1 +]). + +-import(cets_test_peer, [ + disconnect_node/2, + disconnect_node_by_name/2 +]). + +-import(cets_test_helper, [ + assert_unique/1, + set_other_servers/2 +]). + +-import(cets_test_rpc, [ + other_nodes/2 +]). + +all() -> + [ + {group, cets} + % {group, cets_seq}, + % {group, cets_seq_no_log} + ]. + +groups() -> + %% Cases should have unique names, because we name CETS servers based on case names + [ + {cets, [parallel, {repeat_until_any_fail, 3}], assert_unique(cases())}, + %% These tests actually simulate a netsplit on the distribution level. + %% Though, global's prevent_overlapping_partitions option starts kicking + %% all nodes from the cluster, so we have to be careful not to break other cases. + %% Setting prevent_overlapping_partitions=false on ct5 helps. + {cets_seq, [sequence, {repeat_until_any_fail, 2}], assert_unique(seq_cases())}, + {cets_seq_no_log, [sequence, {repeat_until_any_fail, 2}], + assert_unique(cets_seq_no_log_cases())} + ]. + +cases() -> + [ + status_available_nodes, + status_available_nodes_do_not_contain_nodes_with_stopped_disco, + status_unavailable_nodes, + status_unavailable_nodes_is_subset_of_discovery_nodes, + status_joined_nodes, + status_discovery_works, + status_discovered_nodes, + status_remote_nodes_without_disco, + status_remote_nodes_with_unknown_tables, + status_remote_nodes_with_missing_nodes, + status_conflict_nodes, + format_data_does_not_return_table_duplicates + ]. + +seq_cases() -> + []. + +cets_seq_no_log_cases() -> + []. + +init_per_suite(Config) -> + cets_test_setup:init_cleanup_table(), + cets_test_peer:start([ct2], Config). + +end_per_suite(Config) -> + cets_test_setup:remove_cleanup_table(), + cets_test_peer:stop(Config), + Config. + +init_per_group(Group, Config) when Group == cets_seq_no_log; Group == cets_no_log -> + [ok = logger:set_module_level(M, none) || M <- log_modules()], + Config; +init_per_group(_Group, Config) -> + Config. + +end_per_group(Group, Config) when Group == cets_seq_no_log; Group == cets_no_log -> + [ok = logger:unset_module_level(M) || M <- log_modules()], + Config; +end_per_group(_Group, Config) -> + Config. + +init_per_testcase(test_multinode_auto_discovery = Name, Config) -> + ct:make_priv_dir(), + init_per_testcase_generic(Name, Config); +init_per_testcase(Name, Config) -> + init_per_testcase_generic(Name, Config). + +init_per_testcase_generic(Name, Config) -> + [{testcase, Name} | Config]. + +end_per_testcase(_, _Config) -> + cets_test_setup:wait_for_cleanup(), + ok. + +%% Modules that use a multiline LOG_ macro +log_modules() -> + [cets, cets_call, cets_long, cets_join, cets_discovery]. + +status_available_nodes(Config) -> + Node1 = node(), + #{ct2 := Node2} = proplists:get_value(nodes, Config), + F = fun(State) -> + {{ok, []}, State} + end, + DiscoName = disco_name(Config), + start_disco(Node1, #{name => DiscoName, backend_module => cets_discovery_fun, get_nodes_fn => F}), + start_disco(Node2, #{name => DiscoName, backend_module => cets_discovery_fun, get_nodes_fn => F}), + ?assertMatch(#{available_nodes := [Node1, Node2]}, cets_status:status(DiscoName)). + +status_available_nodes_do_not_contain_nodes_with_stopped_disco(Config) -> + Node1 = node(), + #{ct2 := Node2} = proplists:get_value(nodes, Config), + F = fun(State) -> + {{ok, [Node1, Node2]}, State} + end, + DiscoName = disco_name(Config), + start_disco(Node1, #{name => DiscoName, backend_module => cets_discovery_fun, get_nodes_fn => F}), + %% Disco not running + ?assertMatch(#{available_nodes := [Node1]}, cets_status:status(DiscoName)). + +status_unavailable_nodes(Config) -> + Node1 = node(), + F = fun(State) -> + {{ok, [Node1, 'badnode@localhost']}, State} + end, + DiscoName = disco_name(Config), + Disco = start_disco(Node1, #{ + name => DiscoName, backend_module => cets_discovery_fun, get_nodes_fn => F + }), + %% Disco needs at least one table to start calling get_nodes function + Tab = make_name(Config), + {ok, _} = start(Node1, Tab), + cets_discovery:add_table(Disco, Tab), + ok = wait_for_ready(DiscoName, 5000), + ?assertMatch(#{unavailable_nodes := ['badnode@localhost']}, cets_status:status(DiscoName)). + +status_unavailable_nodes_is_subset_of_discovery_nodes(Config) -> + Node1 = node(), + Self = self(), + GetFn1 = fun(State) -> {{ok, [Node1, 'badnode@localhost']}, State} end, + GetFn2 = fun(State) -> + Self ! get_fn2_called, + {{ok, [Node1]}, State} + end, + %% Setup meck + BackendModule = make_name(Config, disco_backend), + meck:new(BackendModule, [non_strict]), + meck:expect(BackendModule, init, fun(_Opts) -> undefined end), + meck:expect(BackendModule, get_nodes, GetFn1), + DiscoName = disco_name(Config), + Disco = start_disco(Node1, #{ + name => DiscoName, backend_module => BackendModule + }), + %% Disco needs at least one table to start calling get_nodes function + Tab = make_name(Config), + {ok, _} = start(Node1, Tab), + cets_discovery:add_table(Disco, Tab), + ok = wait_for_ready(DiscoName, 5000), + ?assertMatch(#{unavailable_nodes := ['badnode@localhost']}, cets_status:status(DiscoName)), + %% Remove badnode from disco + meck:expect(BackendModule, get_nodes, GetFn2), + %% Force check. + Disco ! check, + receive_message(get_fn2_called), + %% The unavailable_nodes list is updated + CondF = fun() -> maps:get(unavailable_nodes, cets_status:status(DiscoName)) end, + cets_test_wait:wait_until(CondF, []). + +status_joined_nodes(Config) -> + Node1 = node(), + #{ct2 := Node2} = proplists:get_value(nodes, Config), + F = fun(State) -> + {{ok, [Node1, Node2]}, State} + end, + DiscoName = disco_name(Config), + DiscoOpts = #{ + name => DiscoName, backend_module => cets_discovery_fun, get_nodes_fn => F + }, + Disco1 = start_disco(Node1, DiscoOpts), + Disco2 = start_disco(Node2, DiscoOpts), + Tab = make_name(Config), + {ok, _} = start(Node1, Tab), + {ok, _} = start(Node2, Tab), + %% Add table using pids (i.e. no need to do RPCs here) + cets_discovery:add_table(Disco1, Tab), + cets_discovery:add_table(Disco2, Tab), + ok = wait_for_ready(DiscoName, 5000), + cets_test_wait:wait_until(fun() -> maps:get(joined_nodes, cets_status:status(DiscoName)) end, [ + Node1, Node2 + ]). + +status_discovery_works(Config) -> + Node1 = node(), + #{ct2 := Node2} = proplists:get_value(nodes, Config), + F = fun(State) -> + {{ok, [Node1, Node2]}, State} + end, + DiscoName = disco_name(Config), + DiscoOpts = #{ + name => DiscoName, backend_module => cets_discovery_fun, get_nodes_fn => F + }, + Disco1 = start_disco(Node1, DiscoOpts), + Disco2 = start_disco(Node2, DiscoOpts), + Tab = make_name(Config), + {ok, _} = start(Node1, Tab), + {ok, _} = start(Node2, Tab), + %% Add table using pids (i.e. no need to do RPCs here) + cets_discovery:add_table(Disco1, Tab), + cets_discovery:add_table(Disco2, Tab), + ok = wait_for_ready(DiscoName, 5000), + ?assertMatch(#{discovery_works := true}, cets_status:status(DiscoName)). + +status_discovered_nodes(Config) -> + Node1 = node(), + #{ct2 := Node2} = proplists:get_value(nodes, Config), + F = fun(State) -> + {{ok, [Node1, Node2]}, State} + end, + DiscoName = disco_name(Config), + Disco = start_disco(Node1, #{ + name => DiscoName, backend_module => cets_discovery_fun, get_nodes_fn => F + }), + Tab = make_name(Config), + {ok, _} = start(Node1, Tab), + {ok, _} = start(Node2, Tab), + %% Add table using pids (i.e. no need to do RPCs here) + cets_discovery:add_table(Disco, Tab), + ok = wait_for_ready(DiscoName, 5000), + ?assertMatch(#{discovered_nodes := [Node1, Node2]}, cets_status:status(DiscoName)). + +status_remote_nodes_without_disco(Config) -> + Node1 = node(), + #{ct2 := Node2} = proplists:get_value(nodes, Config), + F = fun(State) -> + {{ok, [Node1, Node2]}, State} + end, + DiscoName = disco_name(Config), + Disco = start_disco(Node1, #{ + name => DiscoName, backend_module => cets_discovery_fun, get_nodes_fn => F + }), + Tab = make_name(Config), + {ok, _} = start(Node1, Tab), + cets_discovery:add_table(Disco, Tab), + ok = wait_for_ready(DiscoName, 5000), + ?assertMatch(#{remote_nodes_without_disco := [Node2]}, cets_status:status(DiscoName)). + +status_remote_nodes_with_unknown_tables(Config) -> + Node1 = node(), + #{ct2 := Node2} = proplists:get_value(nodes, Config), + F = fun(State) -> + {{ok, [Node1, Node2]}, State} + end, + DiscoName = disco_name(Config), + DiscoOpts = #{ + name => DiscoName, backend_module => cets_discovery_fun, get_nodes_fn => F + }, + Disco1 = start_disco(Node1, DiscoOpts), + Disco2 = start_disco(Node2, DiscoOpts), + Tab1 = make_name(Config, 1), + Tab2 = make_name(Config, 2), + %% Node1 does not have Tab2 + {ok, _} = start(Node1, Tab2), + {ok, _} = start(Node2, Tab1), + {ok, _} = start(Node2, Tab2), + %% Add table using pids (i.e. no need to do RPCs here) + cets_discovery:add_table(Disco1, Tab1), + cets_discovery:add_table(Disco2, Tab1), + cets_discovery:add_table(Disco2, Tab2), + ok = wait_for_ready(DiscoName, 5000), + cets_test_wait:wait_until( + fun() -> maps:get(remote_nodes_with_unknown_tables, cets_status:status(DiscoName)) end, [ + Node2 + ] + ), + cets_test_wait:wait_until( + fun() -> maps:get(remote_unknown_tables, cets_status:status(DiscoName)) end, [ + Tab2 + ] + ). + +status_remote_nodes_with_missing_nodes(Config) -> + Node1 = node(), + #{ct2 := Node2} = proplists:get_value(nodes, Config), + F = fun(State) -> + {{ok, [Node1, Node2]}, State} + end, + DiscoName = disco_name(Config), + DiscoOpts = #{ + name => DiscoName, backend_module => cets_discovery_fun, get_nodes_fn => F + }, + Disco1 = start_disco(Node1, DiscoOpts), + Disco2 = start_disco(Node2, DiscoOpts), + Tab1 = make_name(Config, 1), + Tab2 = make_name(Config, 2), + %% Node2 does not have Tab2 + {ok, _} = start(Node1, Tab1), + {ok, _} = start(Node1, Tab2), + {ok, _} = start(Node2, Tab1), + cets_discovery:add_table(Disco1, Tab1), + cets_discovery:add_table(Disco1, Tab2), + cets_discovery:add_table(Disco2, Tab1), + ok = wait_for_ready(DiscoName, 5000), + cets_test_wait:wait_until( + fun() -> maps:get(remote_nodes_with_missing_tables, cets_status:status(DiscoName)) end, [ + Node2 + ] + ), + cets_test_wait:wait_until( + fun() -> maps:get(remote_missing_tables, cets_status:status(DiscoName)) end, [ + Tab2 + ] + ). + +status_conflict_nodes(Config) -> + Node1 = node(), + #{ct2 := Node2} = proplists:get_value(nodes, Config), + F = fun(State) -> + {{ok, [Node1, Node2]}, State} + end, + DiscoName = disco_name(Config), + DiscoOpts = #{ + name => DiscoName, backend_module => cets_discovery_fun, get_nodes_fn => F + }, + Disco1 = start_disco(Node1, DiscoOpts), + Disco2 = start_disco(Node2, DiscoOpts), + Tab1 = make_name(Config, 1), + Tab2 = make_name(Config, 2), + {ok, _} = start(Node1, Tab1), + {ok, _} = start(Node1, Tab2), + {ok, _} = start(Node2, Tab1), + {ok, Pid22} = start(Node2, Tab2), + cets_discovery:add_table(Disco1, Tab1), + cets_discovery:add_table(Disco1, Tab2), + cets_discovery:add_table(Disco2, Tab1), + cets_discovery:add_table(Disco2, Tab2), + + ok = wait_for_ready(DiscoName, 5000), + set_other_servers(Pid22, []), + cets_test_wait:wait_until( + fun() -> maps:get(conflict_nodes, cets_status:status(DiscoName)) end, [Node2] + ), + cets_test_wait:wait_until( + fun() -> maps:get(conflict_tables, cets_status:status(DiscoName)) end, [Tab2] + ). + +format_data_does_not_return_table_duplicates(Config) -> + Res = cets_status:format_data(test_data_for_duplicate_missing_table_in_status(Config)), + ?assertMatch(#{remote_unknown_tables := [], remote_nodes_with_missing_tables := []}, Res). + +%% Helpers + +%% Gathered after Helm update +%% with cets_status:gather_data(mongoose_cets_discovery). +test_data_for_duplicate_missing_table_in_status(Config) -> + %% Create atoms in non sorted order + %% maps:keys returns keys in the atom-creation order (and not sorted). + %% Also, compiler is smart and would optimize list_to_atom("literal_string"), + %% so we do a module call to disable this optimization. + _ = list_to_atom(?MODULE:return_same("cets_external_component")), + _ = list_to_atom(?MODULE:return_same("cets_bosh")), + Name = filename:join(proplists:get_value(data_dir, Config), "status_data.txt"), + {ok, [Term]} = file:consult(Name), + Term. + +return_same(X) -> + X. diff --git a/test/cets_SUITE_data/status_data.txt b/test/cets_status_SUITE_data/status_data.txt similarity index 100% rename from test/cets_SUITE_data/status_data.txt rename to test/cets_status_SUITE_data/status_data.txt diff --git a/test/cets_test_helper.erl b/test/cets_test_helper.erl index 34137b8..fedc8d4 100644 --- a/test/cets_test_helper.erl +++ b/test/cets_test_helper.erl @@ -7,7 +7,8 @@ ]). -export([ - set_nodedown_timestamp/3 + set_nodedown_timestamp/3, + set_other_servers/2 ]). get_disco_timestamp(Disco, MapName, NodeKey) -> @@ -25,3 +26,8 @@ set_nodedown_timestamp(Disco, Node, NewTimestamp) -> sys:replace_state(Disco, fun(#{nodedown_timestamps := Map} = State) -> State#{nodedown_timestamps := maps:put(Node, NewTimestamp, Map)} end). + +set_other_servers(Pid, Servers) -> + sys:replace_state(Pid, fun(#{other_servers := _} = State) -> + State#{other_servers := Servers} + end). From 9dfdb285ac1dc0f0e89526825143e8d87cf7290b Mon Sep 17 00:00:00 2001 From: Mikhail Uvarov Date: Tue, 12 Mar 2024 15:16:40 +0100 Subject: [PATCH 11/30] Move more cases into cets_disco_SUITE --- test/cets_SUITE.erl | 177 +---------------------------------- test/cets_disco_SUITE.erl | 190 +++++++++++++++++++++++++++++++++++++- 2 files changed, 186 insertions(+), 181 deletions(-) diff --git a/test/cets_SUITE.erl b/test/cets_SUITE.erl index 33961d0..8081932 100644 --- a/test/cets_SUITE.erl +++ b/test/cets_SUITE.erl @@ -34,7 +34,6 @@ start/2, start_local/1, start_local/2, - start_disco/2, start_simple_disco/0, make_name/1, make_name/2, @@ -44,11 +43,7 @@ given_two_joined_tables/2, given_3_servers/1, given_3_servers/2, - given_n_servers/3 -]). - --import(cets_test_setup, [ - make_signalling_process/0, + given_n_servers/3, make_process/0 ]). @@ -64,7 +59,6 @@ -import(cets_test_receive, [ receive_message/1, receive_message_with_arg/1, - receive_all_logs/1, assert_nothing_is_logged/2 ]). @@ -152,9 +146,6 @@ cases() -> test_multinode, test_multinode_remote_insert, node_list_is_correct, - disco_wait_for_get_nodes_works, - disco_wait_for_get_nodes_blocks_and_returns, - disco_wait_for_get_nodes_when_get_nodes_needs_to_be_retried, get_nodes_request, test_locally, handle_down_is_called, @@ -203,7 +194,6 @@ cases() -> pinfo_returns_undefined, cets_ping_non_existing_node, cets_ping_net_family, - unexpected_nodedown_is_ignored_by_disco, ignore_send_dump_received_when_unpaused, ignore_send_dump_received_when_paused_with_another_pause_ref, pause_on_remote_node_returns_if_monitor_process_dies @@ -229,11 +219,8 @@ seq_cases() -> [ insert_returns_when_netsplit, inserts_after_netsplit_reconnects, - disco_connects_to_unconnected_node, joining_not_fully_connected_node_is_not_allowed, joining_not_fully_connected_node_is_not_allowed2, - %% Cannot be run in parallel with other tests because checks all logging messages. - logging_when_failing_join_with_disco, cets_ping_all_returns_when_ping_crashes, join_interrupted_when_ping_crashes, ping_pairs_returns_pongs, @@ -1474,81 +1461,6 @@ node_list_is_correct(Config) -> [Node1, Node2, Node3] = other_nodes(Node4, Tab), ok. -disco_wait_for_get_nodes_works(_Config) -> - F = fun(State) -> {{ok, []}, State} end, - {ok, Disco} = cets_discovery:start_link(#{ - backend_module => cets_discovery_fun, get_nodes_fn => F - }), - ok = cets_discovery:wait_for_get_nodes(Disco, 5000). - -disco_wait_for_get_nodes_blocks_and_returns(Config) -> - Tab = make_name(Config, 1), - {ok, _Pid} = start_local(Tab, #{}), - SignallingPid = make_signalling_process(), - F = fun(State) -> - wait_for_down(SignallingPid), - {{ok, []}, State} - end, - {ok, Disco} = cets_discovery:start_link(#{ - backend_module => cets_discovery_fun, get_nodes_fn => F - }), - cets_discovery:add_table(Disco, Tab), - %% Enter into a blocking get_nodes function - Disco ! check, - %% Do it async, because it would block is - WaitPid = spawn_link(fun() -> ok = cets_discovery:wait_for_get_nodes(Disco, 5000) end), - Cond = fun() -> - length(maps:get(pending_wait_for_get_nodes, cets_discovery:system_info(Disco))) - end, - cets_test_wait:wait_until(Cond, 1), - %% Unblock get_nodes call - SignallingPid ! stop, - %% wait_for_get_nodes returns - wait_for_down(WaitPid), - ok. - -%% Check that wait_for_get_nodes waits in case get_nodes should be retried -disco_wait_for_get_nodes_when_get_nodes_needs_to_be_retried(Config) -> - Me = self(), - Tab = make_name(Config, 1), - {ok, _Pid} = start_local(Tab, #{}), - SignallingPid1 = make_signalling_process(), - SignallingPid2 = make_signalling_process(), - F = fun - (State = #{step := 1}) -> - wait_for_down(SignallingPid1), - {{ok, []}, State#{step => 2}}; - (State = #{step := 2}) -> - Me ! entered_get_nodes2, - wait_for_down(SignallingPid2), - {{ok, []}, State#{step => 2}} - end, - {ok, Disco} = cets_discovery:start_link(#{ - backend_module => cets_discovery_fun, get_nodes_fn => F, step => 1 - }), - cets_discovery:add_table(Disco, Tab), - %% Enter into a blocking get_nodes function - Disco ! check, - %% Do it async, because it would block is - WaitPid = spawn_link(fun() -> ok = cets_discovery:wait_for_get_nodes(Disco, 5000) end), - Cond = fun() -> - length(maps:get(pending_wait_for_get_nodes, cets_discovery:system_info(Disco))) - end, - cets_test_wait:wait_until(Cond, 1), - %% Set should_retry_get_nodes - Disco ! check, - %% Ensure check message is received - cets_discovery:system_info(Disco), - %% Unblock first get_nodes call - SignallingPid1 ! stop, - receive_message(entered_get_nodes2), - %% Still waiting for get_nodes being retried - true = erlang:is_process_alive(WaitPid), - %% It returns finally after second get_nodes call - SignallingPid2 ! stop, - wait_for_down(WaitPid), - ok. - get_nodes_request(Config) -> #{ct2 := Node2, ct3 := Node3, ct4 := Node4} = proplists:get_value(nodes, Config), Tab = make_name(Config), @@ -2011,25 +1923,6 @@ inserts_after_netsplit_reconnects(Config) -> [{1, v2}] = dump(Node1, Tab), [{1, v3}] = dump(Peer5, Tab). -disco_connects_to_unconnected_node(Config) -> - Node1 = node(), - #{ct5 := Peer5} = proplists:get_value(peers, Config), - #{ct5 := Node5} = proplists:get_value(nodes, Config), - ok = net_kernel:monitor_nodes(true), - disconnect_node(Peer5, Node1), - receive_message({nodedown, Node5}), - Tab = make_name(Config), - {ok, _} = start(Node1, Tab), - {ok, _} = start(Peer5, Tab), - F = fun(State) -> - {{ok, [Node1, Node5]}, State} - end, - {ok, Disco} = cets_discovery:start_link(#{ - backend_module => cets_discovery_fun, get_nodes_fn => F - }), - cets_discovery:add_table(Disco, Tab), - ok = wait_for_ready(Disco, 5000). - %% Joins from a bad (not fully connected) node %% Join process should check if nodes could contact each other before allowing to join joining_not_fully_connected_node_is_not_allowed(Config) -> @@ -2090,60 +1983,6 @@ joining_not_fully_connected_node_is_not_allowed2(Config) -> end, [] = cets:other_pids(Pid5). -logging_when_failing_join_with_disco(Config) -> - %% Simulate cets:other_pids/1 failing with reason: - %% {{nodedown,'mongooseim@mongooseim-1.mongooseim.default.svc.cluster.local'}, - %% {gen_server,call,[<30887.438.0>,other_servers,infinity]}} - %% We use peer module to still have a connection after a disconnect from the remote node. - logger_debug_h:start(#{id => ?FUNCTION_NAME}), - Node1 = node(), - #{ct2 := Peer2} = proplists:get_value(peers, Config), - #{ct2 := Node2} = proplists:get_value(nodes, Config), - Tab = make_name(Config), - {ok, _Pid1} = start(Node1, Tab), - {ok, Pid2} = start(Peer2, Tab), - meck:new(cets, [passthrough]), - meck:expect(cets, other_pids, fun - (Server) when Server =:= Pid2 -> - block_node(Node2, Peer2), - wait_for_down(Pid2), - meck:passthrough([Server]); - (Server) -> - meck:passthrough([Server]) - end), - F = fun(State) -> - {{ok, [Node1, Node2]}, State} - end, - DiscoName = disco_name(Config), - Disco = start_disco(Node1, #{ - name => DiscoName, backend_module => cets_discovery_fun, get_nodes_fn => F - }), - try - cets_discovery:add_table(Disco, Tab), - timer:sleep(100), - Logs = receive_all_logs(?FUNCTION_NAME), - Reason = {{nodedown, Node2}, {gen_server, call, [Pid2, other_servers, infinity]}}, - MatchedLogs = [ - Log - || #{ - level := error, - msg := - {report, #{ - what := task_failed, - reason := Reason2 - }} - } = Log <- Logs, - Reason =:= Reason2 - ], - %% Only one message is logged - ?assertMatch([_], MatchedLogs, Logs) - after - meck:unload(), - reconnect_node(Node2, Peer2), - cets:stop(Pid2) - end, - ok. - cets_ping_all_returns_when_ping_crashes(Config) -> #{pid1 := Pid1, pid2 := Pid2} = given_two_joined_tables(Config), meck:new(cets, [passthrough]), @@ -2223,20 +2062,6 @@ cets_ping_net_family(_Config) -> inet6 = cets_ping:net_family({ok, [["inet6"]]}), inet6 = cets_ping:net_family({ok, [["inet6_tls"]]}). -unexpected_nodedown_is_ignored_by_disco(Config) -> - %% Theoretically, should not happen - %% Still, check that we do not crash in this case - DiscoName = disco_name(Config), - F = fun(State) -> {{ok, []}, State} end, - Disco = start_disco(node(), #{ - name => DiscoName, backend_module => cets_discovery_fun, get_nodes_fn => F - }), - #{start_time := StartTime} = cets_discovery:system_info(Disco), - Disco ! {nodedown, 'cets@badnode'}, - %% Check that we are still running - #{start_time := StartTime} = cets_discovery:system_info(Disco), - ok. - ping_pairs_returns_pongs(Config) -> #{ct2 := Node2, ct3 := Node3} = proplists:get_value(nodes, Config), Me = node(), diff --git a/test/cets_disco_SUITE.erl b/test/cets_disco_SUITE.erl index 583efee..95a1232 100644 --- a/test/cets_disco_SUITE.erl +++ b/test/cets_disco_SUITE.erl @@ -9,12 +9,14 @@ start/2, start_local/1, start_local/2, + start_disco/2, make_name/1, make_name/2, disco_name/1 ]). -import(cets_test_wait, [ + wait_for_down/1, wait_for_ready/2, wait_till_test_stage/2 ]). @@ -22,7 +24,8 @@ -import(cets_test_setup, [ setup_two_nodes_and_discovery/1, setup_two_nodes_and_discovery/2, - simulate_disco_restart/1 + simulate_disco_restart/1, + make_signalling_process/0 ]). -import(cets_test_wait, [ @@ -32,14 +35,21 @@ -import(cets_test_receive, [ receive_message/1, - flush_message/1 + flush_message/1, + receive_all_logs/1 ]). -import(cets_test_peer, [ + block_node/2, + reconnect_node/2, disconnect_node/2, disconnect_node_by_name/2 ]). +-import(cets_test_rpc, [ + rpc/4 +]). + -import(cets_test_helper, [assert_unique/1]). -import(cets_test_rpc, [ @@ -68,6 +78,9 @@ groups() -> cases() -> [ + disco_wait_for_get_nodes_works, + disco_wait_for_get_nodes_blocks_and_returns, + disco_wait_for_get_nodes_when_get_nodes_needs_to_be_retried, test_multinode_auto_discovery, test_disco_add_table, test_disco_delete_table, @@ -82,7 +95,8 @@ cases() -> disco_uses_regular_retry_interval_in_the_regular_phase, disco_uses_regular_retry_interval_in_the_regular_phase_after_node_down, disco_uses_regular_retry_interval_in_the_regular_phase_after_expired_node_down, - disco_handles_node_up_and_down + disco_handles_node_up_and_down, + unexpected_nodedown_is_ignored_by_disco ]. seq_cases() -> @@ -96,7 +110,10 @@ seq_cases() -> disco_nodeup_timestamp_is_updated_after_node_reconnects, disco_node_start_timestamp_is_updated_after_node_restarts, disco_late_pang_result_arrives_after_node_went_up, - disco_nodeup_triggers_check_and_get_nodes + disco_nodeup_triggers_check_and_get_nodes, + %% Cannot be run in parallel with other tests because checks all logging messages. + logging_when_failing_join_with_disco, + disco_connects_to_unconnected_node ]. cets_seq_no_log_cases() -> @@ -110,7 +127,7 @@ cets_seq_no_log_cases() -> init_per_suite(Config) -> cets_test_setup:init_cleanup_table(), - cets_test_peer:start([ct2], Config). + cets_test_peer:start([ct2, ct5], Config). end_per_suite(Config) -> cets_test_setup:remove_cleanup_table(), @@ -146,6 +163,81 @@ end_per_testcase(_, _Config) -> log_modules() -> [cets, cets_call, cets_long, cets_join, cets_discovery]. +disco_wait_for_get_nodes_works(_Config) -> + F = fun(State) -> {{ok, []}, State} end, + {ok, Disco} = cets_discovery:start_link(#{ + backend_module => cets_discovery_fun, get_nodes_fn => F + }), + ok = cets_discovery:wait_for_get_nodes(Disco, 5000). + +disco_wait_for_get_nodes_blocks_and_returns(Config) -> + Tab = make_name(Config, 1), + {ok, _Pid} = start_local(Tab, #{}), + SignallingPid = make_signalling_process(), + F = fun(State) -> + wait_for_down(SignallingPid), + {{ok, []}, State} + end, + {ok, Disco} = cets_discovery:start_link(#{ + backend_module => cets_discovery_fun, get_nodes_fn => F + }), + cets_discovery:add_table(Disco, Tab), + %% Enter into a blocking get_nodes function + Disco ! check, + %% Do it async, because it would block is + WaitPid = spawn_link(fun() -> ok = cets_discovery:wait_for_get_nodes(Disco, 5000) end), + Cond = fun() -> + length(maps:get(pending_wait_for_get_nodes, cets_discovery:system_info(Disco))) + end, + cets_test_wait:wait_until(Cond, 1), + %% Unblock get_nodes call + SignallingPid ! stop, + %% wait_for_get_nodes returns + wait_for_down(WaitPid), + ok. + +%% Check that wait_for_get_nodes waits in case get_nodes should be retried +disco_wait_for_get_nodes_when_get_nodes_needs_to_be_retried(Config) -> + Me = self(), + Tab = make_name(Config, 1), + {ok, _Pid} = start_local(Tab, #{}), + SignallingPid1 = make_signalling_process(), + SignallingPid2 = make_signalling_process(), + F = fun + (State = #{step := 1}) -> + wait_for_down(SignallingPid1), + {{ok, []}, State#{step => 2}}; + (State = #{step := 2}) -> + Me ! entered_get_nodes2, + wait_for_down(SignallingPid2), + {{ok, []}, State#{step => 2}} + end, + {ok, Disco} = cets_discovery:start_link(#{ + backend_module => cets_discovery_fun, get_nodes_fn => F, step => 1 + }), + cets_discovery:add_table(Disco, Tab), + %% Enter into a blocking get_nodes function + Disco ! check, + %% Do it async, because it would block is + WaitPid = spawn_link(fun() -> ok = cets_discovery:wait_for_get_nodes(Disco, 5000) end), + Cond = fun() -> + length(maps:get(pending_wait_for_get_nodes, cets_discovery:system_info(Disco))) + end, + cets_test_wait:wait_until(Cond, 1), + %% Set should_retry_get_nodes + Disco ! check, + %% Ensure check message is received + cets_discovery:system_info(Disco), + %% Unblock first get_nodes call + SignallingPid1 ! stop, + receive_message(entered_get_nodes2), + %% Still waiting for get_nodes being retried + true = erlang:is_process_alive(WaitPid), + %% It returns finally after second get_nodes call + SignallingPid2 ! stop, + wait_for_down(WaitPid), + ok. + test_multinode_auto_discovery(Config) -> Node1 = node(), #{ct2 := Node2} = proplists:get_value(nodes, Config), @@ -423,6 +515,20 @@ disco_handles_node_up_and_down(Config) -> %% Check that wait_for_ready still works ok = wait_for_ready(Disco, 5000). +unexpected_nodedown_is_ignored_by_disco(Config) -> + %% Theoretically, should not happen + %% Still, check that we do not crash in this case + DiscoName = disco_name(Config), + F = fun(State) -> {{ok, []}, State} end, + Disco = start_disco(node(), #{ + name => DiscoName, backend_module => cets_discovery_fun, get_nodes_fn => F + }), + #{start_time := StartTime} = cets_discovery:system_info(Disco), + Disco ! {nodedown, 'cets@badnode'}, + %% Check that we are still running + #{start_time := StartTime} = cets_discovery:system_info(Disco), + ok. + disco_logs_nodeup(Config) -> logger_debug_h:start(#{id => ?FUNCTION_NAME}), #{disco := Disco, node2 := Node2} = setup_two_nodes_and_discovery(Config), @@ -574,3 +680,77 @@ disco_nodeup_triggers_check_and_get_nodes(Config) -> flush_message(get_nodes), Disco ! {nodeup, Node2}, receive_message(get_nodes). + +disco_connects_to_unconnected_node(Config) -> + Node1 = node(), + #{ct5 := Peer5} = proplists:get_value(peers, Config), + #{ct5 := Node5} = proplists:get_value(nodes, Config), + disconnect_node(Peer5, Node1), + cets_test_wait:wait_until( + fun() -> lists:member(node(), rpc(Peer5, erlang, nodes, [])) end, false + ), + Tab = make_name(Config), + {ok, _} = start(Node1, Tab), + {ok, _} = start(Peer5, Tab), + F = fun(State) -> + {{ok, [Node1, Node5]}, State} + end, + {ok, Disco} = cets_discovery:start_link(#{ + backend_module => cets_discovery_fun, get_nodes_fn => F + }), + cets_discovery:add_table(Disco, Tab), + ok = wait_for_ready(Disco, 5000). + +logging_when_failing_join_with_disco(Config) -> + %% Simulate cets:other_pids/1 failing with reason: + %% {{nodedown,'mongooseim@mongooseim-1.mongooseim.default.svc.cluster.local'}, + %% {gen_server,call,[<30887.438.0>,other_servers,infinity]}} + %% We use peer module to still have a connection after a disconnect from the remote node. + logger_debug_h:start(#{id => ?FUNCTION_NAME}), + Node1 = node(), + #{ct2 := Peer2} = proplists:get_value(peers, Config), + #{ct2 := Node2} = proplists:get_value(nodes, Config), + Tab = make_name(Config), + {ok, _Pid1} = start(Node1, Tab), + {ok, Pid2} = start(Peer2, Tab), + meck:new(cets, [passthrough]), + meck:expect(cets, other_pids, fun + (Server) when Server =:= Pid2 -> + block_node(Node2, Peer2), + wait_for_down(Pid2), + meck:passthrough([Server]); + (Server) -> + meck:passthrough([Server]) + end), + F = fun(State) -> + {{ok, [Node1, Node2]}, State} + end, + DiscoName = disco_name(Config), + Disco = start_disco(Node1, #{ + name => DiscoName, backend_module => cets_discovery_fun, get_nodes_fn => F + }), + try + cets_discovery:add_table(Disco, Tab), + timer:sleep(100), + Logs = receive_all_logs(?FUNCTION_NAME), + Reason = {{nodedown, Node2}, {gen_server, call, [Pid2, other_servers, infinity]}}, + MatchedLogs = [ + Log + || #{ + level := error, + msg := + {report, #{ + what := task_failed, + reason := Reason2 + }} + } = Log <- Logs, + Reason =:= Reason2 + ], + %% Only one message is logged + ?assertMatch([_], MatchedLogs, Logs) + after + meck:unload(), + reconnect_node(Node2, Peer2), + cets:stop(Pid2) + end, + ok. From 9c9ff72f393e29467d1881c933a9216de1212322 Mon Sep 17 00:00:00 2001 From: Mikhail Uvarov Date: Tue, 12 Mar 2024 15:22:28 +0100 Subject: [PATCH 12/30] Move receive_all_logs/ssert_nothing_is_logged into cets_test_log --- test/cets_SUITE.erl | 5 ++--- test/cets_disco_SUITE.erl | 5 ++--- test/cets_test_log.erl | 23 ++++++++++++++++++++++- test/cets_test_receive.erl | 23 +---------------------- 4 files changed, 27 insertions(+), 29 deletions(-) diff --git a/test/cets_SUITE.erl b/test/cets_SUITE.erl index 8081932..47ea6dd 100644 --- a/test/cets_SUITE.erl +++ b/test/cets_SUITE.erl @@ -58,8 +58,7 @@ -import(cets_test_receive, [ receive_message/1, - receive_message_with_arg/1, - assert_nothing_is_logged/2 + receive_message_with_arg/1 ]). -import(cets_test_helper, [ @@ -1053,7 +1052,7 @@ join_done_already_while_waiting_for_lock_so_do_nothing(Config) -> %% Ensure there is nothing logged, we use log_ref to ignore logs from other tests. %% The counter example for no logging is %% the logs_are_printed_when_join_fails_because_servers_overlap testcase. - assert_nothing_is_logged(?FUNCTION_NAME, LogRef). + cets_test_log:assert_nothing_is_logged(?FUNCTION_NAME, LogRef). pause_owner_crashed_is_logged(Config) -> ct:timetrap({seconds, 6}), diff --git a/test/cets_disco_SUITE.erl b/test/cets_disco_SUITE.erl index 95a1232..357ef48 100644 --- a/test/cets_disco_SUITE.erl +++ b/test/cets_disco_SUITE.erl @@ -35,8 +35,7 @@ -import(cets_test_receive, [ receive_message/1, - flush_message/1, - receive_all_logs/1 + flush_message/1 ]). -import(cets_test_peer, [ @@ -732,7 +731,7 @@ logging_when_failing_join_with_disco(Config) -> try cets_discovery:add_table(Disco, Tab), timer:sleep(100), - Logs = receive_all_logs(?FUNCTION_NAME), + Logs = cets_test_log:receive_all_logs(?FUNCTION_NAME), Reason = {{nodedown, Node2}, {gen_server, call, [Pid2, other_servers, infinity]}}, MatchedLogs = [ Log diff --git a/test/cets_test_log.erl b/test/cets_test_log.erl index fe5277b..8369a13 100644 --- a/test/cets_test_log.erl +++ b/test/cets_test_log.erl @@ -2,7 +2,9 @@ -module(cets_test_log). -export([ receive_all_logs_with_log_ref/2, - receive_all_logs_from_pid/2 + receive_all_logs_from_pid/2, + receive_all_logs/1, + assert_nothing_is_logged/2 ]). -include_lib("kernel/include/logger.hrl"). @@ -61,3 +63,22 @@ ensure_logger_is_working(LogHandlerId, LogRef) -> after 5000 -> ct:fail({timeout, logger_is_broken}) end. + +receive_all_logs(Id) -> + receive + {log, Id, Log} -> + [Log | receive_all_logs(Id)] + after 100 -> + [] + end. + +assert_nothing_is_logged(LogHandlerId, LogRef) -> + receive + {log, LogHandlerId, #{ + level := Level, + msg := {report, #{log_ref := LogRef}} + }} when Level =:= warning; Level =:= error -> + ct:fail(got_logging_but_should_not) + after 0 -> + ok + end. diff --git a/test/cets_test_receive.erl b/test/cets_test_receive.erl index 6fd2116..c84e869 100644 --- a/test/cets_test_receive.erl +++ b/test/cets_test_receive.erl @@ -2,9 +2,7 @@ -export([ receive_message/1, receive_message_with_arg/1, - flush_message/1, - receive_all_logs/1, - assert_nothing_is_logged/2 + flush_message/1 ]). receive_message(M) -> @@ -26,22 +24,3 @@ flush_message(M) -> after 0 -> ok end. - -receive_all_logs(Id) -> - receive - {log, Id, Log} -> - [Log | receive_all_logs(Id)] - after 100 -> - [] - end. - -assert_nothing_is_logged(LogHandlerId, LogRef) -> - receive - {log, LogHandlerId, #{ - level := Level, - msg := {report, #{log_ref := LogRef}} - }} when Level =:= warning; Level =:= error -> - ct:fail(got_logging_but_should_not) - after 0 -> - ok - end. From 3f9e50a2e22fd9a9037b697e8b26340c9cdce09b Mon Sep 17 00:00:00 2001 From: Mikhail Uvarov Date: Tue, 12 Mar 2024 15:29:00 +0100 Subject: [PATCH 13/30] Move unknown_message_is_ignored_in_disco_process/code_change_returns_ok_for_disco into cets_disco_SUITE --- test/cets_SUITE.erl | 15 --------------- test/cets_disco_SUITE.erl | 16 +++++++++++++++- 2 files changed, 15 insertions(+), 16 deletions(-) diff --git a/test/cets_SUITE.erl b/test/cets_SUITE.erl index 47ea6dd..90816cf 100644 --- a/test/cets_SUITE.erl +++ b/test/cets_SUITE.erl @@ -34,11 +34,9 @@ start/2, start_local/1, start_local/2, - start_simple_disco/0, make_name/1, make_name/2, lock_name/1, - disco_name/1, given_two_joined_tables/1, given_two_joined_tables/2, given_3_servers/1, @@ -180,10 +178,8 @@ cases() -> unknown_message_is_ignored_in_ack_process, unknown_cast_message_is_ignored_in_ack_process, unknown_call_returns_error_from_ack_process, - unknown_message_is_ignored_in_disco_process, code_change_returns_ok, code_change_returns_ok_for_ack, - code_change_returns_ok_for_disco, run_spawn_forwards_errors, run_tracked_failed, run_tracked_logged, @@ -1759,11 +1755,6 @@ unknown_message_is_ignored_in_ack_process(Config) -> AckPid ! oops, still_works(Pid). -unknown_message_is_ignored_in_disco_process(_Config) -> - Pid = start_simple_disco(), - Pid ! oops, - #{} = sys:get_state(Pid). - unknown_cast_message_is_ignored_in_ack_process(Config) -> {ok, Pid} = start_local(make_name(Config)), #{ack_pid := AckPid} = cets:info(Pid), @@ -1789,12 +1780,6 @@ code_change_returns_ok_for_ack(Config) -> ok = sys:change_code(AckPid, cets_ack, v2, []), sys:resume(AckPid). -code_change_returns_ok_for_disco(_Config) -> - Pid = start_simple_disco(), - sys:suspend(Pid), - ok = sys:change_code(Pid, cets_ack, v2, []), - sys:resume(Pid). - run_spawn_forwards_errors(_Config) -> ?assertException( error, diff --git a/test/cets_disco_SUITE.erl b/test/cets_disco_SUITE.erl index 357ef48..08b410d 100644 --- a/test/cets_disco_SUITE.erl +++ b/test/cets_disco_SUITE.erl @@ -10,6 +10,7 @@ start_local/1, start_local/2, start_disco/2, + start_simple_disco/0, make_name/1, make_name/2, disco_name/1 @@ -95,7 +96,9 @@ cases() -> disco_uses_regular_retry_interval_in_the_regular_phase_after_node_down, disco_uses_regular_retry_interval_in_the_regular_phase_after_expired_node_down, disco_handles_node_up_and_down, - unexpected_nodedown_is_ignored_by_disco + unexpected_nodedown_is_ignored_by_disco, + unknown_message_is_ignored_in_disco_process, + code_change_returns_ok_for_disco ]. seq_cases() -> @@ -753,3 +756,14 @@ logging_when_failing_join_with_disco(Config) -> cets:stop(Pid2) end, ok. + +unknown_message_is_ignored_in_disco_process(_Config) -> + Pid = start_simple_disco(), + Pid ! oops, + #{} = sys:get_state(Pid). + +code_change_returns_ok_for_disco(_Config) -> + Pid = start_simple_disco(), + sys:suspend(Pid), + ok = sys:change_code(Pid, cets_ack, v2, []), + sys:resume(Pid). From 79bea515c51730ca101eabf37f461b0597b3d626 Mon Sep 17 00:00:00 2001 From: Mikhail Uvarov Date: Tue, 12 Mar 2024 15:33:04 +0100 Subject: [PATCH 14/30] Cleanup cets_status_SUITE --- test/cets_status_SUITE.erl | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/test/cets_status_SUITE.erl b/test/cets_status_SUITE.erl index fd6ccd5..a6bb648 100644 --- a/test/cets_status_SUITE.erl +++ b/test/cets_status_SUITE.erl @@ -53,21 +53,12 @@ all() -> [ {group, cets} - % {group, cets_seq}, - % {group, cets_seq_no_log} ]. groups() -> %% Cases should have unique names, because we name CETS servers based on case names [ - {cets, [parallel, {repeat_until_any_fail, 3}], assert_unique(cases())}, - %% These tests actually simulate a netsplit on the distribution level. - %% Though, global's prevent_overlapping_partitions option starts kicking - %% all nodes from the cluster, so we have to be careful not to break other cases. - %% Setting prevent_overlapping_partitions=false on ct5 helps. - {cets_seq, [sequence, {repeat_until_any_fail, 2}], assert_unique(seq_cases())}, - {cets_seq_no_log, [sequence, {repeat_until_any_fail, 2}], - assert_unique(cets_seq_no_log_cases())} + {cets, [parallel, {repeat_until_any_fail, 3}], assert_unique(cases())} ]. cases() -> @@ -86,12 +77,6 @@ cases() -> format_data_does_not_return_table_duplicates ]. -seq_cases() -> - []. - -cets_seq_no_log_cases() -> - []. - init_per_suite(Config) -> cets_test_setup:init_cleanup_table(), cets_test_peer:start([ct2], Config). From 9f4db45a165a7cb44917656404b255bf0d46bb10 Mon Sep 17 00:00:00 2001 From: Mikhail Uvarov Date: Tue, 12 Mar 2024 16:18:23 +0100 Subject: [PATCH 15/30] Use cets_test_peer:start/2 everywhere --- test/cets_SUITE.erl | 9 ++------- test/cets_dist_blocker_SUITE.erl | 21 ++++++++------------- test/cets_test_peer.erl | 7 +++---- 3 files changed, 13 insertions(+), 24 deletions(-) diff --git a/test/cets_SUITE.erl b/test/cets_SUITE.erl index 90816cf..518af65 100644 --- a/test/cets_SUITE.erl +++ b/test/cets_SUITE.erl @@ -237,16 +237,11 @@ cets_seq_no_log_cases() -> init_per_suite(Config) -> cets_test_setup:init_cleanup_table(), - Names = [ct2, ct3, ct4, ct5, ct6, ct7], - {Nodes, Peers} = lists:unzip([cets_test_peer:start_node(N) || N <- Names]), - [ - {nodes, maps:from_list(lists:zip(Names, Nodes))}, - {peers, maps:from_list(lists:zip(Names, Peers))} - | Config - ]. + cets_test_peer:start([ct2, ct3, ct4, ct5, ct6, ct7], Config). end_per_suite(Config) -> cets_test_setup:remove_cleanup_table(), + cets_test_peer:stop(Config), Config. init_per_group(Group, Config) when Group == cets_seq_no_log; Group == cets_no_log -> diff --git a/test/cets_dist_blocker_SUITE.erl b/test/cets_dist_blocker_SUITE.erl index 5353cbb..9c4f882 100644 --- a/test/cets_dist_blocker_SUITE.erl +++ b/test/cets_dist_blocker_SUITE.erl @@ -35,15 +35,10 @@ unknown_cases() -> ]. init_per_suite(Config) -> - Names = [peer_ct2], - {Nodes, Peers} = lists:unzip([cets_test_peer:start_node(N) || N <- Names]), - [ - {nodes, maps:from_list(lists:zip(Names, Nodes))}, - {peers, maps:from_list(lists:zip(Names, Peers))} - | Config - ]. + cets_test_peer:start([ct2], Config). end_per_suite(Config) -> + cets_test_peer:stop(Config), Config. init_per_group(_Group, Config) -> @@ -64,7 +59,7 @@ end_per_testcase(_, _Config) -> %% Test blocking functionality waits_for_cleaning(Config) -> - #{peer_ct2 := Node2} = proplists:get_value(nodes, Config), + #{ct2 := Node2} = proplists:get_value(nodes, Config), {ok, Blocker} = cets_dist_blocker:start_link(), cets_dist_blocker:add_cleaner(self()), connect_and_disconnect(Node2), @@ -75,7 +70,7 @@ waits_for_cleaning(Config) -> gen_server:stop(Blocker). unblocks_if_cleaner_goes_down(Config) -> - #{peer_ct2 := Node2} = proplists:get_value(nodes, Config), + #{ct2 := Node2} = proplists:get_value(nodes, Config), {ok, Blocker} = cets_dist_blocker:start_link(), Cleaner = spawn_cleaner(), connect_and_disconnect(Node2), @@ -86,7 +81,7 @@ unblocks_if_cleaner_goes_down(Config) -> gen_server:stop(Blocker). unblocks_if_cleaner_goes_down_and_second_cleaner_says_done(Config) -> - #{peer_ct2 := Node2} = proplists:get_value(nodes, Config), + #{ct2 := Node2} = proplists:get_value(nodes, Config), {ok, Blocker} = cets_dist_blocker:start_link(), %% Two cleaners cets_dist_blocker:add_cleaner(self()), @@ -101,7 +96,7 @@ unblocks_if_cleaner_goes_down_and_second_cleaner_says_done(Config) -> gen_server:stop(Blocker). unblocks_if_cleaner_says_done_and_second_cleaner_goes_down(Config) -> - #{peer_ct2 := Node2} = proplists:get_value(nodes, Config), + #{ct2 := Node2} = proplists:get_value(nodes, Config), {ok, Blocker} = cets_dist_blocker:start_link(), %% Two cleaners cets_dist_blocker:add_cleaner(self()), @@ -117,7 +112,7 @@ unblocks_if_cleaner_says_done_and_second_cleaner_goes_down(Config) -> gen_server:stop(Blocker). blocks_if_cleaner_says_done_and_second_cleaner_does_not_ack(Config) -> - #{peer_ct2 := Node2} = proplists:get_value(nodes, Config), + #{ct2 := Node2} = proplists:get_value(nodes, Config), {ok, Blocker} = cets_dist_blocker:start_link(), %% Two cleaners cets_dist_blocker:add_cleaner(self()), @@ -131,7 +126,7 @@ blocks_if_cleaner_says_done_and_second_cleaner_does_not_ack(Config) -> gen_server:stop(Blocker). skip_blocking_if_no_cleaners(Config) -> - #{peer_ct2 := Node2} = proplists:get_value(nodes, Config), + #{ct2 := Node2} = proplists:get_value(nodes, Config), {ok, Blocker} = cets_dist_blocker:start_link(), pong = net_adm:ping(Node2), true = erlang:disconnect_node(Node2), diff --git a/test/cets_test_peer.erl b/test/cets_test_peer.erl index d821a67..632efbe 100644 --- a/test/cets_test_peer.erl +++ b/test/cets_test_peer.erl @@ -2,7 +2,6 @@ -export([ start/2, stop/1, - start_node/1, node_to_peer/1 ]). @@ -18,7 +17,7 @@ -include_lib("common_test/include/ct.hrl"). start(Names, Config) -> - {Nodes, Peers} = lists:unzip([cets_test_peer:start_node(name(N)) || N <- Names]), + {Nodes, Peers} = lists:unzip([start_node(N) || N <- Names]), [ {nodes, maps:from_list(lists:zip(Names, Nodes))}, {peers, maps:from_list(lists:zip(Names, Peers))} @@ -33,9 +32,9 @@ stop(Config) -> name(Node) -> list_to_atom(peer:random_name(atom_to_list(Node))). -start_node(Sname) -> +start_node(Id) -> {ok, Peer, Node} = ?CT_PEER(#{ - name => Sname, connection => standard_io, args => extra_args(Sname) + name => name(Id), connection => standard_io, args => extra_args(Id) }), %% Register so we can find Peer process later in code register(node_to_peer_name(Node), Peer), From a87e6d537259c321501f60485ba682dacfd7de9f Mon Sep 17 00:00:00 2001 From: Mikhail Uvarov Date: Tue, 12 Mar 2024 16:44:20 +0100 Subject: [PATCH 16/30] Make cets_join_SUITE --- test/cets_SUITE.erl | 217 ------------------------- test/cets_join_SUITE.erl | 337 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 337 insertions(+), 217 deletions(-) create mode 100644 test/cets_join_SUITE.erl diff --git a/test/cets_SUITE.erl b/test/cets_SUITE.erl index 518af65..e4a8683 100644 --- a/test/cets_SUITE.erl +++ b/test/cets_SUITE.erl @@ -97,14 +97,7 @@ cases() -> insert_many_with_two_records, delete_works, delete_many_works, - join_works, inserted_records_could_be_read_back_from_replicated_table, - join_works_with_existing_data, - join_works_with_existing_data_with_conflicts, - join_works_with_existing_data_with_conflicts_and_defined_conflict_handler, - join_works_with_existing_data_with_conflicts_and_defined_conflict_handler_and_more_keys, - join_works_with_existing_data_with_conflicts_and_defined_conflict_handler_and_keypos2, - bag_with_conflict_handler_not_allowed, bag_with_conflict_handler_not_allowed_for_start_link, insert_new_works, insert_new_works_with_table_name, @@ -125,12 +118,6 @@ cases() -> insert_serial_works_when_leader_is_back, insert_serial_blocks_when_leader_is_not_back, leader_is_the_same_in_metadata_after_join, - join_with_the_same_pid, - join_ref_is_same_after_join, - join_fails_because_server_process_not_found, - join_fails_because_server_process_not_found_before_get_pids, - join_fails_before_send_dump, - join_fails_before_send_dump_and_there_are_pending_remote_ops, send_dump_fails_during_join_because_receiver_exits, join_fails_in_check_fully_connected, join_fails_because_join_refs_do_not_match_for_nodes_in_segment, @@ -308,9 +295,6 @@ delete_many_works(Config) -> cets:delete_many(Tab, [alice]), [] = ets:lookup(Tab, alice). -join_works(Config) -> - given_two_joined_tables(Config). - inserted_records_could_be_read_back_from_replicated_table(Config) -> #{tab1 := Tab1, tab2 := Tab2} = given_two_joined_tables(Config), cets:insert(Tab1, {alice, 32}), @@ -662,207 +646,6 @@ leader_is_the_same_in_metadata_after_join(Config) -> Leader = cets_metadata:get(T1, leader), Leader = cets_metadata:get(T2, leader). -join_works_with_existing_data(Config) -> - Tab1 = make_name(Config, 1), - Tab2 = make_name(Config, 2), - {ok, Pid1} = start_local(Tab1), - {ok, Pid2} = start_local(Tab2), - cets:insert(Tab1, {alice, 32}), - %% Join will copy and merge existing tables - ok = cets_join:join(lock_name(Config), #{}, Pid1, Pid2), - [{alice, 32}] = ets:lookup(Tab2, alice). - -%% This testcase tests an edgecase: inserting with the same key from two nodes. -%% Usually, inserting with the same key from two different nodes is not possible -%% (because the node-name is a part of the key). -join_works_with_existing_data_with_conflicts(Config) -> - Tab1 = make_name(Config, 1), - Tab2 = make_name(Config, 2), - {ok, Pid1} = start_local(Tab1), - {ok, Pid2} = start_local(Tab2), - cets:insert(Tab1, {alice, 32}), - cets:insert(Tab2, {alice, 33}), - %% Join will copy and merge existing tables - ok = cets_join:join(lock_name(Config), #{}, Pid1, Pid2), - %% We insert data from other table into our table when merging, so the values get swapped - [{alice, 33}] = ets:lookup(Tab1, alice), - [{alice, 32}] = ets:lookup(Tab2, alice). - -join_works_with_existing_data_with_conflicts_and_defined_conflict_handler(Config) -> - Opts = #{handle_conflict => fun resolve_highest/2}, - Tab1 = make_name(Config, 1), - Tab2 = make_name(Config, 2), - {ok, Pid1} = start_local(Tab1, Opts), - {ok, Pid2} = start_local(Tab2, Opts), - cets:insert(Tab1, {alice, 32}), - cets:insert(Tab2, {alice, 33}), - %% Join will copy and merge existing tables - ok = cets_join:join(lock_name(Config), #{}, Pid1, Pid2), - %% Key with the highest Number remains - [{alice, 33}] = ets:lookup(Tab1, alice), - [{alice, 33}] = ets:lookup(Tab2, alice). - -join_works_with_existing_data_with_conflicts_and_defined_conflict_handler_and_more_keys(Config) -> - %% Deeper testing of cets_join:apply_resolver function - Opts = #{handle_conflict => fun resolve_highest/2}, - #{tabs := [T1, T2, T3], pids := [Pid1, Pid2, Pid3]} = given_3_servers(Config, Opts), - cets:insert_many(T1, [{alice, 32}, {bob, 10}, {michal, 40}]), - cets:insert_many(T2, [{alice, 33}, {kate, 3}, {michal, 2}]), - %% Join will copy and merge existing tables - ok = cets_join:join(lock_name(Config), #{}, Pid1, Pid2), - ok = cets_join:join(lock_name(Config), #{}, Pid1, Pid3), - %% Key with the highest Number remains - Dump = [{alice, 33}, {bob, 10}, {kate, 3}, {michal, 40}], - Dump = cets:dump(T1), - Dump = cets:dump(T2), - Dump = cets:dump(T3). - --record(user, {name, age, updated}). - -%% Test with records (which require keypos = 2 option) -join_works_with_existing_data_with_conflicts_and_defined_conflict_handler_and_keypos2(Config) -> - Opts = #{handle_conflict => fun resolve_user_conflict/2, keypos => 2}, - T1 = make_name(Config, 1), - T2 = make_name(Config, 2), - {ok, Pid1} = start_local(T1, Opts), - {ok, Pid2} = start_local(T2, Opts), - cets:insert(T1, #user{name = alice, age = 30, updated = erlang:system_time()}), - cets:insert(T2, #user{name = alice, age = 25, updated = erlang:system_time()}), - %% Join will copy and merge existing tables - ok = cets_join:join(keypos2_lock, #{}, Pid1, Pid2), - %% Last inserted record is in the table - [#user{age = 25}] = ets:lookup(T1, alice), - [#user{age = 25}] = ets:lookup(T2, alice). - -%% Keep record with highest timestamp -resolve_user_conflict(U1 = #user{updated = TS1}, _U2 = #user{updated = TS2}) when - TS1 > TS2 --> - U1; -resolve_user_conflict(_U1, U2) -> - U2. - -resolve_highest({K, A}, {K, B}) -> - {K, max(A, B)}. - -bag_with_conflict_handler_not_allowed(Config) -> - {error, [bag_with_conflict_handler]} = - cets:start(make_name(Config), #{handle_conflict => fun resolve_highest/2, type => bag}). - -bag_with_conflict_handler_not_allowed_for_start_link(Config) -> - {error, [bag_with_conflict_handler]} = - cets:start_link(make_name(Config), #{handle_conflict => fun resolve_highest/2, type => bag}). - -join_with_the_same_pid(Config) -> - Tab = make_name(Config), - {ok, Pid} = start_local(Tab), - %% Just insert something into a table to check later the size - cets:insert(Tab, {1, 1}), - link(Pid), - {error, join_with_the_same_pid} = cets_join:join(lock_name(Config), #{}, Pid, Pid), - Nodes = [node()], - %% The process is still running and no data loss (i.e. size is not zero) - #{nodes := Nodes, size := 1} = cets:info(Pid). - -join_ref_is_same_after_join(Config) -> - #{pid1 := Pid1, pid2 := Pid2} = given_two_joined_tables(Config), - #{join_ref := JoinRef} = cets:info(Pid1), - #{join_ref := JoinRef} = cets:info(Pid2). - -join_fails_because_server_process_not_found(Config) -> - {ok, Pid1} = start_local(make_name(Config, 1)), - {ok, Pid2} = start_local(make_name(Config, 2)), - F = fun - (join_start) -> - exit(Pid1, sim_error); - (_) -> - ok - end, - {error, {task_failed, {noproc, {gen_server, call, [Pid1, get_info, infinity]}}, _}} = - cets_join:join(lock_name(Config), #{}, Pid1, Pid2, #{checkpoint_handler => F}). - -join_fails_because_server_process_not_found_before_get_pids(Config) -> - {ok, Pid1} = start_local(make_name(Config, 1)), - {ok, Pid2} = start_local(make_name(Config, 2)), - F = fun - (before_get_pids) -> - exit(Pid1, sim_error); - (_) -> - ok - end, - {error, {task_failed, {noproc, {gen_server, call, [Pid1, other_servers, infinity]}}, _}} = - cets_join:join(lock_name(Config), #{}, Pid1, Pid2, #{checkpoint_handler => F}). - -join_fails_before_send_dump(Config) -> - Me = self(), - DownFn = fun(#{remote_pid := RemotePid, table := _Tab}) -> - Me ! {down_called, self(), RemotePid} - end, - {ok, Pid1} = start_local(make_name(Config, 1), #{handle_down => DownFn}), - {ok, Pid2} = start_local(make_name(Config, 2), #{}), - cets:insert(Pid1, {1}), - cets:insert(Pid2, {2}), - F = fun - ({before_send_dump, P}) when Pid1 =:= P -> - Me ! before_send_dump_called_for_pid1; - ({before_send_dump, P}) when Pid2 =:= P -> - error(sim_error); - (_) -> - ok - end, - ?assertMatch( - {error, {task_failed, sim_error, #{}}}, - cets_join:join(lock_name(Config), #{}, Pid1, Pid2, #{checkpoint_handler => F}) - ), - %% Ensure we sent dump to Pid1 - receive_message(before_send_dump_called_for_pid1), - %% Not joined, some data exchanged - cets:ping_all(Pid1), - cets:ping_all(Pid2), - [] = cets:other_pids(Pid1), - [] = cets:other_pids(Pid2), - %% Pid1 applied new version of dump - %% Though, it got disconnected after - {ok, [{1}, {2}]} = cets:remote_dump(Pid1), - %% Pid2 rejected changes - {ok, [{2}]} = cets:remote_dump(Pid2), - receive_message({down_called, Pid1, Pid2}). - -%% Checks that remote ops are dropped if join_ref does not match in the state and in remote_op message -join_fails_before_send_dump_and_there_are_pending_remote_ops(Config) -> - Me = self(), - {ok, Pid1} = start_local(make_name(Config, 1)), - {ok, Pid2} = start_local(make_name(Config, 2)), - F = fun - ({before_send_dump, P}) when Pid1 =:= P -> - Me ! before_send_dump_called_for_pid1; - ({before_send_dump, P}) when Pid2 =:= P -> - sys:suspend(Pid2), - error(sim_error); - (before_unpause) -> - %% Crash in before_unpause, otherwise cets_join will block in cets:unpause/2 - %% (because Pid2 is suspended). - %% Servers would be unpaused automatically though, because cets_join process exits - %% (i.e. cets:unpause/2 call is totally optional) - error(sim_error2); - (_) -> - ok - end, - ?assertMatch( - {error, {task_failed, sim_error2, #{}}}, - cets_join:join(lock_name(Config), #{}, Pid1, Pid2, #{checkpoint_handler => F}) - ), - %% Ensure we sent dump to Pid1 - receive_message(before_send_dump_called_for_pid1), - cets:insert_request(Pid1, {1}), - %% Check that the remote_op has reached Pid2 message box - cets_test_wait:wait_for_remote_ops_in_the_message_box(Pid2, 1), - sys:resume(Pid2), - %% Wait till remote_op is processed - cets:ping(Pid2), - %% Check that the insert was ignored - {ok, []} = cets:remote_dump(Pid2). - send_dump_fails_during_join_because_receiver_exits(Config) -> Me = self(), DownFn = fun(#{remote_pid := RemotePid, table := _Tab}) -> diff --git a/test/cets_join_SUITE.erl b/test/cets_join_SUITE.erl new file mode 100644 index 0000000..522b125 --- /dev/null +++ b/test/cets_join_SUITE.erl @@ -0,0 +1,337 @@ +-module(cets_join_SUITE). +-include_lib("common_test/include/ct.hrl"). +-include_lib("eunit/include/eunit.hrl"). +-include_lib("kernel/include/logger.hrl"). + +-compile([export_all, nowarn_export_all]). + +-import(cets_test_setup, [ + start/2, + start_local/1, + start_local/2, + start_disco/2, + start_simple_disco/0, + make_name/1, + make_name/2, + lock_name/1, + disco_name/1 +]). + +-import(cets_test_wait, [ + wait_for_down/1, + wait_for_ready/2, + wait_till_test_stage/2 +]). + +-import(cets_test_setup, [ + setup_two_nodes_and_discovery/1, + setup_two_nodes_and_discovery/2, + simulate_disco_restart/1, + make_signalling_process/0, + given_two_joined_tables/1, + given_two_joined_tables/2, + given_3_servers/2 +]). + +-import(cets_test_wait, [ + wait_for_disco_timestamp_to_appear/3, + wait_for_disco_timestamp_to_be_updated/4 +]). + +-import(cets_test_receive, [ + receive_message/1, + flush_message/1 +]). + +-import(cets_test_peer, [ + block_node/2, + reconnect_node/2, + disconnect_node/2, + disconnect_node_by_name/2 +]). + +-import(cets_test_rpc, [ + rpc/4 +]). + +-import(cets_test_helper, [assert_unique/1]). + +-import(cets_test_rpc, [ + other_nodes/2 +]). + +all() -> + [ + {group, cets} + % {group, cets_seq}, + % {group, cets_seq_no_log} + ]. + +groups() -> + %% Cases should have unique names, because we name CETS servers based on case names + [ + {cets, [parallel, {repeat_until_any_fail, 3}], assert_unique(cases())}, + %% These tests actually simulate a netsplit on the distribution level. + %% Though, global's prevent_overlapping_partitions option starts kicking + %% all nodes from the cluster, so we have to be careful not to break other cases. + %% Setting prevent_overlapping_partitions=false on ct5 helps. + {cets_seq, [sequence, {repeat_until_any_fail, 2}], assert_unique(seq_cases())}, + {cets_seq_no_log, [sequence, {repeat_until_any_fail, 2}], + assert_unique(cets_seq_no_log_cases())} + ]. + +cases() -> + [ + join_works, + join_works_with_existing_data_with_conflicts_and_defined_conflict_handler, + join_works_with_existing_data_with_conflicts_and_defined_conflict_handler_and_more_keys, + join_works_with_existing_data_with_conflicts_and_defined_conflict_handler_and_keypos2, + bag_with_conflict_handler_not_allowed, + join_with_the_same_pid, + join_ref_is_same_after_join, + join_fails_because_server_process_not_found, + join_fails_because_server_process_not_found_before_get_pids, + join_fails_before_send_dump, + join_fails_before_send_dump_and_there_are_pending_remote_ops + ]. + +seq_cases() -> + []. + +cets_seq_no_log_cases() -> + []. + +init_per_suite(Config) -> + cets_test_setup:init_cleanup_table(), + cets_test_peer:start([ct2, ct5], Config). + +end_per_suite(Config) -> + cets_test_setup:remove_cleanup_table(), + cets_test_peer:stop(Config), + Config. + +init_per_group(Group, Config) when Group == cets_seq_no_log; Group == cets_no_log -> + [ok = logger:set_module_level(M, none) || M <- log_modules()], + Config; +init_per_group(_Group, Config) -> + Config. + +end_per_group(Group, Config) when Group == cets_seq_no_log; Group == cets_no_log -> + [ok = logger:unset_module_level(M) || M <- log_modules()], + Config; +end_per_group(_Group, Config) -> + Config. + +init_per_testcase(Name, Config) -> + init_per_testcase_generic(Name, Config). + +init_per_testcase_generic(Name, Config) -> + [{testcase, Name} | Config]. + +end_per_testcase(_, _Config) -> + cets_test_setup:wait_for_cleanup(), + ok. + +%% Modules that use a multiline LOG_ macro +log_modules() -> + [cets, cets_call, cets_long, cets_join, cets_discovery]. + +join_works(Config) -> + given_two_joined_tables(Config). + +join_works_with_existing_data(Config) -> + Tab1 = make_name(Config, 1), + Tab2 = make_name(Config, 2), + {ok, Pid1} = start_local(Tab1), + {ok, Pid2} = start_local(Tab2), + cets:insert(Tab1, {alice, 32}), + %% Join will copy and merge existing tables + ok = cets_join:join(lock_name(Config), #{}, Pid1, Pid2), + [{alice, 32}] = ets:lookup(Tab2, alice). + +%% This testcase tests an edgecase: inserting with the same key from two nodes. +%% Usually, inserting with the same key from two different nodes is not possible +%% (because the node-name is a part of the key). +join_works_with_existing_data_with_conflicts(Config) -> + Tab1 = make_name(Config, 1), + Tab2 = make_name(Config, 2), + {ok, Pid1} = start_local(Tab1), + {ok, Pid2} = start_local(Tab2), + cets:insert(Tab1, {alice, 32}), + cets:insert(Tab2, {alice, 33}), + %% Join will copy and merge existing tables + ok = cets_join:join(lock_name(Config), #{}, Pid1, Pid2), + %% We insert data from other table into our table when merging, so the values get swapped + [{alice, 33}] = ets:lookup(Tab1, alice), + [{alice, 32}] = ets:lookup(Tab2, alice). + +join_works_with_existing_data_with_conflicts_and_defined_conflict_handler(Config) -> + Opts = #{handle_conflict => fun resolve_highest/2}, + Tab1 = make_name(Config, 1), + Tab2 = make_name(Config, 2), + {ok, Pid1} = start_local(Tab1, Opts), + {ok, Pid2} = start_local(Tab2, Opts), + cets:insert(Tab1, {alice, 32}), + cets:insert(Tab2, {alice, 33}), + %% Join will copy and merge existing tables + ok = cets_join:join(lock_name(Config), #{}, Pid1, Pid2), + %% Key with the highest Number remains + [{alice, 33}] = ets:lookup(Tab1, alice), + [{alice, 33}] = ets:lookup(Tab2, alice). + +join_works_with_existing_data_with_conflicts_and_defined_conflict_handler_and_more_keys(Config) -> + %% Deeper testing of cets_join:apply_resolver function + Opts = #{handle_conflict => fun resolve_highest/2}, + #{tabs := [T1, T2, T3], pids := [Pid1, Pid2, Pid3]} = given_3_servers(Config, Opts), + cets:insert_many(T1, [{alice, 32}, {bob, 10}, {michal, 40}]), + cets:insert_many(T2, [{alice, 33}, {kate, 3}, {michal, 2}]), + %% Join will copy and merge existing tables + ok = cets_join:join(lock_name(Config), #{}, Pid1, Pid2), + ok = cets_join:join(lock_name(Config), #{}, Pid1, Pid3), + %% Key with the highest Number remains + Dump = [{alice, 33}, {bob, 10}, {kate, 3}, {michal, 40}], + Dump = cets:dump(T1), + Dump = cets:dump(T2), + Dump = cets:dump(T3). + +-record(user, {name, age, updated}). + +%% Test with records (which require keypos = 2 option) +join_works_with_existing_data_with_conflicts_and_defined_conflict_handler_and_keypos2(Config) -> + Opts = #{handle_conflict => fun resolve_user_conflict/2, keypos => 2}, + T1 = make_name(Config, 1), + T2 = make_name(Config, 2), + {ok, Pid1} = start_local(T1, Opts), + {ok, Pid2} = start_local(T2, Opts), + cets:insert(T1, #user{name = alice, age = 30, updated = erlang:system_time()}), + cets:insert(T2, #user{name = alice, age = 25, updated = erlang:system_time()}), + %% Join will copy and merge existing tables + ok = cets_join:join(keypos2_lock, #{}, Pid1, Pid2), + %% Last inserted record is in the table + [#user{age = 25}] = ets:lookup(T1, alice), + [#user{age = 25}] = ets:lookup(T2, alice). + +%% Keep record with highest timestamp +resolve_user_conflict(U1 = #user{updated = TS1}, _U2 = #user{updated = TS2}) when + TS1 > TS2 +-> + U1; +resolve_user_conflict(_U1, U2) -> + U2. + +resolve_highest({K, A}, {K, B}) -> + {K, max(A, B)}. + +bag_with_conflict_handler_not_allowed(Config) -> + {error, [bag_with_conflict_handler]} = + cets:start(make_name(Config), #{handle_conflict => fun resolve_highest/2, type => bag}). + +join_with_the_same_pid(Config) -> + Tab = make_name(Config), + {ok, Pid} = start_local(Tab), + %% Just insert something into a table to check later the size + cets:insert(Tab, {1, 1}), + link(Pid), + {error, join_with_the_same_pid} = cets_join:join(lock_name(Config), #{}, Pid, Pid), + Nodes = [node()], + %% The process is still running and no data loss (i.e. size is not zero) + #{nodes := Nodes, size := 1} = cets:info(Pid). + +join_ref_is_same_after_join(Config) -> + #{pid1 := Pid1, pid2 := Pid2} = given_two_joined_tables(Config), + #{join_ref := JoinRef} = cets:info(Pid1), + #{join_ref := JoinRef} = cets:info(Pid2). + +join_fails_because_server_process_not_found(Config) -> + {ok, Pid1} = start_local(make_name(Config, 1)), + {ok, Pid2} = start_local(make_name(Config, 2)), + F = fun + (join_start) -> + exit(Pid1, sim_error); + (_) -> + ok + end, + {error, {task_failed, {noproc, {gen_server, call, [Pid1, get_info, infinity]}}, _}} = + cets_join:join(lock_name(Config), #{}, Pid1, Pid2, #{checkpoint_handler => F}). + +join_fails_because_server_process_not_found_before_get_pids(Config) -> + {ok, Pid1} = start_local(make_name(Config, 1)), + {ok, Pid2} = start_local(make_name(Config, 2)), + F = fun + (before_get_pids) -> + exit(Pid1, sim_error); + (_) -> + ok + end, + {error, {task_failed, {noproc, {gen_server, call, [Pid1, other_servers, infinity]}}, _}} = + cets_join:join(lock_name(Config), #{}, Pid1, Pid2, #{checkpoint_handler => F}). + +join_fails_before_send_dump(Config) -> + Me = self(), + DownFn = fun(#{remote_pid := RemotePid, table := _Tab}) -> + Me ! {down_called, self(), RemotePid} + end, + {ok, Pid1} = start_local(make_name(Config, 1), #{handle_down => DownFn}), + {ok, Pid2} = start_local(make_name(Config, 2), #{}), + cets:insert(Pid1, {1}), + cets:insert(Pid2, {2}), + F = fun + ({before_send_dump, P}) when Pid1 =:= P -> + Me ! before_send_dump_called_for_pid1; + ({before_send_dump, P}) when Pid2 =:= P -> + error(sim_error); + (_) -> + ok + end, + ?assertMatch( + {error, {task_failed, sim_error, #{}}}, + cets_join:join(lock_name(Config), #{}, Pid1, Pid2, #{checkpoint_handler => F}) + ), + %% Ensure we sent dump to Pid1 + receive_message(before_send_dump_called_for_pid1), + %% Not joined, some data exchanged + cets:ping_all(Pid1), + cets:ping_all(Pid2), + [] = cets:other_pids(Pid1), + [] = cets:other_pids(Pid2), + %% Pid1 applied new version of dump + %% Though, it got disconnected after + {ok, [{1}, {2}]} = cets:remote_dump(Pid1), + %% Pid2 rejected changes + {ok, [{2}]} = cets:remote_dump(Pid2), + receive_message({down_called, Pid1, Pid2}). + +%% Checks that remote ops are dropped if join_ref does not match in the state and in remote_op message +join_fails_before_send_dump_and_there_are_pending_remote_ops(Config) -> + Me = self(), + {ok, Pid1} = start_local(make_name(Config, 1)), + {ok, Pid2} = start_local(make_name(Config, 2)), + F = fun + ({before_send_dump, P}) when Pid1 =:= P -> + Me ! before_send_dump_called_for_pid1; + ({before_send_dump, P}) when Pid2 =:= P -> + sys:suspend(Pid2), + error(sim_error); + (before_unpause) -> + %% Crash in before_unpause, otherwise cets_join will block in cets:unpause/2 + %% (because Pid2 is suspended). + %% Servers would be unpaused automatically though, because cets_join process exits + %% (i.e. cets:unpause/2 call is totally optional) + error(sim_error2); + (_) -> + ok + end, + ?assertMatch( + {error, {task_failed, sim_error2, #{}}}, + cets_join:join(lock_name(Config), #{}, Pid1, Pid2, #{checkpoint_handler => F}) + ), + %% Ensure we sent dump to Pid1 + receive_message(before_send_dump_called_for_pid1), + cets:insert_request(Pid1, {1}), + %% Check that the remote_op has reached Pid2 message box + cets_test_wait:wait_for_remote_ops_in_the_message_box(Pid2, 1), + sys:resume(Pid2), + %% Wait till remote_op is processed + cets:ping(Pid2), + %% Check that the insert was ignored + {ok, []} = cets:remote_dump(Pid2). From 45b1f7fe6db6215f805f2c2196bae5ed27ee34f6 Mon Sep 17 00:00:00 2001 From: Mikhail Uvarov Date: Tue, 12 Mar 2024 16:55:17 +0100 Subject: [PATCH 17/30] Move more tests into cets_join_SUITE --- test/cets_SUITE.erl | 210 +--------------------------------- test/cets_join_SUITE.erl | 230 +++++++++++++++++++++++++++++++++++++- test/cets_test_helper.erl | 6 +- 3 files changed, 232 insertions(+), 214 deletions(-) diff --git a/test/cets_SUITE.erl b/test/cets_SUITE.erl index e4a8683..463b773 100644 --- a/test/cets_SUITE.erl +++ b/test/cets_SUITE.erl @@ -61,7 +61,8 @@ -import(cets_test_helper, [ assert_unique/1, - set_other_servers/2 + set_other_servers/2, + set_join_ref/2 ]). all() -> @@ -118,13 +119,6 @@ cases() -> insert_serial_works_when_leader_is_back, insert_serial_blocks_when_leader_is_not_back, leader_is_the_same_in_metadata_after_join, - send_dump_fails_during_join_because_receiver_exits, - join_fails_in_check_fully_connected, - join_fails_because_join_refs_do_not_match_for_nodes_in_segment, - join_fails_because_pids_do_not_match_for_nodes_in_segment, - join_fails_because_servers_overlap, - remote_ops_are_ignored_if_join_ref_does_not_match, - join_retried_if_lock_is_busy, send_dump_contains_already_added_servers, servers_remove_each_other_if_join_refs_do_not_match_after_unpause, test_multinode, @@ -185,10 +179,8 @@ only_for_logger_cases() -> [ run_tracked_logged_check_logger, long_call_fails_because_linked_process_dies, - logs_are_printed_when_join_fails_because_servers_overlap, pause_owner_crashed_is_logged, pause_owner_crashed_is_not_logged_if_reason_is_normal, - join_done_already_while_waiting_for_lock_so_do_nothing, atom_error_is_logged_in_tracked, shutdown_reason_is_not_logged_in_tracked, other_reason_is_logged_in_tracked, @@ -646,188 +638,6 @@ leader_is_the_same_in_metadata_after_join(Config) -> Leader = cets_metadata:get(T1, leader), Leader = cets_metadata:get(T2, leader). -send_dump_fails_during_join_because_receiver_exits(Config) -> - Me = self(), - DownFn = fun(#{remote_pid := RemotePid, table := _Tab}) -> - Me ! {down_called, self(), RemotePid} - end, - {ok, Pid1} = start_local(make_name(Config, 1), #{handle_down => DownFn}), - {ok, Pid2} = start_local(make_name(Config, 2), #{}), - F = fun - ({before_send_dump, P}) when P =:= Pid1 -> - %% Kill Pid2 process. - %% It does not crash the join process. - %% Pid1 would receive a dump with Pid2 in the server list. - exit(Pid2, sim_error), - %% Ensure Pid1 got DOWN message from Pid2 already - pong = cets:ping(Pid1), - Me ! before_send_dump_called; - (_) -> - ok - end, - ok = cets_join:join(lock_name(Config), #{}, Pid1, Pid2, #{checkpoint_handler => F}), - receive_message(before_send_dump_called), - pong = cets:ping(Pid1), - receive_message({down_called, Pid1, Pid2}), - [] = cets:other_pids(Pid1), - %% Pid1 still works - cets:insert(Pid1, {1}), - {ok, [{1}]} = cets:remote_dump(Pid1). - -join_fails_in_check_fully_connected(Config) -> - Me = self(), - #{pids := [Pid1, Pid2, Pid3]} = given_3_servers(Config), - %% Pid2 and Pid3 are connected - ok = cets_join:join(lock_name(Config), #{}, Pid2, Pid3, #{}), - [Pid3] = cets:other_pids(Pid2), - F = fun - (before_check_fully_connected) -> - %% Ask Pid2 to remove Pid3 from the list - Pid2 ! {'DOWN', make_ref(), process, Pid3, sim_error}, - %% Ensure Pid2 did the cleaning - pong = cets:ping(Pid2), - [] = cets:other_pids(Pid2), - Me ! before_check_fully_connected_called; - (_) -> - ok - end, - ?assertMatch( - {error, {task_failed, check_fully_connected_failed, #{}}}, - cets_join:join(lock_name(Config), #{}, Pid1, Pid2, #{checkpoint_handler => F}) - ), - receive_message(before_check_fully_connected_called). - -join_fails_because_join_refs_do_not_match_for_nodes_in_segment(Config) -> - #{pids := [Pid1, Pid2, Pid3]} = given_3_servers(Config), - %% Pid2 and Pid3 are connected - %% But for some reason Pid3 has a different join_ref - %% (probably could happen if it still haven't checked other nodes after a join) - ok = cets_join:join(lock_name(Config), #{}, Pid2, Pid3, #{}), - set_join_ref(Pid3, make_ref()), - ?assertMatch( - {error, {task_failed, check_same_join_ref_failed, #{}}}, - cets_join:join(lock_name(Config), #{}, Pid1, Pid2, #{}) - ). - -join_fails_because_pids_do_not_match_for_nodes_in_segment(Config) -> - #{pids := [Pid1, Pid2, Pid3]} = given_3_servers(Config), - %% Pid2 and Pid3 are connected - %% But for some reason Pid3 has a different other_nodes list - %% (probably could happen if it still haven't checked other nodes after a join) - ok = cets_join:join(lock_name(Config), #{}, Pid2, Pid3, #{}), - set_other_servers(Pid3, []), - ?assertMatch( - {error, {task_failed, check_fully_connected_failed, #{}}}, - cets_join:join(lock_name(Config), #{}, Pid1, Pid2, #{}) - ). - -join_fails_because_servers_overlap(Config) -> - #{pids := [Pid1, Pid2, Pid3]} = given_3_servers(Config), - set_other_servers(Pid1, [Pid3]), - set_other_servers(Pid2, [Pid3]), - ?assertMatch( - {error, {task_failed, check_do_not_overlap_failed, #{}}}, - cets_join:join(lock_name(Config), #{}, Pid1, Pid2, #{}) - ). - -%% join_fails_because_servers_overlap testcase, but we check the logging. -%% We check that `?LOG_ERROR(#{what => check_do_not_overlap_failed})' is called. -logs_are_printed_when_join_fails_because_servers_overlap(Config) -> - LogRef = make_ref(), - logger_debug_h:start(#{id => ?FUNCTION_NAME}), - #{pids := [Pid1, Pid2, Pid3]} = given_3_servers(Config), - set_other_servers(Pid1, [Pid3]), - set_other_servers(Pid2, [Pid3]), - ?assertMatch( - {error, {task_failed, check_do_not_overlap_failed, #{}}}, - cets_join:join(lock_name(Config), #{log_ref => LogRef}, Pid1, Pid2, #{}) - ), - receive - {log, ?FUNCTION_NAME, #{ - level := error, - msg := - {report, #{ - what := check_do_not_overlap_failed, log_ref := LogRef - }} - }} -> - ok - after 5000 -> - ct:fail(timeout) - end. - -remote_ops_are_ignored_if_join_ref_does_not_match(Config) -> - {ok, Pid1} = start_local(make_name(Config, 1)), - {ok, Pid2} = start_local(make_name(Config, 2)), - ok = cets_join:join(lock_name(Config), #{}, Pid1, Pid2, #{}), - #{join_ref := JoinRef} = cets:info(Pid1), - set_join_ref(Pid1, make_ref()), - cets:insert(Pid2, {1}), - %% fix and check again - set_join_ref(Pid1, JoinRef), - cets:insert(Pid2, {2}), - {ok, [{2}]} = cets:remote_dump(Pid1). - -join_retried_if_lock_is_busy(Config) -> - Me = self(), - {ok, Pid1} = start_local(make_name(Config, 1)), - {ok, Pid2} = start_local(make_name(Config, 2)), - Lock = lock_name(Config), - SleepyF = fun - (join_start) -> - Me ! join_start, - timer:sleep(infinity); - (_) -> - ok - end, - F = fun - (before_retry) -> Me ! before_retry; - (_) -> ok - end, - %% Get the lock in a separate process - proc_lib:spawn_link(fun() -> - cets_join:join(Lock, #{}, Pid1, Pid2, #{checkpoint_handler => SleepyF}) - end), - receive_message(join_start), - %% We actually would not return from cets_join:join unless we get the lock - proc_lib:spawn_link(fun() -> - ok = cets_join:join(Lock, #{}, Pid1, Pid2, #{checkpoint_handler => F}) - end), - receive_message(before_retry). - -join_done_already_while_waiting_for_lock_so_do_nothing(Config) -> - logger_debug_h:start(#{id => ?FUNCTION_NAME}), - Me = self(), - #{pids := [Pid1, Pid2, Pid3, Pid4]} = given_n_servers(Config, 4, #{}), - Lock = lock_name(Config), - ok = cets_join:join(Lock, #{}, Pid1, Pid2, #{}), - ok = cets_join:join(Lock, #{}, Pid3, Pid4, #{}), - %% It is to just match logs - LogRef = make_ref(), - Info = #{log_ref => LogRef}, - F1 = send_join_start_back_and_wait_for_continue_joining(), - F2 = fun(_) -> ok end, - %% Get the lock in a separate process - proc_lib:spawn_link(fun() -> - ok = cets_join:join(Lock, Info, Pid1, Pid3, #{checkpoint_handler => F1}), - Me ! first_join_returns - end), - JoinPid = receive_message_with_arg(join_start), - proc_lib:spawn_link(fun() -> - ok = cets_join:join(Lock, Info, Pid1, Pid3, #{checkpoint_handler => F2}), - Me ! second_join_returns - end), - JoinPid ! continue_joining, - %% At this point our first join would finish, after that our second join should exit too. - receive_message(first_join_returns), - receive_message(second_join_returns), - %% Ensure all logs are received by removing the handler, it is a sync operation. - %% (we do not expect any logs anyway). - logger:remove_handler(?FUNCTION_NAME), - %% Ensure there is nothing logged, we use log_ref to ignore logs from other tests. - %% The counter example for no logging is - %% the logs_are_printed_when_join_fails_because_servers_overlap testcase. - cets_test_log:assert_nothing_is_logged(?FUNCTION_NAME, LogRef). - pause_owner_crashed_is_logged(Config) -> ct:timetrap({seconds, 6}), logger_debug_h:start(#{id => ?FUNCTION_NAME}), @@ -1855,9 +1665,6 @@ start_link_local(Name, Opts) -> schedule_cleanup(Pid), {ok, Pid}. -set_join_ref(Pid, JoinRef) -> - sys:replace_state(Pid, fun(#{join_ref := _} = State) -> State#{join_ref := JoinRef} end). - stopped_pid() -> %% Get a pid for a stopped process {Pid, Mon} = spawn_monitor(fun() -> ok end), @@ -1874,19 +1681,6 @@ bad_node_pid_binary() -> <<131, 88, 100, 0, 17, 98, 97, 100, 110, 111, 100, 101, 64, 108, 111, 99, 97, 108, 104, 111, 115, 116, 0, 0, 0, 90, 0, 0, 0, 0, 100, 206, 70, 92>>. -send_join_start_back_and_wait_for_continue_joining() -> - Me = self(), - fun - (join_start) -> - Me ! {join_start, self()}, - receive - continue_joining -> - ok - end; - (_) -> - ok - end. - not_leader(Leader, Other, Leader) -> Other; not_leader(Other, Leader, Leader) -> diff --git a/test/cets_join_SUITE.erl b/test/cets_join_SUITE.erl index 522b125..3408462 100644 --- a/test/cets_join_SUITE.erl +++ b/test/cets_join_SUITE.erl @@ -30,7 +30,9 @@ make_signalling_process/0, given_two_joined_tables/1, given_two_joined_tables/2, - given_3_servers/2 + given_3_servers/1, + given_3_servers/2, + given_n_servers/3 ]). -import(cets_test_wait, [ @@ -40,6 +42,7 @@ -import(cets_test_receive, [ receive_message/1, + receive_message_with_arg/1, flush_message/1 ]). @@ -54,7 +57,11 @@ rpc/4 ]). --import(cets_test_helper, [assert_unique/1]). +-import(cets_test_helper, [ + set_join_ref/2, + set_other_servers/2, + assert_unique/1 +]). -import(cets_test_rpc, [ other_nodes/2 @@ -62,15 +69,24 @@ all() -> [ - {group, cets} + {group, cets}, + {group, cets_no_log} % {group, cets_seq}, % {group, cets_seq_no_log} ]. +only_for_logger_cases() -> + [ + join_done_already_while_waiting_for_lock_so_do_nothing, + logs_are_printed_when_join_fails_because_servers_overlap + ]. + groups() -> %% Cases should have unique names, because we name CETS servers based on case names [ - {cets, [parallel, {repeat_until_any_fail, 3}], assert_unique(cases())}, + {cets, [parallel, {repeat_until_any_fail, 3}], + assert_unique(cases() ++ only_for_logger_cases())}, + {cets_no_log, [parallel], assert_unique(cases())}, %% These tests actually simulate a netsplit on the distribution level. %% Though, global's prevent_overlapping_partitions option starts kicking %% all nodes from the cluster, so we have to be careful not to break other cases. @@ -92,7 +108,14 @@ cases() -> join_fails_because_server_process_not_found, join_fails_because_server_process_not_found_before_get_pids, join_fails_before_send_dump, - join_fails_before_send_dump_and_there_are_pending_remote_ops + join_fails_before_send_dump_and_there_are_pending_remote_ops, + send_dump_fails_during_join_because_receiver_exits, + join_fails_in_check_fully_connected, + join_fails_because_join_refs_do_not_match_for_nodes_in_segment, + join_fails_because_pids_do_not_match_for_nodes_in_segment, + join_fails_because_servers_overlap, + remote_ops_are_ignored_if_join_ref_does_not_match, + join_retried_if_lock_is_busy ]. seq_cases() -> @@ -335,3 +358,200 @@ join_fails_before_send_dump_and_there_are_pending_remote_ops(Config) -> cets:ping(Pid2), %% Check that the insert was ignored {ok, []} = cets:remote_dump(Pid2). + +send_dump_fails_during_join_because_receiver_exits(Config) -> + Me = self(), + DownFn = fun(#{remote_pid := RemotePid, table := _Tab}) -> + Me ! {down_called, self(), RemotePid} + end, + {ok, Pid1} = start_local(make_name(Config, 1), #{handle_down => DownFn}), + {ok, Pid2} = start_local(make_name(Config, 2), #{}), + F = fun + ({before_send_dump, P}) when P =:= Pid1 -> + %% Kill Pid2 process. + %% It does not crash the join process. + %% Pid1 would receive a dump with Pid2 in the server list. + exit(Pid2, sim_error), + %% Ensure Pid1 got DOWN message from Pid2 already + pong = cets:ping(Pid1), + Me ! before_send_dump_called; + (_) -> + ok + end, + ok = cets_join:join(lock_name(Config), #{}, Pid1, Pid2, #{checkpoint_handler => F}), + receive_message(before_send_dump_called), + pong = cets:ping(Pid1), + receive_message({down_called, Pid1, Pid2}), + [] = cets:other_pids(Pid1), + %% Pid1 still works + cets:insert(Pid1, {1}), + {ok, [{1}]} = cets:remote_dump(Pid1). + +join_fails_in_check_fully_connected(Config) -> + Me = self(), + #{pids := [Pid1, Pid2, Pid3]} = given_3_servers(Config), + %% Pid2 and Pid3 are connected + ok = cets_join:join(lock_name(Config), #{}, Pid2, Pid3, #{}), + [Pid3] = cets:other_pids(Pid2), + F = fun + (before_check_fully_connected) -> + %% Ask Pid2 to remove Pid3 from the list + Pid2 ! {'DOWN', make_ref(), process, Pid3, sim_error}, + %% Ensure Pid2 did the cleaning + pong = cets:ping(Pid2), + [] = cets:other_pids(Pid2), + Me ! before_check_fully_connected_called; + (_) -> + ok + end, + ?assertMatch( + {error, {task_failed, check_fully_connected_failed, #{}}}, + cets_join:join(lock_name(Config), #{}, Pid1, Pid2, #{checkpoint_handler => F}) + ), + receive_message(before_check_fully_connected_called). + +join_fails_because_join_refs_do_not_match_for_nodes_in_segment(Config) -> + #{pids := [Pid1, Pid2, Pid3]} = given_3_servers(Config), + %% Pid2 and Pid3 are connected + %% But for some reason Pid3 has a different join_ref + %% (probably could happen if it still haven't checked other nodes after a join) + ok = cets_join:join(lock_name(Config), #{}, Pid2, Pid3, #{}), + set_join_ref(Pid3, make_ref()), + ?assertMatch( + {error, {task_failed, check_same_join_ref_failed, #{}}}, + cets_join:join(lock_name(Config), #{}, Pid1, Pid2, #{}) + ). + +join_fails_because_pids_do_not_match_for_nodes_in_segment(Config) -> + #{pids := [Pid1, Pid2, Pid3]} = given_3_servers(Config), + %% Pid2 and Pid3 are connected + %% But for some reason Pid3 has a different other_nodes list + %% (probably could happen if it still haven't checked other nodes after a join) + ok = cets_join:join(lock_name(Config), #{}, Pid2, Pid3, #{}), + set_other_servers(Pid3, []), + ?assertMatch( + {error, {task_failed, check_fully_connected_failed, #{}}}, + cets_join:join(lock_name(Config), #{}, Pid1, Pid2, #{}) + ). + +join_fails_because_servers_overlap(Config) -> + #{pids := [Pid1, Pid2, Pid3]} = given_3_servers(Config), + set_other_servers(Pid1, [Pid3]), + set_other_servers(Pid2, [Pid3]), + ?assertMatch( + {error, {task_failed, check_do_not_overlap_failed, #{}}}, + cets_join:join(lock_name(Config), #{}, Pid1, Pid2, #{}) + ). + +%% join_fails_because_servers_overlap testcase, but we check the logging. +%% We check that `?LOG_ERROR(#{what => check_do_not_overlap_failed})' is called. +logs_are_printed_when_join_fails_because_servers_overlap(Config) -> + LogRef = make_ref(), + logger_debug_h:start(#{id => ?FUNCTION_NAME}), + #{pids := [Pid1, Pid2, Pid3]} = given_3_servers(Config), + set_other_servers(Pid1, [Pid3]), + set_other_servers(Pid2, [Pid3]), + ?assertMatch( + {error, {task_failed, check_do_not_overlap_failed, #{}}}, + cets_join:join(lock_name(Config), #{log_ref => LogRef}, Pid1, Pid2, #{}) + ), + receive + {log, ?FUNCTION_NAME, #{ + level := error, + msg := + {report, #{ + what := check_do_not_overlap_failed, log_ref := LogRef + }} + }} -> + ok + after 5000 -> + ct:fail(timeout) + end. + +remote_ops_are_ignored_if_join_ref_does_not_match(Config) -> + {ok, Pid1} = start_local(make_name(Config, 1)), + {ok, Pid2} = start_local(make_name(Config, 2)), + ok = cets_join:join(lock_name(Config), #{}, Pid1, Pid2, #{}), + #{join_ref := JoinRef} = cets:info(Pid1), + set_join_ref(Pid1, make_ref()), + cets:insert(Pid2, {1}), + %% fix and check again + set_join_ref(Pid1, JoinRef), + cets:insert(Pid2, {2}), + {ok, [{2}]} = cets:remote_dump(Pid1). + +join_retried_if_lock_is_busy(Config) -> + Me = self(), + {ok, Pid1} = start_local(make_name(Config, 1)), + {ok, Pid2} = start_local(make_name(Config, 2)), + Lock = lock_name(Config), + SleepyF = fun + (join_start) -> + Me ! join_start, + timer:sleep(infinity); + (_) -> + ok + end, + F = fun + (before_retry) -> Me ! before_retry; + (_) -> ok + end, + %% Get the lock in a separate process + proc_lib:spawn_link(fun() -> + cets_join:join(Lock, #{}, Pid1, Pid2, #{checkpoint_handler => SleepyF}) + end), + receive_message(join_start), + %% We actually would not return from cets_join:join unless we get the lock + proc_lib:spawn_link(fun() -> + ok = cets_join:join(Lock, #{}, Pid1, Pid2, #{checkpoint_handler => F}) + end), + receive_message(before_retry). + +join_done_already_while_waiting_for_lock_so_do_nothing(Config) -> + logger_debug_h:start(#{id => ?FUNCTION_NAME}), + Me = self(), + #{pids := [Pid1, Pid2, Pid3, Pid4]} = given_n_servers(Config, 4, #{}), + Lock = lock_name(Config), + ok = cets_join:join(Lock, #{}, Pid1, Pid2, #{}), + ok = cets_join:join(Lock, #{}, Pid3, Pid4, #{}), + %% It is to just match logs + LogRef = make_ref(), + Info = #{log_ref => LogRef}, + F1 = send_join_start_back_and_wait_for_continue_joining(), + F2 = fun(_) -> ok end, + %% Get the lock in a separate process + proc_lib:spawn_link(fun() -> + ok = cets_join:join(Lock, Info, Pid1, Pid3, #{checkpoint_handler => F1}), + Me ! first_join_returns + end), + JoinPid = receive_message_with_arg(join_start), + proc_lib:spawn_link(fun() -> + ok = cets_join:join(Lock, Info, Pid1, Pid3, #{checkpoint_handler => F2}), + Me ! second_join_returns + end), + JoinPid ! continue_joining, + %% At this point our first join would finish, after that our second join should exit too. + receive_message(first_join_returns), + receive_message(second_join_returns), + %% Ensure all logs are received by removing the handler, it is a sync operation. + %% (we do not expect any logs anyway). + logger:remove_handler(?FUNCTION_NAME), + %% Ensure there is nothing logged, we use log_ref to ignore logs from other tests. + %% The counter example for no logging is + %% the logs_are_printed_when_join_fails_because_servers_overlap testcase. + cets_test_log:assert_nothing_is_logged(?FUNCTION_NAME, LogRef). + +%% Heleprs + +send_join_start_back_and_wait_for_continue_joining() -> + Me = self(), + fun + (join_start) -> + Me ! {join_start, self()}, + receive + continue_joining -> + ok + end; + (_) -> + ok + end. diff --git a/test/cets_test_helper.erl b/test/cets_test_helper.erl index fedc8d4..a12acd6 100644 --- a/test/cets_test_helper.erl +++ b/test/cets_test_helper.erl @@ -8,7 +8,8 @@ -export([ set_nodedown_timestamp/3, - set_other_servers/2 + set_other_servers/2, + set_join_ref/2 ]). get_disco_timestamp(Disco, MapName, NodeKey) -> @@ -31,3 +32,6 @@ set_other_servers(Pid, Servers) -> sys:replace_state(Pid, fun(#{other_servers := _} = State) -> State#{other_servers := Servers} end). + +set_join_ref(Pid, JoinRef) -> + sys:replace_state(Pid, fun(#{join_ref := _} = State) -> State#{join_ref := JoinRef} end). From fa821f0b3720cb0b48d0f150d47c398309a6f1e1 Mon Sep 17 00:00:00 2001 From: Mikhail Uvarov Date: Tue, 12 Mar 2024 17:22:28 +0100 Subject: [PATCH 18/30] Move move cases into cets_join_SUITE --- test/cets_SUITE.erl | 62 ---------------------------------- test/cets_join_SUITE.erl | 73 +++++++++++++++++++++++++++++++++++++--- 2 files changed, 68 insertions(+), 67 deletions(-) diff --git a/test/cets_SUITE.erl b/test/cets_SUITE.erl index 463b773..5ab3237 100644 --- a/test/cets_SUITE.erl +++ b/test/cets_SUITE.erl @@ -193,8 +193,6 @@ seq_cases() -> [ insert_returns_when_netsplit, inserts_after_netsplit_reconnects, - joining_not_fully_connected_node_is_not_allowed, - joining_not_fully_connected_node_is_not_allowed2, cets_ping_all_returns_when_ping_crashes, join_interrupted_when_ping_crashes, ping_pairs_returns_pongs, @@ -1495,66 +1493,6 @@ inserts_after_netsplit_reconnects(Config) -> [{1, v2}] = dump(Node1, Tab), [{1, v3}] = dump(Peer5, Tab). -%% Joins from a bad (not fully connected) node -%% Join process should check if nodes could contact each other before allowing to join -joining_not_fully_connected_node_is_not_allowed(Config) -> - #{ct3 := Peer3, ct5 := Peer5} = proplists:get_value(peers, Config), - #{ct5 := Node5} = proplists:get_value(nodes, Config), - Node1 = node(), - Tab = make_name(Config), - {ok, Pid1} = start(Node1, Tab), - {ok, Pid3} = start(Peer3, Tab), - {ok, Pid5} = start(Peer5, Tab), - ok = cets_join:join(lock_name(Config), #{}, Pid1, Pid3), - %% No connection between Peer5 and Node1 - block_node(Node5, Peer5), - try - %% Pid5 and Pid3 could contact each other. - %% Pid3 could contact Pid1 (they are joined). - %% But Pid5 cannot contact Pid1. - {error, {task_failed, check_could_reach_each_other_failed, _}} = - rpc(Peer5, cets_join, join, [lock_name(Config), #{}, Pid5, Pid3]), - %% Still connected - cets:insert(Pid1, {r1}), - {ok, [{r1}]} = cets:remote_dump(Pid3), - [Pid3] = cets:other_pids(Pid1), - [Pid1] = cets:other_pids(Pid3) - after - reconnect_node(Node5, Peer5) - end, - [] = cets:other_pids(Pid5). - -%% Joins from a good (fully connected) node -joining_not_fully_connected_node_is_not_allowed2(Config) -> - #{ct3 := Peer3, ct5 := Peer5} = proplists:get_value(peers, Config), - #{ct5 := Node5} = proplists:get_value(nodes, Config), - Node1 = node(), - Tab = make_name(Config), - {ok, Pid1} = start(Node1, Tab), - {ok, Pid3} = start(Peer3, Tab), - {ok, Pid5} = start(Peer5, Tab), - ok = cets_join:join(lock_name(Config), #{}, Pid1, Pid3), - %% No connection between Peer5 and Node1 - block_node(Node5, Peer5), - try - %% Pid5 and Pid3 could contact each other. - %% Pid3 could contact Pid1 (they are joined). - %% But Pid5 cannot contact Pid1. - {error, {task_failed, check_could_reach_each_other_failed, _}} = rpc( - Peer3, cets_join, join, [ - lock_name(Config), #{}, Pid5, Pid3 - ] - ), - %% Still connected - cets:insert(Pid1, {r1}), - {ok, [{r1}]} = cets:remote_dump(Pid3), - [Pid3] = cets:other_pids(Pid1), - [Pid1] = cets:other_pids(Pid3) - after - reconnect_node(Node5, Peer5) - end, - [] = cets:other_pids(Pid5). - cets_ping_all_returns_when_ping_crashes(Config) -> #{pid1 := Pid1, pid2 := Pid2} = given_two_joined_tables(Config), meck:new(cets, [passthrough]), diff --git a/test/cets_join_SUITE.erl b/test/cets_join_SUITE.erl index 3408462..a81d24e 100644 --- a/test/cets_join_SUITE.erl +++ b/test/cets_join_SUITE.erl @@ -70,8 +70,8 @@ all() -> [ {group, cets}, - {group, cets_no_log} - % {group, cets_seq}, + {group, cets_no_log}, + {group, cets_seq} % {group, cets_seq_no_log} ]. @@ -119,14 +119,17 @@ cases() -> ]. seq_cases() -> - []. + [ + joining_not_fully_connected_node_is_not_allowed, + joining_not_fully_connected_node_is_not_allowed2 + ]. cets_seq_no_log_cases() -> []. init_per_suite(Config) -> cets_test_setup:init_cleanup_table(), - cets_test_peer:start([ct2, ct5], Config). + cets_test_peer:start([ct2, ct3, ct5], Config). end_per_suite(Config) -> cets_test_setup:remove_cleanup_table(), @@ -541,7 +544,67 @@ join_done_already_while_waiting_for_lock_so_do_nothing(Config) -> %% the logs_are_printed_when_join_fails_because_servers_overlap testcase. cets_test_log:assert_nothing_is_logged(?FUNCTION_NAME, LogRef). -%% Heleprs +%% Joins from a bad (not fully connected) node +%% Join process should check if nodes could contact each other before allowing to join +joining_not_fully_connected_node_is_not_allowed(Config) -> + #{ct3 := Peer3, ct5 := Peer5} = proplists:get_value(peers, Config), + #{ct5 := Node5} = proplists:get_value(nodes, Config), + Node1 = node(), + Tab = make_name(Config), + {ok, Pid1} = start(Node1, Tab), + {ok, Pid3} = start(Peer3, Tab), + {ok, Pid5} = start(Peer5, Tab), + ok = cets_join:join(lock_name(Config), #{}, Pid1, Pid3), + %% No connection between Peer5 and Node1 + block_node(Node5, Peer5), + try + %% Pid5 and Pid3 could contact each other. + %% Pid3 could contact Pid1 (they are joined). + %% But Pid5 cannot contact Pid1. + {error, {task_failed, check_could_reach_each_other_failed, _}} = + rpc(Peer5, cets_join, join, [lock_name(Config), #{}, Pid5, Pid3]), + %% Still connected + cets:insert(Pid1, {r1}), + {ok, [{r1}]} = cets:remote_dump(Pid3), + [Pid3] = cets:other_pids(Pid1), + [Pid1] = cets:other_pids(Pid3) + after + reconnect_node(Node5, Peer5) + end, + [] = cets:other_pids(Pid5). + +%% Joins from a good (fully connected) node +joining_not_fully_connected_node_is_not_allowed2(Config) -> + #{ct3 := Peer3, ct5 := Peer5} = proplists:get_value(peers, Config), + #{ct5 := Node5} = proplists:get_value(nodes, Config), + Node1 = node(), + Tab = make_name(Config), + {ok, Pid1} = start(Node1, Tab), + {ok, Pid3} = start(Peer3, Tab), + {ok, Pid5} = start(Peer5, Tab), + ok = cets_join:join(lock_name(Config), #{}, Pid1, Pid3), + %% No connection between Peer5 and Node1 + block_node(Node5, Peer5), + try + %% Pid5 and Pid3 could contact each other. + %% Pid3 could contact Pid1 (they are joined). + %% But Pid5 cannot contact Pid1. + {error, {task_failed, check_could_reach_each_other_failed, _}} = rpc( + Peer3, cets_join, join, [ + lock_name(Config), #{}, Pid5, Pid3 + ] + ), + %% Still connected + cets:insert(Pid1, {r1}), + {ok, [{r1}]} = cets:remote_dump(Pid3), + [Pid3] = cets:other_pids(Pid1), + [Pid1] = cets:other_pids(Pid3) + after + reconnect_node(Node5, Peer5) + end, + [] = cets:other_pids(Pid5). + +%% Helpers send_join_start_back_and_wait_for_continue_joining() -> Me = self(), From 53cda772337b577710eeb5dd4eeaa0ee4acca7ef Mon Sep 17 00:00:00 2001 From: Mikhail Uvarov Date: Tue, 12 Mar 2024 17:28:38 +0100 Subject: [PATCH 19/30] Move join_interrupted_when_ping_crashes --- test/cets_SUITE.erl | 15 --------------- test/cets_join_SUITE.erl | 24 ++++++++++++++++++++---- 2 files changed, 20 insertions(+), 19 deletions(-) diff --git a/test/cets_SUITE.erl b/test/cets_SUITE.erl index 5ab3237..32c2ad6 100644 --- a/test/cets_SUITE.erl +++ b/test/cets_SUITE.erl @@ -194,7 +194,6 @@ seq_cases() -> insert_returns_when_netsplit, inserts_after_netsplit_reconnects, cets_ping_all_returns_when_ping_crashes, - join_interrupted_when_ping_crashes, ping_pairs_returns_pongs, ping_pairs_returns_earlier, pre_connect_fails_on_our_node, @@ -206,7 +205,6 @@ seq_cases() -> cets_seq_no_log_cases() -> [ - join_interrupted_when_ping_crashes, node_down_history_is_updated_when_netsplit_happens, send_check_servers_is_called_before_last_server_got_dump, remote_ops_are_not_sent_before_last_server_got_dump @@ -1503,19 +1501,6 @@ cets_ping_all_returns_when_ping_crashes(Config) -> ?assertMatch({error, [{Pid2, {'EXIT', {simulate_crash, _}}}]}, cets:ping_all(Pid1)), meck:unload(). -join_interrupted_when_ping_crashes(Config) -> - #{pid1 := Pid1, pid2 := Pid2} = given_two_joined_tables(Config), - Tab3 = make_name(Config, 3), - {ok, Pid3} = start_local(Tab3, #{}), - meck:new(cets, [passthrough]), - meck:expect(cets_call, long_call, fun - (Server, ping) when Server == Pid2 -> error(simulate_crash); - (Server, Msg) -> meck:passthrough([Server, Msg]) - end), - Res = cets_join:join(lock_name(Config), #{}, Pid1, Pid3), - ?assertMatch({error, {task_failed, ping_all_failed, #{}}}, Res), - meck:unload(). - node_down_history_is_updated_when_netsplit_happens(Config) -> %% node_down_history is available in cets:info/1 API. %% It could be used for manual debugging in situations diff --git a/test/cets_join_SUITE.erl b/test/cets_join_SUITE.erl index a81d24e..2f02212 100644 --- a/test/cets_join_SUITE.erl +++ b/test/cets_join_SUITE.erl @@ -71,8 +71,8 @@ all() -> [ {group, cets}, {group, cets_no_log}, - {group, cets_seq} - % {group, cets_seq_no_log} + {group, cets_seq}, + {group, cets_seq_no_log} ]. only_for_logger_cases() -> @@ -121,11 +121,14 @@ cases() -> seq_cases() -> [ joining_not_fully_connected_node_is_not_allowed, - joining_not_fully_connected_node_is_not_allowed2 + joining_not_fully_connected_node_is_not_allowed2, + join_interrupted_when_ping_crashes ]. cets_seq_no_log_cases() -> - []. + [ + join_interrupted_when_ping_crashes + ]. init_per_suite(Config) -> cets_test_setup:init_cleanup_table(), @@ -604,6 +607,19 @@ joining_not_fully_connected_node_is_not_allowed2(Config) -> end, [] = cets:other_pids(Pid5). +join_interrupted_when_ping_crashes(Config) -> + #{pid1 := Pid1, pid2 := Pid2} = given_two_joined_tables(Config), + Tab3 = make_name(Config, 3), + {ok, Pid3} = start_local(Tab3, #{}), + meck:new(cets, [passthrough]), + meck:expect(cets_call, long_call, fun + (Server, ping) when Server == Pid2 -> error(simulate_crash); + (Server, Msg) -> meck:passthrough([Server, Msg]) + end), + Res = cets_join:join(lock_name(Config), #{}, Pid1, Pid3), + ?assertMatch({error, {task_failed, ping_all_failed, #{}}}, Res), + meck:unload(). + %% Helpers send_join_start_back_and_wait_for_continue_joining() -> From 19fba2fb667f3403683538ab12a18c5365bc074b Mon Sep 17 00:00:00 2001 From: Mikhail Uvarov Date: Tue, 12 Mar 2024 17:55:55 +0100 Subject: [PATCH 20/30] Move tests into cets_netsplit_SUITE --- test/cets_SUITE.erl | 131 ------------------ test/cets_netsplit_SUITE.erl | 259 +++++++++++++++++++++++++++++++++++ 2 files changed, 259 insertions(+), 131 deletions(-) create mode 100644 test/cets_netsplit_SUITE.erl diff --git a/test/cets_SUITE.erl b/test/cets_SUITE.erl index 32c2ad6..11e7b85 100644 --- a/test/cets_SUITE.erl +++ b/test/cets_SUITE.erl @@ -168,8 +168,6 @@ cases() -> send_leader_op_throws_noproc, pinfo_returns_value, pinfo_returns_undefined, - cets_ping_non_existing_node, - cets_ping_net_family, ignore_send_dump_received_when_unpaused, ignore_send_dump_received_when_paused_with_another_pause_ref, pause_on_remote_node_returns_if_monitor_process_dies @@ -191,13 +189,6 @@ only_for_logger_cases() -> seq_cases() -> [ - insert_returns_when_netsplit, - inserts_after_netsplit_reconnects, - cets_ping_all_returns_when_ping_crashes, - ping_pairs_returns_pongs, - ping_pairs_returns_earlier, - pre_connect_fails_on_our_node, - pre_connect_fails_on_one_of_the_nodes, send_check_servers_is_called_before_last_server_got_dump, remote_ops_are_not_sent_before_last_server_got_dump, pause_on_remote_node_crashes @@ -205,7 +196,6 @@ seq_cases() -> cets_seq_no_log_cases() -> [ - node_down_history_is_updated_when_netsplit_happens, send_check_servers_is_called_before_last_server_got_dump, remote_ops_are_not_sent_before_last_server_got_dump ]. @@ -1449,127 +1439,6 @@ pinfo_returns_value(_Config) -> pinfo_returns_undefined(_Config) -> undefined = cets_long:pinfo(stopped_pid(), messages). -%% Netsplit cases (run in sequence) - -insert_returns_when_netsplit(Config) -> - #{ct5 := Peer5} = proplists:get_value(peers, Config), - #{ct5 := Node5} = proplists:get_value(nodes, Config), - Node1 = node(), - Tab = make_name(Config), - {ok, Pid1} = start(Node1, Tab), - {ok, Pid5} = start(Peer5, Tab), - ok = cets_join:join(lock_name(Config), #{}, Pid1, Pid5), - sys:suspend(Pid5), - R = cets:insert_request(Tab, {1, test}), - block_node(Node5, Peer5), - try - {reply, ok} = cets:wait_response(R, 5000) - after - reconnect_node(Node5, Peer5) - end. - -inserts_after_netsplit_reconnects(Config) -> - #{ct5 := Peer5} = proplists:get_value(peers, Config), - #{ct5 := Node5} = proplists:get_value(nodes, Config), - Node1 = node(), - Tab = make_name(Config), - {ok, Pid1} = start(Node1, Tab), - {ok, Pid5} = start(Peer5, Tab), - ok = cets_join:join(lock_name(Config), #{}, Pid1, Pid5), - sys:suspend(Pid5), - R = cets:insert_request(Tab, {1, v1}), - block_node(Node5, Peer5), - try - {reply, ok} = cets:wait_response(R, 5000) - after - reconnect_node(Node5, Peer5) - end, - sys:resume(Pid5), - cets:insert(Pid1, {1, v2}), - cets:insert(Pid5, {1, v3}), - %% No automatic recovery - [{1, v2}] = dump(Node1, Tab), - [{1, v3}] = dump(Peer5, Tab). - -cets_ping_all_returns_when_ping_crashes(Config) -> - #{pid1 := Pid1, pid2 := Pid2} = given_two_joined_tables(Config), - meck:new(cets, [passthrough]), - meck:expect(cets_call, long_call, fun - (Server, ping) when Server == Pid2 -> error(simulate_crash); - (Server, Msg) -> meck:passthrough([Server, Msg]) - end), - ?assertMatch({error, [{Pid2, {'EXIT', {simulate_crash, _}}}]}, cets:ping_all(Pid1)), - meck:unload(). - -node_down_history_is_updated_when_netsplit_happens(Config) -> - %% node_down_history is available in cets:info/1 API. - %% It could be used for manual debugging in situations - %% we get netsplits or during rolling upgrades. - #{ct5 := Peer5} = proplists:get_value(peers, Config), - #{ct5 := Node5} = proplists:get_value(nodes, Config), - Node1 = node(), - Tab = make_name(Config), - {ok, Pid1} = start(Node1, Tab), - {ok, Pid5} = start(Peer5, Tab), - ok = cets_join:join(lock_name(Config), #{}, Pid1, Pid5), - block_node(Node5, Peer5), - try - F = fun() -> - History = maps:get(node_down_history, cets:info(Pid1)), - lists:map(fun(#{node := Node}) -> Node end, History) - end, - cets_test_wait:wait_until(F, [Node5]) - after - reconnect_node(Node5, Peer5), - cets:stop(Pid5) - end. - -cets_ping_non_existing_node(_Config) -> - pang = cets_ping:ping('mongooseim@non_existing_host'). - -pre_connect_fails_on_our_node(_Config) -> - cets_test_setup:mock_epmd(), - %% We would fail to connect to the remote EPMD but we would get an IP - pang = cets_ping:ping('mongooseim@resolvabletobadip'), - meck:unload(). - -pre_connect_fails_on_one_of_the_nodes(Config) -> - #{ct2 := Node2} = proplists:get_value(nodes, Config), - cets_test_setup:mock_epmd(), - %% We would get pong on Node2, but would fail an RPC to our hode - pang = rpc(Node2, cets_ping, ping, ['cetsnode1@localhost']), - History = meck:history(erl_epmd), - %% Check that Node2 called us - ?assertMatch( - [_], - [ - X - || {_, {erl_epmd, address_please, ["cetsnode1", "localhost", inet]}, - {ok, {192, 168, 100, 134}}} = X <- History - ], - History - ), - meck:unload(). - -cets_ping_net_family(_Config) -> - inet = cets_ping:net_family(error), - inet = cets_ping:net_family({ok, [["inet"]]}), - inet6 = cets_ping:net_family({ok, [["inet6"]]}), - inet6 = cets_ping:net_family({ok, [["inet6_tls"]]}). - -ping_pairs_returns_pongs(Config) -> - #{ct2 := Node2, ct3 := Node3} = proplists:get_value(nodes, Config), - Me = node(), - [{Me, Node2, pong}, {Node2, Node3, pong}] = - cets_ping:ping_pairs([{Me, Node2}, {Node2, Node3}]). - -ping_pairs_returns_earlier(Config) -> - #{ct2 := Node2, ct3 := Node3} = proplists:get_value(nodes, Config), - Me = node(), - Bad = 'badnode@localhost', - [{Me, Me, pong}, {Me, Node2, pong}, {Me, Bad, pang}, {Me, Node3, skipped}] = - cets_ping:ping_pairs([{Me, Me}, {Me, Node2}, {Me, Bad}, {Me, Node3}]). - %% Helper functions still_works(Pid) -> diff --git a/test/cets_netsplit_SUITE.erl b/test/cets_netsplit_SUITE.erl new file mode 100644 index 0000000..d20facb --- /dev/null +++ b/test/cets_netsplit_SUITE.erl @@ -0,0 +1,259 @@ +-module(cets_netsplit_SUITE). +-include_lib("common_test/include/ct.hrl"). +-include_lib("eunit/include/eunit.hrl"). +-include_lib("kernel/include/logger.hrl"). + +-compile([export_all, nowarn_export_all]). + +-import(cets_test_setup, [ + start/2, + start_local/1, + start_local/2, + start_disco/2, + start_simple_disco/0, + make_name/1, + make_name/2, + lock_name/1, + disco_name/1 +]). + +-import(cets_test_wait, [ + wait_for_down/1, + wait_for_ready/2, + wait_till_test_stage/2 +]). + +-import(cets_test_setup, [ + setup_two_nodes_and_discovery/1, + setup_two_nodes_and_discovery/2, + simulate_disco_restart/1, + make_signalling_process/0, + given_two_joined_tables/1 +]). + +-import(cets_test_wait, [ + wait_for_disco_timestamp_to_appear/3, + wait_for_disco_timestamp_to_be_updated/4 +]). + +-import(cets_test_receive, [ + receive_message/1, + flush_message/1 +]). + +-import(cets_test_peer, [ + block_node/2, + reconnect_node/2, + disconnect_node/2, + disconnect_node_by_name/2 +]). + +-import(cets_test_rpc, [ + rpc/4, + dump/2 +]). + +-import(cets_test_helper, [assert_unique/1]). + +-import(cets_test_rpc, [ + other_nodes/2 +]). + +all() -> + [ + {group, cets}, + {group, cets_seq}, + {group, cets_seq_no_log} + ]. + +groups() -> + %% Cases should have unique names, because we name CETS servers based on case names + [ + {cets, [parallel, {repeat_until_any_fail, 3}], assert_unique(cases())}, + %% These tests actually simulate a netsplit on the distribution level. + %% Though, global's prevent_overlapping_partitions option starts kicking + %% all nodes from the cluster, so we have to be careful not to break other cases. + %% Setting prevent_overlapping_partitions=false on ct5 helps. + {cets_seq, [sequence, {repeat_until_any_fail, 2}], assert_unique(seq_cases())}, + {cets_seq_no_log, [sequence, {repeat_until_any_fail, 2}], + assert_unique(cets_seq_no_log_cases())} + ]. + +cases() -> + [ + cets_ping_non_existing_node, + cets_ping_net_family + ]. + +seq_cases() -> + [ + insert_returns_when_netsplit, + inserts_after_netsplit_reconnects, + cets_ping_all_returns_when_ping_crashes, + ping_pairs_returns_pongs, + ping_pairs_returns_earlier, + pre_connect_fails_on_our_node, + pre_connect_fails_on_one_of_the_nodes + ]. + +cets_seq_no_log_cases() -> + [ + node_down_history_is_updated_when_netsplit_happens + ]. + +init_per_suite(Config) -> + cets_test_setup:init_cleanup_table(), + cets_test_peer:start([ct2, ct3, ct5], Config). + +end_per_suite(Config) -> + cets_test_setup:remove_cleanup_table(), + cets_test_peer:stop(Config), + Config. + +init_per_group(Group, Config) when Group == cets_seq_no_log; Group == cets_no_log -> + [ok = logger:set_module_level(M, none) || M <- log_modules()], + Config; +init_per_group(_Group, Config) -> + Config. + +end_per_group(Group, Config) when Group == cets_seq_no_log; Group == cets_no_log -> + [ok = logger:unset_module_level(M) || M <- log_modules()], + Config; +end_per_group(_Group, Config) -> + Config. + +init_per_testcase(test_multinode_auto_discovery = Name, Config) -> + ct:make_priv_dir(), + init_per_testcase_generic(Name, Config); +init_per_testcase(Name, Config) -> + init_per_testcase_generic(Name, Config). + +init_per_testcase_generic(Name, Config) -> + [{testcase, Name} | Config]. + +end_per_testcase(_, _Config) -> + cets_test_setup:wait_for_cleanup(), + ok. + +%% Modules that use a multiline LOG_ macro +log_modules() -> + [cets, cets_call, cets_long, cets_join, cets_discovery]. + +insert_returns_when_netsplit(Config) -> + #{ct5 := Peer5} = proplists:get_value(peers, Config), + #{ct5 := Node5} = proplists:get_value(nodes, Config), + Node1 = node(), + Tab = make_name(Config), + {ok, Pid1} = start(Node1, Tab), + {ok, Pid5} = start(Peer5, Tab), + ok = cets_join:join(lock_name(Config), #{}, Pid1, Pid5), + sys:suspend(Pid5), + R = cets:insert_request(Tab, {1, test}), + block_node(Node5, Peer5), + try + {reply, ok} = cets:wait_response(R, 5000) + after + reconnect_node(Node5, Peer5) + end. + +inserts_after_netsplit_reconnects(Config) -> + #{ct5 := Peer5} = proplists:get_value(peers, Config), + #{ct5 := Node5} = proplists:get_value(nodes, Config), + Node1 = node(), + Tab = make_name(Config), + {ok, Pid1} = start(Node1, Tab), + {ok, Pid5} = start(Peer5, Tab), + ok = cets_join:join(lock_name(Config), #{}, Pid1, Pid5), + sys:suspend(Pid5), + R = cets:insert_request(Tab, {1, v1}), + block_node(Node5, Peer5), + try + {reply, ok} = cets:wait_response(R, 5000) + after + reconnect_node(Node5, Peer5) + end, + sys:resume(Pid5), + cets:insert(Pid1, {1, v2}), + cets:insert(Pid5, {1, v3}), + %% No automatic recovery + [{1, v2}] = dump(Node1, Tab), + [{1, v3}] = dump(Peer5, Tab). + +cets_ping_all_returns_when_ping_crashes(Config) -> + #{pid1 := Pid1, pid2 := Pid2} = given_two_joined_tables(Config), + meck:new(cets, [passthrough]), + meck:expect(cets_call, long_call, fun + (Server, ping) when Server == Pid2 -> error(simulate_crash); + (Server, Msg) -> meck:passthrough([Server, Msg]) + end), + ?assertMatch({error, [{Pid2, {'EXIT', {simulate_crash, _}}}]}, cets:ping_all(Pid1)), + meck:unload(). + +node_down_history_is_updated_when_netsplit_happens(Config) -> + %% node_down_history is available in cets:info/1 API. + %% It could be used for manual debugging in situations + %% we get netsplits or during rolling upgrades. + #{ct5 := Peer5} = proplists:get_value(peers, Config), + #{ct5 := Node5} = proplists:get_value(nodes, Config), + Node1 = node(), + Tab = make_name(Config), + {ok, Pid1} = start(Node1, Tab), + {ok, Pid5} = start(Peer5, Tab), + ok = cets_join:join(lock_name(Config), #{}, Pid1, Pid5), + block_node(Node5, Peer5), + try + F = fun() -> + History = maps:get(node_down_history, cets:info(Pid1)), + lists:map(fun(#{node := Node}) -> Node end, History) + end, + cets_test_wait:wait_until(F, [Node5]) + after + reconnect_node(Node5, Peer5), + cets:stop(Pid5) + end. + +cets_ping_non_existing_node(_Config) -> + pang = cets_ping:ping('mongooseim@non_existing_host'). + +pre_connect_fails_on_our_node(_Config) -> + cets_test_setup:mock_epmd(), + %% We would fail to connect to the remote EPMD but we would get an IP + pang = cets_ping:ping('mongooseim@resolvabletobadip'), + meck:unload(). + +pre_connect_fails_on_one_of_the_nodes(Config) -> + #{ct2 := Node2} = proplists:get_value(nodes, Config), + cets_test_setup:mock_epmd(), + %% We would get pong on Node2, but would fail an RPC to our hode + pang = rpc(Node2, cets_ping, ping, ['cetsnode1@localhost']), + History = meck:history(erl_epmd), + %% Check that Node2 called us + ?assertMatch( + [_], + [ + X + || {_, {erl_epmd, address_please, ["cetsnode1", "localhost", inet]}, + {ok, {192, 168, 100, 134}}} = X <- History + ], + History + ), + meck:unload(). + +cets_ping_net_family(_Config) -> + inet = cets_ping:net_family(error), + inet = cets_ping:net_family({ok, [["inet"]]}), + inet6 = cets_ping:net_family({ok, [["inet6"]]}), + inet6 = cets_ping:net_family({ok, [["inet6_tls"]]}). + +ping_pairs_returns_pongs(Config) -> + #{ct2 := Node2, ct3 := Node3} = proplists:get_value(nodes, Config), + Me = node(), + [{Me, Node2, pong}, {Node2, Node3, pong}] = + cets_ping:ping_pairs([{Me, Node2}, {Node2, Node3}]). + +ping_pairs_returns_earlier(Config) -> + #{ct2 := Node2, ct3 := Node3} = proplists:get_value(nodes, Config), + Me = node(), + Bad = 'badnode@localhost', + [{Me, Me, pong}, {Me, Node2, pong}, {Me, Bad, pang}, {Me, Node3, skipped}] = + cets_ping:ping_pairs([{Me, Me}, {Me, Node2}, {Me, Bad}, {Me, Node3}]). From c19a3c4f1168d6ecafe012090dc3313febdfd59f Mon Sep 17 00:00:00 2001 From: Mikhail Uvarov Date: Thu, 9 May 2024 21:59:43 +0200 Subject: [PATCH 21/30] Move start_link_local test --- test/cets_SUITE.erl | 13 +------------ test/cets_join_SUITE.erl | 15 +++++++++++---- test/cets_test_setup.erl | 12 ++++++++++++ 3 files changed, 24 insertions(+), 16 deletions(-) diff --git a/test/cets_SUITE.erl b/test/cets_SUITE.erl index 11e7b85..8e49f81 100644 --- a/test/cets_SUITE.erl +++ b/test/cets_SUITE.erl @@ -99,7 +99,6 @@ cases() -> delete_works, delete_many_works, inserted_records_could_be_read_back_from_replicated_table, - bag_with_conflict_handler_not_allowed_for_start_link, insert_new_works, insert_new_works_with_table_name, insert_new_works_when_leader_is_back, @@ -237,7 +236,7 @@ log_modules() -> start_link_inits_and_accepts_records(Config) -> Tab = make_name(Config), - start_link_local(Tab), + cets_test_setup:start_link_local(Tab), cets:insert(Tab, {alice, 32}), [{alice, 32}] = ets:lookup(Tab, alice). @@ -1447,16 +1446,6 @@ still_works(Pid) -> ok = cets:insert(Pid, {1}), {ok, [{1}]} = cets:remote_dump(Pid). -start_link_local(Name) -> - start_link_local(Name, #{}). - -start_link_local(Name, Opts) -> - catch cets:stop(Name), - wait_for_name_to_be_free(node(), Name), - {ok, Pid} = cets:start_link(Name, Opts), - schedule_cleanup(Pid), - {ok, Pid}. - stopped_pid() -> %% Get a pid for a stopped process {Pid, Mon} = spawn_monitor(fun() -> ok end), diff --git a/test/cets_join_SUITE.erl b/test/cets_join_SUITE.erl index 2f02212..ac44c62 100644 --- a/test/cets_join_SUITE.erl +++ b/test/cets_join_SUITE.erl @@ -99,10 +99,13 @@ groups() -> cases() -> [ join_works, + join_works_with_existing_data, + join_works_with_existing_data_with_conflicts, join_works_with_existing_data_with_conflicts_and_defined_conflict_handler, join_works_with_existing_data_with_conflicts_and_defined_conflict_handler_and_more_keys, join_works_with_existing_data_with_conflicts_and_defined_conflict_handler_and_keypos2, bag_with_conflict_handler_not_allowed, + bag_with_conflict_handler_not_allowed_for_start_link, join_with_the_same_pid, join_ref_is_same_after_join, join_fails_because_server_process_not_found, @@ -194,6 +197,14 @@ join_works_with_existing_data_with_conflicts(Config) -> [{alice, 33}] = ets:lookup(Tab1, alice), [{alice, 32}] = ets:lookup(Tab2, alice). +bag_with_conflict_handler_not_allowed(Config) -> + {error, [bag_with_conflict_handler]} = + cets:start(make_name(Config), #{handle_conflict => fun resolve_highest/2, type => bag}). + +bag_with_conflict_handler_not_allowed_for_start_link(Config) -> + {error, [bag_with_conflict_handler]} = + cets:start_link(make_name(Config), #{handle_conflict => fun resolve_highest/2, type => bag}). + join_works_with_existing_data_with_conflicts_and_defined_conflict_handler(Config) -> Opts = #{handle_conflict => fun resolve_highest/2}, Tab1 = make_name(Config, 1), @@ -251,10 +262,6 @@ resolve_user_conflict(_U1, U2) -> resolve_highest({K, A}, {K, B}) -> {K, max(A, B)}. -bag_with_conflict_handler_not_allowed(Config) -> - {error, [bag_with_conflict_handler]} = - cets:start(make_name(Config), #{handle_conflict => fun resolve_highest/2, type => bag}). - join_with_the_same_pid(Config) -> Tab = make_name(Config), {ok, Pid} = start_local(Tab), diff --git a/test/cets_test_setup.erl b/test/cets_test_setup.erl index 3decf1d..48715fe 100644 --- a/test/cets_test_setup.erl +++ b/test/cets_test_setup.erl @@ -13,6 +13,8 @@ -export([ start_local/1, start_local/2, + start_link_local/1, + start_link_local/2, start/2, start_disco/2, start_simple_disco/0 @@ -80,6 +82,16 @@ start(Node, Tab) -> schedule_cleanup(Pid), {ok, Pid}. +start_link_local(Name) -> + start_link_local(Name, #{}). + +start_link_local(Name, Opts) -> + catch cets:stop(Name), + cets_test_wait:wait_for_name_to_be_free(node(), Name), + {ok, Pid} = cets:start_link(Name, Opts), + schedule_cleanup(Pid), + {ok, Pid}. + start_disco(Node, Opts) -> case Opts of #{name := Name} -> From e3ad79cccd6c61a8bbed05149ba1afa2906ee597 Mon Sep 17 00:00:00 2001 From: Mikhail Uvarov Date: Thu, 9 May 2024 22:17:13 +0200 Subject: [PATCH 22/30] Debug tests --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f452c10..9e6b947 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -22,7 +22,7 @@ jobs: - name: Compile run: rebar3 as test compile - name: Run tests - run: rebar3 cover_tests + run: DEBUG=1 rebar3 cover_tests - name: Send test coverage report run: rebar3 as test codecov analyze - name: Upload coverage reports to Codecov From a211b72aeba53d9ff5fd6604e2f8562bcc176d3a Mon Sep 17 00:00:00 2001 From: Mikhail Uvarov Date: Fri, 10 May 2024 13:15:32 +0200 Subject: [PATCH 23/30] Add 10 seconds timetrap --- test/app.config | 1 + test/cets_SUITE.erl | 3 +++ test/cets_disco_SUITE.erl | 3 +++ test/cets_join_SUITE.erl | 3 +++ test/cets_status_SUITE.erl | 3 +++ test/cets_test_setup.erl | 5 +++++ 6 files changed, 18 insertions(+) create mode 100644 test/app.config diff --git a/test/app.config b/test/app.config new file mode 100644 index 0000000..f468a9d --- /dev/null +++ b/test/app.config @@ -0,0 +1 @@ +[{timetrap, 1}]. diff --git a/test/cets_SUITE.erl b/test/cets_SUITE.erl index 8e49f81..f1f2534 100644 --- a/test/cets_SUITE.erl +++ b/test/cets_SUITE.erl @@ -65,6 +65,9 @@ set_join_ref/2 ]). +suite() -> + cets_test_setup:suite(). + all() -> [ {group, cets}, diff --git a/test/cets_disco_SUITE.erl b/test/cets_disco_SUITE.erl index 08b410d..4f3052d 100644 --- a/test/cets_disco_SUITE.erl +++ b/test/cets_disco_SUITE.erl @@ -56,6 +56,9 @@ other_nodes/2 ]). +suite() -> + cets_test_setup:suite(). + all() -> [ {group, cets}, diff --git a/test/cets_join_SUITE.erl b/test/cets_join_SUITE.erl index ac44c62..2815241 100644 --- a/test/cets_join_SUITE.erl +++ b/test/cets_join_SUITE.erl @@ -67,6 +67,9 @@ other_nodes/2 ]). +suite() -> + cets_test_setup:suite(). + all() -> [ {group, cets}, diff --git a/test/cets_status_SUITE.erl b/test/cets_status_SUITE.erl index a6bb648..01ce0a4 100644 --- a/test/cets_status_SUITE.erl +++ b/test/cets_status_SUITE.erl @@ -50,6 +50,9 @@ other_nodes/2 ]). +suite() -> + cets_test_setup:suite(). + all() -> [ {group, cets} diff --git a/test/cets_test_setup.erl b/test/cets_test_setup.erl index 48715fe..d6827a2 100644 --- a/test/cets_test_setup.erl +++ b/test/cets_test_setup.erl @@ -1,4 +1,6 @@ -module(cets_test_setup). +-export([suite/0]). + -export([ mock_epmd/0, mock_pause_on_remote_node_failing/0 @@ -51,6 +53,9 @@ -import(cets_test_rpc, [rpc/4]). +suite() -> + [{timetrap, {seconds, 10}}]. + mock_epmd() -> meck:new(erl_epmd, [passthrough, unstick]), meck:expect(erl_epmd, address_please, fun From 07839a6989e4a76d4b608c37af8c08c96ea14d3f Mon Sep 17 00:00:00 2001 From: Mikhail Uvarov Date: Fri, 10 May 2024 13:24:34 +0200 Subject: [PATCH 24/30] Use init:stop() to stop peers in tests --- test/cets_test_peer.erl | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/test/cets_test_peer.erl b/test/cets_test_peer.erl index 632efbe..be33877 100644 --- a/test/cets_test_peer.erl +++ b/test/cets_test_peer.erl @@ -34,7 +34,10 @@ name(Node) -> start_node(Id) -> {ok, Peer, Node} = ?CT_PEER(#{ - name => name(Id), connection => standard_io, args => extra_args(Id) + name => name(Id), + connection => standard_io, + args => extra_args(Id), + shutdown => 3000 }), %% Register so we can find Peer process later in code register(node_to_peer_name(Node), Peer), From 0d98f7471dd94a5071c11adfbbab580849d90fcd Mon Sep 17 00:00:00 2001 From: Mikhail Uvarov Date: Mon, 13 May 2024 17:19:11 +0200 Subject: [PATCH 25/30] Monitorr stopping peers --- test/cets_test_peer.erl | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/test/cets_test_peer.erl b/test/cets_test_peer.erl index be33877..26353e7 100644 --- a/test/cets_test_peer.erl +++ b/test/cets_test_peer.erl @@ -26,7 +26,14 @@ start(Names, Config) -> stop(Config) -> Peers = proplists:get_value(peers, Config), - [peer:stop(Peer) || Peer <- maps:values(Peers)], + [ + slow_task( + "peer:stop:self", + self(), + fun() -> slow_task("peer:stop", Peer, fun() -> peer:stop(Peer) end) end + ) + || Peer <- maps:values(Peers) + ], ok. name(Node) -> @@ -98,3 +105,18 @@ disconnect_node_by_name(Config, Id) -> lists:member(Node, nodes()) end, cets_test_wait:wait_until(F, false). + +slow_task(What, Self, F) -> + Pid = spawn_link(fun() -> monitor_loop(What, Self) end), + Res = F(), + Pid ! stop, + Res. + +monitor_loop(What, Pid) -> + receive + stop -> + ok + after 1000 -> + ct:pal("monitor_loop ~p ~p", [What, erlang:process_info(Pid, current_stacktrace)]), + monitor_loop(What, Pid) + end. From b9908fc54b0dd0286f4b5c0668372263b3cd0a40 Mon Sep 17 00:00:00 2001 From: Mikhail Uvarov Date: Tue, 14 May 2024 17:37:20 +0200 Subject: [PATCH 26/30] Reuse nodes in cets_test_peer cover logic is broken in Erlang 26. So, peer:stop/1 should not be called --- .github/workflows/ci.yml | 2 +- test/app.config | 1 - test/cets_test_peer.erl | 47 ++++++++++++++++------------------------ 3 files changed, 20 insertions(+), 30 deletions(-) delete mode 100644 test/app.config diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9e6b947..f452c10 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -22,7 +22,7 @@ jobs: - name: Compile run: rebar3 as test compile - name: Run tests - run: DEBUG=1 rebar3 cover_tests + run: rebar3 cover_tests - name: Send test coverage report run: rebar3 as test codecov analyze - name: Upload coverage reports to Codecov diff --git a/test/app.config b/test/app.config deleted file mode 100644 index f468a9d..0000000 --- a/test/app.config +++ /dev/null @@ -1 +0,0 @@ -[{timetrap, 1}]. diff --git a/test/cets_test_peer.erl b/test/cets_test_peer.erl index 26353e7..d2fedba 100644 --- a/test/cets_test_peer.erl +++ b/test/cets_test_peer.erl @@ -17,7 +17,7 @@ -include_lib("common_test/include/ct.hrl"). start(Names, Config) -> - {Nodes, Peers} = lists:unzip([start_node(N) || N <- Names]), + {Nodes, Peers} = lists:unzip([find_or_start_node(N) || N <- Names]), [ {nodes, maps:from_list(lists:zip(Names, Nodes))}, {peers, maps:from_list(lists:zip(Names, Peers))} @@ -25,20 +25,27 @@ start(Names, Config) -> ]. stop(Config) -> - Peers = proplists:get_value(peers, Config), + %% peer:stop/1 freezes in the code cover logic. + %% So, we reuse nodes between different suites. + %% Ensure that the nodes are connected again. + Nodes = proplists:get_value(nodes, Config), [ - slow_task( - "peer:stop:self", - self(), - fun() -> slow_task("peer:stop", Peer, fun() -> peer:stop(Peer) end) end - ) - || Peer <- maps:values(Peers) + reconnect_node(Node, node_to_peer(Node)) + || Node <- maps:values(Nodes) ], ok. name(Node) -> list_to_atom(peer:random_name(atom_to_list(Node))). +find_or_start_node(Id) -> + case persistent_term:get({id_to_node_peer, Id}, undefined) of + undefined -> + start_node(Id); + NodePeer -> + NodePeer + end. + start_node(Id) -> {ok, Peer, Node} = ?CT_PEER(#{ name => name(Id), @@ -47,7 +54,8 @@ start_node(Id) -> shutdown => 3000 }), %% Register so we can find Peer process later in code - register(node_to_peer_name(Node), Peer), + persistent_term:put({node_to_peer, Node}, Peer), + persistent_term:put({id_to_node_peer, Id}, {Node, Peer}), %% Keep nodes running after init_per_suite is finished unlink(Peer), %% Do RPC using alternative connection method @@ -60,16 +68,13 @@ node_to_peer(Node) when Node =:= node() -> %% There is no peer for the local CT node Node; node_to_peer(Node) when is_atom(Node) -> - case whereis(node_to_peer_name(Node)) of + case persistent_term:get({node_to_peer, Node}) of Pid when is_pid(Pid) -> Pid; undefined -> ct:fail({node_to_peer_failed, Node}) end. -node_to_peer_name(Node) -> - list_to_atom(atom_to_list(Node) ++ "_peer"). - %% Set epmd_port for better coverage extra_args(ct2) -> ["-epmd_port", "4369"]; @@ -88,6 +93,7 @@ block_node(Node, Peer) when is_atom(Node), is_pid(Peer) -> reconnect_node(Node, Peer) when is_atom(Node), is_pid(Peer) -> rpc(Peer, erlang, set_cookie, [node(), erlang:get_cookie()]), + erlang:set_cookie(Node, erlang:get_cookie()), %% Very rarely it could return pang cets_test_wait:wait_until(fun() -> rpc(Peer, net_adm, ping, [node()]) end, pong), cets_test_wait:wait_until(fun() -> rpc(node(), net_adm, ping, [Node]) end, pong). @@ -105,18 +111,3 @@ disconnect_node_by_name(Config, Id) -> lists:member(Node, nodes()) end, cets_test_wait:wait_until(F, false). - -slow_task(What, Self, F) -> - Pid = spawn_link(fun() -> monitor_loop(What, Self) end), - Res = F(), - Pid ! stop, - Res. - -monitor_loop(What, Pid) -> - receive - stop -> - ok - after 1000 -> - ct:pal("monitor_loop ~p ~p", [What, erlang:process_info(Pid, current_stacktrace)]), - monitor_loop(What, Pid) - end. From 6492e60ca5e151952c03bab0d42a4f335e917a63 Mon Sep 17 00:00:00 2001 From: Mikhail Uvarov Date: Thu, 16 May 2024 17:30:02 +0200 Subject: [PATCH 27/30] Remove unused imports --- test/cets_SUITE.erl | 6 ------ 1 file changed, 6 deletions(-) diff --git a/test/cets_SUITE.erl b/test/cets_SUITE.erl index f1f2534..b6a513a 100644 --- a/test/cets_SUITE.erl +++ b/test/cets_SUITE.erl @@ -13,8 +13,6 @@ -compile([export_all, nowarn_export_all]). -import(cets_test_peer, [ - block_node/2, - reconnect_node/2, disconnect_node/2 ]). @@ -40,14 +38,11 @@ given_two_joined_tables/1, given_two_joined_tables/2, given_3_servers/1, - given_3_servers/2, - given_n_servers/3, make_process/0 ]). -import(cets_test_wait, [ wait_for_down/1, - wait_for_ready/2, wait_for_unpaused/3, wait_for_join_ref_to_match/2, wait_till_test_stage/2, @@ -61,7 +56,6 @@ -import(cets_test_helper, [ assert_unique/1, - set_other_servers/2, set_join_ref/2 ]). From 5f6d9483b7132d60e110caecdd9b9d1b5132125c Mon Sep 17 00:00:00 2001 From: Mikhail Uvarov Date: Thu, 16 May 2024 17:33:10 +0200 Subject: [PATCH 28/30] Move servers_remove_each_other_if_join_refs_do_not_match_after_unpause --- test/cets_SUITE.erl | 15 --------------- test/cets_join_SUITE.erl | 17 ++++++++++++++++- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/test/cets_SUITE.erl b/test/cets_SUITE.erl index b6a513a..3fa782f 100644 --- a/test/cets_SUITE.erl +++ b/test/cets_SUITE.erl @@ -116,7 +116,6 @@ cases() -> insert_serial_blocks_when_leader_is_not_back, leader_is_the_same_in_metadata_after_join, send_dump_contains_already_added_servers, - servers_remove_each_other_if_join_refs_do_not_match_after_unpause, test_multinode, test_multinode_remote_insert, node_list_is_correct, @@ -783,20 +782,6 @@ send_dump_contains_already_added_servers(Config) -> cets:unpause(Pid1, PauseRef), {ok, [{1}]} = cets:remote_dump(Pid1). -servers_remove_each_other_if_join_refs_do_not_match_after_unpause(Config) -> - {ok, Pid1} = start_local(make_name(Config, 1)), - {ok, Pid2} = start_local(make_name(Config, 2)), - %% cets:send_check_servers function is only called after all pauses are unpaused - PauseRef1 = cets:pause(Pid1), - PauseRef2 = cets:pause(Pid2), - ok = cets_join:join(lock_name(Config), #{}, Pid1, Pid2, #{}), - %% send_check_servers is not called yet, because we are still pausing. - %% Mess with join_ref in the state. - set_join_ref(Pid1, make_ref()), - cets:unpause(Pid1, PauseRef1), - cets:unpause(Pid2, PauseRef2), - cets_test_wait:wait_until(fun() -> maps:get(other_servers, cets:info(Pid1)) end, []). - ignore_send_dump_received_when_paused_with_another_pause_ref(Config) -> ignore_send_dump_received_when_unpaused([{extra_pause, true} | Config]). diff --git a/test/cets_join_SUITE.erl b/test/cets_join_SUITE.erl index 2815241..99b835f 100644 --- a/test/cets_join_SUITE.erl +++ b/test/cets_join_SUITE.erl @@ -121,7 +121,8 @@ cases() -> join_fails_because_pids_do_not_match_for_nodes_in_segment, join_fails_because_servers_overlap, remote_ops_are_ignored_if_join_ref_does_not_match, - join_retried_if_lock_is_busy + join_retried_if_lock_is_busy, + servers_remove_each_other_if_join_refs_do_not_match_after_unpause ]. seq_cases() -> @@ -644,3 +645,17 @@ send_join_start_back_and_wait_for_continue_joining() -> (_) -> ok end. + +servers_remove_each_other_if_join_refs_do_not_match_after_unpause(Config) -> + {ok, Pid1} = start_local(make_name(Config, 1)), + {ok, Pid2} = start_local(make_name(Config, 2)), + %% cets:send_check_servers function is only called after all pauses are unpaused + PauseRef1 = cets:pause(Pid1), + PauseRef2 = cets:pause(Pid2), + ok = cets_join:join(lock_name(Config), #{}, Pid1, Pid2, #{}), + %% send_check_servers is not called yet, because we are still pausing. + %% Mess with join_ref in the state. + set_join_ref(Pid1, make_ref()), + cets:unpause(Pid1, PauseRef1), + cets:unpause(Pid2, PauseRef2), + cets_test_wait:wait_until(fun() -> maps:get(other_servers, cets:info(Pid1)) end, []). From 0209ac8c41bbbc5cb0248ca8b349f1af89ade93b Mon Sep 17 00:00:00 2001 From: Mikhail Uvarov Date: Thu, 16 May 2024 17:42:15 +0200 Subject: [PATCH 29/30] Remove unused imports --- test/cets_join_SUITE.erl | 31 +++---------------------------- test/cets_netsplit_SUITE.erl | 31 ++----------------------------- test/cets_status_SUITE.erl | 19 +------------------ 3 files changed, 6 insertions(+), 75 deletions(-) diff --git a/test/cets_join_SUITE.erl b/test/cets_join_SUITE.erl index 99b835f..bfae46a 100644 --- a/test/cets_join_SUITE.erl +++ b/test/cets_join_SUITE.erl @@ -9,25 +9,12 @@ start/2, start_local/1, start_local/2, - start_disco/2, - start_simple_disco/0, make_name/1, make_name/2, - lock_name/1, - disco_name/1 -]). - --import(cets_test_wait, [ - wait_for_down/1, - wait_for_ready/2, - wait_till_test_stage/2 + lock_name/1 ]). -import(cets_test_setup, [ - setup_two_nodes_and_discovery/1, - setup_two_nodes_and_discovery/2, - simulate_disco_restart/1, - make_signalling_process/0, given_two_joined_tables/1, given_two_joined_tables/2, given_3_servers/1, @@ -35,22 +22,14 @@ given_n_servers/3 ]). --import(cets_test_wait, [ - wait_for_disco_timestamp_to_appear/3, - wait_for_disco_timestamp_to_be_updated/4 -]). - -import(cets_test_receive, [ receive_message/1, - receive_message_with_arg/1, - flush_message/1 + receive_message_with_arg/1 ]). -import(cets_test_peer, [ block_node/2, - reconnect_node/2, - disconnect_node/2, - disconnect_node_by_name/2 + reconnect_node/2 ]). -import(cets_test_rpc, [ @@ -63,10 +42,6 @@ assert_unique/1 ]). --import(cets_test_rpc, [ - other_nodes/2 -]). - suite() -> cets_test_setup:suite(). diff --git a/test/cets_netsplit_SUITE.erl b/test/cets_netsplit_SUITE.erl index d20facb..63687b1 100644 --- a/test/cets_netsplit_SUITE.erl +++ b/test/cets_netsplit_SUITE.erl @@ -7,45 +7,18 @@ -import(cets_test_setup, [ start/2, - start_local/1, - start_local/2, - start_disco/2, - start_simple_disco/0, make_name/1, make_name/2, - lock_name/1, - disco_name/1 -]). - --import(cets_test_wait, [ - wait_for_down/1, - wait_for_ready/2, - wait_till_test_stage/2 + lock_name/1 ]). -import(cets_test_setup, [ - setup_two_nodes_and_discovery/1, - setup_two_nodes_and_discovery/2, - simulate_disco_restart/1, - make_signalling_process/0, given_two_joined_tables/1 ]). --import(cets_test_wait, [ - wait_for_disco_timestamp_to_appear/3, - wait_for_disco_timestamp_to_be_updated/4 -]). - --import(cets_test_receive, [ - receive_message/1, - flush_message/1 -]). - -import(cets_test_peer, [ block_node/2, - reconnect_node/2, - disconnect_node/2, - disconnect_node_by_name/2 + reconnect_node/2 ]). -import(cets_test_rpc, [ diff --git a/test/cets_status_SUITE.erl b/test/cets_status_SUITE.erl index 01ce0a4..b5eeafe 100644 --- a/test/cets_status_SUITE.erl +++ b/test/cets_status_SUITE.erl @@ -7,8 +7,6 @@ -import(cets_test_setup, [ start/2, - start_local/1, - start_local/2, start_disco/2, make_name/1, make_name/2, @@ -26,19 +24,8 @@ simulate_disco_restart/1 ]). --import(cets_test_wait, [ - wait_for_disco_timestamp_to_appear/3, - wait_for_disco_timestamp_to_be_updated/4 -]). - -import(cets_test_receive, [ - receive_message/1, - flush_message/1 -]). - --import(cets_test_peer, [ - disconnect_node/2, - disconnect_node_by_name/2 + receive_message/1 ]). -import(cets_test_helper, [ @@ -46,10 +33,6 @@ set_other_servers/2 ]). --import(cets_test_rpc, [ - other_nodes/2 -]). - suite() -> cets_test_setup:suite(). From 06d99a8f579d44b070274ab510b3e9508eccd445 Mon Sep 17 00:00:00 2001 From: Mikhail Uvarov Date: Thu, 16 May 2024 17:44:57 +0200 Subject: [PATCH 30/30] Rename disconnect_node_by_name to disconnect_node_by_id --- test/cets_disco_SUITE.erl | 4 ++-- test/cets_test_peer.erl | 12 ++++++------ test/cets_test_setup.erl | 6 +++--- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/test/cets_disco_SUITE.erl b/test/cets_disco_SUITE.erl index 4f3052d..98bf829 100644 --- a/test/cets_disco_SUITE.erl +++ b/test/cets_disco_SUITE.erl @@ -43,7 +43,7 @@ block_node/2, reconnect_node/2, disconnect_node/2, - disconnect_node_by_name/2 + disconnect_node_by_id/2 ]). -import(cets_test_rpc, [ @@ -636,7 +636,7 @@ disco_nodeup_timestamp_is_updated_after_node_reconnects(Config) -> Setup = setup_two_nodes_and_discovery(Config, [wait, disco2]), #{disco := Disco, node2 := Node2} = Setup, OldTimestamp = cets_test_helper:get_disco_timestamp(Disco, nodeup_timestamps, Node2), - disconnect_node_by_name(Config, ct2), + disconnect_node_by_id(Config, ct2), wait_for_disco_timestamp_to_be_updated(Disco, nodeup_timestamps, Node2, OldTimestamp). disco_node_start_timestamp_is_updated_after_node_restarts(Config) -> diff --git a/test/cets_test_peer.erl b/test/cets_test_peer.erl index d2fedba..ce83f0c 100644 --- a/test/cets_test_peer.erl +++ b/test/cets_test_peer.erl @@ -9,18 +9,18 @@ block_node/2, reconnect_node/2, disconnect_node/2, - disconnect_node_by_name/2 + disconnect_node_by_id/2 ]). -import(cets_test_rpc, [rpc/4]). -include_lib("common_test/include/ct.hrl"). -start(Names, Config) -> - {Nodes, Peers} = lists:unzip([find_or_start_node(N) || N <- Names]), +start(Ids, Config) -> + {Nodes, Peers} = lists:unzip([find_or_start_node(Id) || Id <- Ids]), [ - {nodes, maps:from_list(lists:zip(Names, Nodes))}, - {peers, maps:from_list(lists:zip(Names, Peers))} + {nodes, maps:from_list(lists:zip(Ids, Nodes))}, + {peers, maps:from_list(lists:zip(Ids, Peers))} | Config ]. @@ -101,7 +101,7 @@ reconnect_node(Node, Peer) when is_atom(Node), is_pid(Peer) -> disconnect_node(RPCNode, DisconnectNode) -> rpc(RPCNode, erlang, disconnect_node, [DisconnectNode]). -disconnect_node_by_name(Config, Id) -> +disconnect_node_by_id(Config, Id) -> Peer = maps:get(Id, proplists:get_value(peers, Config)), Node = maps:get(Id, proplists:get_value(nodes, Config)), %% We could need to retry to disconnect, if the local node is currently trying to establish a connection diff --git a/test/cets_test_setup.erl b/test/cets_test_setup.erl index d6827a2..e8c016c 100644 --- a/test/cets_test_setup.erl +++ b/test/cets_test_setup.erl @@ -48,7 +48,7 @@ -import(cets_test_peer, [ disconnect_node/2, - disconnect_node_by_name/2 + disconnect_node_by_id/2 ]). -import(cets_test_rpc, [rpc/4]). @@ -214,7 +214,7 @@ setup_two_nodes_and_discovery(Config, Flags) -> Node1 = node(), #{ct2 := Peer2} = proplists:get_value(peers, Config), #{ct2 := Node2} = proplists:get_value(nodes, Config), - disconnect_node_by_name(Config, ct2), + disconnect_node_by_id(Config, ct2), Tab = make_name(Config), {ok, _Pid1} = start(Node1, Tab), {ok, _Pid2} = start(Peer2, Tab), @@ -252,7 +252,7 @@ setup_two_nodes_and_discovery(Config, Flags) -> case lists:member(netsplit, Flags) of true -> %% Simulate a loss of connection between nodes - disconnect_node_by_name(Config, ct2); + disconnect_node_by_id(Config, ct2); false -> ok end,