diff --git a/sqlitecluster/SQLiteNode.cpp b/sqlitecluster/SQLiteNode.cpp index f47e01384..040a98a60 100644 --- a/sqlitecluster/SQLiteNode.cpp +++ b/sqlitecluster/SQLiteNode.cpp @@ -98,8 +98,8 @@ const vector SQLiteNode::_initPeers(const string& peerListString) { SINFO("Adding peer #" << peerList.size() << ": " << name << " (" << host << "), " << SComposeJSONObject(params)); SQLitePeer* peer = new SQLitePeer(name, host, params, peerList.size() + 1); - // Wait up to 2s before trying the first time - peer->nextReconnect = STimeNow() + SRandom::rand64() % (STIME_US_PER_S * 2); + // Wait up to 1s before trying the first time + peer->nextReconnect = STimeNow() + SRandom::rand64() % STIME_US_PER_S; peerList.push_back(peer); } @@ -867,8 +867,7 @@ bool SQLiteNode::update() { // See if we're taking too long if (STimeNow() > _stateTimeout) { // Timed out - SHMMM("Timed out waiting for STANDUP approval; reconnect all and re-SEARCHING."); - _reconnectAll(); + SHMMM("Timed out waiting for STANDUP approval; re-SEARCHING."); _changeState(SQLiteNodeState::SEARCHING); return true; // Re-update } @@ -1791,6 +1790,16 @@ void SQLiteNode::_onConnect(SQLitePeer* peer) { login["Version"] = _version; login["Permafollower"] = _originalPriority ? "false" : "true"; _sendToPeer(peer, login); + + // If we're STANDINGUP when a peer connects, send them a STATE message so they know they need to APPROVE or DENY the standup. + // Otherwise we will wait for their response that's not coming,and can eventually time out the standup. + if (_state == SQLiteNodeState::STANDINGUP) { + SData state("STATE"); + state["StateChangeCount"] = to_string(_stateChangeCount); + state["State"] = stateName(_state); + state["Priority"] = SToStr(_priority); + _sendToPeer(peer, state); + } } // -------------------------------------------------------------------------- @@ -1809,7 +1818,7 @@ void SQLiteNode::_onDisconnect(SQLitePeer* peer) { if (peer == _leadPeer) { // We've lost our leader: make sure we aren't waiting for // transaction response and re-SEARCH - PHMMM("Lost our LEADER, re-SEARCHING."); + PWARN("Lost our LEADER, re-SEARCHING."); SASSERTWARN(_state == SQLiteNodeState::SUBSCRIBING || _state == SQLiteNodeState::FOLLOWING); { _leadPeer = nullptr; diff --git a/test/clustertest/tests/MassiveQueryTest.cpp b/test/clustertest/tests/MassiveQueryTest.cpp index 934d31b77..e18697e47 100644 --- a/test/clustertest/tests/MassiveQueryTest.cpp +++ b/test/clustertest/tests/MassiveQueryTest.cpp @@ -14,13 +14,33 @@ struct MassiveQueryTest : tpunit::TestFixture { cmd["processTimeout"] = "290000"; cmd["writeConsistency"] = "ASYNC"; auto r1 = brtester.executeWaitMultipleData({cmd})[0]; - uint64_t commitCount = stoull(r1["CommitCount"]); + uint64_t commitCount = 0; + try { + commitCount = stoull(r1["CommitCount"]); + } catch (const invalid_argument& e) { + cout << "invalid_argument parsing commitCount from: " << r1["CommitCount"] << endl; + } catch (const out_of_range& e) { + cout << "out_of_range parsing commitCount from: " << r1["CommitCount"] << endl; + } uint64_t commitCount2 = 0; + // Make sure the commit count is actually set. + ASSERT_TRUE(commitCount); + SData status("Status"); for (size_t i = 0; i < 500; i++) { - auto r2 = tester.getTester(2).executeWaitMultipleData({status})[0]; - commitCount2 = stoull(SParseJSONObject(r2.content)["CommitCount"]); + auto responseList = tester.getTester(2).executeWaitMultipleData({status}); + auto r2 = responseList[0]; + auto json = SParseJSONObject(r2.content); + try { + commitCount2 = stoull(json["CommitCount"]); + } catch (const invalid_argument& e) { + cout << "invalid_argument parsing commitCount2." << endl; + cout << r2.serialize() << endl; + } catch (const out_of_range& e) { + cout << "out_of_range parsing commitCount2." << endl; + cout << r2.serialize() << endl; + } if (commitCount2 == commitCount) { break; }