From 26a652972371bec8b704301575e0a030c3492bbc Mon Sep 17 00:00:00 2001 From: Tyler Karaszewski Date: Fri, 23 Feb 2024 10:35:17 -0800 Subject: [PATCH 1/3] Notice timeout faster --- sqlitecluster/SQLitePeer.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sqlitecluster/SQLitePeer.cpp b/sqlitecluster/SQLitePeer.cpp index 624725b74..94f4eb5bd 100644 --- a/sqlitecluster/SQLitePeer.cpp +++ b/sqlitecluster/SQLitePeer.cpp @@ -72,8 +72,7 @@ SQLitePeer::PeerPostPollStatus SQLitePeer::postPoll(fd_map& fdm, uint64_t& nextA switch (socket->state.load()) { case STCPManager::Socket::CONNECTED: { // socket->lastRecvTime is always set, it's initialized to STimeNow() at creation. - auto lastActivityTime = max(socket->lastSendTime, socket->lastRecvTime); - if (lastActivityTime + SQLiteNode::RECV_TIMEOUT < STimeNow()) { + if (socket->lastRecvTime + SQLiteNode::RECV_TIMEOUT < STimeNow()) { SHMMM("Connection with peer '" << name << "' timed out."); return PeerPostPollStatus::SOCKET_ERROR; } From d8ba53daec122f83cc19e7bbd7593c107647bdba Mon Sep 17 00:00:00 2001 From: Tyler Karaszewski Date: Fri, 23 Feb 2024 11:49:51 -0800 Subject: [PATCH 2/3] Attempt to kill peer connections on crash --- BedrockServer.cpp | 1 + libstuff/SSignal.cpp | 11 +++++++++-- sqlitecluster/SQLiteNode.cpp | 10 ++++++++++ sqlitecluster/SQLiteNode.h | 7 +++++++ 4 files changed, 27 insertions(+), 2 deletions(-) diff --git a/BedrockServer.cpp b/BedrockServer.cpp index 8105dd9a7..0ae15f10a 100644 --- a/BedrockServer.cpp +++ b/BedrockServer.cpp @@ -712,6 +712,7 @@ void BedrockServer::runCommand(unique_ptr&& _command, bool isBlo // signals, like SIGSEGV, this function will be called. SSetSignalHandlerDieFunc([&](){ _clusterMessenger->runOnAll(_generateCrashMessage(command)); + _syncNode->kill(); }); // If we dequeue a status or control command, handle it immediately. diff --git a/libstuff/SSignal.cpp b/libstuff/SSignal.cpp index d623a9ad7..e4f2b86af 100644 --- a/libstuff/SSignal.cpp +++ b/libstuff/SSignal.cpp @@ -1,5 +1,5 @@ #include "libstuff.h" - +#include #include #include #include @@ -202,12 +202,19 @@ void _SSignal_StackTrace(int signum, siginfo_t *info, void *ucontext) { SWARN("Calling DIE function."); SSignalHandlerDieFunc(); SSignalHandlerDieFunc = [](){}; - SWARN("DIE function returned, aborting (if not done)."); + SWARN("DIE function returned."); + if (SQLiteNode::KILLABLE_SQLITE_NODE) { + SWARN("Killing peer connections."); + SQLiteNode::KILLABLE_SQLITE_NODE->kill(); + } } // If we weren't already in ABORT, we'll call that. The second call will skip the above callstack generation. if (signum != SIGABRT) { + SWARN("Aborting."); abort(); + } else { + SWARN("Already in ABORT."); } } else { SALERT("Non-signal thread got signal " << strsignal(signum) << "(" << signum << "), which wasn't expected"); diff --git a/sqlitecluster/SQLiteNode.cpp b/sqlitecluster/SQLiteNode.cpp index 4485ce8c1..2a0c7e31b 100644 --- a/sqlitecluster/SQLiteNode.cpp +++ b/sqlitecluster/SQLiteNode.cpp @@ -59,6 +59,8 @@ #undef SLOGPREFIX #define SLOGPREFIX "{" << _name << "/" << SQLiteNode::stateName(_state) << "} " +SQLiteNode* SQLiteNode::KILLABLE_SQLITE_NODE{0}; + // Initializations for static vars. const uint64_t SQLiteNode::RECV_TIMEOUT{STIME_US_PER_S * 30}; @@ -145,6 +147,7 @@ SQLiteNode::SQLiteNode(SQLiteServer& server, shared_ptr dbPool, cons _stateTimeout(STimeNow() + firstTimeout), _syncPeer(nullptr) { + KILLABLE_SQLITE_NODE = this; SASSERT(_originalPriority >= 0); onPrepareHandlerEnabled = false; @@ -2716,3 +2719,10 @@ SQLiteNodeState SQLiteNode::stateFromName(const string& name) { return it->second; } } + +void SQLiteNode::kill() { + for (SQLitePeer* peer : _peerList) { + SWARN("Killing peer: " << peer->name); + peer->reset(); + } +} diff --git a/sqlitecluster/SQLiteNode.h b/sqlitecluster/SQLiteNode.h index 7b2f3ba0d..1244f0cfa 100644 --- a/sqlitecluster/SQLiteNode.h +++ b/sqlitecluster/SQLiteNode.h @@ -76,6 +76,10 @@ class SQLiteNode : public STCPManager { NUM_CONSISTENCY_LEVELS }; + // This is a globally accessible pointer to some node instance. The intention here is to let signal handling code attempt to kill outstanding + // peer connections on this node before shutting down. + static SQLiteNode* KILLABLE_SQLITE_NODE; + // Receive timeout for cluster messages. static const uint64_t RECV_TIMEOUT; @@ -152,6 +156,9 @@ class SQLiteNode : public STCPManager { // Call this if you want to shut down the node. void beginShutdown(); + // kill all peer connections on this node. + void kill(); + // Handle any read/write events that occurred. void postPoll(fd_map& fdm, uint64_t& nextActivity); From 627c913bdf10d121ab4795ca84a67bdb0429ffa7 Mon Sep 17 00:00:00 2001 From: Tyler Karaszewski Date: Fri, 23 Feb 2024 11:56:18 -0800 Subject: [PATCH 3/3] Remove one redundant line --- BedrockServer.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/BedrockServer.cpp b/BedrockServer.cpp index 0ae15f10a..8105dd9a7 100644 --- a/BedrockServer.cpp +++ b/BedrockServer.cpp @@ -712,7 +712,6 @@ void BedrockServer::runCommand(unique_ptr&& _command, bool isBlo // signals, like SIGSEGV, this function will be called. SSetSignalHandlerDieFunc([&](){ _clusterMessenger->runOnAll(_generateCrashMessage(command)); - _syncNode->kill(); }); // If we dequeue a status or control command, handle it immediately.