From ee48bdd4d9bce7518ac521e09d3d56d40d36171b Mon Sep 17 00:00:00 2001
From: Tim Gross <tgross@hashicorp.com>
Date: Thu, 20 Jun 2024 10:05:44 -0400
Subject: [PATCH] server: fix panic if heartbeat reset happens for GC'd node
 (#23383)

When setting up the timer for heartbeat invalidation, there's no control that
allows us to remove that timer when the node is GC'd. If the GC window is narrow
enough, it's possible to GC a node that has a waiting heartbeat timer. In this
case, we hit a bug where querying for the node returns `nil` and this is
incorrectly handled when checking for disconnect/reconnect state. Fix this bug
by correctly handling a `nil` node and allowing the `Node.Update` RPC to fire
normally (which then errors correctly).

Fixes: https://github.com/hashicorp/nomad/issues/23376
Ref: https://hashicorp.atlassian.net/browse/NET-10109
---
 .changelog/23383.txt | 3 +++
 nomad/heartbeat.go   | 4 ++++
 2 files changed, 7 insertions(+)
 create mode 100644 .changelog/23383.txt

diff --git a/.changelog/23383.txt b/.changelog/23383.txt
new file mode 100644
index 00000000000..6d48f92f869
--- /dev/null
+++ b/.changelog/23383.txt
@@ -0,0 +1,3 @@
+```release-note:bug
+server: Fixed a bug where expiring heartbeats for garbage collected nodes could panic the server
+```
diff --git a/nomad/heartbeat.go b/nomad/heartbeat.go
index 2b207b6ed41..6b394811843 100644
--- a/nomad/heartbeat.go
+++ b/nomad/heartbeat.go
@@ -183,6 +183,10 @@ func (h *nodeHeartbeater) disconnectState(id string) (bool, bool) {
 		h.logger.Error("error retrieving node by id", "error", err)
 		return false, false
 	}
+	if node == nil {
+		h.logger.Error("node not found", "node_id", id)
+		return false, false
+	}
 
 	// Exit if the node is already down or just initializing.
 	if node.Status == structs.NodeStatusDown || node.Status == structs.NodeStatusInit {