-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #187 from mfreeman451/fix/node_recovery_issue
Fix/node recovery issue
- Loading branch information
Showing
6 changed files
with
280 additions
and
133 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
package cloud | ||
|
||
import "errors" | ||
|
||
var ( | ||
errEmptyPollerID = errors.New("empty poller ID") | ||
errDatabaseError = errors.New("database error") | ||
errInvalidSweepData = errors.New("invalid sweep data") | ||
errFailedToSendAlerts = errors.New("failed to send alerts") | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
package cloud | ||
|
||
import ( | ||
"context" | ||
"errors" | ||
"fmt" | ||
"log" | ||
"os" | ||
"time" | ||
|
||
"github.com/mfreeman451/serviceradar/pkg/cloud/alerts" | ||
"github.com/mfreeman451/serviceradar/pkg/db" | ||
) | ||
|
||
// NodeRecoveryManager handles node recovery state transitions. | ||
type NodeRecoveryManager struct { | ||
db db.Service | ||
alerter alerts.AlertService | ||
getHostname func() string | ||
} | ||
|
||
func newNodeRecoveryManager(d db.Service, alerter alerts.AlertService) *NodeRecoveryManager { | ||
return &NodeRecoveryManager{ | ||
db: d, | ||
alerter: alerter, | ||
getHostname: func() string { | ||
hostname, err := os.Hostname() | ||
if err != nil { | ||
return statusUnknown | ||
} | ||
return hostname | ||
}, | ||
} | ||
} | ||
|
||
func (m *NodeRecoveryManager) processRecovery(ctx context.Context, nodeID string, lastSeen time.Time) error { | ||
tx, err := m.db.Begin() | ||
if err != nil { | ||
return fmt.Errorf("begin transaction: %w", err) | ||
} | ||
|
||
var committed bool | ||
defer func() { | ||
if !committed { | ||
if rbErr := tx.Rollback(); rbErr != nil { | ||
log.Printf("Error rolling back transaction: %v", rbErr) | ||
} | ||
} | ||
}() | ||
|
||
status, err := m.db.GetNodeStatus(nodeID) | ||
if err != nil { | ||
return fmt.Errorf("get node status: %w", err) | ||
} | ||
|
||
// Early return if the node is already healthy | ||
if status.IsHealthy { | ||
return nil | ||
} | ||
|
||
// Update node status | ||
status.IsHealthy = true | ||
status.LastSeen = lastSeen | ||
|
||
// Update the database BEFORE trying to send the alert | ||
if err = m.db.UpdateNodeStatus(status); err != nil { | ||
return fmt.Errorf("update node status: %w", err) | ||
} | ||
|
||
// Send alert | ||
if err = m.sendRecoveryAlert(ctx, nodeID, lastSeen); err != nil { | ||
// Only treat the cooldown as non-error | ||
if !errors.Is(err, alerts.ErrWebhookCooldown) { | ||
return fmt.Errorf("send recovery alert: %w", err) | ||
} | ||
|
||
// Log the cooldown but proceed with the recovery | ||
log.Printf("Recovery alert for node %s rate limited, but node marked as recovered", nodeID) | ||
} | ||
|
||
// Commit the transaction | ||
if err := tx.Commit(); err != nil { | ||
return fmt.Errorf("commit transaction: %w", err) | ||
} | ||
|
||
committed = true | ||
|
||
return nil | ||
} | ||
|
||
// sendRecoveryAlert handles alert creation and sending. | ||
func (m *NodeRecoveryManager) sendRecoveryAlert(ctx context.Context, nodeID string, lastSeen time.Time) error { | ||
alert := &alerts.WebhookAlert{ | ||
Level: alerts.Info, | ||
Title: "Node Recovered", | ||
Message: fmt.Sprintf("Node '%s' is back online", nodeID), | ||
NodeID: nodeID, | ||
Timestamp: lastSeen.UTC().Format(time.RFC3339), | ||
Details: map[string]any{ | ||
"hostname": m.getHostname(), | ||
"recovery_time": lastSeen.Format(time.RFC3339), | ||
}, | ||
} | ||
|
||
return m.alerter.Alert(ctx, alert) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.