diff --git a/cmd/main.go b/cmd/main.go index 9caee2a..4479217 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -31,6 +31,7 @@ type Config struct { OpenRouterKey string GrafanaBaseURL string PromDatasourceID string + AlertUnexplained bool } func main() { @@ -85,6 +86,7 @@ func main() { rootCmd.Flags().StringVar(&cfg.ExecutionNode, "ethereum-el", checks.ClientTypeAll.String(), "execution client to monitor") rootCmd.Flags().StringVar(&cfg.GrafanaBaseURL, "grafana-base-url", defaultGrafanaBaseURL, "grafana base URL") rootCmd.Flags().StringVar(&cfg.PromDatasourceID, "prometheus-datasource-id", defaultPromDatasourceID, "prometheus datasource ID") + rootCmd.Flags().BoolVar(&cfg.AlertUnexplained, "alert-unexplained", false, "whether to alert on unexplained issues") if err := rootCmd.MarkFlagRequired("network"); err != nil { fmt.Fprintf(os.Stderr, "Error: %v\n", err) @@ -152,7 +154,7 @@ func runChecks(cmd *cobra.Command, cfg Config) error { } // Send results to Discord. - if err := discordNotifier.SendResults(cfg.DiscordChannel, cfg.Network, targetClient, results, analysis); err != nil { + if err := discordNotifier.SendResults(cfg.DiscordChannel, cfg.Network, targetClient, results, analysis, cfg.AlertUnexplained); err != nil { return fmt.Errorf("failed to send discord notification: %w", err) } diff --git a/pkg/analyzer/analyzer.go b/pkg/analyzer/analyzer.go index ff3266f..98362b8 100644 --- a/pkg/analyzer/analyzer.go +++ b/pkg/analyzer/analyzer.go @@ -1,20 +1,3 @@ -/* -Package analyzer provides functionality for analyzing the health relationships between CL and EL clients. -It helps identify root causes of client failures by detecting patterns where: - -1. For CL clients: If an EL client is failing with multiple CL clients, it's likely the root cause -2. For EL clients: If the target EL client is failing with multiple CL clients, it's likely the root cause - -The analyzer tracks client relationships and their health status, distinguishing between explained issues -(traced back to a root cause) and unexplained issues that may need further investigation. - -Example usage: - - analyzer := NewAnalyzer("geth", ClientTypeEL) - analyzer.AddNodeStatus("lighthouse-geth-1", false) - analyzer.AddNodeStatus("prysm-geth-1", false) - result := analyzer.Analyze() -*/ package analyzer import ( @@ -24,37 +7,46 @@ import ( ) const ( - // MinFailuresForRootCause is the minimum number of failures needed to consider something a root cause. MinFailuresForRootCause = 2 ) -// Analyzer is a struct that analyzes the status of a client. +type ClientFailure struct { + Client string + Type ClientType + FailedWith []string +} + +type ClientPairWithNodes struct { + Pair ClientPair + Nodes []string +} + +type AnalysisState struct { + CLFailures map[string]*ClientFailure + ELFailures map[string]*ClientFailure + RootCauses map[string]string // key: client name, value: evidence + UnexplainedPairs []ClientPairWithNodes +} + type Analyzer struct { - // nodeStatusMap tracks the health status of all client pairs (CL-EL relationships). - // The map key is a ClientPair (e.g., lighthouse-geth), and the value is a list of NodeStatus - // for all instances of that pair. nodeStatusMap NodeStatusMap - // targetClient is the name of the client we're analyzing (e.g., "geth", "lighthouse"). - targetClient string - // clientType indicates whether we're analyzing a CL or EL client. - clientType ClientType + targetClient string + clientType ClientType } -// clientStatusTracker tracks the health status and relationships of a client. -type clientStatusTracker struct { - // totalPeers is the total number of peer relationships this client has. - // e.g., for an EL client, this would be the number of CL clients it pairs with. - totalPeers int - // failingPeers is the count of failing peer relationships. - // Used to track how many peers are having issues with this client. - failingPeers int - // failingList contains the names of peers that are failing with this client. - // e.g., for an EL client, this would be the list of CL client names that are failing. - // Used for logging and evidence gathering when determining root causes. - failingList []string +type Config struct { + Network string + ConsensusNode string + ExecutionNode string + DiscordChannel string + GrafanaToken string + DiscordToken string + OpenRouterKey string + GrafanaBaseURL string + PromDatasourceID string + AlertUnexplained bool } -// NewAnalyzer creates a new Analyzer. func NewAnalyzer(targetClient string, clientType ClientType) *Analyzer { return &Analyzer{ nodeStatusMap: make(NodeStatusMap), @@ -63,43 +55,53 @@ func NewAnalyzer(targetClient string, clientType ClientType) *Analyzer { } } -// Analyze analyzes the status of a client. func (a *Analyzer) Analyze() *AnalysisResult { - log.Printf("\n=== Analyzing %s (%s)", a.targetClient, a.clientType) + log.Print("\n=== Analyzing check results") + + state := &AnalysisState{ + CLFailures: make(map[string]*ClientFailure), + ELFailures: make(map[string]*ClientFailure), + RootCauses: make(map[string]string), + } + + // Step 1: Collect all failures. + a.collectFailures(state) + + // Step 2: Find primary root causes (clients failing with many peers). + a.findPrimaryRootCauses(state) + + // Step 3: Find secondary root causes (clients failing with non-root-cause peers). + a.findSecondaryRootCauses(state) + + // Step 4: Remove false positives (clients only failing with root causes). + a.removeFalsePositives(state) + + // Step 5: Identify unexplained issues. + a.findUnexplainedIssues(state) + // Convert state to result. result := &AnalysisResult{ RootCause: make([]string, 0), UnexplainedIssues: make([]string, 0), AffectedNodes: make(map[string][]string), - RootCauseEvidence: make(map[string]string), + RootCauseEvidence: state.RootCauses, } - var ( - rootCauses []string - evidence map[string]string - ) - - switch a.clientType { - case ClientTypeCL: - status := a.analyzeClientRelationships(ClientTypeCL) - rootCauses, evidence = a.findRootCausesForCL(status) - case ClientTypeEL: - rootCauses, evidence = a.findRootCausesForEL() - default: - // If we made it here, we have bigger problems. - log.Printf(" - Unknown client type: %s", a.clientType) + // Add root causes to result. + for client := range state.RootCauses { + result.RootCause = append(result.RootCause, client) } - result.RootCause = rootCauses - result.RootCauseEvidence = evidence - result.UnexplainedIssues = a.findUnexplainedIssues(rootCauses) + // Add unexplained issues to result. + for _, pairWithNodes := range state.UnexplainedPairs { + result.UnexplainedIssues = append(result.UnexplainedIssues, pairWithNodes.Nodes...) + } a.logAnalysisResults(result) return result } -// AddNodeStatus adds a node status to the analyzer. func (a *Analyzer) AddNodeStatus(nodeName string, isHealthy bool) { pair := parseClientPair(nodeName) @@ -113,183 +115,268 @@ func (a *Analyzer) AddNodeStatus(nodeName string, isHealthy bool) { }) } -// getClientAndPeerNames gets the client and peer names. -func (a *Analyzer) getClientAndPeerNames(pair ClientPair, targetType ClientType) (client, peer string) { - if targetType == ClientTypeCL { - return pair.ELClient, pair.CLClient - } +func (a *Analyzer) collectFailures(state *AnalysisState) { + // For each client pair and their statuses. + for pair, statuses := range a.nodeStatusMap { + // Skip if no failures. + hasFailure := false - return pair.CLClient, pair.ELClient -} + for _, s := range statuses { + if !s.IsHealthy { + hasFailure = true -// isTargetClientIssue checks if the issue is related to the target client. -func (a *Analyzer) isTargetClientIssue(pair ClientPair) bool { - switch a.clientType { - case ClientTypeCL: - return pair.CLClient == a.targetClient - case ClientTypeEL: - return pair.ELClient == a.targetClient - default: - return false + break + } + } + + if !hasFailure { + continue + } + + // Add to CL failures. + if _, exists := state.CLFailures[pair.CLClient]; !exists { + state.CLFailures[pair.CLClient] = &ClientFailure{ + Client: pair.CLClient, + Type: ClientTypeCL, + FailedWith: make([]string, 0), + } + } + + if !contains(state.CLFailures[pair.CLClient].FailedWith, pair.ELClient) { + state.CLFailures[pair.CLClient].FailedWith = append( + state.CLFailures[pair.CLClient].FailedWith, + pair.ELClient, + ) + } + + // Add to EL failures. + if _, exists := state.ELFailures[pair.ELClient]; !exists { + state.ELFailures[pair.ELClient] = &ClientFailure{ + Client: pair.ELClient, + Type: ClientTypeEL, + FailedWith: make([]string, 0), + } + } + + if !contains(state.ELFailures[pair.ELClient].FailedWith, pair.CLClient) { + state.ELFailures[pair.ELClient].FailedWith = append( + state.ELFailures[pair.ELClient].FailedWith, + pair.CLClient, + ) + } + + log.Printf(" - %s is failing with %s", pair.CLClient, pair.ELClient) } } -// hasHealthIssue checks if the node has a health issue. -func (a *Analyzer) hasHealthIssue(statuses []NodeStatus) bool { - for _, status := range statuses { - if !status.IsHealthy { - return true +func (a *Analyzer) findPrimaryRootCauses(state *AnalysisState) { + // Find CL clients failing with many EL clients. + for client, failure := range state.CLFailures { + if len(failure.FailedWith) >= MinFailuresForRootCause { + state.RootCauses[client] = fmt.Sprintf( + "CL client failing with %d EL clients: %s", + len(failure.FailedWith), + strings.Join(failure.FailedWith, ", "), + ) + + log.Printf(" - Primary root cause: %s (%s)", client, state.RootCauses[client]) } } - return false + // Find EL clients failing with many CL clients. + for client, failure := range state.ELFailures { + if len(failure.FailedWith) >= MinFailuresForRootCause { + state.RootCauses[client] = fmt.Sprintf( + "EL client failing with %d CL clients: %s", + len(failure.FailedWith), + strings.Join(failure.FailedWith, ", "), + ) + + log.Printf(" - Primary root cause: %s (%s)", client, state.RootCauses[client]) + } + } } -// analyzeClientRelationships analyzes the relationships between CL and EL clients counting total peers -// and any failing relationships we may have. -func (a *Analyzer) analyzeClientRelationships(targetType ClientType) map[string]*clientStatusTracker { - status := make(map[string]*clientStatusTracker) +func (a *Analyzer) findSecondaryRootCauses(state *AnalysisState) { + // Find clients failing with multiple non-root-cause peers. + for client, failure := range state.CLFailures { + if _, exists := state.RootCauses[client]; exists { + continue // Skip existing root causes. + } + + var ( + nonRootCauseFailures = 0 + nonRootCauseList = make([]string, 0) + ) + + for _, peer := range failure.FailedWith { + if _, isRootCause := state.RootCauses[peer]; !isRootCause { + nonRootCauseFailures++ - // Count total peers. - for pair := range a.nodeStatusMap { - clientName, _ := a.getClientAndPeerNames(pair, targetType) - if _, exists := status[clientName]; !exists { - status[clientName] = &clientStatusTracker{} + nonRootCauseList = append(nonRootCauseList, peer) + } } - status[clientName].totalPeers++ + if nonRootCauseFailures >= MinFailuresForRootCause { + state.RootCauses[client] = fmt.Sprintf( + "CL client failing with %d non-root-cause EL clients: %s", + nonRootCauseFailures, + strings.Join(nonRootCauseList, ", "), + ) + + log.Printf(" - Secondary root cause: %s (%s)", client, state.RootCauses[client]) + } } - // Count failing relationships. - for pair, statuses := range a.nodeStatusMap { - if !a.hasHealthIssue(statuses) { + // Same for EL clients + for client, failure := range state.ELFailures { + if _, exists := state.RootCauses[client]; exists { continue } - clientName, peerName := a.getClientAndPeerNames(pair, targetType) - if status[clientName] != nil { - status[clientName].addFailure(peerName) + var ( + nonRootCauseFailures = 0 + nonRootCauseList = make([]string, 0) + ) + + for _, peer := range failure.FailedWith { + if _, isRootCause := state.RootCauses[peer]; !isRootCause { + nonRootCauseFailures++ + + nonRootCauseList = append(nonRootCauseList, peer) + } } - } - return status + if nonRootCauseFailures >= MinFailuresForRootCause { + state.RootCauses[client] = fmt.Sprintf( + "EL client failing with %d non-root-cause CL clients: %s", + nonRootCauseFailures, + strings.Join(nonRootCauseList, ", "), + ) + + log.Printf(" - Secondary root cause: %s (%s)", client, state.RootCauses[client]) + } + } } -// findTargetFailures finds the nodes that are failing for our target client. -func (a *Analyzer) findTargetFailures() []string { - failures := make([]string, 0) +func (a *Analyzer) removeFalsePositives(state *AnalysisState) { + toRemove := make([]string, 0) - for pair, statuses := range a.nodeStatusMap { - if !a.isTargetClientIssue(pair) { - continue + for client := range state.RootCauses { + var failure *ClientFailure + + if f, exists := state.CLFailures[client]; exists { + failure = f + } else if f, exists := state.ELFailures[client]; exists { + failure = f } - for _, status := range statuses { - if !status.IsHealthy { - failures = append(failures, status.Name) - } + if failure == nil { + continue } - } - return failures -} + // Keep clients failing with many peers (more than 4). + if len(failure.FailedWith) > 4 { + continue + } -// findUnexplainedIssues finds the nodes that are failing for our target client but are not explained by the root causes. -func (a *Analyzer) findUnexplainedIssues(rootCauses []string) []string { - unexplained := make([]string, 0) + // For clients with 2-4 failures, check if they're only failing with major root causes + // or if they're not failing with enough non-major-root-cause peers. + majorRootCauses := make(map[string]bool) - for pair, statuses := range a.nodeStatusMap { - if !a.isTargetClientIssue(pair) { - continue + for c, f := range state.CLFailures { + if len(f.FailedWith) > 4 { + majorRootCauses[c] = true + } } - for _, status := range statuses { - if !status.IsHealthy && !a.isIssueExplained(pair, rootCauses) { - unexplained = append(unexplained, status.Name) + for c, f := range state.ELFailures { + if len(f.FailedWith) > 4 { + majorRootCauses[c] = true } } - } - return unique(unexplained) -} + // Count failures with non-major-root-cause peers. + nonMajorRootCauseFailures := 0 -// isIssueExplained checks if the issue is explained by being classified as a root cause. -func (a *Analyzer) isIssueExplained(pair ClientPair, rootCauses []string) bool { - for _, rootCause := range rootCauses { - // If we're analyzing an EL client and it's the root cause, all its "issues" are explained. - if a.clientType == ClientTypeEL && rootCause == a.targetClient { - return true + for _, peer := range failure.FailedWith { + if !majorRootCauses[peer] { + nonMajorRootCauseFailures++ + } } - // Now check if the client is failing due to it being a root cause. - switch a.clientType { - case ClientTypeCL: - if pair.ELClient == rootCause { - return true - } - case ClientTypeEL: - if pair.CLClient == rootCause { - return true + // Remove if: + // 1. Only failing with major root causes, OR + // 2. Not failing with enough non-major-root-cause peers. + if nonMajorRootCauseFailures < MinFailuresForRootCause { + toRemove = append(toRemove, client) + + if nonMajorRootCauseFailures == 0 { + log.Printf( + " - Removing false positive: %s (only failing with major root causes)", + client, + ) + } else { + log.Printf( + " - Removing false positive: %s (only failing with %d non-major-root-cause peers)", + client, + nonMajorRootCauseFailures, + ) } } } - return false + for _, client := range toRemove { + delete(state.RootCauses, client) + } } -// findRootCausesForCL finds the root causes for the CL client. -func (a *Analyzer) findRootCausesForCL(status map[string]*clientStatusTracker) ([]string, map[string]string) { - var ( - rootCauses = make([]string, 0) - evidence = make(map[string]string) - ) +func (a *Analyzer) findUnexplainedIssues(state *AnalysisState) { + // For each client pair in nodeStatusMap. + for pair, statuses := range a.nodeStatusMap { + // Skip if no failures or not related to target client. + if !a.isTargetClientIssue(pair) { + continue + } + + // Find failing nodes. + failingNodes := make([]string, 0) - for el, stat := range status { - if el == "" { + for _, s := range statuses { + if !s.IsHealthy { + failingNodes = append(failingNodes, s.Name) + } + } + + if len(failingNodes) == 0 { continue } - log.Printf( - " - %s is failing with CL clients: %s", - el, - strings.Join(stat.failingList, ", "), - ) + // If neither client is a root cause, this is unexplained. + if _, clIsRoot := state.RootCauses[pair.CLClient]; !clIsRoot { + if _, elIsRoot := state.RootCauses[pair.ELClient]; !elIsRoot { + state.UnexplainedPairs = append(state.UnexplainedPairs, ClientPairWithNodes{ + Pair: pair, + Nodes: failingNodes, + }) - if len(stat.failingList) > MinFailuresForRootCause { - rootCauses = append(rootCauses, el) - evidence[el] = fmt.Sprintf( - "Failing with %d CL clients: %s", - len(stat.failingList), - strings.Join(stat.failingList, ", "), - ) + log.Printf(" - Unexplained issue: %s-%s", pair.CLClient, pair.ELClient) + } } } - - return rootCauses, evidence } -// findRootCausesForEL finds any root causes for the EL client. -func (a *Analyzer) findRootCausesForEL() ([]string, map[string]string) { - var ( - rootCauses = make([]string, 0) - evidence = make(map[string]string) - ) - - targetFailures := a.findTargetFailures() - - if len(targetFailures) > MinFailuresForRootCause { - rootCauses = append(rootCauses, a.targetClient) - evidence[a.targetClient] = fmt.Sprintf( - "Failing with %d nodes: %s", - len(targetFailures), - strings.Join(targetFailures, ", "), - ) +func (a *Analyzer) isTargetClientIssue(pair ClientPair) bool { + switch a.clientType { + case ClientTypeCL: + return pair.CLClient == a.targetClient + case ClientTypeEL: + return pair.ELClient == a.targetClient + default: + return false } - - return rootCauses, evidence } -// logAnalysisResults logs the analysis results. func (a *Analyzer) logAnalysisResults(result *AnalysisResult) { if len(result.UnexplainedIssues) == 0 && len(result.RootCause) == 0 { log.Printf(" - No issues to analyze") @@ -306,16 +393,6 @@ func (a *Analyzer) logAnalysisResults(result *AnalysisResult) { } } -// addFailure adds a failure to the client status tracker. -func (c *clientStatusTracker) addFailure(peerName string) { - c.failingPeers++ - - if !contains(c.failingList, peerName) { - c.failingList = append(c.failingList, peerName) - } -} - -// contains checks if a string slice contains a value. func contains(slice []string, str string) bool { for _, v := range slice { if v == str { @@ -325,21 +402,3 @@ func contains(slice []string, str string) bool { return false } - -// unique deduplicates a string slice. -func unique(slice []string) []string { - var ( - seen = make(map[string]bool) - result = make([]string, 0) - ) - - for _, str := range slice { - if !seen[str] { - seen[str] = true - - result = append(result, str) - } - } - - return result -} diff --git a/pkg/analyzer/analyzer_test.go b/pkg/analyzer/analyzer_test.go index f65fe1a..f4cc7f2 100644 --- a/pkg/analyzer/analyzer_test.go +++ b/pkg/analyzer/analyzer_test.go @@ -8,79 +8,256 @@ import ( func TestAnalyzer_RootCauseDetection(t *testing.T) { tests := []struct { - name string - targetClient string - clientType ClientType - nodes map[string]bool // map[nodeName]isHealthy - wantRootCause []string - wantUnexplained []string - wantNotification bool + name string + targetClient string + clientType ClientType + nodes map[string]bool // map[nodeName]isHealthy + wantRootCause []string + wantUnexplained []string }{ { - name: "EL client failing with multiple CL clients should be classed as root cause", - targetClient: "nimbusel", + name: "all healthy nodes", + targetClient: "lighthouse", + clientType: ClientTypeCL, + nodes: map[string]bool{ + "lighthouse-geth-1": true, + "lighthouse-besu-1": true, + "lighthouse-nethermind-1": true, + }, + wantRootCause: []string{}, + wantUnexplained: []string{}, + }, + { + name: "unexplained issue - single failure pair", + targetClient: "lighthouse", + clientType: ClientTypeCL, + nodes: map[string]bool{ + "lighthouse-erigon-1": false, // Only this lighthouse+erigon pair is failing. + "lighthouse-geth-1": true, // Other lighthouse pairs are healthy. + "lighthouse-besu-1": true, + "prysm-erigon-1": true, // Other clients with erigon are healthy. + "teku-erigon-1": true, + }, + wantRootCause: []string{}, + wantUnexplained: []string{"lighthouse-erigon-1"}, + }, + { + name: "clear root cause - EL client failing with many CL clients", + targetClient: "ethereumjs", clientType: ClientTypeEL, nodes: map[string]bool{ - "grandine-nimbusel-1": false, - "lighthouse-nimbusel-1": false, - "lodestar-nimbusel-1": false, - "nimbus-nimbusel-1": false, - "prysm-nimbusel-1": false, - "teku-nimbusel-1": false, - // Add some healthy nodes with other EL clients to spice things up. + "lighthouse-ethereumjs-1": false, + "teku-ethereumjs-1": false, + "lodestar-ethereumjs-1": false, + "grandine-ethereumjs-1": false, + "nimbus-ethereumjs-1": false, + // Some healthy nodes "lighthouse-geth-1": true, "prysm-geth-1": true, - "teku-nethermind-1": true, }, - wantRootCause: []string{"nimbusel"}, - wantUnexplained: []string{}, // These should be explained by nimbusel being root cause. - wantNotification: true, // Should notify as the targetClient itself is the root cause. + wantRootCause: []string{"ethereumjs"}, + wantUnexplained: []string{}, }, + // This tests when we have multiple instances of the same client pair (prysm-geth-N). + // Some failing, some healthy, each failing instance should be listed as unexplained. { - name: "CL client with single EL issue should not be classed as root cause", - targetClient: "grandine", + name: "multiple node instances - same client pair", + targetClient: "prysm", clientType: ClientTypeCL, nodes: map[string]bool{ - "grandine-erigon-1": false, - "grandine-geth-1": true, - "grandine-nethermind-1": true, - "lighthouse-erigon-1": true, - "prysm-erigon-1": true, + "prysm-geth-1": false, + "prysm-geth-2": false, + "prysm-geth-3": true, + "prysm-geth-4": false, + "prysm-geth-5": true, + "prysm-geth-6": false, + }, + wantRootCause: []string{}, + wantUnexplained: []string{ + "prysm-geth-1", + "prysm-geth-2", + "prysm-geth-4", + "prysm-geth-6", }, - wantRootCause: []string{}, - wantUnexplained: []string{"grandine-erigon-1"}, - wantNotification: true, // Should notify as it has unexplained issue (grandine-erigon-1). }, { - name: "Multiple CL clients failing with same EL should classify EL as root cause", + name: "clear root cause - CL client failing with many EL clients", + targetClient: "prysm", + clientType: ClientTypeCL, + nodes: map[string]bool{ + "prysm-erigon-1": false, + "prysm-geth-1": false, + "prysm-ethereumjs-1": false, + "prysm-reth-1": false, + "prysm-besu-1": false, + // Some healthy nodes + "lighthouse-geth-1": true, + "teku-geth-1": true, + }, + wantRootCause: []string{"prysm"}, + wantUnexplained: []string{}, + }, + { + name: "false positive - client only failing with known root causes", targetClient: "lighthouse", clientType: ClientTypeCL, nodes: map[string]bool{ + // ethereumjs + nethermind are root causes (failing with many CL clients). + "lighthouse-ethereumjs-1": false, + "teku-ethereumjs-1": false, + "lodestar-ethereumjs-1": false, "grandine-ethereumjs-1": false, + "nimbus-ethereumjs-1": false, + "lighthouse-nethermind-1": false, + "teku-nethermind-1": false, + "grandine-nethermind-1": false, + // lighthouse's other pairs are healthy + "lighthouse-geth-1": true, + "lighthouse-besu-1": true, + }, + wantRootCause: []string{"ethereumjs", "nethermind"}, + wantUnexplained: []string{}, + }, + { + name: "mixed health status - some nodes healthy, some failing", + targetClient: "ethereumjs", + clientType: ClientTypeEL, + nodes: map[string]bool{ "lighthouse-ethereumjs-1": false, + "teku-ethereumjs-1": false, "lodestar-ethereumjs-1": false, + "grandine-ethereumjs-1": false, "nimbus-ethereumjs-1": false, - "prysm-ethereumjs-1": false, - // Add some healthy nodes with other CL clients to ensure we filter them out nicely. - "lighthouse-geth-1": true, - "lighthouse-nethermind-1": true, + // Some healthy nodes with same client + "prysm-ethereumjs-1": true, + "lighthouse-ethereumjs-2": true, }, - wantRootCause: []string{"ethereumjs"}, - wantUnexplained: []string{}, - wantNotification: false, // Should not notify as issues are explained (root cause is ethereumjs). + wantRootCause: []string{"ethereumjs"}, + wantUnexplained: []string{}, }, { - name: "No notification needed when all nodes are healthy", + name: "borderline case - client failing with exactly MinFailuresForRootCause peers", + targetClient: "reth", + clientType: ClientTypeEL, + nodes: map[string]bool{ + // Exactly MinFailuresForRootCause (2) failures. + "lighthouse-reth-1": false, + "teku-reth-1": false, + // Some healthy nodes + "prysm-reth-1": true, + "nimbus-reth-1": true, + }, + wantRootCause: []string{"reth"}, + wantUnexplained: []string{}, + }, + { + name: "below threshold - client failing with less than MinFailuresForRootCause peers", + targetClient: "reth", + clientType: ClientTypeEL, + nodes: map[string]bool{ + // Only one failure, below MinFailuresForRootCause (2). + "lighthouse-reth-1": false, + // Some healthy nodes + "prysm-reth-1": true, + "teku-reth-1": true, + "nimbus-reth-1": true, + }, + wantRootCause: []string{}, + wantUnexplained: []string{"lighthouse-reth-1"}, + }, + { + name: "secondary root cause - client failing with non-root-cause peers", targetClient: "lighthouse", clientType: ClientTypeCL, nodes: map[string]bool{ - "lighthouse-geth-1": true, - "lighthouse-nethermind-1": true, - "lighthouse-besu-1": true, + // lighthouse failing with multiple non-root-cause EL clients. + "lighthouse-geth-1": false, + "lighthouse-besu-1": false, + "lighthouse-nethermind-1": false, + // Other CL clients healthy with these EL clients. + "prysm-geth-1": true, + "teku-besu-1": true, + "nimbus-nethermind-1": true, + // Some other failures that aren't root causes. + "lighthouse-erigon-1": false, + "lighthouse-ethereumjs-1": false, + }, + wantRootCause: []string{"lighthouse"}, + wantUnexplained: []string{}, + }, + { + name: "major root cause overrides - client failing with many peers including root causes", + targetClient: "lighthouse", + clientType: ClientTypeCL, + nodes: map[string]bool{ + // lighthouse failing with many peers (>4). + "lighthouse-geth-1": false, + "lighthouse-besu-1": false, + "lighthouse-nethermind-1": false, + "lighthouse-erigon-1": false, + "lighthouse-ethereumjs-1": false, + // Some of these peers are root causes themselves. + "teku-ethereumjs-1": false, + "prysm-ethereumjs-1": false, + "nimbus-ethereumjs-1": false, + // But lighthouse should still be a root cause due to failing with >4 peers. + }, + wantRootCause: []string{"lighthouse", "ethereumjs"}, + wantUnexplained: []string{}, + }, + { + name: "secondary root cause - EL client failing with non-root-cause peers", + targetClient: "besu", + clientType: ClientTypeEL, + nodes: map[string]bool{ + // besu failing with multiple non-root-cause CL clients + "lighthouse-besu-1": false, + "teku-besu-1": false, + "prysm-besu-1": false, + // Other EL clients healthy with these CL clients + "lighthouse-geth-1": true, + "teku-nethermind-1": true, + "prysm-erigon-1": true, + // Single failures that should be unexplained when viewed from CL perspective. + "grandine-besu-1": false, + "lodestar-besu-1": false, }, - wantRootCause: []string{}, - wantUnexplained: []string{}, - wantNotification: false, + wantRootCause: []string{"besu"}, + wantUnexplained: []string{}, // These won't show up as unexplained from besu's perspective. + }, + { + name: "secondary root cause - CL client failing with non-root-cause peers", + targetClient: "teku", + clientType: ClientTypeCL, + nodes: map[string]bool{ + // teku failing with multiple non-root-cause EL clients. + "teku-geth-1": false, + "teku-besu-1": false, + "teku-nethermind-1": false, + // Other CL clients healthy with these EL clients. + "lighthouse-geth-1": true, + "prysm-besu-1": true, + "nimbus-nethermind-1": true, + // Additional failures that don't affect root cause status. + "teku-ethereumjs-1": false, + "teku-reth-1": false, + }, + wantRootCause: []string{"teku"}, // Only teku is a root cause. + wantUnexplained: []string{}, + }, + { + name: "unexplained issues - CL clients with single failures", + targetClient: "grandine", + clientType: ClientTypeCL, + nodes: map[string]bool{ + // Single failure with besu + "grandine-besu-1": false, + // Other pairs healthy + "grandine-geth-1": true, + "grandine-reth-1": true, + }, + wantRootCause: []string{}, + wantUnexplained: []string{"grandine-besu-1"}, }, } @@ -88,35 +265,14 @@ func TestAnalyzer_RootCauseDetection(t *testing.T) { t.Run(tt.name, func(t *testing.T) { a := NewAnalyzer(tt.targetClient, tt.clientType) - // Add all node statuses. for nodeName, isHealthy := range tt.nodes { a.AddNodeStatus(nodeName, isHealthy) } - // Run analysis. result := a.Analyze() - // Check root causes. assert.ElementsMatch(t, tt.wantRootCause, result.RootCause, "root causes don't match") - - // Check unexplained issues. assert.ElementsMatch(t, tt.wantUnexplained, result.UnexplainedIssues, "unexplained issues don't match") - - // Check if notification would be sent. - shouldNotify := len(result.UnexplainedIssues) > 0 - for _, rc := range result.RootCause { - if rc == tt.targetClient { - shouldNotify = true - - break - } - } - assert.Equal(t, tt.wantNotification, shouldNotify, "notification decision incorrect") - - // If root causes were found, verify we have evidence. - for _, rc := range result.RootCause { - assert.NotEmpty(t, result.RootCauseEvidence[rc], "missing evidence for root cause %s", rc) - } }) } } diff --git a/pkg/discord/notifier.go b/pkg/discord/notifier.go index b3972a3..14dd7a3 100644 --- a/pkg/discord/notifier.go +++ b/pkg/discord/notifier.go @@ -62,7 +62,7 @@ func NewNotifier(token string, openRouterKey string) (*Notifier, error) { } // SendResults sends the analysis results to Discord. -func (n *Notifier) SendResults(channelID string, network string, targetClient string, results []*checks.Result, analysis *analyzer.AnalysisResult) error { +func (n *Notifier) SendResults(channelID string, network string, targetClient string, results []*checks.Result, analysis *analyzer.AnalysisResult, alertUnexplained bool) error { var ( hasFailures bool isRootCause bool @@ -87,8 +87,8 @@ func (n *Notifier) SendResults(channelID string, network string, targetClient st } } - // If they are neither, we're done. - if !hasUnexplainedIssues && !isRootCause { + // If they are neither, or if unexplained alerts are disabled, we're done. + if !isRootCause && (!hasUnexplainedIssues || !alertUnexplained) { return nil } @@ -295,7 +295,7 @@ func (n *Notifier) sendCategoryIssues( } // Extract instances from this category's checks. - instances := n.extractInstances(cat.failedChecks) + instances := n.extractInstances(cat.failedChecks, targetClient) if len(instances) == 0 { return nil } @@ -316,7 +316,7 @@ func (n *Notifier) sendCategoryIssues( } // extractInstances extracts instance names from check results. -func (n *Notifier) extractInstances(checks []*checks.Result) map[string]bool { +func (n *Notifier) extractInstances(checks []*checks.Result, targetClient string) map[string]bool { instances := make(map[string]bool) for _, check := range checks { @@ -333,7 +333,18 @@ func (n *Notifier) extractInstances(checks []*checks.Result) map[string]bool { } instance = strings.Split(instance, " (")[0] - instances[instance] = true + + // Split the instance name into parts + nodeParts := strings.Split(instance, "-") + if len(nodeParts) < 2 { + continue + } + + // Match exactly the CL or EL client name + if nodeParts[0] == targetClient || // CL client + (len(nodeParts) > 1 && nodeParts[1] == targetClient) { // EL client + instances[instance] = true + } } } }