Skip to content

Commit

Permalink
refactor(analyzer): introduce concept of root-cause and an analyser t…
Browse files Browse the repository at this point in the history
…o handle alert determination
  • Loading branch information
mattevans committed Feb 5, 2025
1 parent bb5d8ba commit 1361233
Show file tree
Hide file tree
Showing 13 changed files with 581 additions and 135 deletions.
14 changes: 9 additions & 5 deletions cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package main
import (
"context"
"fmt"
"log"
"net/http"
"os"
"strings"
Expand Down Expand Up @@ -33,6 +34,9 @@ type Config struct {
}

func main() {
// Remove timestamp from log output, makes it harder to grok.
log.SetFlags(0)

var cfg Config

rootCmd := &cobra.Command{
Expand Down Expand Up @@ -114,12 +118,12 @@ func runChecks(cmd *cobra.Command, cfg Config) error {
runner := checks.NewDefaultRunner()

// Register checks.
runner.RegisterCheck(checks.NewHeadSlotCheck(grafanaClient))
runner.RegisterCheck(checks.NewCLSyncCheck(grafanaClient))
runner.RegisterCheck(checks.NewELSyncCheck(grafanaClient))
runner.RegisterCheck(checks.NewHeadSlotCheck(grafanaClient))
runner.RegisterCheck(checks.NewCLFinalizedEpochCheck(grafanaClient))
runner.RegisterCheck(checks.NewCLPeerCountCheck(grafanaClient))
runner.RegisterCheck(checks.NewELSyncCheck(grafanaClient))
runner.RegisterCheck(checks.NewELPeerCountCheck(grafanaClient))
runner.RegisterCheck(checks.NewCLFinalizedEpochCheck(grafanaClient))
runner.RegisterCheck(checks.NewELBlockHeightCheck(grafanaClient))

// Determine if we're running checks for a specific client.
Expand All @@ -131,7 +135,7 @@ func runChecks(cmd *cobra.Command, cfg Config) error {
}

// Execute the checks.
results, err := runner.RunChecks(context.Background(), checks.Config{
results, analysis, err := runner.RunChecks(context.Background(), checks.Config{
Network: cfg.Network,
ConsensusNode: cfg.ConsensusNode,
ExecutionNode: cfg.ExecutionNode,
Expand All @@ -142,7 +146,7 @@ func runChecks(cmd *cobra.Command, cfg Config) error {
}

// Send results to Discord.
if err := discordNotifier.SendResults(cfg.DiscordChannel, cfg.Network, targetClient, results); err != nil {
if err := discordNotifier.SendResults(cfg.DiscordChannel, cfg.Network, targetClient, results, analysis); err != nil {
return fmt.Errorf("failed to send discord notification: %w", err)
}

Expand Down
188 changes: 188 additions & 0 deletions pkg/analyzer/analyzer.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
package analyzer

import (
"fmt"
"log"
"strings"
)

// Analyzer is a struct that analyzes the status of a client.
type Analyzer struct {
nodeStatusMap NodeStatusMap
targetClient string
clientType ClientType
}

// NewAnalyzer creates a new Analyzer.
func NewAnalyzer(targetClient string, clientType ClientType) *Analyzer {
return &Analyzer{
nodeStatusMap: make(NodeStatusMap),
targetClient: targetClient,
clientType: clientType,
}
}

// AddNodeStatus adds a node status to the analyzer.
func (a *Analyzer) AddNodeStatus(nodeName string, isHealthy bool) {
pair := ParseClientPair(nodeName)
if _, exists := a.nodeStatusMap[pair]; !exists {
a.nodeStatusMap[pair] = make([]NodeStatus, 0)
}

a.nodeStatusMap[pair] = append(a.nodeStatusMap[pair], NodeStatus{
Name: nodeName,
IsHealthy: isHealthy,
})
}

// Analyze analyzes the status of the client.
func (a *Analyzer) Analyze() *AnalysisResult {
result := &AnalysisResult{
RootCause: make([]string, 0),
UnexplainedIssues: make([]string, 0),
AffectedNodes: make(map[string][]string),
RootCauseEvidence: make(map[string]string),
}

log.Printf("\n=== Analyzing %s (%s)", a.targetClient, a.clientType)

// For CL clients, check if any EL clients are having widespread issues. For example, if
// we consistently see the same EL clients failing across multiple CL clients, we can
// reasonably conclude that the EL clients are the root cause in this scenario.
elStatus := make(map[string]struct {
totalCLs int
failingCLs int
failingList []string
})

// First identify root causes.
if a.clientType == ClientTypeCL {
// Count total CL clients for each EL.
for pair := range a.nodeStatusMap {
if _, exists := elStatus[pair.ELClient]; !exists {
elStatus[pair.ELClient] = struct {
totalCLs int
failingCLs int
failingList []string
}{0, 0, make([]string, 0)}
}

status := elStatus[pair.ELClient]
status.totalCLs++
elStatus[pair.ELClient] = status
}

// Then count failing relationships.
for pair, statuses := range a.nodeStatusMap {
hasIssue := false

for _, status := range statuses {
if !status.IsHealthy {
hasIssue = true

break
}
}

if hasIssue {
status := elStatus[pair.ELClient]
status.failingCLs++

if !contains(status.failingList, pair.CLClient) {
status.failingList = append(status.failingList, pair.CLClient)
}

elStatus[pair.ELClient] = status
}
}

// Identify EL clients that are failing with multiple CL clients.
for el, status := range elStatus {
if el == "" {
continue // Skip empty client names.
}

log.Printf(
" - %s is failing with CL clients: %s",
el,
strings.Join(status.failingList, ", "),
)

if len(status.failingList) > 2 {
result.RootCause = append(result.RootCause, el)
result.RootCauseEvidence[el] = fmt.Sprintf(
"Failing with %d CL clients: %s",
len(status.failingList),
strings.Join(status.failingList, ", "),
)
}
}
}

// Now identify unexplained issues.
for pair, statuses := range a.nodeStatusMap {
for _, status := range statuses {
if !status.IsHealthy {
// Only consider issues with our target client. We don't want to be including
// noise about other clients in the individual client notifications.
if (a.clientType == ClientTypeCL && pair.CLClient == a.targetClient) ||
(a.clientType == ClientTypeEL && pair.ELClient == a.targetClient) {
isExplained := false

for _, rootCause := range result.RootCause {
if (a.clientType == ClientTypeCL && pair.ELClient == rootCause) ||
(a.clientType == ClientTypeEL && pair.CLClient == rootCause) {
isExplained = true

break
}
}

if !isExplained {
result.UnexplainedIssues = append(result.UnexplainedIssues, status.Name)
}
}
}
}
}

result.UnexplainedIssues = unique(result.UnexplainedIssues)
for _, issue := range result.UnexplainedIssues {
log.Printf(" - %s (unexplained issue)", issue)
}

if len(elStatus) == 0 && len(result.UnexplainedIssues) == 0 {
log.Printf(" - No issues to analyze")
}

return result
}

// Helper function to check if a string slice contains a value.
func contains(slice []string, str string) bool {
for _, v := range slice {
if v == str {
return true
}
}

return false
}

// Helper function to deduplicate a string slice.
func unique(slice []string) []string {
var (
seen = make(map[string]bool)
result = make([]string, 0)
)

for _, str := range slice {
if !seen[str] {
seen[str] = true

result = append(result, str)
}
}

return result
}
71 changes: 71 additions & 0 deletions pkg/analyzer/types.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
package analyzer

import (
"fmt"
"strings"
)

// ClientType represents the type of client.
type ClientType string

const (
ClientTypeEL ClientType = "EL"
ClientTypeCL ClientType = "CL"
)

// NodeStatus represents the status of a node.
type NodeStatus struct {
Name string
IsHealthy bool
}

// AnalysisResult is the result of the analysis.
type AnalysisResult struct {
RootCause []string // List of clients determined to be root cause.
UnexplainedIssues []string // List of issues that can't be explained by root cause.
AffectedNodes map[string][]string // Map of issue type to affected nodes.
RootCauseEvidence map[string]string // Evidence for why each root cause was determined.
}

// ClientPair represents a CL-EL client combination.
type ClientPair struct {
CLClient string
ELClient string
}

// String returns the string representation of a ClientPair.
func (cp ClientPair) String() string {
return fmt.Sprintf("%s-%s", cp.CLClient, cp.ELClient)
}

// ParseClientPair parses a node name into CL and EL clients.
func ParseClientPair(nodeName string) ClientPair {
// Remove any network prefix if it exists
parts := strings.Split(nodeName, "-")
if len(parts) < 2 {
return ClientPair{}
}

// Find the CL and EL parts
// Format is typically: [network]-[cl_client]-[el_client]-[number]
// or: [cl_client]-[el_client]-[number]
var clClient, elClient string

if len(parts) >= 4 && strings.HasPrefix(nodeName, "pectra-devnet-6-") {
// Format: pectra-devnet-6-cl-el-number.
clClient = parts[len(parts)-3]
elClient = parts[len(parts)-2]
} else if len(parts) >= 3 {
// Format: cl-el-number.
clClient = parts[0]
elClient = parts[1]
}

return ClientPair{
CLClient: clClient,
ELClient: elClient,
}
}

// NodeStatusMap tracks the status of nodes by client pair.
type NodeStatusMap map[ClientPair][]NodeStatus
Loading

0 comments on commit 1361233

Please sign in to comment.