Skip to content
This repository has been archived by the owner on Jan 7, 2025. It is now read-only.

Commit

Permalink
[PROTO-1842] Robust status report (#109)
Browse files Browse the repository at this point in the history
  • Loading branch information
phelpsdb authored May 28, 2024
1 parent bca8e1e commit 98d1142
Show file tree
Hide file tree
Showing 9 changed files with 654 additions and 215 deletions.
4 changes: 2 additions & 2 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -104,14 +104,14 @@ jobs:
name: Test network status
command: |
cd ~/audius-d
audius-ctl status
audius-ctl status --ignore-health
- run:
name: Test restart node
command: |
cd ~/audius-d
IMAGE_TAG="$(sha1sum ./Dockerfile | awk '{print $1}')"
audius-ctl --debug restart discovery-1.devnet.audius-d -w -f --audius-d-version $IMAGE_TAG
audius-ctl status
audius-ctl status --ignore-health
- run:
name: Teardown
command: |
Expand Down
2 changes: 1 addition & 1 deletion cmd/audius-ctl/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ func main() {

rootCmd.Flags().BoolVarP(&displayVersion, "version", "v", false, "Display version info")
rootCmd.PersistentFlags().BoolVar(&debugLogging, "debug", false, "Print debug logs in console")
rootCmd.AddCommand(configCmd, devnetCmd, downCmd, infraCmd, jumpCmd, registerCmd, restartCmd, sbCmd, testCmd, upCmd)
rootCmd.AddCommand(configCmd, devnetCmd, downCmd, infraCmd, jumpCmd, registerCmd, restartCmd, sbCmd, statusCmd, upCmd)
registerCmd.Hidden = true // Hidden as the command is currently only for local devnet registration

// Handle interrupt/sigterm to mention logfile
Expand Down
303 changes: 283 additions & 20 deletions cmd/audius-ctl/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,45 +2,308 @@ package main

import (
"fmt"
"os"
"sort"
"sync"
"time"

"github.com/AudiusProject/audius-d/pkg/conf"
"github.com/AudiusProject/audius-d/pkg/health"
"github.com/AudiusProject/audius-d/pkg/logger"
"github.com/AudiusProject/audius-d/pkg/test"
"github.com/jedib0t/go-pretty/v6/table"
"github.com/jedib0t/go-pretty/v6/text"
"github.com/spf13/cobra"
)

type hcResult struct {
Host string
HealthSummary health.NodeHealthSummary
Error error
}

type diskUsage struct {
sizeBytes uint64
usedBytes uint64
}

const (
nodeCol int = iota
typeCol
upCol
healthyCol
chainCol
websocketCol
ipCol
dbCol
diskCol
uptimeCol
commentCol
)

const (
diskUsageWarningThreshold float64 = 0.80
diskUsageErrorThreshold float64 = 0.90
dbSizeWarningThreshold uint64 = 2
)

var (
testCmd = &cobra.Command{
Use: "status",
Short: "test audius-d connectivity",
noStatus = "n/a"
ignoreHealth bool
statusCmd = &cobra.Command{
Use: "status [host ...]",
Short: "Check health of configured nodes",
SilenceUsage: true, // do not print --help text on failed node health
Args: cobra.ExactArgs(0),
RunE: func(cmd *cobra.Command, args []string) error {
ctxConfig, err := conf.ReadOrCreateContextConfig()
if err != nil {
return logger.Error("Failed to retrieve context. ", err)
}

responses, err := test.CheckNodeHealth(ctxConfig)
if err != nil {
return err
}

var encounteredError bool
for _, response := range responses {
if response.Error != nil {
fmt.Printf("%-50s Error: %v\n", response.Host, response.Error)
encounteredError = true
} else {
fmt.Printf("%-50s %t\n", response.Host, response.Result)
var nodesToCheck map[string]conf.NodeConfig
if len(args) == 0 {
nodesToCheck = ctxConfig.Nodes
} else {
nodesToCheck, err = filterNodesFromContext(args, ctxConfig)
if err != nil {
return err
}
}

if encounteredError {
return fmt.Errorf("\none or more health checks failed")
var wg sync.WaitGroup
resultsChan := make(chan hcResult, len(nodesToCheck))
for host, config := range nodesToCheck {
wg.Add(1)
go func(h string, c conf.NodeConfig) {
defer wg.Done()
result, err := health.CheckNodeHealth(h, c)
resultsChan <- hcResult{
Host: h,
HealthSummary: result,
Error: err,
}
}(host, config)
}

return nil
go func() {
wg.Wait()
close(resultsChan)
}()

var results []hcResult
for r := range resultsChan {
results = append(results, r)
}
sort.Slice(results, func(i, j int) bool {
return results[i].Host < results[j].Host
})

err = writeResultsToTable(results)
if ctxConfig.Network.DeployOn == conf.Devnet {
devnetHealth := health.CheckDevnetHealth()
foundUnhealthy := false
for _, h := range devnetHealth.Hosts {
if !h.Healthy {
foundUnhealthy = true
}
fmt.Printf("%s %t\n", h.Host, h.Healthy)
}
if err == nil && foundUnhealthy {
err = logger.Error("Unhealthy devnet")
}
}
return err
},
}
)

func init() {
statusCmd.Flags().BoolVarP(&ignoreHealth, "ignore-health", "i", false, "Return non-zero only if nodes aren't up, ignoring health")
}

func writeResultsToTable(results []hcResult) error {
t := table.NewWriter()
t.SetStyle(table.StyleColoredMagentaWhiteOnBlack)
t.SetOutputMirror(os.Stdout)
t.AppendHeader(table.Row{
"Node",
"Type",
"Up",
"Healthy",
"Chain",
"Websocket",
"Client IP",
"DB",
"Disk",
"Uptime",
"Comment",
})

healthTransformer := text.Transformer(func(val interface{}) string {
switch fmt.Sprint(val) {
case "true", "healthy", "matched":
return text.FgGreen.Sprint(val)
case "n/a", "<nil>":
return text.FgHiBlack.Sprint(val)
default:
return text.FgRed.Sprint(val)
}
})
dbSizeTransformer := text.Transformer(func(val interface{}) string {
if fmt.Sprint(val) == "n/a" || fmt.Sprint(val) == "<nil>" {
return text.FgHiBlack.Sprint(val)
}

ival, ok := val.(uint64)
if !ok {
return text.FgRed.Sprint("NaN")
}
gb := ival / 1024 / 1024 / 1024
result := fmt.Sprintf("%d GB", gb)
if gb > dbSizeWarningThreshold {
return text.FgWhite.Sprint(result)
} else {
return text.FgRed.Sprint(result)
}
})
diskSizeTransformer := text.Transformer(func(val interface{}) string {
if fmt.Sprint(val) == "n/a" || fmt.Sprint(val) == "<nil>" {
return text.FgHiBlack.Sprint(val)
}

du, ok := val.(diskUsage)
if !ok {
return text.FgRed.Sprint("NaN")
}
ugb := du.usedBytes / 1024 / 1024 / 1024
sgb := du.sizeBytes / 1024 / 1024 / 1024
result := fmt.Sprintf("%d/%d GB", ugb, sgb)
ratio := float64(ugb) / float64(sgb)
if ratio < diskUsageWarningThreshold {
return text.FgWhite.Sprint(result)
} else if ratio < diskUsageErrorThreshold {
return text.FgYellow.Sprint(result)
} else {
return text.FgRed.Sprint(result)
}
})
uptimeTransformer := text.Transformer(func(val interface{}) string {
if fmt.Sprint(val) == "n/a" || fmt.Sprint(val) == "<nil>" {
return text.FgHiBlack.Sprint(val)
}

dur, ok := val.(time.Duration)
if !ok {
return text.FgRed.Sprint("NaN")
}
return text.FgWhite.Sprint(dur.Round(time.Second))
})

t.SetColumnConfigs([]table.ColumnConfig{
{
Name: "Up",
Transformer: healthTransformer,
}, {
Name: "Healthy",
Transformer: healthTransformer,
}, {
Name: "Chain",
Transformer: healthTransformer,
}, {
Name: "Websocket",
Transformer: healthTransformer,
}, {
Name: "Client IP",
Transformer: healthTransformer,
}, {
Name: "DB",
Transformer: dbSizeTransformer,
}, {
Name: "Disk",
Transformer: diskSizeTransformer,
}, {
Name: "Uptime",
Transformer: uptimeTransformer,
}, {
Name: "Comment",
Transformer: healthTransformer,
},
})

var unhealthyNode bool
for _, res := range results {
row := table.Row{
res.Host,
res.HealthSummary.Type,
res.HealthSummary.Up,
noStatus,
noStatus,
noStatus,
noStatus,
noStatus,
noStatus,
noStatus,
res.Error,
}
if !res.HealthSummary.Up {
unhealthyNode = true
t.AppendRow(row)
continue
}
if !res.HealthSummary.Healthy && !ignoreHealth {
unhealthyNode = true
}

row[healthyCol] = res.HealthSummary.Healthy
if res.HealthSummary.Type == conf.Identity {
t.AppendRow(row)
continue
}

row[dbCol] = res.HealthSummary.DatabaseSizeBytes
row[diskCol] = diskUsage{
usedBytes: res.HealthSummary.DiskSpaceUsedBytes,
sizeBytes: res.HealthSummary.DiskSpaceSizeBytes,
}
row[uptimeCol] = time.Now().Sub(res.HealthSummary.BootTime)
if res.HealthSummary.IPCheck {
row[ipCol] = "matched"
} else {
row[ipCol] = "unmatched/error"
}
if res.Error == nil && len(res.HealthSummary.Errors) != 0 {
row[commentCol] = res.HealthSummary.Errors
}

if res.HealthSummary.Type == conf.Discovery {
var chainStatus string
if res.HealthSummary.ChainHealthy {
if !res.HealthSummary.ChainPortOpen {
chainStatus = "Port 30300 unreachable"
} else {
chainStatus = "healthy"
}
} else {
chainStatus = "unhealthy"
}
row[chainCol] = chainStatus

wsStatus := "unreachable"
if res.HealthSummary.WebsocketHealthy {
wsStatus = "healthy"
}
row[websocketCol] = wsStatus

t.AppendRow(row)
} else {
t.AppendRow(row)
continue
}
}

t.Render()
if unhealthyNode {
return fmt.Errorf("One or more health checks failed")
}

return nil
}
Loading

0 comments on commit 98d1142

Please sign in to comment.