diff --git a/cmds/modules/zui/header.go b/cmds/modules/zui/header.go index c165109e5..3d8d6edd4 100644 --- a/cmds/modules/zui/header.go +++ b/cmds/modules/zui/header.go @@ -76,7 +76,7 @@ func headerRenderer(ctx context.Context, c zbus.Client, h *widgets.Paragraph, r } cache := green("OK") - if app.CheckFlag(app.LimitedCache) { + if app.CheckFlag(app.LimitedCache) || app.CheckFlag(app.ReadonlyCache) { cache = red("no ssd disks detected") } diff --git a/cmds/modules/zui/main.go b/cmds/modules/zui/main.go index 37c7cfe3b..e3138a11a 100644 --- a/cmds/modules/zui/main.go +++ b/cmds/modules/zui/main.go @@ -93,9 +93,11 @@ func action(ctx *cli.Context) error { resources.SetRect(0, 14, width, 22) resources.Border = false - errorsGrid := ui.NewGrid() - errorsGrid.Title = "Errors" - errorsGrid.SetRect(0, 22, width, 26) + errorsParagraph := widgets.NewParagraph() + errorsParagraph.Title = "Errors" + errorsParagraph.SetRect(0, 22, width, 26) + errorsParagraph.Border = true + errorsParagraph.WrapText = true var flag signalFlag @@ -111,7 +113,7 @@ func action(ctx *cli.Context) error { log.Error().Err(err).Msg("failed to start resources renderer") } - mod := zui.New(ctx.Context, errorsGrid, &flag) + mod := zui.New(ctx.Context, errorsParagraph, &flag) server.Register(zbus.ObjectID{Name: module, Version: "0.0.1"}, mod) @@ -123,7 +125,7 @@ func action(ctx *cli.Context) error { }() render := func() { - ui.Render(header, netgrid, resources, errorsGrid) + ui.Render(header, netgrid, resources, errorsParagraph) } render() @@ -138,6 +140,7 @@ func action(ctx *cli.Context) error { case "": payload := e.Payload.(ui.Resize) header.SetRect(0, 0, payload.Width, 3) + errorsParagraph.SetRect(0, 22, payload.Width, 26) // grid.SetRect(0, 3, payload.Width, payload.Height) ui.Clear() render() diff --git a/docs/tasks/healthcheck.md b/docs/tasks/healthcheck.md new file mode 100644 index 000000000..18694f2bf --- /dev/null +++ b/docs/tasks/healthcheck.md @@ -0,0 +1,29 @@ +# HealthCheck + +## Overview + +Health check task executes some checks over ZOS components to determine if the node is in a usable state or not and set flags for the Power Daemon to stop uptime reports if the node is unusable. + +## Configuration + +- Name: `healthcheck` +- Schedule: Every 20 mins. + +## Details + +- Check if the node cache disk is usable or not by trying to write some data to it. If it failed, it set the Readonly flag. + +## Result Sample + +```json +{ + "description": "health check task runs multiple checks to ensure the node is in a usable state and set flags for the power daemon to stop reporting uptime if it is not usable", + "name": "healthcheck", + "result": { + "cache": [ + "failed to write to cache: open /var/cache/healthcheck: operation not permitted" + ] + }, + "timestamp": 1701599580 +} +``` diff --git a/pkg/perf/healthcheck/healthcheck.go b/pkg/perf/healthcheck/healthcheck.go index 1f081a4ef..f1c1e786c 100644 --- a/pkg/perf/healthcheck/healthcheck.go +++ b/pkg/perf/healthcheck/healthcheck.go @@ -79,14 +79,16 @@ func (h *healthcheckTask) Run(ctx context.Context) (interface{}, error) { func cacheCheck(ctx context.Context) (string, error) { const label = "cache" - _, err := os.Create("/var/cache/healthcheck") + const checkFile = "/var/cache/healthcheck" + + _, err := os.Create(checkFile) if err != nil { if err := app.SetFlag(app.ReadonlyCache); err != nil { log.Error().Err(err).Msg("failed to set readonly flag") } return label, fmt.Errorf("failed to write to cache: %w", err) } - defer os.Remove("/var/cache/healthcheck") + defer os.Remove(checkFile) if err := app.DeleteFlag(app.ReadonlyCache); err != nil { log.Error().Err(err).Msg("failed to delete readonly flag") diff --git a/pkg/power/uptime.go b/pkg/power/uptime.go index c442d6f82..f71bedd15 100644 --- a/pkg/power/uptime.go +++ b/pkg/power/uptime.go @@ -12,6 +12,7 @@ import ( "github.com/rs/zerolog/log" "github.com/shirou/gopsutil/host" substrate "github.com/threefoldtech/tfchain/clients/tfchain-client-go" + "github.com/threefoldtech/zos/pkg/app" "github.com/threefoldtech/zos/pkg/utils" ) @@ -61,7 +62,7 @@ func (u *Uptime) SendNow() (types.Hash, error) { } func (u *Uptime) uptime(ctx context.Context) error { - for { + report := func() error { log.Debug().Msg("updating node uptime") hash, err := u.SendNow() if err != nil { @@ -71,6 +72,23 @@ func (u *Uptime) uptime(ctx context.Context) error { u.Mark.Signal() log.Info().Str("hash", hash.Hex()).Msg("node uptime hash") + return nil + } + for { + unusable := false + if app.CheckFlag(app.ReadonlyCache) { + log.Error().Msg("node cache is read only") + unusable = true + } + if app.CheckFlag(app.LimitedCache) { + log.Error().Msg("node is running on limited cache") + unusable = true + } + if unusable { + log.Error().Msg("node is not usable skipping uptime reports") + } else if err := report(); err != nil { + return err + } select { case <-ctx.Done(): diff --git a/pkg/zui/zui.go b/pkg/zui/zui.go index 5adb3de07..4fc8af988 100644 --- a/pkg/zui/zui.go +++ b/pkg/zui/zui.go @@ -6,19 +6,18 @@ import ( "sync" "time" - ui "github.com/gizak/termui/v3" "github.com/gizak/termui/v3/widgets" "github.com/threefoldtech/zos/pkg" ) type module struct { - grid *ui.Grid - render Signaler - labels []labelData - table *widgets.Table - mu *sync.Mutex + render Signaler + labels []labelData + paragraph *widgets.Paragraph + mu *sync.Mutex } +// Signaler interface to signal ZUI to render some element. type Signaler interface { Signal() } @@ -28,25 +27,13 @@ type labelData struct { errors []string } -func New(ctx context.Context, grid *ui.Grid, render Signaler) pkg.ZUI { - table := widgets.NewTable() - grid.Set( - ui.NewRow(1.0, table), - ) - table.Title = "Errors" - table.FillRow = true - table.RowSeparator = false - - table.Rows = [][]string{ - {"[No Errors!](fg:green)"}, - } - +// New returns a new ZUI module. +func New(ctx context.Context, p *widgets.Paragraph, render Signaler) pkg.ZUI { zuiModule := &module{ - grid: grid, - render: render, - table: table, - labels: make([]labelData, 0), - mu: &sync.Mutex{}, + render: render, + labels: make([]labelData, 0), + paragraph: p, + mu: &sync.Mutex{}, } go zuiModule.renderErrors(ctx) return zuiModule @@ -54,6 +41,8 @@ func New(ctx context.Context, grid *ui.Grid, render Signaler) pkg.ZUI { var _ pkg.ZUI = (*module)(nil) +// PushErrors pushes the given errors to the ZUI module to be displayed. +// It can also remove stop displaying certain label by sending an empty errors slice. func (m *module) PushErrors(label string, errors []string) { m.mu.Lock() defer m.mu.Unlock() @@ -76,22 +65,19 @@ func (m *module) renderErrors(ctx context.Context) { labels := make([]labelData, len(m.labels)) copy(labels, m.labels) m.mu.Unlock() - display(labels, m.table, m.render) + display(labels, m.paragraph, m.render) // in case nothing got displayed <-time.After(2 * time.Second) } } } -func display(labels []labelData, table *widgets.Table, render Signaler) { - table.Rows = [][]string{ - {"[No Errors!](fg:green)"}, - } +func display(labels []labelData, p *widgets.Paragraph, render Signaler) { + p.Text = "[No Errors!](fg:green)" + for _, label := range labels { for _, e := range label.errors { - table.Rows = [][]string{ - {fmt.Sprintf("%s: [%s](fg:red)", label.label, e)}, - } + p.Text = fmt.Sprintf("%s: [%s](fg:red)", label.label, e) render.Signal() <-time.After(2 * time.Second) }