Skip to content

Commit

Permalink
dashboard: add panels for Tarantool 3 configuration
Browse files Browse the repository at this point in the history
Closes #224
  • Loading branch information
DifferentialOrange committed Jul 9, 2024
1 parent 32480a7 commit abf0ad0
Show file tree
Hide file tree
Showing 12 changed files with 5,114 additions and 2,180 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,11 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [Unreleased]

### Added
- Panels for Tarantool 3 configuration status and alerts (#224)

## [3.0.0] - 2024-07-09
Grafana revisions:
- Tarantool 3:
Expand Down
173 changes: 173 additions & 0 deletions dashboard/panels/cluster.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -373,6 +373,179 @@ local prometheus = grafana.prometheus;
level='critical',
),

local tarantool3_config_description_note(description) = std.join('\n', [description, |||
Panel works with metrics 1.2.0 or newer (or Tarantool 3.1.1 or newer). TODO: actual release
|||]),

tarantool3_config_status(
cfg,
title='Tarantool configuration status',
description=tarantool3_config_description_note(|||
Current Tarantool 3 configuration apply status for a cluster instance.
`uninitialized` decribes uninitialized instance,
`check_errors` decribes instance with at least one apply error,
`check_warnings` decribes instance with at least one apply warning,
`startup_in_progress` decribes instance doing initial configuration apply,
`reload_in_progress` decribes instance doing configuration apply over existing configuration,
`ready` describes a healthy instance.
Panel works with Grafana 8.x.
|||),
):: timeseries.new(
title=title,
description=description,
datasource=cfg.datasource,
panel_width=12,
max=6,
min=1,
).addValueMapping(
1, 'dark-red', 'uninitialized'
).addRangeMapping(
1.001, 1.999, '-'
).addValueMapping(
2, 'red', 'check_errors'
).addRangeMapping(
2.001, 2.999, '-'
).addValueMapping(
3, 'yellow', 'startup_in_progress'
).addRangeMapping(
3.001, 3.999, '-'
).addValueMapping(
4, 'dark-yellow', 'reload_in_progress'
).addRangeMapping(
4.001, 4.999, '-'
).addValueMapping(
5, 'dark-orange', 'check_warnings'
).addRangeMapping(
5.001, 5.999, '-'
).addValueMapping(
6, 'green', 'ready'
).addTarget(
if cfg.type == variable.datasource_type.prometheus then
local expr = std.format(
|||
1 * %(metric_full_name)s{%(uninitialized_filters)s} + on(alias)
2 * %(metric_full_name)s{%(check_errors_filters)s} + on(alias)
3 * %(metric_full_name)s{%(startup_in_progress_filters)s} + on(alias)
4 * %(metric_full_name)s{%(reload_in_progress_filters)s} + on(alias)
5 * %(metric_full_name)s{%(check_warnings_filters)s} + on(alias)
6 * %(metric_full_name)s{%(ready_filters)s}
|||, {
metric_full_name: cfg.metrics_prefix + 'tnt_config_status',
uninitialized_filters: common.prometheus_query_filters(cfg.filters { status: ['=', 'uninitialized'] }),
check_errors_filters: common.prometheus_query_filters(cfg.filters { status: ['=', 'check_errors'] }),
startup_in_progress_filters: common.prometheus_query_filters(cfg.filters { status: ['=', 'startup_in_progress'] }),
reload_in_progress_filters: common.prometheus_query_filters(cfg.filters { status: ['=', 'reload_in_progress'] }),
check_warnings_filters: common.prometheus_query_filters(cfg.filters { status: ['=', 'check_warnings'] }),
ready_filters: common.prometheus_query_filters(cfg.filters { status: ['=', 'ready'] }),
}
);
prometheus.target(expr=expr, legendFormat='{{alias}}')
else if cfg.type == variable.datasource_type.influxdb then
local query = std.format(|||
SELECT (1 * last("uninitialized") + 2 * last("check_errors") + 3 * last("startup_in_progress") +
4 * last("reload_in_progress") + 5 * last("check_warnings") + 6 * last("ready")) as "status" FROM
(
SELECT "value" as "uninitialized" FROM %(measurement_with_policy)s
WHERE ("metric_name" = '%(metric_full_name)s' AND %(uninitialized_filters)s) AND $timeFilter
),
(
SELECT "value" as "check_errors" FROM %(measurement_with_policy)s
WHERE ("metric_name" = '%(metric_full_name)s' AND %(check_errors_filters)s) AND $timeFilter
),
(
SELECT "value" as "startup_in_progress" FROM %(measurement_with_policy)s
WHERE ("metric_name" = '%(metric_full_name)s' AND %(startup_in_progress_filters)s) AND $timeFilter
),
(
SELECT "value" as "reload_in_progress" FROM %(measurement_with_policy)s
WHERE ("metric_name" = '%(metric_full_name)s' AND %(reload_in_progress_filters)s) AND $timeFilter
),
(
SELECT "value" as "check_warnings" FROM %(measurement_with_policy)s
WHERE ("metric_name" = '%(metric_full_name)s' AND %(check_warnings_filters)s) AND $timeFilter
),
(
SELECT "value" as "ready" FROM %(measurement_with_policy)s
WHERE ("metric_name" = '%(metric_full_name)s' AND %(ready_filters)s) AND $timeFilter
)
GROUP BY time($__interval), "label_pairs_alias" fill(0)
|||, {
metric_full_name: cfg.metrics_prefix + 'tnt_config_status',
measurement_with_policy: std.format('%(policy_prefix)s"%(measurement)s"', {
policy_prefix: if cfg.policy == 'default' then '' else std.format('"%(policy)s".', cfg.policy),
measurement: cfg.measurement,
}),
uninitialized_filters: common.influxdb_query_filters(cfg.filters { label_pairs_status: ['=', 'uninitialized'] }),
check_errors_filters: common.influxdb_query_filters(cfg.filters { label_pairs_status: ['=', 'check_errors'] }),
startup_in_progress_filters: common.influxdb_query_filters(cfg.filters { label_pairs_status: ['=', 'startup_in_progress'] }),
reload_in_progress_filters: common.influxdb_query_filters(cfg.filters { label_pairs_status: ['=', 'reload_in_progress'] }),
check_warnings_filters: common.influxdb_query_filters(cfg.filters { label_pairs_status: ['=', 'check_warnings'] }),
ready_filters: common.influxdb_query_filters(cfg.filters { label_pairs_status: ['=', 'ready'] }),
});
influxdb.target(
rawQuery=true,
query=query,
alias='$tag_label_pairs_alias',
)
),

local tarantool3_config_alerts(
cfg,
title,
description,
level,
) = common.default_graph(
cfg,
title=title,
description=tarantool3_config_description_note(description),
min=0,
legend_avg=false,
legend_max=false,
panel_height=8,
panel_width=6,
).addTarget(
common.target(
cfg,
'tnt_config_alerts',
additional_filters={
[variable.datasource_type.prometheus]: { level: ['=', level] },
[variable.datasource_type.influxdb]: { label_pairs_level: ['=', level] },
},
converter='last',
),
),

tarantool3_config_warning_alerts(
cfg,
title='Tarantool configuration warnings',
description=|||
Number of "warn" alerts on Tarantool 3 configuration apply on a cluster instance.
"warn" alerts cover non-critical issues which do not result in apply failure,
like missing a role to grant for a user.
|||,
):: tarantool3_config_alerts(
cfg,
title=title,
description=description,
level='warn',
),

tarantool3_config_error_alerts(
cfg,
title='Tarantool configuration errors',
description=|||
Number of "error" alerts on Tarantool 3 configuration apply on a cluster instance.
"error" alerts cover critical issues which results in apply failure,
like instance missing itself in configuration.
|||,
):: tarantool3_config_alerts(
cfg,
title=title,
description=description,
level='error',
),

failovers_per_second(
cfg,
title='Failovers triggered',
Expand Down
6 changes: 6 additions & 0 deletions dashboard/section.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,19 @@ local vinyl = import 'dashboard/panels/vinyl.libsonnet';
cluster.http_rps_stat(cfg) { gridPos: { w: 4, h: 5, x: 12, y: 4 } },
cluster.net_rps_stat(cfg) { gridPos: { w: 4, h: 5, x: 16, y: 4 } },
cluster.space_ops_stat(cfg) { gridPos: { w: 4, h: 5, x: 20, y: 4 } },
cluster.tarantool3_config_status(cfg),
cluster.tarantool3_config_warning_alerts(cfg),
cluster.tarantool3_config_error_alerts(cfg),
cluster.read_only_status(cfg, panel_width=24),
cluster.election_state(cfg),
cluster.election_vote(cfg),
cluster.election_leader(cfg),
cluster.election_term(cfg),
] else if cfg.type == variable.datasource_type.influxdb then [
cluster.row,
cluster.tarantool3_config_status(cfg),
cluster.tarantool3_config_warning_alerts(cfg),
cluster.tarantool3_config_error_alerts(cfg),
cluster.read_only_status(cfg, panel_width=24),
cluster.election_state(cfg),
cluster.election_vote(cfg),
Expand Down
42 changes: 42 additions & 0 deletions doc/monitoring/alerting.rst
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,48 @@ sleeps.
Some high loaded fiber has too little yields. It may be the reason of 'Too long WAL write' warnings."
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
Configuration status
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""

:ref:`Configuration status <config_api_reference_info>` displays
Tarantool 3 configuration apply state. Additional metrics desplay the count
of apply warnings and errors.

.. code-block:: yaml
- alert: ConfigWarningAlerts
expr: tnt_config_alerts{level="warn"} > 0
for: 1m
labels:
severity: warning
annotations:
summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') has configuration 'warn' alerts"
description: "Instance '{{ $labels.alias }}' of job '{{ $labels.job }}' has configuration 'warn' alerts.
Please, check config:info() for detailed info."
- alert: ConfigErrorAlerts
expr: tnt_config_alerts{level="error"} > 0
for: 1m
labels:
severity: page
annotations:
summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') has configuration 'error' alerts"
description: "Instance '{{ $labels.alias }}' of job '{{ $labels.job }}' has configuration 'error' alerts.
Latest configuration has not been applied.
Please, check config:info() for detailed info."
- alert: ConfigStatusNotReady
expr: tnt_config_status{status="ready"} == 0
for: 5m
labels:
severity: warning
annotations:
summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') configuration is not ready"
description: "Instance '{{ $labels.alias }}' of job '{{ $labels.job }}' configuration is not ready.
Please, check config:info() for detailed info."
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
Cartridge issues
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
Expand Down
34 changes: 34 additions & 0 deletions example_cluster/prometheus/alerts.yml
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,40 @@ groups:
You are likely to hit limit soon.
It is strongly recommended to increase memtx_memory or number of storages in case of sharded data."

# Warning for configuration warning alerts.
- alert: ConfigWarningAlerts
expr: tnt_config_alerts{level="warn"} > 0
for: 1m
labels:
severity: warning
annotations:
summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') has configuration 'warn' alerts"
description: "Instance '{{ $labels.alias }}' of job '{{ $labels.job }}' has configuration 'warn' alerts.
Please, check config:info() for detailed info."

# Alert for configuration error alerts.
- alert: ConfigErrorAlerts
expr: tnt_config_alerts{level="error"} > 0
for: 1m
labels:
severity: page
annotations:
summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') has configuration 'error' alerts"
description: "Instance '{{ $labels.alias }}' of job '{{ $labels.job }}' has configuration 'error' alerts.
Latest configuration has not been applied.
Please, check config:info() for detailed info."

# Warning for configuration status.
- alert: ConfigStatusNotReady
expr: tnt_config_status{status="ready"} == 0
for: 5m
labels:
severity: warning
annotations:
summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') configuration is not ready"
description: "Instance '{{ $labels.alias }}' of job '{{ $labels.job }}' configuration is not ready.
Please, check config:info() for detailed info."

# Alert for Tarantool replication high lag (both for masters and replicas).
- alert: HighReplicationLag
expr: tnt_replication_lag > 1
Expand Down
91 changes: 91 additions & 0 deletions example_cluster/prometheus/test_alerts.yml
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,97 @@ tests:
exp_alerts: # no alert firing


- interval: 15s
input_series:
- series: 'tnt_config_status{job="tarantool",instance="app:8081",alias="tnt_router",status="unitialized"}'
values: '1+0x4 0+0x4 0+0x30'
- series: 'tnt_config_status{job="tarantool",instance="app:8081",alias="tnt_router",status="check_errors"}'
values: '0+0x4 0+0x4 0+0x30'
- series: 'tnt_config_status{job="tarantool",instance="app:8081",alias="tnt_router",status="check_warnings"}'
values: '0+0x4 0+0x4 0+0x30'
- series: 'tnt_config_status{job="tarantool",instance="app:8081",alias="tnt_router",status="startup_in_progress"}'
values: '0+0x4 1+0x4 0+0x30'
- series: 'tnt_config_status{job="tarantool",instance="app:8081",alias="tnt_router",status="reload_in_progress"}'
values: '0+0x4 0+0x4 0+0x30'
- series: 'tnt_config_status{job="tarantool",instance="app:8081",alias="tnt_router",status="ready"}'
values: '0+0x4 0+0x4 1+0x30'
alert_rule_test:
- eval_time: 10m
alertname: ConfigStatusNotReady
exp_alerts: # no alert firing


- interval: 15s
input_series:
- series: 'tnt_config_status{job="tarantool",instance="app:8081",alias="tnt_router",status="unitialized"}'
values: '1+0x4 0+0x4 0+0x30'
- series: 'tnt_config_status{job="tarantool",instance="app:8081",alias="tnt_router",status="check_errors"}'
values: '0+0x4 0+0x4 1+0x30'
- series: 'tnt_config_status{job="tarantool",instance="app:8081",alias="tnt_router",status="check_warnings"}'
values: '0+0x4 0+0x4 0+0x30'
- series: 'tnt_config_status{job="tarantool",instance="app:8081",alias="tnt_router",status="startup_in_progress"}'
values: '0+0x4 1+0x4 0+0x30'
- series: 'tnt_config_status{job="tarantool",instance="app:8081",alias="tnt_router",status="reload_in_progress"}'
values: '0+0x4 0+0x4 0+0x30'
- series: 'tnt_config_status{job="tarantool",instance="app:8081",alias="tnt_router",status="ready"}'
values: '0+0x4 0+0x4 0+0x30'
alert_rule_test:
- eval_time: 10m
alertname: ConfigStatusNotReady
exp_alerts:
- exp_labels:
severity: warning
instance: app:8081
alias: tnt_router
job: tarantool
status: ready
exp_annotations:
summary: "Instance 'tnt_router' ('tarantool') configuration is not ready"
description: "Instance 'tnt_router' of job 'tarantool' configuration is not ready.
Please, check config:info() for detailed info."


- interval: 15s
input_series:
- series: 'tnt_config_alerts{job="tarantool",instance="app:8081",alias="tnt_router",level="warn"}'
values: '1+0x10'
alert_rule_test:
- eval_time: 2m
alertname: ConfigWarningAlerts
exp_alerts:
- exp_labels:
severity: warning
instance: app:8081
alias: tnt_router
job: tarantool
level: warn
exp_annotations:
summary: "Instance 'tnt_router' ('tarantool') has configuration 'warn' alerts"
description: "Instance 'tnt_router' of job 'tarantool' has configuration 'warn' alerts.
Please, check config:info() for detailed info."


- interval: 15s
input_series:
- series: 'tnt_config_alerts{job="tarantool",instance="app:8081",alias="tnt_router",level="error"}'
values: '1+0x10'
alert_rule_test:
- eval_time: 2m
alertname: ConfigErrorAlerts
exp_alerts:
- exp_labels:
severity: page
instance: app:8081
alias: tnt_router
job: tarantool
level: error
exp_annotations:
summary: "Instance 'tnt_router' ('tarantool') has configuration 'error' alerts"
description: "Instance 'tnt_router' of job 'tarantool' has configuration 'error' alerts.
Latest configuration has not been applied.
Please, check config:info() for detailed info."


- interval: 15s
input_series:
- series: 'tnt_slab_quota_used_ratio{job="tarantool",instance="app:8081",alias="tnt_router"}'
Expand Down
Loading

0 comments on commit abf0ad0

Please sign in to comment.