diff --git a/README.md b/README.md index d409723..30c80a5 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ vertx-cluster-watchdog [![Build Status](https://travis-ci.com/swisspush/vertx-cluster-watchdog.svg?branch=master)](https://travis-ci.com/swisspush/vertx-cluster-watchdog) [![codecov](https://codecov.io/gh/swisspost/vertx-cluster-watchdog/branch/master/graph/badge.svg?token=nbrYxHDMmJ)](https://codecov.io/gh/swisspost/vertx-cluster-watchdog) -Checks if all your hazelcast cluster members are receiveing published messages over the bus. +Checks if all your hazelcast cluster members are receiving published messages over the bus. How to run the watchdog ----------------------- @@ -63,3 +63,22 @@ Tests ----- The tests try to simulate the cluster with multiple instances of the verticle. The amount of cluster members is injected over the config. + +Micrometer metrics +------------------ +When enabled, `vertx-cluster-watchdog` is monitored with micrometer. The following metrics are available: +* cluster_watchdog_members +* cluster_watchdog_members_responded + +Example metrics: + +``` +# HELP cluster_watchdog_members Amount of members visible to the cluster +# TYPE cluster_watchdog_members gauge +cluster_watchdog_members 2.0 +# HELP cluster_watchdog_members_responded Amount of cluster members responded when accessed +# TYPE cluster_watchdog_members_responded gauge +cluster_watchdog_members_responded 2.0 +``` + +To enable the metrics, set a `MeterRegistry` instance by calling `setMeterRegistry(MeterRegistry meterRegistry)` method in `ClusterWatchdog` class. \ No newline at end of file diff --git a/pom.xml b/pom.xml index 56d29ca..9a53628 100644 --- a/pom.xml +++ b/pom.xml @@ -79,6 +79,11 @@ slf4j-simple ${slf4j.version} + + io.micrometer + micrometer-core + ${micrometer.version} + junit @@ -399,6 +404,7 @@ 4.5.2 2.0.10 + 1.12.13 2.6 2.15.1 4.4 diff --git a/src/main/java/org/swisspush/vertx/cluster/ClusterWatchdog.java b/src/main/java/org/swisspush/vertx/cluster/ClusterWatchdog.java index 9f62416..009b399 100644 --- a/src/main/java/org/swisspush/vertx/cluster/ClusterWatchdog.java +++ b/src/main/java/org/swisspush/vertx/cluster/ClusterWatchdog.java @@ -1,5 +1,7 @@ package org.swisspush.vertx.cluster; +import io.micrometer.core.instrument.Gauge; +import io.micrometer.core.instrument.MeterRegistry; import io.vertx.core.AbstractVerticle; import io.vertx.core.Handler; import io.vertx.core.Promise; @@ -11,6 +13,7 @@ import java.text.SimpleDateFormat; import java.util.*; +import java.util.concurrent.atomic.AtomicLong; public class ClusterWatchdog extends AbstractVerticle { @@ -32,6 +35,9 @@ public class ClusterWatchdog extends AbstractVerticle { private Map> healthCheckResponses; private ClusterWatchdogHttpHandler clusterWatchdogHttpHandler; + private final AtomicLong atomicClusterMemberCountRequired = new AtomicLong(0); + private final AtomicLong atomicClusterMemberRespondersCount = new AtomicLong(0); + @Override public void start(Promise startPromise) { @@ -50,6 +56,7 @@ public void start(Promise startPromise) { } else { clusterMemberCount = clusterMemberCountFromConfig; } + atomicClusterMemberRespondersCount.set(0); int resultQueueLength = config.getInteger("resultQueueLength", 100); log.info("ClusterWatchdog used resultQueueLength: " + resultQueueLength); @@ -111,6 +118,15 @@ public void start(Promise startPromise) { }); } + public void setMeterRegistry(MeterRegistry meterRegistry) { + if(meterRegistry != null) { + Gauge.builder("cluster.watchdog.members", atomicClusterMemberCountRequired, AtomicLong::get) + .description("Amount of members visible to the cluster").register(meterRegistry); + Gauge.builder("cluster.watchdog.members.responded", atomicClusterMemberRespondersCount, AtomicLong::get) + .description("Amount of cluster members responded when accessed").register(meterRegistry); + } + } + class ClusterCheckHandler implements Handler { public void handle(Long event) { @@ -135,6 +151,8 @@ public void handle(Long event) { return; } + atomicClusterMemberCountRequired.set(clusterMemberCount); + // publish the broadcast event which will us get the response of all the registered handlers eb.publish(BROADCAST, testpayload); @@ -148,6 +166,9 @@ public void handle(Long event) { watchdogResult.time = time; watchdogResult.verticleId = uniqueId; watchdogResult.clusterMemberCount = clusterMemberCount; + + atomicClusterMemberRespondersCount.set(responses != null ? responses.size() : 0); + if(responses == null) { log.error("ClusterWatchdog found no responses for timestamp: " + timestamp); watchdogResult.status = ClusterHealthStatus.INCONSISTENT;