Skip to content

Commit

Permalink
Prom vecs (#219)
Browse files Browse the repository at this point in the history
* FEAT: Add chain and node metris

* PATCH: Fix linting

* add drained nonce cache metrics

---------

Co-authored-by: gustavobelfort <[email protected]>
  • Loading branch information
agouin and Gustavobelfort authored Nov 17, 2023
1 parent 73350e2 commit 234ea43
Show file tree
Hide file tree
Showing 4 changed files with 171 additions and 97 deletions.
211 changes: 139 additions & 72 deletions signer/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,48 +74,79 @@ var (
metricsTimeKeeper = newMetricsTimer()

// Prometheus Metrics
totalPubKeyRequests = promauto.NewCounter(prometheus.CounterOpts{
Name: "signer_total_pubkey_requests",
Help: "Total times public key requested (High count may indicate validator restarts)",
})
lastPrecommitHeight = promauto.NewGauge(prometheus.GaugeOpts{
Name: "signer_last_precommit_height",
Help: "Last Height Precommit Signed",
})
lastPrevoteHeight = promauto.NewGauge(prometheus.GaugeOpts{
Name: "signer_last_prevote_height",
Help: "Last Height Prevote Signed",
})
totalPubKeyRequests = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "signer_total_pubkey_requests",
Help: "Total times public key requested (High count may indicate validator restarts)",
},
[]string{"chain_id"},
)
lastPrecommitHeight = promauto.NewGaugeVec(
prometheus.GaugeOpts{
Name: "signer_last_precommit_height",
Help: "Last Height Precommit Signed",
},
[]string{"chain_id"},
)

lastProposalHeight = promauto.NewGauge(prometheus.GaugeOpts{
Name: "signer_last_proposal_height",
Help: "Last Height Proposal Signed",
})
lastPrecommitRound = promauto.NewGauge(prometheus.GaugeOpts{
Name: "signer_last_precommit_round",
Help: "Last Round Precommit Signed",
})
lastPrevoteRound = promauto.NewGauge(prometheus.GaugeOpts{
Name: "signer_last_prevote_round",
Help: "Last Round Prevote Signed",
})
lastProposalRound = promauto.NewGauge(prometheus.GaugeOpts{
Name: "signer_last_proposal_round",
Help: "Last Round Proposal Signed",
})
lastPrevoteHeight = promauto.NewGaugeVec(
prometheus.GaugeOpts{
Name: "signer_last_prevote_height",
Help: "Last Height Prevote Signed",
},
[]string{"chain_id"},
)

totalPrecommitsSigned = promauto.NewCounter(prometheus.CounterOpts{
Name: "signer_total_precommits_signed",
Help: "Total Precommit Signed",
})
totalPrevotesSigned = promauto.NewCounter(prometheus.CounterOpts{
Name: "signer_total_prevotes_signed",
Help: "Total Prevote Signed",
})
totalProposalsSigned = promauto.NewCounter(prometheus.CounterOpts{
Name: "signer_total_proposals_signed",
Help: "Total Proposal Signed",
})
lastProposalHeight = promauto.NewGaugeVec(
prometheus.GaugeOpts{
Name: "signer_last_proposal_height",
Help: "Last Height Proposal Signed",
},
[]string{"chain_id"},
)
lastPrecommitRound = promauto.NewGaugeVec(
prometheus.GaugeOpts{
Name: "signer_last_precommit_round",
Help: "Last Round Precommit Signed",
},
[]string{"chain_id"},
)
lastPrevoteRound = promauto.NewGaugeVec(
prometheus.GaugeOpts{
Name: "signer_last_prevote_round",
Help: "Last Round Prevote Signed",
},
[]string{"chain_id"},
)
lastProposalRound = promauto.NewGaugeVec(
prometheus.GaugeOpts{
Name: "signer_last_proposal_round",
Help: "Last Round Proposal Signed",
},
[]string{"chain_id"},
)

totalPrecommitsSigned = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "signer_total_precommits_signed",
Help: "Total Precommit Signed",
},
[]string{"chain_id"},
)
totalPrevotesSigned = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "signer_total_prevotes_signed",
Help: "Total Prevote Signed",
},
[]string{"chain_id"},
)
totalProposalsSigned = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "signer_total_proposals_signed",
Help: "Total Proposal Signed",
},
[]string{"chain_id"},
)

secondsSinceLastPrecommit = promauto.NewGauge(prometheus.GaugeOpts{
Name: "signer_seconds_since_last_precommit",
Expand All @@ -140,22 +171,34 @@ var (
"(Should not increase beyond block time; If high, may indicate raft joining issue for CoSigner) ",
})

missedPrecommits = promauto.NewGauge(prometheus.GaugeOpts{
Name: "signer_missed_precommits",
Help: "Consecutive Precommit Missed",
})
missedPrevotes = promauto.NewGauge(prometheus.GaugeOpts{
Name: "signer_missed_prevotes",
Help: "Consecutive Prevote Missed",
})
totalMissedPrecommits = promauto.NewCounter(prometheus.CounterOpts{
Name: "signer_total_missed_precommits",
Help: "Total Precommit Missed",
})
totalMissedPrevotes = promauto.NewCounter(prometheus.CounterOpts{
Name: "signer_total_missed_prevotes",
Help: "Total Prevote Missed",
})
missedPrecommits = promauto.NewGaugeVec(
prometheus.GaugeOpts{
Name: "signer_missed_precommits",
Help: "Consecutive Precommit Missed",
},
[]string{"chain_id"},
)
missedPrevotes = promauto.NewGaugeVec(
prometheus.GaugeOpts{
Name: "signer_missed_prevotes",
Help: "Consecutive Prevote Missed",
},
[]string{"chain_id"},
)
totalMissedPrecommits = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "signer_total_missed_precommits",
Help: "Total Precommit Missed",
},
[]string{"chain_id"},
)
totalMissedPrevotes = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "signer_total_missed_prevotes",
Help: "Total Prevote Missed",
},
[]string{"chain_id"},
)

missedNonces = promauto.NewGaugeVec(
prometheus.GaugeOpts{
Expand All @@ -171,24 +214,48 @@ var (
},
[]string{"peerid"},
)
drainedNonceCache = promauto.NewGauge(
prometheus.GaugeOpts{
Name: "signer_drained_nonce_cache",
Help: "Consecutive Nonces Requested When Cache is Drained",
},
)
totalDrainedNonceCache = promauto.NewCounter(
prometheus.CounterOpts{
Name: "signer_total_drained_nonce_cache",
Help: "Total Nonces Requested When Cache is Drained",
},
)

sentryConnectTries = promauto.NewGauge(prometheus.GaugeOpts{
Name: "signer_sentry_connect_tries",
Help: "Consecutive Number of times sentry TCP connect has been tried (High count may indicate validator restarts)",
})
totalSentryConnectTries = promauto.NewCounter(prometheus.CounterOpts{
Name: "signer_total_sentry_connect_tries",
Help: "Total Number of times sentry TCP connect has been tried (High count may indicate validator restarts)",
})
sentryConnectTries = promauto.NewGaugeVec(
prometheus.GaugeOpts{
Name: "signer_sentry_connect_tries",
Help: "Consecutive Number of times sentry TCP connect has been tried (High count may indicate validator restarts)",
},
[]string{"node"},
)
totalSentryConnectTries = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "signer_total_sentry_connect_tries",
Help: "Total Number of times sentry TCP connect has been tried (High count may indicate validator restarts)",
},
[]string{"node"},
)

beyondBlockErrors = promauto.NewCounter(prometheus.CounterOpts{
Name: "signer_total_beyond_block_errors",
Help: "Total Times Signing Started but duplicate height/round request arrives",
})
failedSignVote = promauto.NewCounter(prometheus.CounterOpts{
Name: "signer_total_failed_sign_vote",
Help: "Total Times Signer Failed to sign block - Unstarted and Unexepcted Height",
})
beyondBlockErrors = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "signer_total_beyond_block_errors",
Help: "Total Times Signing Started but duplicate height/round request arrives",
},
[]string{"chain_id"},
)
failedSignVote = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "signer_total_failed_sign_vote",
Help: "Total Times Signer Failed to sign block - Unstarted and Unexepcted Height",
},
[]string{"chain_id"},
)

totalRaftLeader = promauto.NewCounter(prometheus.CounterOpts{
Name: "signer_total_raft_leader",
Expand Down
8 changes: 4 additions & 4 deletions signer/remote_signer.go
Original file line number Diff line number Diff line change
Expand Up @@ -106,14 +106,14 @@ func (rs *ReconnRemoteSigner) loop(ctx context.Context) {
timer := time.NewTimer(connRetrySec * time.Second)
conn, err = rs.establishConnection(ctx)
if err == nil {
sentryConnectTries.Set(0)
sentryConnectTries.WithLabelValues(rs.address).Set(0)
timer.Stop()
rs.Logger.Info("Connected to Sentry", "address", rs.address)
break
}

sentryConnectTries.Add(1)
totalSentryConnectTries.Inc()
sentryConnectTries.WithLabelValues(rs.address).Add(1)
totalSentryConnectTries.WithLabelValues(rs.address).Inc()
retries++
rs.Logger.Error(
"Error establishing connection, will retry",
Expand Down Expand Up @@ -226,7 +226,7 @@ func (rs *ReconnRemoteSigner) handleSignProposalRequest(
}

func (rs *ReconnRemoteSigner) handlePubKeyRequest(chainID string) cometprotoprivval.Message {
totalPubKeyRequests.Inc()
totalPubKeyRequests.WithLabelValues(chainID).Inc()
msgSum := &cometprotoprivval.Message_PubKeyResponse{PubKeyResponse: &cometprotoprivval.PubKeyResponse{
PubKey: cometprotocrypto.PublicKey{},
Error: nil,
Expand Down
42 changes: 22 additions & 20 deletions signer/remote_signer_grpc_server.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,13 +58,15 @@ func (s *RemoteSignerGRPCServer) OnStop() {
}

func (s *RemoteSignerGRPCServer) PubKey(ctx context.Context, req *proto.PubKeyRequest) (*proto.PubKeyResponse, error) {
totalPubKeyRequests.Inc()
chainID := req.ChainId

pubKey, err := s.validator.GetPubKey(ctx, req.ChainId)
totalPubKeyRequests.WithLabelValues(chainID).Inc()

pubKey, err := s.validator.GetPubKey(ctx, chainID)
if err != nil {
s.logger.Error(
"Failed to get Pub Key",
"chain_id", req.ChainId,
"chain_id", chainID,
"error", err,
)
return nil, err
Expand Down Expand Up @@ -111,7 +113,7 @@ func signAndTrack(
"round", block.Round,
"reason", typedErr.msg,
)
beyondBlockErrors.Inc()
beyondBlockErrors.WithLabelValues(chainID).Inc()
default:
logger.Error(
"Failed to sign",
Expand All @@ -121,7 +123,7 @@ func signAndTrack(
"round", block.Round,
"error", err,
)
failedSignVote.Inc()
failedSignVote.WithLabelValues(chainID).Inc()
}
return nil, block.Timestamp, err
}
Expand All @@ -143,41 +145,41 @@ func signAndTrack(

switch block.Step {
case stepPropose:
lastProposalHeight.Set(float64(block.Height))
lastProposalRound.Set(float64(block.Round))
totalProposalsSigned.Inc()
lastProposalHeight.WithLabelValues(chainID).Set(float64(block.Height))
lastProposalRound.WithLabelValues(chainID).Set(float64(block.Round))
totalProposalsSigned.WithLabelValues(chainID).Inc()
case stepPrevote:
// Determine number of heights since the last Prevote
stepSize := block.Height - previousPrevoteHeight
if previousPrevoteHeight != 0 && stepSize > 1 {
missedPrevotes.Add(float64(stepSize))
totalMissedPrevotes.Add(float64(stepSize))
missedPrevotes.WithLabelValues(chainID).Add(float64(stepSize))
totalMissedPrevotes.WithLabelValues(chainID).Add(float64(stepSize))
} else {
missedPrevotes.Set(0)
missedPrevotes.WithLabelValues(chainID).Set(0)
}

previousPrevoteHeight = block.Height // remember last PrevoteHeight

metricsTimeKeeper.SetPreviousPrevote(time.Now())

lastPrevoteHeight.Set(float64(block.Height))
lastPrevoteRound.Set(float64(block.Round))
totalPrevotesSigned.Inc()
lastPrevoteHeight.WithLabelValues(chainID).Set(float64(block.Height))
lastPrevoteRound.WithLabelValues(chainID).Set(float64(block.Round))
totalPrevotesSigned.WithLabelValues(chainID).Inc()
case stepPrecommit:
stepSize := block.Height - previousPrecommitHeight
if previousPrecommitHeight != 0 && stepSize > 1 {
missedPrecommits.Add(float64(stepSize))
totalMissedPrecommits.Add(float64(stepSize))
missedPrecommits.WithLabelValues(chainID).Add(float64(stepSize))
totalMissedPrecommits.WithLabelValues(chainID).Add(float64(stepSize))
} else {
missedPrecommits.Set(0)
missedPrecommits.WithLabelValues(chainID).Set(0)
}
previousPrecommitHeight = block.Height // remember last PrecommitHeight

metricsTimeKeeper.SetPreviousPrecommit(time.Now())

lastPrecommitHeight.Set(float64(block.Height))
lastPrecommitRound.Set(float64(block.Round))
totalPrecommitsSigned.Inc()
lastPrecommitHeight.WithLabelValues(chainID).Set(float64(block.Height))
lastPrecommitRound.WithLabelValues(chainID).Set(float64(block.Round))
totalPrecommitsSigned.WithLabelValues(chainID).Inc()
}

return signature, timestamp, nil
Expand Down
7 changes: 6 additions & 1 deletion signer/threshold_validator.go
Original file line number Diff line number Diff line change
Expand Up @@ -465,6 +465,9 @@ func (pv *ThresholdValidator) getNoncesFallback(
) (*CosignerUUIDNonces, []Cosigner, error) {
nonces := make(map[Cosigner]CosignerNonces)

drainedNonceCache.Inc()
totalDrainedNonceCache.Inc()

var wg sync.WaitGroup
wg.Add(pv.threshold)

Expand Down Expand Up @@ -527,7 +530,7 @@ func (pv *ThresholdValidator) waitForPeerNonces(
peerStartTime := time.Now()
peerNonces, err := peer.GetNonces(ctx, []uuid.UUID{u})
if err != nil {
missedNonces.WithLabelValues(peer.GetAddress()).Add(float64(1))
missedNonces.WithLabelValues(peer.GetAddress()).Inc()
totalMissedNonces.WithLabelValues(peer.GetAddress()).Inc()

pv.logger.Error("Error getting nonces", "cosigner", peer.GetID(), "err", err)
Expand Down Expand Up @@ -668,6 +671,8 @@ func (pv *ThresholdValidator) Sign(ctx context.Context, chainID string, block Bl
return nil, stamp, fmt.Errorf("failed to get nonces: %w", errors.Join(err, fallbackErr))
}
dontIterateFastestCosigners = true
} else {
drainedNonceCache.Set(0)
}

nextFastestCosignerIndex := pv.threshold - 1
Expand Down

0 comments on commit 234ea43

Please sign in to comment.