From 42227487d42b64c600f257950ce93813b151b5fc Mon Sep 17 00:00:00 2001 From: Xun Li Date: Wed, 30 Oct 2024 16:16:07 -0700 Subject: [PATCH] Add system_invariant_violation macro --- crates/mysten-common/src/logging.rs | 5 ++++- crates/mysten-metrics/src/lib.rs | 12 ++++++++++-- crates/sui-core/src/authority.rs | 13 ++----------- 3 files changed, 16 insertions(+), 14 deletions(-) diff --git a/crates/mysten-common/src/logging.rs b/crates/mysten-common/src/logging.rs index 8ba327026953b..3cd5c9ad0a371 100644 --- a/crates/mysten-common/src/logging.rs +++ b/crates/mysten-common/src/logging.rs @@ -15,8 +15,11 @@ macro_rules! debug_fatal { if cfg!(debug_assertions) { $crate::fatal!($($arg)*); } else { - // TODO: Export invariant metric for alerting tracing::error!(debug_fatal = true, $($arg)*); + let location = concat!(file!(), ':', line!()); + if let Some(metrics) = mysten_metrics::get_metrics() { + metrics.system_invariant_violations.with_label_values(&[location]).inc(); + } } }}; } diff --git a/crates/mysten-metrics/src/lib.rs b/crates/mysten-metrics/src/lib.rs index 3fb40de20573a..8cf2310fce3e0 100644 --- a/crates/mysten-metrics/src/lib.rs +++ b/crates/mysten-metrics/src/lib.rs @@ -15,8 +15,9 @@ use std::time::Instant; use once_cell::sync::OnceCell; use prometheus::{ - register_histogram_with_registry, register_int_gauge_vec_with_registry, Histogram, IntGaugeVec, - Registry, TextEncoder, + register_histogram_with_registry, register_int_counter_vec_with_registry, + register_int_gauge_vec_with_registry, Histogram, IntCounterVec, IntGaugeVec, Registry, + TextEncoder, }; use tap::TapFallible; use tracing::{warn, Span}; @@ -69,6 +70,7 @@ pub struct Metrics { pub scope_duration_ns: IntGaugeVec, pub scope_entrance: IntGaugeVec, pub thread_stall_duration_sec: Histogram, + pub system_invariant_violations: IntCounterVec, } impl Metrics { @@ -143,6 +145,12 @@ impl Metrics { registry, ) .unwrap(), + system_invariant_violations: register_int_counter_vec_with_registry!( + "system_invariant_violations", + "Number of system invariant violations", + &["name"], + registry, + ).unwrap(), } } } diff --git a/crates/sui-core/src/authority.rs b/crates/sui-core/src/authority.rs index 9a7b6c341e524..18f8e9eca6c2f 100644 --- a/crates/sui-core/src/authority.rs +++ b/crates/sui-core/src/authority.rs @@ -68,6 +68,7 @@ use mysten_metrics::{monitored_scope, spawn_monitored_task}; use crate::jsonrpc_index::IndexStore; use crate::jsonrpc_index::{CoinInfo, ObjectIndexChanges}; +use mysten_common::debug_fatal; use once_cell::sync::OnceCell; use shared_crypto::intent::{AppId, Intent, IntentMessage, IntentScope, IntentVersion}; use sui_archival::reader::ArchiveReaderBalancer; @@ -303,8 +304,6 @@ pub struct AuthorityMetrics { /// bytecode verifier metrics for tracking timeouts pub bytecode_verifier_metrics: Arc, - pub authenticator_state_update_failed: IntCounter, - /// Count of zklogin signatures pub zklogin_sig_count: IntCounter, /// Count of multisig signatures @@ -736,12 +735,6 @@ impl AuthorityMetrics { ).unwrap(), limits_metrics: Arc::new(LimitsMetrics::new(registry)), bytecode_verifier_metrics: Arc::new(BytecodeVerifierMetrics::new(registry)), - authenticator_state_update_failed: register_int_counter_with_registry!( - "authenticator_state_update_failed", - "Number of failed authenticator state updates", - registry, - ) - .unwrap(), zklogin_sig_count: register_int_counter_with_registry!( "zklogin_sig_count", "Count of zkLogin signatures", @@ -1511,10 +1504,8 @@ impl AuthorityState { certificate.data().transaction_data().kind() { if let Some(err) = &execution_error_opt { - error!("Authenticator state update failed: {err}"); - self.metrics.authenticator_state_update_failed.inc(); + debug_fatal!("Authenticator state update failed: {:?}", err); } - debug_assert!(execution_error_opt.is_none()); epoch_store.update_authenticator_state(auth_state); // double check that the signature verifier always matches the authenticator state