Skip to content

Commit

Permalink
Add log-only mode for metrics based bad token detection (#3203)
Browse files Browse the repository at this point in the history
# Description
The metrics based bad token detection has not been tested with real data
yet so actively filtering out tokens based on it is pretty risky. If the
logic is too naive or simply has a bug we might filter out a bunch of
tokens which are actually good which could reduce the throughput of the
system drastically.

# Changes
To increase the confidence in the system this PR introduces a log only
mode for the metrics based bad token detection.
In that mode no tokens will be filtered out based on the metrics but we
already get logs indicating this tokens would get filtered out by it.
When we start running this mode in prod we should hopefully see 2
things:
1 good tokens don't get filtered out
2 tokens which currently cause alerts like `Driver Run Error Rate Too
High` should be flagged by the logic

`2` would then indicate that the new logic would reduce the unactionable
alerts if the feature gets enabled fully.

I only added logs when we update the metrics and a token "turns bad".
This should keep the noise in the logs low while still providing all the
necessary information.

## How to test
Deploying this to prod aids in testing the overall feature
  • Loading branch information
MartinquaXD authored Jan 3, 2025
1 parent 1294738 commit fca9a92
Show file tree
Hide file tree
Showing 5 changed files with 46 additions and 16 deletions.
28 changes: 23 additions & 5 deletions crates/driver/src/domain/competition/bad_tokens/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,12 @@ struct TokenStatistics {
pub struct DetectorBuilder(Arc<DashMap<eth::TokenAddress, TokenStatistics>>);

impl DetectorBuilder {
pub fn build(self, failure_ratio: f64, required_measurements: u32) -> Detector {
pub fn build(self, failure_ratio: f64, required_measurements: u32, log_only: bool) -> Detector {
Detector {
failure_ratio,
required_measurements,
counter: self.0,
log_only,
}
}
}
Expand All @@ -29,15 +30,20 @@ pub struct Detector {
failure_ratio: f64,
required_measurements: u32,
counter: Arc<DashMap<eth::TokenAddress, TokenStatistics>>,
log_only: bool,
}

impl Detector {
pub fn get_quality(&self, token: &eth::TokenAddress) -> Option<Quality> {
let measurements = self.counter.get(token)?;
let is_unsupported = measurements.attempts >= self.required_measurements
&& (measurements.fails as f64 / measurements.attempts as f64) >= self.failure_ratio;
let is_unsupported = self.is_unsupported(&measurements);
(!self.log_only && is_unsupported).then_some(Quality::Unsupported)
}

is_unsupported.then_some(Quality::Unsupported)
fn is_unsupported(&self, measurements: &TokenStatistics) -> bool {
let token_failure_ratio = measurements.fails as f64 / measurements.attempts as f64;
measurements.attempts >= self.required_measurements
&& token_failure_ratio >= self.failure_ratio
}

/// Updates the tokens that participated in settlements by
Expand All @@ -48,11 +54,13 @@ impl Detector {
token_pairs: &[(eth::TokenAddress, eth::TokenAddress)],
failure: bool,
) {
let mut unsupported_tokens = vec![];
token_pairs
.iter()
.flat_map(|(token_a, token_b)| [token_a, token_b])
.for_each(|token| {
self.counter
let measurement = self
.counter
.entry(*token)
.and_modify(|counter| {
counter.attempts += 1;
Expand All @@ -62,6 +70,16 @@ impl Detector {
attempts: 1,
fails: u32::from(failure),
});
if self.is_unsupported(&measurement) {
unsupported_tokens.push(token);
}
});

if !unsupported_tokens.is_empty() {
tracing::debug!(
tokens = ?unsupported_tokens,
"mark tokens as unsupported"
);
}
}
}
20 changes: 9 additions & 11 deletions crates/driver/src/infra/api/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -73,21 +73,19 @@ impl Api {
let router = routes::reveal(router);
let router = routes::settle(router);

let bad_token_config = solver.bad_token_detection();
let mut bad_tokens =
bad_tokens::Detector::new(solver.bad_token_detection().tokens_supported.clone());
if solver.bad_token_detection().enable_simulation_strategy {
bad_tokens::Detector::new(bad_token_config.tokens_supported.clone());
if bad_token_config.enable_simulation_strategy {
bad_tokens.with_simulation_detector(self.bad_token_detector.clone());
}

if solver.bad_token_detection().enable_metrics_strategy {
bad_tokens.with_metrics_detector(
metrics_bad_token_detector_builder.clone().build(
solver.bad_token_detection().metrics_strategy_failure_ratio,
solver
.bad_token_detection()
.metrics_strategy_required_measurements,
),
);
if bad_token_config.enable_metrics_strategy {
bad_tokens.with_metrics_detector(metrics_bad_token_detector_builder.clone().build(
bad_token_config.metrics_strategy_failure_ratio,
bad_token_config.metrics_strategy_required_measurements,
bad_token_config.metrics_strategy_log_only,
));
}

let router = router.with_state(State(Arc::new(Inner {
Expand Down
1 change: 1 addition & 0 deletions crates/driver/src/infra/config/file/load.rs
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@ pub async fn load(chain: Chain, path: &Path) -> infra::Config {
metrics_strategy_required_measurements: config
.bad_token_detection
.metrics_strategy_required_measurements,
metrics_strategy_log_only: config.bad_token_detection.metrics_strategy_log_only,
},
settle_queue_size: config.settle_queue_size,
}
Expand Down
12 changes: 12 additions & 0 deletions crates/driver/src/infra/config/file/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -708,6 +708,14 @@ pub struct BadTokenDetectionConfig {
rename = "metrics-bad-token-detection-required-measurements"
)]
pub metrics_strategy_required_measurements: u32,

/// Controls whether the metrics based detection strategy should only log
/// unsupported tokens or actually filter them out.
#[serde(
default = "default_metrics_bad_token_detector_log_only",
rename = "metrics-bad-token-detection-log-only"
)]
pub metrics_strategy_log_only: bool,
}

impl Default for BadTokenDetectionConfig {
Expand All @@ -730,3 +738,7 @@ fn default_metrics_bad_token_detector_required_measurements() -> u32 {
fn default_settle_queue_size() -> usize {
2
}

fn default_metrics_bad_token_detector_log_only() -> bool {
true
}
1 change: 1 addition & 0 deletions crates/driver/src/infra/solver/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -316,4 +316,5 @@ pub struct BadTokenDetection {
pub enable_metrics_strategy: bool,
pub metrics_strategy_failure_ratio: f64,
pub metrics_strategy_required_measurements: u32,
pub metrics_strategy_log_only: bool,
}

0 comments on commit fca9a92

Please sign in to comment.