Add log-only mode for metrics based bad token detection (#3203)

# Description The metrics based bad token detection has not been tested with real data yet so actively filtering out tokens based on it is pretty risky. If the logic is too naive or simply has a bug we might filter out a bunch of tokens which are actually good which could reduce the throughput of the system drastically. # Changes To increase the confidence in the system this PR introduces a log only mode for the metrics based bad token detection. In that mode no tokens will be filtered out based on the metrics but we already get logs indicating this tokens would get filtered out by it. When we start running this mode in prod we should hopefully see 2 things: 1 good tokens don't get filtered out 2 tokens which currently cause alerts like `Driver Run Error Rate Too High` should be flagged by the logic `2` would then indicate that the new logic would reduce the unactionable alerts if the feature gets enabled fully. I only added logs when we update the metrics and a token "turns bad". This should keep the noise in the logs low while still providing all the necessary information. ## How to test Deploying this to prod aids in testing the overall feature
cowprotocol · Jan 3, 2025 · fca9a92 · fca9a92
1 parent 1294738
commit fca9a92
Show file tree

Hide file tree

Showing 5 changed files with 46 additions and 16 deletions.
diff --git a/crates/driver/src/domain/competition/bad_tokens/metrics.rs b/crates/driver/src/domain/competition/bad_tokens/metrics.rs
@@ -10,11 +10,12 @@ struct TokenStatistics {
 pub struct DetectorBuilder(Arc<DashMap<eth::TokenAddress, TokenStatistics>>);
 
 impl DetectorBuilder {
-    pub fn build(self, failure_ratio: f64, required_measurements: u32) -> Detector {
+    pub fn build(self, failure_ratio: f64, required_measurements: u32, log_only: bool) -> Detector {
         Detector {
             failure_ratio,
             required_measurements,
             counter: self.0,
+            log_only,
         }
     }
 }
@@ -29,15 +30,20 @@ pub struct Detector {
     failure_ratio: f64,
     required_measurements: u32,
     counter: Arc<DashMap<eth::TokenAddress, TokenStatistics>>,
+    log_only: bool,
 }
 
 impl Detector {
     pub fn get_quality(&self, token: &eth::TokenAddress) -> Option<Quality> {
         let measurements = self.counter.get(token)?;
-        let is_unsupported = measurements.attempts >= self.required_measurements
-            && (measurements.fails as f64 / measurements.attempts as f64) >= self.failure_ratio;
+        let is_unsupported = self.is_unsupported(&measurements);
+        (!self.log_only && is_unsupported).then_some(Quality::Unsupported)
+    }
 
-        is_unsupported.then_some(Quality::Unsupported)
+    fn is_unsupported(&self, measurements: &TokenStatistics) -> bool {
+        let token_failure_ratio = measurements.fails as f64 / measurements.attempts as f64;
+        measurements.attempts >= self.required_measurements
+            && token_failure_ratio >= self.failure_ratio
     }
 
     /// Updates the tokens that participated in settlements by
@@ -48,11 +54,13 @@ impl Detector {
         token_pairs: &[(eth::TokenAddress, eth::TokenAddress)],
         failure: bool,
     ) {
+        let mut unsupported_tokens = vec![];
         token_pairs
             .iter()
             .flat_map(|(token_a, token_b)| [token_a, token_b])
             .for_each(|token| {
-                self.counter
+                let measurement = self
+                    .counter
                     .entry(*token)
                     .and_modify(|counter| {
                         counter.attempts += 1;
@@ -62,6 +70,16 @@ impl Detector {
                         attempts: 1,
                         fails: u32::from(failure),
                     });
+                if self.is_unsupported(&measurement) {
+                    unsupported_tokens.push(token);
+                }
             });
+
+        if !unsupported_tokens.is_empty() {
+            tracing::debug!(
+                tokens = ?unsupported_tokens,
+                "mark tokens as unsupported"
+            );
+        }
     }
 }
diff --git a/crates/driver/src/infra/api/mod.rs b/crates/driver/src/infra/api/mod.rs
@@ -73,21 +73,19 @@ impl Api {
             let router = routes::reveal(router);
             let router = routes::settle(router);
 
+            let bad_token_config = solver.bad_token_detection();
             let mut bad_tokens =
-                bad_tokens::Detector::new(solver.bad_token_detection().tokens_supported.clone());
-            if solver.bad_token_detection().enable_simulation_strategy {
+                bad_tokens::Detector::new(bad_token_config.tokens_supported.clone());
+            if bad_token_config.enable_simulation_strategy {
                 bad_tokens.with_simulation_detector(self.bad_token_detector.clone());
             }
 
-            if solver.bad_token_detection().enable_metrics_strategy {
-                bad_tokens.with_metrics_detector(
-                    metrics_bad_token_detector_builder.clone().build(
-                        solver.bad_token_detection().metrics_strategy_failure_ratio,
-                        solver
-                            .bad_token_detection()
-                            .metrics_strategy_required_measurements,
-                    ),
-                );
+            if bad_token_config.enable_metrics_strategy {
+                bad_tokens.with_metrics_detector(metrics_bad_token_detector_builder.clone().build(
+                    bad_token_config.metrics_strategy_failure_ratio,
+                    bad_token_config.metrics_strategy_required_measurements,
+                    bad_token_config.metrics_strategy_log_only,
+                ));
             }
 
             let router = router.with_state(State(Arc::new(Inner {

diff --git a/crates/driver/src/infra/config/file/load.rs b/crates/driver/src/infra/config/file/load.rs
@@ -123,6 +123,7 @@ pub async fn load(chain: Chain, path: &Path) -> infra::Config {
                     metrics_strategy_required_measurements: config
                         .bad_token_detection
                         .metrics_strategy_required_measurements,
+                    metrics_strategy_log_only: config.bad_token_detection.metrics_strategy_log_only,
                 },
                 settle_queue_size: config.settle_queue_size,
             }

diff --git a/crates/driver/src/infra/config/file/mod.rs b/crates/driver/src/infra/config/file/mod.rs
@@ -708,6 +708,14 @@ pub struct BadTokenDetectionConfig {
         rename = "metrics-bad-token-detection-required-measurements"
     )]
     pub metrics_strategy_required_measurements: u32,
+
+    /// Controls whether the metrics based detection strategy should only log
+    /// unsupported tokens or actually filter them out.
+    #[serde(
+        default = "default_metrics_bad_token_detector_log_only",
+        rename = "metrics-bad-token-detection-log-only"
+    )]
+    pub metrics_strategy_log_only: bool,
 }
 
 impl Default for BadTokenDetectionConfig {
@@ -730,3 +738,7 @@ fn default_metrics_bad_token_detector_required_measurements() -> u32 {
 fn default_settle_queue_size() -> usize {
     2
 }
+
+fn default_metrics_bad_token_detector_log_only() -> bool {
+    true
+}
diff --git a/crates/driver/src/infra/solver/mod.rs b/crates/driver/src/infra/solver/mod.rs
@@ -316,4 +316,5 @@ pub struct BadTokenDetection {
     pub enable_metrics_strategy: bool,
     pub metrics_strategy_failure_ratio: f64,
     pub metrics_strategy_required_measurements: u32,
+    pub metrics_strategy_log_only: bool,
 }