Dust Apps: Improve resilience (#4552)

* Always retry model errors at least once * bump data source retrieval max topk to 1024 * Bump parallelism of model execution in dust apps maps
dust-tt · Apr 4, 2024 · 43e726f · 43e726f
1 parent 3cb6a54
commit 43e726f
Show file tree

Hide file tree

Showing 7 changed files with 95 additions and 39 deletions.
diff --git a/core/src/data_sources/data_source.rs b/core/src/data_sources/data_source.rs
@@ -1310,7 +1310,7 @@ impl DataSource {
         Ok(document)
     }
 
-    const MAX_TOP_K_SEARCH: usize = 128;
+    const MAX_TOP_K_SEARCH: usize = 1024;
 
     pub async fn search(
         &self,

diff --git a/core/src/providers/ai21.rs b/core/src/providers/ai21.rs
@@ -131,17 +131,21 @@ impl AI21LLM {
                 Err(ModelError {
                     message: format!("Ai21APIError: {}", error.detail),
                     retryable: Some(ModelErrorRetryOptions {
-                        sleep: Duration::from_millis(2000),
+                        sleep: Duration::from_millis(500),
                         factor: 2,
-                        retries: 8,
+                        retries: 1,
                     }),
                 })
             }
             _ => {
                 let error: Error = serde_json::from_slice(c)?;
                 Err(ModelError {
                     message: format!("Ai21APIError: {}", error.detail),
-                    retryable: None,
+                    retryable: Some(ModelErrorRetryOptions {
+                        sleep: Duration::from_millis(500),
+                        factor: 1,
+                        retries: 1,
+                    }),
                 })
             }
         }?;

diff --git a/core/src/providers/anthropic.rs b/core/src/providers/anthropic.rs
@@ -2,7 +2,7 @@ use crate::providers::embedder::{Embedder, EmbedderVector};
 use crate::providers::llm::{
     ChatMessage, ChatMessageRole, LLMChatGeneration, LLMGeneration, Tokens, LLM,
 };
-use crate::providers::provider::{ModelError, Provider, ProviderID};
+use crate::providers::provider::{ModelError, ModelErrorRetryOptions, Provider, ProviderID};
 use crate::providers::tiktoken::tiktoken::anthropic_base_singleton;
 use crate::run::Credentials;
 use crate::utils;
@@ -280,7 +280,11 @@ impl AnthropicLLM {
                 let error: Error = serde_json::from_slice(c)?;
                 Err(ModelError {
                     message: format!("Anthropic API Error: {}", error.to_string()),
-                    retryable: None,
+                    retryable: Some(ModelErrorRetryOptions {
+                        sleep: Duration::from_millis(500),
+                        factor: 1,
+                        retries: 1,
+                    }),
                 })
             }
         }?;
@@ -537,7 +541,11 @@ impl AnthropicLLM {
                                     "Anthropic API Error: {}",
                                     event.error.to_string()
                                 ),
-                                retryable: None,
+                                retryable: Some(ModelErrorRetryOptions {
+                                    sleep: Duration::from_millis(500),
+                                    factor: 1,
+                                    retries: 1,
+                                }),
                             })?;
                             break 'stream;
                         }
@@ -750,7 +758,11 @@ impl AnthropicLLM {
                 let error: Error = serde_json::from_slice(c)?;
                 Err(ModelError {
                     message: format!("Anthropic API Error: {}", error.to_string()),
-                    retryable: None,
+                    retryable: Some(ModelErrorRetryOptions {
+                        sleep: Duration::from_millis(500),
+                        factor: 1,
+                        retries: 1,
+                    }),
                 })
             }
         }?;

diff --git a/core/src/providers/cohere.rs b/core/src/providers/cohere.rs
@@ -58,17 +58,21 @@ async fn api_encode(api_key: &str, text: &str) -> Result<Vec<usize>> {
             Err(ModelError {
                 message: format!("CohereAPIError: {}", error.message),
                 retryable: Some(ModelErrorRetryOptions {
-                    sleep: Duration::from_millis(2000),
+                    sleep: Duration::from_millis(500),
                     factor: 2,
-                    retries: 8,
+                    retries: 3,
                 }),
             })
         }
         _ => {
             let error: Error = serde_json::from_slice(c)?;
             Err(ModelError {
                 message: format!("CohereAPIError: {}", error.message),
-                retryable: None,
+                retryable: Some(ModelErrorRetryOptions {
+                    sleep: Duration::from_millis(500),
+                    factor: 1,
+                    retries: 1,
+                }),
             })
         }
     }?;
@@ -105,17 +109,21 @@ async fn api_decode(api_key: &str, tokens: Vec<usize>) -> Result<String> {
             Err(ModelError {
                 message: format!("CohereAPIError: {}", error.message),
                 retryable: Some(ModelErrorRetryOptions {
-                    sleep: Duration::from_millis(2000),
+                    sleep: Duration::from_millis(500),
                     factor: 2,
-                    retries: 8,
+                    retries: 3,
                 }),
             })
         }
         _ => {
             let error: Error = serde_json::from_slice(c)?;
             Err(ModelError {
                 message: format!("CohereAPIError: {}", error.message),
-                retryable: None,
+                retryable: Some(ModelErrorRetryOptions {
+                    sleep: Duration::from_millis(500),
+                    factor: 1,
+                    retries: 1,
+                }),
             })
         }
     }?;
@@ -226,9 +234,9 @@ impl CohereLLM {
                 Err(ModelError {
                     message: format!("CohereAPIError: {}", error.message),
                     retryable: Some(ModelErrorRetryOptions {
-                        sleep: Duration::from_millis(2000),
+                        sleep: Duration::from_millis(500),
                         factor: 2,
-                        retries: 8,
+                        retries: 3,
                     }),
                 })
             }
@@ -450,17 +458,21 @@ impl CohereEmbedder {
                 Err(ModelError {
                     message: format!("CohereAPIError: {}", error.message),
                     retryable: Some(ModelErrorRetryOptions {
-                        sleep: Duration::from_millis(2000),
+                        sleep: Duration::from_millis(500),
                         factor: 2,
-                        retries: 8,
+                        retries: 3,
                     }),
                 })
             }
             _ => {
                 let error: Error = serde_json::from_slice(c)?;
                 Err(ModelError {
                     message: format!("CohereAPIError: {}", error.message),
-                    retryable: None,
+                    retryable: Some(ModelErrorRetryOptions {
+                        sleep: Duration::from_millis(500),
+                        factor: 1,
+                        retries: 1,
+                    }),
                 })
             }
         }?;

diff --git a/core/src/providers/mistral.rs b/core/src/providers/mistral.rs
@@ -490,14 +490,18 @@ impl MistralAILLM {
                                                     true => Err(ModelError {
                                                         message: error.message(),
                                                         retryable: Some(ModelErrorRetryOptions {
-                                                            sleep: Duration::from_millis(100),
+                                                            sleep: Duration::from_millis(500),
                                                             factor: 2,
                                                             retries: 3,
                                                         }),
                                                     })?,
                                                     false => Err(ModelError {
                                                         message: error.message(),
-                                                        retryable: None,
+                                                        retryable: Some(ModelErrorRetryOptions {
+                                                            sleep: Duration::from_millis(500),
+                                                            factor: 1,
+                                                            retries: 1,
+                                                        }),
                                                     })?,
                                                 }
                                                 break 'stream;
@@ -710,14 +714,18 @@ impl MistralAILLM {
                     true => Err(ModelError {
                         message: error.message(),
                         retryable: Some(ModelErrorRetryOptions {
-                            sleep: Duration::from_millis(2000),
+                            sleep: Duration::from_millis(500),
                             factor: 2,
-                            retries: 8,
+                            retries: 3,
                         }),
                     }),
                     false => Err(ModelError {
                         message: error.message(),
-                        retryable: None,
+                        retryable: Some(ModelErrorRetryOptions {
+                            sleep: Duration::from_millis(500),
+                            factor: 1,
+                            retries: 1,
+                        }),
                     }),
                 }
             }

diff --git a/core/src/providers/openai.rs b/core/src/providers/openai.rs
@@ -273,14 +273,18 @@ pub async fn streamed_completion(
                                             true => Err(ModelError {
                                                 message: error.message(),
                                                 retryable: Some(ModelErrorRetryOptions {
-                                                    sleep: Duration::from_millis(100),
+                                                    sleep: Duration::from_millis(500),
                                                     factor: 2,
                                                     retries: 3,
                                                 }),
                                             })?,
                                             false => Err(ModelError {
                                                 message: error.message(),
-                                                retryable: None,
+                                                retryable: Some(ModelErrorRetryOptions {
+                                                    sleep: Duration::from_millis(500),
+                                                    factor: 1,
+                                                    retries: 1,
+                                                }),
                                             })?,
                                         }
                                         break 'stream;
@@ -500,14 +504,18 @@ pub async fn completion(
                 true => Err(ModelError {
                     message: error.message(),
                     retryable: Some(ModelErrorRetryOptions {
-                        sleep: Duration::from_millis(2000),
+                        sleep: Duration::from_millis(500),
                         factor: 2,
-                        retries: 8,
+                        retries: 3,
                     }),
                 }),
                 false => Err(ModelError {
                     message: error.message(),
-                    retryable: None,
+                    retryable: Some(ModelErrorRetryOptions {
+                        sleep: Duration::from_millis(500),
+                        factor: 1,
+                        retries: 1,
+                    }),
                 }),
             }
         }
@@ -651,14 +659,18 @@ pub async fn streamed_chat_completion(
                                             true => Err(ModelError {
                                                 message: error.message(),
                                                 retryable: Some(ModelErrorRetryOptions {
-                                                    sleep: Duration::from_millis(100),
+                                                    sleep: Duration::from_millis(500),
                                                     factor: 2,
                                                     retries: 3,
                                                 }),
                                             })?,
                                             false => Err(ModelError {
                                                 message: error.message(),
-                                                retryable: None,
+                                                retryable: Some(ModelErrorRetryOptions {
+                                                    sleep: Duration::from_millis(500),
+                                                    factor: 1,
+                                                    retries: 1,
+                                                }),
                                             })?,
                                         }
                                         break 'stream;
@@ -982,14 +994,18 @@ pub async fn chat_completion(
                 true => Err(ModelError {
                     message: error.message(),
                     retryable: Some(ModelErrorRetryOptions {
-                        sleep: Duration::from_millis(2000),
+                        sleep: Duration::from_millis(500),
                         factor: 2,
-                        retries: 8,
+                        retries: 3,
                     }),
                 }),
                 false => Err(ModelError {
                     message: error.message(),
-                    retryable: None,
+                    retryable: Some(ModelErrorRetryOptions {
+                        sleep: Duration::from_millis(500),
+                        factor: 1,
+                        retries: 1,
+                    }),
                 }),
             }
         }
@@ -1078,14 +1094,18 @@ pub async fn embed(
                 true => Err(ModelError {
                     message: error.message(),
                     retryable: Some(ModelErrorRetryOptions {
-                        sleep: Duration::from_millis(2000),
+                        sleep: Duration::from_millis(500),
                         factor: 2,
-                        retries: 8,
+                        retries: 3,
                     }),
                 }),
                 false => Err(ModelError {
                     message: error.message(),
-                    retryable: None,
+                    retryable: Some(ModelErrorRetryOptions {
+                        sleep: Duration::from_millis(500),
+                        factor: 1,
+                        retries: 1,
+                    }),
                 }),
             }
         }

diff --git a/core/src/run.rs b/core/src/run.rs
@@ -47,8 +47,8 @@ impl RunConfig {
             BlockType::Data => 64,
             BlockType::DataSource => 8,
             BlockType::Code => 64,
-            BlockType::LLM => 8,
-            BlockType::Chat => 8,
+            BlockType::LLM => 32,
+            BlockType::Chat => 32,
             BlockType::Map => 64,
             BlockType::Reduce => 64,
             BlockType::Search => 8,