Merge branch 'main' into SearchPolish

dust-tt · Oct 3, 2023 · 085409e · 085409e
2 parents 787c5ee + 9211f98
commit 085409e
Show file tree

Hide file tree

Showing 29 changed files with 892 additions and 147 deletions.
diff --git a/connectors/src/connectors/slack/bot.ts b/connectors/src/connectors/slack/bot.ts
@@ -1,7 +1,9 @@
 import {
+  AgentActionType,
   AgentGenerationSuccessEvent,
   AgentMessageType,
   DustAPI,
+  RetrievalDocumentType,
 } from "@connectors/lib/dust_api";
 import {
   Connector,
@@ -161,7 +163,7 @@ async function botAnswerMessage(
   });
   const mainMessage = await slackClient.chat.postMessage({
     channel: slackChannel,
-    text: "_I am thinking..._",
+    text: "_Thinking..._",
     thread_ts: slackMessageTs,
     mrkdwn: true,
   });
@@ -238,6 +240,7 @@ async function botAnswerMessage(
   }
 
   let fullAnswer = "";
+  let action: AgentActionType | null = null;
   let lastSentDate = new Date();
   for await (const event of streamRes.value.eventStream) {
     switch (event.type) {
@@ -258,24 +261,31 @@ async function botAnswerMessage(
       }
       case "generation_tokens": {
         fullAnswer += event.text;
-        if (lastSentDate.getTime() + 1000 > new Date().getTime()) {
+        if (lastSentDate.getTime() + 1500 > new Date().getTime()) {
           continue;
         }
         lastSentDate = new Date();
+
+        let finalAnswer = _processCiteMention(fullAnswer, action);
+        finalAnswer += `...\n\n <${DUST_API}/w/${connector.workspaceId}/assistant/${conversation.sId}|Continue this conversation on Dust>`;
+
         await slackClient.chat.update({
           channel: slackChannel,
-          text: fullAnswer,
+          text: finalAnswer,
           ts: mainMessage.ts as string,
           thread_ts: slackMessageTs,
         });
         break;
       }
+      case "agent_action_success": {
+        action = event.action;
+        break;
+      }
       case "agent_generation_success": {
-        const finalAnswer = `${_removeCiteMention(
-          event.text
-        )}\n\n <${DUST_API}/w/${connector.workspaceId}/assistant/${
-          conversation.sId
-        }|Continue this conversation on Dust>`;
+        fullAnswer = event.text;
+
+        let finalAnswer = _processCiteMention(fullAnswer, action);
+        finalAnswer += `\n\n <${DUST_API}/w/${connector.workspaceId}/assistant/${conversation.sId}|Continue this conversation on Dust>`;
 
         await slackClient.chat.update({
           channel: slackChannel,
@@ -293,11 +303,45 @@ async function botAnswerMessage(
   return new Err(new Error("Failed to get the final answer from Dust"));
 }
 
-/*
- * Temp > until I have a PR to properly handle mentions
- */
-function _removeCiteMention(message: string) {
-  const regex = /:cite\[[a-zA-Z0-9,]+\]/g;
+function _processCiteMention(
+  content: string,
+  action: AgentActionType | null
+): string {
+  const references: { [key: string]: RetrievalDocumentType } = {};
+
+  if (action && action.type === "retrieval_action" && action.documents) {
+    action.documents.forEach((d) => {
+      references[d.reference] = d;
+    });
+  }
+
+  if (references) {
+    let counter = 0;
+    const refCounter: { [key: string]: number } = {};
+    return content.replace(/:cite\[[a-zA-Z0-9, ]+\]/g, (match) => {
+      const keys = match.slice(6, -1).split(","); // slice off ":cite[" and "]" then split by comma
+      return keys
+        .map((key) => {
+          const k = key.trim();
+          const ref = references[k];
+          if (ref && ref.sourceUrl) {
+            if (!refCounter[k]) {
+              counter++;
+              refCounter[k] = counter;
+            }
+            return `[<${ref.sourceUrl}|${refCounter[k]}>]`;
+          }
+          return "";
+        })
+        .join("");
+    });
+  }
+
+  return _removeCiteMention(content);
+}
+
+function _removeCiteMention(message: string): string {
+  const regex = /:cite\[[a-zA-Z0-9, ]+\]/g;
   return message.replace(regex, "");
 }
 

diff --git a/connectors/src/lib/dust_api.ts b/connectors/src/lib/dust_api.ts
@@ -280,7 +280,6 @@ export type AgentMessageType = {
   visibility: MessageVisibility;
   version: number;
   parentMessageId: string | null;
-
   // configuration: AgentConfigurationType;
   status: AgentMessageStatus;
   action: AgentActionType | null;

diff --git a/core/bin/dust_api.rs b/core/bin/dust_api.rs
@@ -505,7 +505,7 @@ async fn run_helper(
             None => Err(error_response(
                 StatusCode::BAD_REQUEST,
                 "missing_specification_error",
-                "No specification provided, either `specification` or 
+                "No specification provided, either `specification` or
                 `specification_hash` must be provided",
                 None,
             ))?,
@@ -1610,8 +1610,8 @@ struct TokenizePayload {
 async fn tokenize(
     extract::Json(payload): extract::Json<TokenizePayload>,
 ) -> (StatusCode, Json<APIResponse>) {
-    let embedder = provider(payload.provider_id).embedder(payload.model_id);
-    match embedder.tokenize(payload.text).await {
+    let embedder = provider(payload.provider_id).llm(payload.model_id);
+    match embedder.tokenize(&payload.text).await {
         Err(e) => error_response(
             StatusCode::INTERNAL_SERVER_ERROR,
             "internal_server_error",

diff --git a/core/src/lib.rs b/core/src/lib.rs
@@ -30,6 +30,7 @@ pub mod providers {
         pub mod tiktoken;
     }
     pub mod anthropic;
+    pub mod textsynth;
 }
 pub mod http {
     pub mod request;

diff --git a/core/src/providers/ai21.rs b/core/src/providers/ai21.rs
@@ -190,6 +190,10 @@ impl LLM for AI21LLM {
         Err(anyhow!("Encode/Decode not implemented for provider `ai21`"))
     }
 
+    async fn tokenize(&self, _text: &str) -> Result<Vec<(usize, String)>> {
+        Err(anyhow!("Tokenize not implemented for provider `ai21`"))
+    }
+
     async fn generate(
         &self,
         prompt: &str,
@@ -385,10 +389,6 @@ impl Embedder for AI21Embedder {
         Err(anyhow!("Encode/Decode not implemented for provider `ai21`"))
     }
 
-    async fn tokenize(&self, _text: String) -> Result<Vec<(usize, String)>> {
-        Err(anyhow!("Tokenize not implemented for provider `ai21`"))
-    }
-
     async fn embed(&self, _text: Vec<&str>, _extras: Option<Value>) -> Result<Vec<EmbedderVector>> {
         Err(anyhow!("Embeddings not available for provider `ai21`"))
     }

diff --git a/core/src/providers/anthropic.rs b/core/src/providers/anthropic.rs
@@ -572,6 +572,10 @@ impl LLM for AnthropicLLM {
         decode_async(anthropic_base_singleton(), tokens).await
     }
 
+    async fn tokenize(&self, text: &str) -> Result<Vec<(usize, String)>> {
+        tokenize_async(anthropic_base_singleton(), text).await
+    }
+
     async fn chat(
         &self,
         messages: &Vec<ChatMessage>,
@@ -665,10 +669,6 @@ impl Embedder for AnthropicEmbedder {
         decode_async(anthropic_base_singleton(), tokens).await
     }
 
-    async fn tokenize(&self, text: String) -> Result<Vec<(usize, String)>> {
-        tokenize_async(anthropic_base_singleton(), text).await
-    }
-
     async fn embed(&self, _text: Vec<&str>, _extras: Option<Value>) -> Result<Vec<EmbedderVector>> {
         Err(anyhow!("Embeddings not available for provider `anthropic`"))
     }

diff --git a/core/src/providers/azure_openai.rs b/core/src/providers/azure_openai.rs
@@ -1,3 +1,4 @@
+use super::tiktoken::tiktoken::{decode_async, encode_async, tokenize_async};
 use crate::providers::embedder::{Embedder, EmbedderVector};
 use crate::providers::llm::Tokens;
 use crate::providers::llm::{ChatMessage, LLMChatGeneration, LLMGeneration, LLM};
@@ -265,13 +266,15 @@ impl LLM for AzureOpenAILLM {
     }
 
     async fn encode(&self, text: &str) -> Result<Vec<usize>> {
-        let tokens = { self.tokenizer().lock().encode_with_special_tokens(text) };
-        Ok(tokens)
+        encode_async(self.tokenizer(), text).await
     }
 
     async fn decode(&self, tokens: Vec<usize>) -> Result<String> {
-        let str = { self.tokenizer().lock().decode(tokens)? };
-        Ok(str)
+        decode_async(self.tokenizer(), tokens).await
+    }
+
+    async fn tokenize(&self, text: &str) -> Result<Vec<(usize, String)>> {
+        tokenize_async(self.tokenizer(), text).await
     }
 
     async fn generate(
@@ -600,10 +603,6 @@ impl Embedder for AzureOpenAIEmbedder {
         Ok(str)
     }
 
-    async fn tokenize(&self, _text: String) -> Result<Vec<(usize, String)>> {
-        Err(anyhow!("Tokenize not implemented for provider `anthropic`"))
-    }
-
     async fn embed(&self, text: Vec<&str>, extras: Option<Value>) -> Result<Vec<EmbedderVector>> {
         let e = embed(
             self.uri()?,

diff --git a/core/src/providers/cohere.rs b/core/src/providers/cohere.rs
@@ -297,6 +297,16 @@ impl LLM for CohereLLM {
         api_decode(self.api_key.as_ref().unwrap(), tokens).await
     }
 
+    // We return empty string in tokenize to partially support the endpoint.
+    async fn tokenize(&self, text: &str) -> Result<Vec<(usize, String)>> {
+        assert!(self.api_key.is_some());
+        let tokens = api_encode(self.api_key.as_ref().unwrap(), text).await?;
+        Ok(tokens
+            .iter()
+            .map(|t| (*t, "".to_string()))
+            .collect::<Vec<_>>())
+    }
+
     async fn generate(
         &self,
         prompt: &str,
@@ -534,10 +544,6 @@ impl Embedder for CohereEmbedder {
         api_decode(self.api_key.as_ref().unwrap(), tokens).await
     }
 
-    async fn tokenize(&self, _text: String) -> Result<Vec<(usize, String)>> {
-        Err(anyhow!("Tokenize not implemented for provider `Cohere`"))
-    }
-
     async fn embed(&self, text: Vec<&str>, _extras: Option<Value>) -> Result<Vec<EmbedderVector>> {
         assert!(self.api_key.is_some());
 

diff --git a/core/src/providers/embedder.rs b/core/src/providers/embedder.rs
@@ -26,8 +26,6 @@ pub trait Embedder {
     async fn encode(&self, text: &str) -> Result<Vec<usize>>;
     async fn decode(&self, tokens: Vec<usize>) -> Result<String>;
 
-    async fn tokenize(&self, text: String) -> Result<Vec<(usize, String)>>;
-
     async fn embed(&self, text: Vec<&str>, extras: Option<Value>) -> Result<Vec<EmbedderVector>>;
 }
 

diff --git a/core/src/providers/llm.rs b/core/src/providers/llm.rs
@@ -103,6 +103,7 @@ pub trait LLM {
 
     async fn encode(&self, text: &str) -> Result<Vec<usize>>;
     async fn decode(&self, tokens: Vec<usize>) -> Result<String>;
+    async fn tokenize(&self, text: &str) -> Result<Vec<(usize, String)>>;
 
     async fn generate(
         &self,

diff --git a/core/src/providers/openai.rs b/core/src/providers/openai.rs
@@ -1160,6 +1160,10 @@ impl LLM for OpenAILLM {
         decode_async(self.tokenizer(), tokens).await
     }
 
+    async fn tokenize(&self, text: &str) -> Result<Vec<(usize, String)>> {
+        tokenize_async(self.tokenizer(), text).await
+    }
+
     async fn generate(
         &self,
         prompt: &str,
@@ -1575,10 +1579,6 @@ impl Embedder for OpenAIEmbedder {
         decode_async(self.tokenizer(), tokens).await
     }
 
-    async fn tokenize(&self, text: String) -> Result<Vec<(usize, String)>> {
-        tokenize_async(self.tokenizer(), text).await
-    }
-
     async fn embed(&self, text: Vec<&str>, extras: Option<Value>) -> Result<Vec<EmbedderVector>> {
         let e = embed(
             self.uri()?,

diff --git a/core/src/providers/provider.rs b/core/src/providers/provider.rs
@@ -13,6 +13,8 @@ use serde::{Deserialize, Serialize};
 use std::str::FromStr;
 use std::time::Duration;
 
+use super::textsynth::TextSynthProvider;
+
 #[derive(Debug, Clone, Copy, Serialize, PartialEq, Deserialize)]
 #[serde(rename_all = "lowercase")]
 pub enum ProviderID {
@@ -22,6 +24,7 @@ pub enum ProviderID {
     #[serde(rename = "azure_openai")]
     AzureOpenAI,
     Anthropic,
+    TextSynth,
 }
 
 impl ToString for ProviderID {
@@ -32,6 +35,7 @@ impl ToString for ProviderID {
             ProviderID::AI21 => String::from("ai21"),
             ProviderID::AzureOpenAI => String::from("azure_openai"),
             ProviderID::Anthropic => String::from("anthropic"),
+            ProviderID::TextSynth => String::from("textsynth"),
         }
     }
 }
@@ -45,6 +49,7 @@ impl FromStr for ProviderID {
             "ai21" => Ok(ProviderID::AI21),
             "azure_openai" => Ok(ProviderID::AzureOpenAI),
             "anthropic" => Ok(ProviderID::Anthropic),
+            "textsynth" => Ok(ProviderID::TextSynth),
             _ => Err(ParseError::with_message(
                 "Unknown provider ID (possible values: openai, cohere, ai21, azure_openai)",
             ))?,
@@ -139,5 +144,6 @@ pub fn provider(t: ProviderID) -> Box<dyn Provider + Sync + Send> {
         ProviderID::AI21 => Box::new(AI21Provider::new()),
         ProviderID::AzureOpenAI => Box::new(AzureOpenAIProvider::new()),
         ProviderID::Anthropic => Box::new(AnthropicProvider::new()),
+        ProviderID::TextSynth => Box::new(TextSynthProvider::new()),
     }
 }