Skip to content

Commit

Permalink
Extract big docs v2 (#1241)
Browse files Browse the repository at this point in the history
* Extract: Handling big docs v2

* WIP

* WIP

* Max tokens 6000

* Fix comments

* Move function to util and add basic test

* Apply feedback

* Rework format from tokenize

* Fix bad rebase

* rename function

* Add comment
  • Loading branch information
PopDaph authored Sep 4, 2023
1 parent 33915f0 commit 6408917
Show file tree
Hide file tree
Showing 13 changed files with 324 additions and 40 deletions.
2 changes: 1 addition & 1 deletion core/bin/dust_api.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1611,7 +1611,7 @@ async fn tokenize(
extract::Json(payload): extract::Json<TokenizePayload>,
) -> (StatusCode, Json<APIResponse>) {
let embedder = provider(payload.provider_id).embedder(payload.model_id);
match embedder.encode(&payload.text).await {
match embedder.tokenize(payload.text).await {
Err(e) => error_response(
StatusCode::INTERNAL_SERVER_ERROR,
"internal_server_error",
Expand Down
4 changes: 4 additions & 0 deletions core/src/providers/ai21.rs
Original file line number Diff line number Diff line change
Expand Up @@ -385,6 +385,10 @@ impl Embedder for AI21Embedder {
Err(anyhow!("Encode/Decode not implemented for provider `ai21`"))
}

async fn tokenize(&self, _text: String) -> Result<Vec<(usize, String)>> {
Err(anyhow!("Tokenize not implemented for provider `ai21`"))
}

async fn embed(&self, _text: Vec<&str>, _extras: Option<Value>) -> Result<Vec<EmbedderVector>> {
Err(anyhow!("Embeddings not available for provider `ai21`"))
}
Expand Down
4 changes: 4 additions & 0 deletions core/src/providers/anthropic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -631,6 +631,10 @@ impl Embedder for AnthropicEmbedder {
))
}

async fn tokenize(&self, _text: String) -> Result<Vec<(usize, String)>> {
Err(anyhow!("Tokenize not implemented for provider `anthropic`"))
}

async fn embed(&self, _text: Vec<&str>, _extras: Option<Value>) -> Result<Vec<EmbedderVector>> {
Err(anyhow!("Embeddings not available for provider `anthropic`"))
}
Expand Down
4 changes: 4 additions & 0 deletions core/src/providers/azure_openai.rs
Original file line number Diff line number Diff line change
Expand Up @@ -600,6 +600,10 @@ impl Embedder for AzureOpenAIEmbedder {
Ok(str)
}

async fn tokenize(&self, _text: String) -> Result<Vec<(usize, String)>> {
Err(anyhow!("Tokenize not implemented for provider `anthropic`"))
}

async fn embed(&self, text: Vec<&str>, extras: Option<Value>) -> Result<Vec<EmbedderVector>> {
let e = embed(
self.uri()?,
Expand Down
4 changes: 4 additions & 0 deletions core/src/providers/cohere.rs
Original file line number Diff line number Diff line change
Expand Up @@ -534,6 +534,10 @@ impl Embedder for CohereEmbedder {
api_decode(self.api_key.as_ref().unwrap(), tokens).await
}

async fn tokenize(&self, _text: String) -> Result<Vec<(usize, String)>> {
Err(anyhow!("Tokenize not implemented for provider `Cohere`"))
}

async fn embed(&self, text: Vec<&str>, _extras: Option<Value>) -> Result<Vec<EmbedderVector>> {
assert!(self.api_key.is_some());

Expand Down
2 changes: 2 additions & 0 deletions core/src/providers/embedder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ pub trait Embedder {
async fn encode(&self, text: &str) -> Result<Vec<usize>>;
async fn decode(&self, tokens: Vec<usize>) -> Result<String>;

async fn tokenize(&self, text: String) -> Result<Vec<(usize, String)>>;

async fn embed(&self, text: Vec<&str>, extras: Option<Value>) -> Result<Vec<EmbedderVector>>;
}

Expand Down
6 changes: 5 additions & 1 deletion core/src/providers/openai.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ use tokio::sync::mpsc::UnboundedSender;
use tokio::time::timeout;

use super::llm::{ChatFunction, ChatFunctionCall};
use super::tiktoken::tiktoken::{decode_async, encode_async};
use super::tiktoken::tiktoken::{decode_async, encode_async, tokenize_async};

#[derive(Serialize, Deserialize, Debug, Clone)]
pub struct Usage {
Expand Down Expand Up @@ -1575,6 +1575,10 @@ impl Embedder for OpenAIEmbedder {
decode_async(self.tokenizer(), tokens).await
}

async fn tokenize(&self, text: String) -> Result<Vec<(usize, String)>> {
tokenize_async(self.tokenizer(), text).await
}

async fn embed(&self, text: Vec<&str>, extras: Option<Value>) -> Result<Vec<EmbedderVector>> {
let e = embed(
self.uri()?,
Expand Down
55 changes: 55 additions & 0 deletions core/src/providers/tiktoken/tiktoken.rs
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,14 @@ pub async fn encode_async(bpe: Arc<Mutex<CoreBPE>>, text: &str) -> Result<Vec<us
Ok(r)
}

pub async fn tokenize_async(
bpe: Arc<Mutex<CoreBPE>>,
text: String,
) -> Result<Vec<(usize, String)>> {
let r = task::spawn_blocking(move || bpe.lock().tokenize(&text)).await?;
Ok(r)
}

fn _byte_pair_merge(piece: &[u8], ranks: &HashMap<Vec<u8>, usize>) -> Vec<std::ops::Range<usize>> {
let mut parts: Vec<_> = (0..piece.len()).map(|i| i..i + 1).collect();

Expand Down Expand Up @@ -241,6 +249,49 @@ impl CoreBPE {
ret
}

fn _tokenize(&self, text: &String) -> Vec<(usize, String)> {
let regex = self._get_regex();
let mut results = vec![];

for mat in regex.find_iter(text) {
let string = mat.unwrap().as_str();
let piece = string.as_bytes();
if let Some(token) = self.encoder.get(piece) {
results.push((*token, string.to_string()));
continue;
}

results.extend(Self::_tokenize_byte_pair_encode(piece, &self.encoder));
}
results
}

/**
* Implemented to match the logic in _encode_ordinary_native
* Used in tokenize function
*/
pub fn _tokenize_byte_pair_encode(
piece: &[u8],
ranks: &HashMap<Vec<u8>, usize>,
) -> Vec<(usize, String)> {
if piece.len() == 1 {
let string = std::str::from_utf8(&piece).unwrap();
return vec![(ranks[piece], string.to_string())];
}

_byte_pair_merge(piece, ranks)
.iter()
.map(|p| {
(
ranks[&piece[p.start..p.end]],
std::str::from_utf8(&piece[p.start..p.end])
.unwrap()
.to_string(),
)
})
.collect()
}

fn _encode_native(&self, text: &str, allowed_special: &HashSet<&str>) -> (Vec<usize>, usize) {
let special_regex = self._get_special_regex();
let regex = self._get_regex();
Expand Down Expand Up @@ -506,6 +557,10 @@ impl CoreBPE {
self._encode_native(text, &allowed_special).0
}

pub fn tokenize(&self, text: &String) -> Vec<(usize, String)> {
self._tokenize(text)
}

pub fn encode_with_special_tokens(&self, text: &str) -> Vec<usize> {
let allowed_special = self
.special_tokens_encoder
Expand Down
4 changes: 3 additions & 1 deletion front/lib/core_api.ts
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,8 @@ export type CoreAPIRun = {
traces: Array<[[BlockType, string], Array<Array<TraceType>>]>;
};

export type CoreAPITokenType = [number, string];

type CoreAPICreateRunParams = {
projectId: string;
runAsWorkspaceId: string;
Expand Down Expand Up @@ -726,7 +728,7 @@ export const CoreAPI = {
text: string;
modelId: string;
providerId: string;
}): Promise<CoreAPIResponse<{ tokens: number[] }>> {
}): Promise<CoreAPIResponse<{ tokens: CoreAPITokenType[] }>> {
const response = await fetch(`${CORE_API}/tokenize`, {
method: "POST",
headers: {
Expand Down
142 changes: 106 additions & 36 deletions front/lib/extract_event_app.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,12 @@ import {
} from "@app/lib/actions/registry";
import { runAction } from "@app/lib/actions/server";
import { Authenticator } from "@app/lib/auth";
import { CoreAPI } from "@app/lib/core_api";
import { CoreAPI, CoreAPITokenType } from "@app/lib/core_api";
import { findMarkersIndexes } from "@app/lib/extract_event_markers";
import { formatPropertiesForModel } from "@app/lib/extract_events_properties";
import logger from "@app/logger/logger";
import { EventSchemaType } from "@app/types/extract";

const EXTRACT_MAX_NUMBER_TOKENS_TO_PROCESS = 6000;

export type ExtractEventAppResponseResults = {
value: {
results: { value: string }[][];
Expand Down Expand Up @@ -65,53 +64,124 @@ export async function _runExtractEventApp({
}

/**
* Return the content to process by the Extract Event app.
* If the document is too big, we send only part of it to the Dust App.
* @param fullDocumentText
* @param marker
* Gets the maximum text content to process for the Dust app.
* We define a maximum number of tokens that the Dust app can process.
* It will return the text around the marker: first we expand the text before the marker, than after the marker.
*/
export async function _getMaxTextContentToProcess({
fullDocumentText,
fullText,
marker,
}: {
fullDocumentText: string;
fullText: string;
marker: string;
}): Promise<string> {
const tokensInDocumentText = await CoreAPI.tokenize({
text: fullDocumentText,
const tokenized = await getTokenizedText(fullText);
const tokens = tokenized.tokens;
const nbTokens = tokens.length;
const MAX_TOKENS = 6000;

// If the text is small enough, just return it
if (nbTokens < MAX_TOKENS) {
return fullText;
}

// Otherwise we extract the tokens around the marker
// and return the text corresponding to those tokens
const extractTokensResult = extractMaxTokens({
fullText,
tokens,
marker,
maxTokens: MAX_TOKENS,
});

return extractTokensResult.map((t) => t[1]).join("");
}

/**
* Extracts the maximum number of tokens around the marker.
*/
function extractMaxTokens({
fullText,
tokens,
marker,
maxTokens,
}: {
fullText: string;
tokens: CoreAPITokenType[];
marker: string;
maxTokens: number;
}): CoreAPITokenType[] {
const { start, end } = findMarkersIndexes({ fullText, marker, tokens });

if (start === -1 || end === -1) {
return [];
}

// The number of tokens that the marker takes up
const markerTokens = end - start + 1;

// The number of remaining tokens that can be included around the marker
const remainingTokens = maxTokens - markerTokens;

// Initialize the slicing start and end points around the marker
let startSlice = start;
let endSlice = end;

// Try to add tokens before the marker first
if (remainingTokens > 0) {
startSlice = Math.max(0, start - remainingTokens);

// Calculate any remaining tokens that can be used after the marker
const remainingAfter = remainingTokens - (start - startSlice);

// If there are any tokens left, add them after the marker
if (remainingAfter > 0) {
endSlice = Math.min(tokens.length - 1, end + remainingAfter);
}
}

return tokens.slice(startSlice, endSlice + 1);
}

/**
* Calls Core API to get the tokens and associated strings for a given text.
* Ex: "Un petit Soupinou des bois [[idea:2]]" will return:
* {
* tokens: [
[ 1844, 'Un' ],
[ 46110, ' petit' ],
[ 9424, ' Sou' ],
[ 13576, 'pin' ],
[ 283, 'ou' ],
[ 951, ' des' ],
[ 66304, ' bois' ],
[ 4416, ' [[' ],
[ 42877, 'idea' ],
[ 25, ':' ],
[ 17, '2' ],
[ 5163, ']]' ]
],
* }
*/

export async function getTokenizedText(
text: string
): Promise<{ tokens: CoreAPITokenType[] }> {
console.log("computeNbTokens4");
const tokenizeResponse = await CoreAPI.tokenize({
text: text,
modelId: "text-embedding-ada-002",
providerId: "openai",
});
if (tokensInDocumentText.isErr()) {
if (tokenizeResponse.isErr()) {
{
tokensInDocumentText.error;
tokenizeResponse.error;
}
logger.error(
"Could not get number of tokens for document, trying with full doc."
);
return fullDocumentText;
}

const numberOfTokens = tokensInDocumentText.value.tokens.length;
let documentTextToProcess: string;

if (numberOfTokens > EXTRACT_MAX_NUMBER_TOKENS_TO_PROCESS) {
// Document is too big, we need to send only part of it to the Dust App.
const fullDocLength = fullDocumentText.length;
const markerIndex = fullDocumentText.indexOf(marker);
const markerLength = marker.length;

// We can go half the max number of tokens on each side of the marker.
// We multiply by 4 because we assume 1 token is approximately 4 characters
const maxLength = (EXTRACT_MAX_NUMBER_TOKENS_TO_PROCESS / 2) * 4;

const start = Math.max(0, markerIndex - maxLength);
const end = Math.min(fullDocLength, markerIndex + markerLength + maxLength);
documentTextToProcess = fullDocumentText.substring(start, end);
} else {
// Document is small enough, we send the whole text.
documentTextToProcess = fullDocumentText;
return { tokens: [] };
}

return documentTextToProcess;
return tokenizeResponse.value;
}
Loading

0 comments on commit 6408917

Please sign in to comment.