Skip to content

Commit

Permalink
Extract: Manage Big docs
Browse files Browse the repository at this point in the history
  • Loading branch information
PopDaph committed Aug 31, 2023
1 parent da65a64 commit c34bc8c
Show file tree
Hide file tree
Showing 6 changed files with 198 additions and 143 deletions.
4 changes: 2 additions & 2 deletions front/lib/actions/registry.ts
Original file line number Diff line number Diff line change
Expand Up @@ -218,9 +218,9 @@ export const DustProdActionRegistry = createActionRegistry({
"extract-events": {
app: {
workspaceId: PRODUCTION_DUST_APPS_WORKSPACE_ID,
appId: "edf70ecf98",
appId: "d4f31b6a63",
appHash:
"6c32adac64c0876301614c17e7ef98091f7ab52afaacfc11549b3efe09a65ffc",
"73215c9d3fb6819c979d83bae86681313a41ea21760ffd9372d7f2a711387d18",
},
config: {
MODEL: {
Expand Down
117 changes: 117 additions & 0 deletions front/lib/extract_event_app.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
import {
cloneBaseConfig,
DustProdActionRegistry,
} from "@app/lib/actions/registry";
import { runAction } from "@app/lib/actions/server";
import { Authenticator } from "@app/lib/auth";
import { CoreAPI } from "@app/lib/core_api";
import { formatPropertiesForModel } from "@app/lib/extract_events_properties";
import logger from "@app/logger/logger";
import { EventSchemaType } from "@app/types/extract";

const EXTRACT_MAX_NUMBER_TOKENS_TO_PROCESS = 6000;

export type ExtractEventAppResponseResults = {
value: {
results: { value: string }[][];
};
};

/**
* Runs the Extract event app and returns just only the results in which extracted events are found
* @param auth
* @param inputs
*/
export async function _runExtractEventApp({
auth,
content,
marker,
schema,
}: {
auth: Authenticator;
content: string;
marker: string;
schema: EventSchemaType;
}): Promise<string> {
const inputs = [
{
content: content,
marker: marker,
properties_to_extract: formatPropertiesForModel(schema.properties),
},
];

const ACTION_NAME = "extract-events";
const config = cloneBaseConfig(DustProdActionRegistry[ACTION_NAME]?.config);
const response = await runAction(auth, ACTION_NAME, config, inputs);

if (response.isErr()) {
logger.error(
{ error: response.error },
`api_error: ${JSON.stringify(response.error)}`
);
return "";
}

const successResponse = response as ExtractEventAppResponseResults;
const successResponseValue = successResponse.value.results[0][0].value;

logger.info(
{ value: successResponseValue },
"[Extract Event] Extract event app ran successfully."
);

return successResponseValue;
}

/**
* Return the content to process by the Extract Event app.
* If the document is too big, we send only part of it to the Dust App.
* @param fullDocumentText
* @param marker
*/
export async function _getMaxTextContentToProcess({
fullDocumentText,
marker,
}: {
fullDocumentText: string;
marker: string;
}): Promise<string> {
const tokensInDocumentText = await CoreAPI.tokenize({
text: fullDocumentText,
modelId: "text-embedding-ada-002",
providerId: "openai",
});
if (tokensInDocumentText.isErr()) {
{
tokensInDocumentText.error;
}
logger.error(
"Could not get number of tokens for document, trying with full doc."
);
return fullDocumentText;
}

const numberOfTokens = tokensInDocumentText.value.tokens.length;
let documentTextToProcess: string;

if (numberOfTokens > EXTRACT_MAX_NUMBER_TOKENS_TO_PROCESS) {
// Document is too big, we need to send only part of it to the Dust App.
const fullDocLength = fullDocumentText.length;
const markerIndex = fullDocumentText.indexOf(marker);
const markerLength = marker.length;

// We can go half the max number of tokens on each side of the marker.
// We multiply by 4 because we assume 1 token is approximately 4 characters
const maxLength = (EXTRACT_MAX_NUMBER_TOKENS_TO_PROCESS / 2) * 4;

const start = Math.max(0, markerIndex - maxLength);
const end = Math.min(fullDocLength, markerIndex + markerLength + maxLength);
documentTextToProcess = fullDocumentText.substring(start, end);
} else {
// Document is small enough, we send the whole text.
documentTextToProcess = fullDocumentText;
}

return documentTextToProcess;
}
26 changes: 11 additions & 15 deletions front/lib/extract_event_markers.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ import { Op } from "sequelize";
import { ExtractedEvent } from "@app/lib/models";

const EXTRACT_EVENT_PATTERN = /\[\[(.*?)\]\]/; // Ex: [[event]]
type ExtractedMarkersType = { [key: string]: string[] };

/**
* Check if a text contains an extract event marker
Expand Down Expand Up @@ -32,22 +31,19 @@ export function getRawExtractEventMarkersFromText(text: string): string[] {
/**
* We can use [[idea]] or [[idea:2]] in a document to mark 2 events of the same type.
* This function will return a dict of markers with the same name.
* @param rawMarkers string[]
* @returns ExtractedMarkersType
* @example ["idea", "idea:2", "idea:3", "goals"] returns { "idea": ["idea", "idea:2", "idea:3"], "goals": ["goals"] }
* @param markersWithSuffix string[]
* @returns uniqueMarkersWithoutSuffix string[]
* @example ["idea", "idea:2", "idea:3", "goals"] returns ["idea", "goals"]
*/
export function sanitizeRawExtractEventMarkers(
rawMarkers: string[]
): ExtractedMarkersType {
const markers: { [key: string]: string[] } = {};
rawMarkers.map((m) => {
const [key] = m.split(":");
if (!markers[key]) {
markers[key] = [];
}
markers[key].push(m);
export function getUniqueMarkersWithoutSuffix(
markersWithSuffix: string[]
): string[] {
const uniqueMarkers = new Set<string>();
markersWithSuffix.forEach((marker) => {
const [markerWithoutSuffix] = marker.split(":");
uniqueMarkers.add(markerWithoutSuffix);
});
return markers;
return Array.from(uniqueMarkers);
}

/**
Expand Down
Loading

0 comments on commit c34bc8c

Please sign in to comment.