diff --git a/front/lib/actions/registry.ts b/front/lib/actions/registry.ts index 75fec962c2b1..117f20315db9 100644 --- a/front/lib/actions/registry.ts +++ b/front/lib/actions/registry.ts @@ -218,9 +218,9 @@ export const DustProdActionRegistry = createActionRegistry({ "extract-events": { app: { workspaceId: PRODUCTION_DUST_APPS_WORKSPACE_ID, - appId: "edf70ecf98", + appId: "d4f31b6a63", appHash: - "6c32adac64c0876301614c17e7ef98091f7ab52afaacfc11549b3efe09a65ffc", + "73215c9d3fb6819c979d83bae86681313a41ea21760ffd9372d7f2a711387d18", }, config: { MODEL: { diff --git a/front/lib/extract_event_app.ts b/front/lib/extract_event_app.ts new file mode 100644 index 000000000000..b7ae45733c50 --- /dev/null +++ b/front/lib/extract_event_app.ts @@ -0,0 +1,117 @@ +import { + cloneBaseConfig, + DustProdActionRegistry, +} from "@app/lib/actions/registry"; +import { runAction } from "@app/lib/actions/server"; +import { Authenticator } from "@app/lib/auth"; +import { CoreAPI } from "@app/lib/core_api"; +import { formatPropertiesForModel } from "@app/lib/extract_events_properties"; +import logger from "@app/logger/logger"; +import { EventSchemaType } from "@app/types/extract"; + +const EXTRACT_MAX_NUMBER_TOKENS_TO_PROCESS = 6000; + +export type ExtractEventAppResponseResults = { + value: { + results: { value: string }[][]; + }; +}; + +/** + * Runs the Extract event app and returns just only the results in which extracted events are found + * @param auth + * @param inputs + */ +export async function _runExtractEventApp({ + auth, + content, + marker, + schema, +}: { + auth: Authenticator; + content: string; + marker: string; + schema: EventSchemaType; +}): Promise { + const inputs = [ + { + content: content, + marker: marker, + properties_to_extract: formatPropertiesForModel(schema.properties), + }, + ]; + + const ACTION_NAME = "extract-events"; + const config = cloneBaseConfig(DustProdActionRegistry[ACTION_NAME]?.config); + const response = await runAction(auth, ACTION_NAME, config, inputs); + + if (response.isErr()) { + logger.error( + { error: response.error }, + `api_error: ${JSON.stringify(response.error)}` + ); + return ""; + } + + const successResponse = response as ExtractEventAppResponseResults; + const successResponseValue = successResponse.value.results[0][0].value; + + logger.info( + { value: successResponseValue }, + "[Extract Event] Extract event app ran successfully." + ); + + return successResponseValue; +} + +/** + * Return the content to process by the Extract Event app. + * If the document is too big, we send only part of it to the Dust App. + * @param fullDocumentText + * @param marker + */ +export async function _getMaxTextContentToProcess({ + fullDocumentText, + marker, +}: { + fullDocumentText: string; + marker: string; +}): Promise { + const tokensInDocumentText = await CoreAPI.tokenize({ + text: fullDocumentText, + modelId: "text-embedding-ada-002", + providerId: "openai", + }); + if (tokensInDocumentText.isErr()) { + { + tokensInDocumentText.error; + } + logger.error( + "Could not get number of tokens for document, trying with full doc." + ); + return fullDocumentText; + } + + const numberOfTokens = tokensInDocumentText.value.tokens.length; + let documentTextToProcess: string; + + if (numberOfTokens > EXTRACT_MAX_NUMBER_TOKENS_TO_PROCESS) { + // Document is too big, we need to send only part of it to the Dust App. + const fullDocLength = fullDocumentText.length; + const markerIndex = fullDocumentText.indexOf(marker); + const markerLength = marker.length; + + // We can go half the max number of tokens on each side of the marker. + // We multiply by 4 because we assume 1 token is approximately 4 characters + const maxLength = (EXTRACT_MAX_NUMBER_TOKENS_TO_PROCESS / 2) * 4; + + const start = Math.max(0, markerIndex - maxLength); + const end = Math.min(fullDocLength, markerIndex + markerLength + maxLength); + documentTextToProcess = fullDocumentText.substring(start, end); + } else { + // Document is small enough, we send the whole text. + documentTextToProcess = fullDocumentText; + } + + return documentTextToProcess; +} diff --git a/front/lib/extract_event_markers.ts b/front/lib/extract_event_markers.ts index 417201575223..a626622e3f4a 100644 --- a/front/lib/extract_event_markers.ts +++ b/front/lib/extract_event_markers.ts @@ -3,7 +3,6 @@ import { Op } from "sequelize"; import { ExtractedEvent } from "@app/lib/models"; const EXTRACT_EVENT_PATTERN = /\[\[(.*?)\]\]/; // Ex: [[event]] -type ExtractedMarkersType = { [key: string]: string[] }; /** * Check if a text contains an extract event marker @@ -32,22 +31,19 @@ export function getRawExtractEventMarkersFromText(text: string): string[] { /** * We can use [[idea]] or [[idea:2]] in a document to mark 2 events of the same type. * This function will return a dict of markers with the same name. - * @param rawMarkers string[] - * @returns ExtractedMarkersType - * @example ["idea", "idea:2", "idea:3", "goals"] returns { "idea": ["idea", "idea:2", "idea:3"], "goals": ["goals"] } + * @param markersWithSuffix string[] + * @returns uniqueMarkersWithoutSuffix string[] + * @example ["idea", "idea:2", "idea:3", "goals"] returns ["idea", "goals"] */ -export function sanitizeRawExtractEventMarkers( - rawMarkers: string[] -): ExtractedMarkersType { - const markers: { [key: string]: string[] } = {}; - rawMarkers.map((m) => { - const [key] = m.split(":"); - if (!markers[key]) { - markers[key] = []; - } - markers[key].push(m); +export function getUniqueMarkersWithoutSuffix( + markersWithSuffix: string[] +): string[] { + const uniqueMarkers = new Set(); + markersWithSuffix.forEach((marker) => { + const [markerWithoutSuffix] = marker.split(":"); + uniqueMarkers.add(markerWithoutSuffix); }); - return markers; + return Array.from(uniqueMarkers); } /** diff --git a/front/lib/extract_events.ts b/front/lib/extract_events.ts index 1ebb22ff64af..781ca590f6bf 100644 --- a/front/lib/extract_events.ts +++ b/front/lib/extract_events.ts @@ -5,19 +5,17 @@ import { DocumentsPostProcessHookOnUpsertParams, } from "@app/documents_post_process_hooks/hooks"; import { getDatasource } from "@app/documents_post_process_hooks/hooks/lib/data_source_helpers"; -import { - cloneBaseConfig, - DustProdActionRegistry, -} from "@app/lib/actions/registry"; -import { runAction } from "@app/lib/actions/server"; import { Authenticator } from "@app/lib/auth"; +import { + _getMaxTextContentToProcess, + _runExtractEventApp, +} from "@app/lib/extract_event_app"; import { getExtractEventMarkersToProcess, getRawExtractEventMarkersFromText, + getUniqueMarkersWithoutSuffix, hasExtractEventMarker, - sanitizeRawExtractEventMarkers, } from "@app/lib/extract_event_markers"; -import { formatPropertiesForModel } from "@app/lib/extract_events_properties"; import { EventSchema, ExtractedEvent } from "@app/lib/models"; import { generateModelSId } from "@app/lib/utils"; import logger from "@app/logger/logger"; @@ -47,13 +45,13 @@ export async function shouldProcessExtractEvents( return false; } - const rawMarkers = getRawExtractEventMarkersFromText(documentText); - const markers = sanitizeRawExtractEventMarkers(rawMarkers); + const markersWithSuffix = getRawExtractEventMarkersFromText(documentText); + const markersWithoutSuffix = getUniqueMarkersWithoutSuffix(markersWithSuffix); const activeSchema: EventSchema | null = await EventSchema.findOne({ where: { marker: { - [Op.in]: Object.keys(markers), + [Op.in]: markersWithoutSuffix, }, status: "active", }, @@ -91,20 +89,18 @@ export async function processExtractEvents({ } // Getting the markers from the doc and keeping only those not already in the DB - const rawMarkers = await getExtractEventMarkersToProcess({ + const markersToProcess = await getExtractEventMarkersToProcess({ documentId, dataSourceName: dataSourceName, documentText, }); - const markers = sanitizeRawExtractEventMarkers(rawMarkers); await Promise.all( - Object.keys(markers).map((marker) => { - return _processExtractEvent({ + markersToProcess.map((marker) => { + return _processExtractEventsForMarker({ auth: auth, dataSourceName: dataSourceName, - sanitizedMarker: marker, - markers: markers[marker], + marker: marker, documentText: documentText, documentId: documentId, documentSourceUrl: documentSourceUrl || null, @@ -114,31 +110,31 @@ export async function processExtractEvents({ } /** - * Gets the schema for the marker, runs the Dust app to extract properties, and saves the event(s) + * 1/ Gets the schema for the marker, + * 2/ Checks that the document is not too big for the Dust app, + * 3/ Runs the Dust app to extract schema properties from the document, + * 4/ Saves the event(s) in the DB. */ -async function _processExtractEvent(data: { +async function _processExtractEventsForMarker({ + auth, + dataSourceName, + marker, + documentId, + documentSourceUrl, + documentText, +}: { auth: Authenticator; dataSourceName: string; - sanitizedMarker: string; - markers: string[]; + marker: string; documentText: string; documentId: string; documentSourceUrl: string | null; }) { - const { - auth, - dataSourceName, - sanitizedMarker, - markers, - documentId, - documentSourceUrl, - documentText, - } = data; - + // 1/ Get the schema for the marker const schema: EventSchema | null = await EventSchema.findOne({ where: { workspaceId: auth.workspace()?.id, - marker: sanitizedMarker, + marker: marker.split(":")[0], status: "active", }, }); @@ -147,82 +143,57 @@ async function _processExtractEvent(data: { return; } - const inputsForApp = [ - { - document_text: documentText, - markers: markers, - schema_properties: formatPropertiesForModel(schema.properties), - }, - ]; - - const results = await _runExtractEventApp(auth, inputsForApp); - results.map(async (result: string) => { - const properties = JSON.parse(result); - if (!properties.marker) { - logger.error( - { properties, marker: schema.marker, documentSourceUrl, documentId }, - "Extract event app did not return a marker. Skipping." - ); - return; - } + // 2/ Check that the document is not to big for the Dust App. + const contentToProcess = await _getMaxTextContentToProcess({ + fullDocumentText: documentText, + marker: marker, + }); - const event = await ExtractedEvent.create({ - sId: generateModelSId(), - documentId: documentId, - properties: properties, - status: "pending", - eventSchemaId: schema.id, - dataSourceName: dataSourceName, - documentSourceUrl: documentSourceUrl || null, - marker: properties.marker, - }); + // 3/ Run the Dust app to extract properties + const result = await _runExtractEventApp({ + auth, + content: contentToProcess, + marker: marker, + schema: schema, + }); - // Temp: we log on slack events that are extracted from the Dust workspace - if (schema.debug === true) { - await _logDebugEventOnSlack({ event, schema, documentSourceUrl }); - } + if (result.length === 0) { logger.info( - { properties, marker: schema.marker, documentSourceUrl, documentId }, - "[Extract Event] Event saved and logged." + { marker: schema.marker, documentSourceUrl, documentId }, + "[Extract Event] No event extracted." ); - }); -} - -type ExtractEventAppResponseResults = { - value: { - results: { value: string[] }[][]; - }; -}; - -/** - * Runs the Extract event app and returns just only the results in which extracted events are found - * @param auth - * @param inputs - */ -async function _runExtractEventApp( - auth: Authenticator, - inputs: { document_text: string; markers: string[]; schema_properties: any }[] -): Promise { - const ACTION_NAME = "extract-events"; - const config = cloneBaseConfig(DustProdActionRegistry[ACTION_NAME]?.config); - const response = await runAction(auth, ACTION_NAME, config, inputs); + return; + } - if (response.isErr()) { + // 4/ Save the event(s) in the DB + const properties = JSON.parse(result); + if (!properties.marker) { logger.error( - { error: response.error }, - `api_error: ${JSON.stringify(response.error)}` + { properties, marker: schema.marker, documentSourceUrl, documentId }, + "Extract event app did not return a marker. Skipping." ); - return []; + return; } - const successResponse = response as ExtractEventAppResponseResults; + const event = await ExtractedEvent.create({ + sId: generateModelSId(), + documentId: documentId, + properties: properties, + status: "pending", + eventSchemaId: schema.id, + dataSourceName: dataSourceName, + documentSourceUrl: documentSourceUrl || null, + marker: properties.marker, + }); + // 5/ Temp: we log on slack events that are extracted from the Dust workspace + if (schema.debug === true) { + await _logDebugEventOnSlack({ event, schema, documentSourceUrl }); + } logger.info( - { response: successResponse.value }, - "[Extract Event] Extract event app ran successfully." + { properties, marker: schema.marker, documentSourceUrl, documentId }, + "[Extract Event] Event saved and logged." ); - - return successResponse.value.results[0][0].value; } /** diff --git a/front/pages/w/[wId]/u/extract/index.tsx b/front/pages/w/[wId]/u/extract/index.tsx index 7d3857ba11f5..25573c5b1608 100644 --- a/front/pages/w/[wId]/u/extract/index.tsx +++ b/front/pages/w/[wId]/u/extract/index.tsx @@ -82,7 +82,7 @@ export default function AppExtractEvents({ [[{schema.marker}]] @@ -92,7 +92,7 @@ export default function AppExtractEvents({ {" "} {/* Set the second column to take max width */} {schema.description} diff --git a/front/tests/lib/markers.test.ts b/front/tests/lib/markers.test.ts index 557cfd053b37..09adf93e1b34 100644 --- a/front/tests/lib/markers.test.ts +++ b/front/tests/lib/markers.test.ts @@ -1,7 +1,6 @@ import { getRawExtractEventMarkersFromText, hasExtractEventMarker, - sanitizeRawExtractEventMarkers, } from "@app/lib/extract_event_markers"; describe("Test hasExtractEventMarker", function () { @@ -75,31 +74,3 @@ describe("Test getExtractEventMarker", function () { }); }); }); - -describe("Test getExtractEventMarker", function () { - const cases = [ - { - markers: ["idea", "goal", "to do"], - expected: { - idea: ["idea"], - goal: ["goal"], - "to do": ["to do"], - }, - }, - { - markers: ["idea", "goal", "to do", "idea:2"], - expected: { - idea: ["idea", "idea:2"], - goal: ["goal"], - "to do": ["to do"], - }, - }, - { - markers: [], - expected: {}, - }, - ]; - cases.forEach((c) => { - expect(sanitizeRawExtractEventMarkers(c.markers)).toEqual(c.expected); - }); -});