Extract: Manage Big docs

dust-tt · Aug 31, 2023 · c34bc8c · c34bc8c
1 parent da65a64
commit c34bc8c
Show file tree

Hide file tree

Showing 6 changed files with 198 additions and 143 deletions.
diff --git a/front/lib/actions/registry.ts b/front/lib/actions/registry.ts
@@ -218,9 +218,9 @@ export const DustProdActionRegistry = createActionRegistry({
   "extract-events": {
     app: {
       workspaceId: PRODUCTION_DUST_APPS_WORKSPACE_ID,
-      appId: "edf70ecf98",
+      appId: "d4f31b6a63",
       appHash:
-        "6c32adac64c0876301614c17e7ef98091f7ab52afaacfc11549b3efe09a65ffc",
+        "73215c9d3fb6819c979d83bae86681313a41ea21760ffd9372d7f2a711387d18",
     },
     config: {
       MODEL: {

diff --git a/front/lib/extract_event_app.ts b/front/lib/extract_event_app.ts
@@ -0,0 +1,117 @@
+import {
+  cloneBaseConfig,
+  DustProdActionRegistry,
+} from "@app/lib/actions/registry";
+import { runAction } from "@app/lib/actions/server";
+import { Authenticator } from "@app/lib/auth";
+import { CoreAPI } from "@app/lib/core_api";
+import { formatPropertiesForModel } from "@app/lib/extract_events_properties";
+import logger from "@app/logger/logger";
+import { EventSchemaType } from "@app/types/extract";
+
+const EXTRACT_MAX_NUMBER_TOKENS_TO_PROCESS = 6000;
+
+export type ExtractEventAppResponseResults = {
+  value: {
+    results: { value: string }[][];
+  };
+};
+
+/**
+ * Runs the Extract event app and returns just only the results in which extracted events are found
+ * @param auth
+ * @param inputs
+ */
+export async function _runExtractEventApp({
+  auth,
+  content,
+  marker,
+  schema,
+}: {
+  auth: Authenticator;
+  content: string;
+  marker: string;
+  schema: EventSchemaType;
+}): Promise<string> {
+  const inputs = [
+    {
+      content: content,
+      marker: marker,
+      properties_to_extract: formatPropertiesForModel(schema.properties),
+    },
+  ];
+
+  const ACTION_NAME = "extract-events";
+  const config = cloneBaseConfig(DustProdActionRegistry[ACTION_NAME]?.config);
+  const response = await runAction(auth, ACTION_NAME, config, inputs);
+
+  if (response.isErr()) {
+    logger.error(
+      { error: response.error },
+      `api_error: ${JSON.stringify(response.error)}`
+    );
+    return "";
+  }
+
+  const successResponse = response as ExtractEventAppResponseResults;
+  const successResponseValue = successResponse.value.results[0][0].value;
+
+  logger.info(
+    { value: successResponseValue },
+    "[Extract Event] Extract event app ran successfully."
+  );
+
+  return successResponseValue;
+}
+
+/**
+ * Return the content to process by the Extract Event app.
+ * If the document is too big, we send only part of it to the Dust App.
+ * @param fullDocumentText
+ * @param marker
+ */
+export async function _getMaxTextContentToProcess({
+  fullDocumentText,
+  marker,
+}: {
+  fullDocumentText: string;
+  marker: string;
+}): Promise<string> {
+  const tokensInDocumentText = await CoreAPI.tokenize({
+    text: fullDocumentText,
+    modelId: "text-embedding-ada-002",
+    providerId: "openai",
+  });
+  if (tokensInDocumentText.isErr()) {
+    {
+      tokensInDocumentText.error;
+    }
+    logger.error(
+      "Could not get number of tokens for document, trying with full doc."
+    );
+    return fullDocumentText;
+  }
+
+  const numberOfTokens = tokensInDocumentText.value.tokens.length;
+  let documentTextToProcess: string;
+
+  if (numberOfTokens > EXTRACT_MAX_NUMBER_TOKENS_TO_PROCESS) {
+    // Document is too big, we need to send only part of it to the Dust App.
+    const fullDocLength = fullDocumentText.length;
+    const markerIndex = fullDocumentText.indexOf(marker);
+    const markerLength = marker.length;
+
+    // We can go half the max number of tokens on each side of the marker.
+    // We multiply by 4 because we assume 1 token is approximately 4 characters
+    const maxLength = (EXTRACT_MAX_NUMBER_TOKENS_TO_PROCESS / 2) * 4;
+
+    const start = Math.max(0, markerIndex - maxLength);
+    const end = Math.min(fullDocLength, markerIndex + markerLength + maxLength);
+    documentTextToProcess = fullDocumentText.substring(start, end);
+  } else {
+    // Document is small enough, we send the whole text.
+    documentTextToProcess = fullDocumentText;
+  }
+
+  return documentTextToProcess;
+}
diff --git a/front/lib/extract_event_markers.ts b/front/lib/extract_event_markers.ts
@@ -3,7 +3,6 @@ import { Op } from "sequelize";
 import { ExtractedEvent } from "@app/lib/models";
 
 const EXTRACT_EVENT_PATTERN = /\[\[(.*?)\]\]/; // Ex: [[event]]
-type ExtractedMarkersType = { [key: string]: string[] };
 
 /**
  * Check if a text contains an extract event marker
@@ -32,22 +31,19 @@ export function getRawExtractEventMarkersFromText(text: string): string[] {
 /**
  * We can use [[idea]] or [[idea:2]] in a document to mark 2 events of the same type.
  * This function will return a dict of markers with the same name.
- * @param rawMarkers string[]
- * @returns ExtractedMarkersType
- * @example ["idea", "idea:2", "idea:3", "goals"] returns { "idea": ["idea", "idea:2", "idea:3"], "goals": ["goals"] }
+ * @param markersWithSuffix string[]
+ * @returns uniqueMarkersWithoutSuffix string[]
+ * @example ["idea", "idea:2", "idea:3", "goals"] returns ["idea",  "goals"]
  */
-export function sanitizeRawExtractEventMarkers(
-  rawMarkers: string[]
-): ExtractedMarkersType {
-  const markers: { [key: string]: string[] } = {};
-  rawMarkers.map((m) => {
-    const [key] = m.split(":");
-    if (!markers[key]) {
-      markers[key] = [];
-    }
-    markers[key].push(m);
+export function getUniqueMarkersWithoutSuffix(
+  markersWithSuffix: string[]
+): string[] {
+  const uniqueMarkers = new Set<string>();
+  markersWithSuffix.forEach((marker) => {
+    const [markerWithoutSuffix] = marker.split(":");
+    uniqueMarkers.add(markerWithoutSuffix);
   });
-  return markers;
+  return Array.from(uniqueMarkers);
 }
 
 /**