diff --git a/front/hooks/useFileUploaderService.ts b/front/hooks/useFileUploaderService.ts index 3f8063402858..169bc1ca642d 100644 --- a/front/hooks/useFileUploaderService.ts +++ b/front/hooks/useFileUploaderService.ts @@ -140,7 +140,7 @@ export function useFileUploaderService({ new FileBlobUploadError( "file_type_not_supported", file, - `File "${file.name}" is not supported.` + `File "${file.name}" is not supported (${contentType}).` ) ) ); diff --git a/front/lib/api/assistant/jit_utils.ts b/front/lib/api/assistant/jit_utils.ts index 8a5b5eec1bbd..faacf33cc1ef 100644 --- a/front/lib/api/assistant/jit_utils.ts +++ b/front/lib/api/assistant/jit_utils.ts @@ -4,7 +4,6 @@ import type { SupportedContentFragmentType, } from "@dust-tt/types"; import { - assertNever, isAgentMessageType, isContentFragmentType, isSupportedDelimitedTextContentType, @@ -14,52 +13,21 @@ import { function isConversationIncludableFileContentType( contentType: SupportedContentFragmentType ): boolean { + // We allow including everything except images. if (isSupportedImageContentType(contentType)) { return false; } - // We allow including everything except images. - switch (contentType) { - case "application/msword": - case "application/vnd.openxmlformats-officedocument.wordprocessingml.document": - case "application/pdf": - case "text/markdown": - case "text/plain": - case "dust-application/slack": - case "text/vnd.dust.attachment.slack.thread": - case "text/comma-separated-values": - case "text/csv": - case "text/tab-separated-values": - case "text/tsv": - return true; - - default: - assertNever(contentType); - } + return true; } function isQueryableContentType( contentType: SupportedContentFragmentType ): boolean { - if (isSupportedImageContentType(contentType)) { - return false; - } + // For now we only allow querying tabular files. if (isSupportedDelimitedTextContentType(contentType)) { return true; } - // For now we only allow querying tabular files. - switch (contentType) { - case "application/msword": - case "application/vnd.openxmlformats-officedocument.wordprocessingml.document": - case "application/pdf": - case "text/markdown": - case "text/plain": - case "dust-application/slack": - case "text/vnd.dust.attachment.slack.thread": - return false; - - default: - assertNever(contentType); - } + return false; } function isSearchableContentType( @@ -71,20 +39,8 @@ function isSearchableContentType( if (isSupportedDelimitedTextContentType(contentType)) { return false; } - // For now we only allow searching text files. - switch (contentType) { - case "application/msword": - case "application/vnd.openxmlformats-officedocument.wordprocessingml.document": - case "application/pdf": - case "text/markdown": - case "text/plain": - case "dust-application/slack": - case "text/vnd.dust.attachment.slack.thread": - return true; - - default: - assertNever(contentType); - } + // For now we allow searching everything else. + return true; } function isListableContentType( diff --git a/front/lib/api/files/upload.ts b/front/lib/api/files/upload.ts index c97a7c71af33..06eeb3b0548f 100644 --- a/front/lib/api/files/upload.ts +++ b/front/lib/api/files/upload.ts @@ -6,6 +6,8 @@ import type { import { assertNever, Err, + isSupportedDelimitedTextContentType, + isSupportedImageContentType, isTextExtractionSupportedContentType, Ok, TextExtraction, @@ -287,16 +289,26 @@ const getProcessingFunction = ({ contentType: SupportedFileContentType; useCase: FileUseCase; }): ProcessingFunction | undefined => { - switch (contentType) { - case "image/jpeg": - case "image/png": - if (useCase === "conversation") { - return resizeAndUploadToFileStorage; - } else if (useCase === "avatar") { - return uploadToPublicBucket; - } - break; + if (isSupportedImageContentType(contentType)) { + if (useCase === "conversation") { + return resizeAndUploadToFileStorage; + } else if (useCase === "avatar") { + return uploadToPublicBucket; + } + return undefined; + } + if (isSupportedDelimitedTextContentType(contentType)) { + if (useCase === "conversation" || useCase === "folder_table") { + // TODO(JIT): after JIT enablement, store raw text here too, the snippet is useless + return extractContentAndSchemaFromDelimitedTextFiles; + } else if (useCase === "folder_document" || useCase === "tool_output") { + return storeRawText; + } + return undefined; + } + + switch (contentType) { case "application/msword": case "application/vnd.openxmlformats-officedocument.wordprocessingml.document": case "application/pdf": @@ -306,6 +318,14 @@ const getProcessingFunction = ({ break; case "text/plain": case "text/markdown": + case "text/html": + case "text/xml": + case "text/calendar": + case "text/css": + case "text/javascript": + case "application/json": + case "application/xml": + case "application/x-sh": if ( useCase === "conversation" || useCase === "folder_document" || @@ -319,17 +339,6 @@ const getProcessingFunction = ({ return storeRawText; } break; - case "text/comma-separated-values": - case "text/csv": - case "text/tab-separated-values": - case "text/tsv": - if (useCase === "conversation" || useCase === "folder_table") { - // TODO(JIT): after JIT enablement, store raw text here too, the snippet is useless - return extractContentAndSchemaFromDelimitedTextFiles; - } else if (useCase === "folder_document" || useCase === "tool_output") { - return storeRawText; - } - break; default: assertNever(contentType); diff --git a/front/lib/api/files/upsert.ts b/front/lib/api/files/upsert.ts index 8019c4d64fea..591858b13225 100644 --- a/front/lib/api/files/upsert.ts +++ b/front/lib/api/files/upsert.ts @@ -9,6 +9,7 @@ import { Err, getSmallWhitelistedModel, isSupportedDelimitedTextContentType, + isSupportedImageContentType, Ok, removeNulls, slugify, @@ -66,37 +67,45 @@ async function generateSnippet( const startTime = Date.now(); const owner = auth.getNonNullableWorkspace(); - switch (file.contentType) { - case "image/jpeg": - case "image/png": - return new Err( - new Error("Image files are not supported for file snippets.") - ); - case "text/csv": - case "text/comma-separated-values": - case "text/tsv": - case "text/tab-separated-values": - const format = - file.contentType === "text/csv" || - file.contentType === "text/comma-separated-values" - ? "csv" - : "tsv"; - - // Parse only the headers from the CSV file - const headers = content.split("\n")[0]; - - let snippet = `${format.toUpperCase()} file with headers: ${headers}`; - if (snippet.length > 256) { - snippet = snippet.slice(0, 242) + "... (truncated)"; - } + if (isSupportedImageContentType(file.contentType)) { + return new Err( + new Error("Image files are not supported for file snippets.") + ); + } - return new Ok(snippet); - case "text/markdown": - case "text/plain": - case "text/vnd.dust.attachment.slack.thread": + if (isSupportedDelimitedTextContentType(file.contentType)) { + const format = + file.contentType === "text/csv" || + file.contentType === "text/comma-separated-values" + ? "csv" + : "tsv"; + + // Parse only the headers from the CSV file + const headers = content.split("\n")[0]; + + let snippet = `${format.toUpperCase()} file with headers: ${headers}`; + if (snippet.length > 256) { + snippet = snippet.slice(0, 242) + "... (truncated)"; + } + + return new Ok(snippet); + } + + switch (file.contentType) { case "application/msword": case "application/vnd.openxmlformats-officedocument.wordprocessingml.document": case "application/pdf": + case "text/plain": + case "text/markdown": + case "text/html": + case "text/xml": + case "text/calendar": + case "text/css": + case "text/javascript": + case "application/json": + case "application/xml": + case "application/x-sh": + case "text/vnd.dust.attachment.slack.thread": if (!ENABLE_LLM_SNIPPETS) { // Take the first 256 characters if (content.length > 256) { @@ -214,6 +223,7 @@ async function generateSnippet( assertNever(run); } break; + default: assertNever(file.contentType); } @@ -311,6 +321,10 @@ const getProcessingFunction = ({ contentType: SupportedFileContentType; useCase: FileUseCase; }): ProcessingFunction | undefined => { + if (isSupportedImageContentType(contentType)) { + return undefined; + } + // Use isSupportedDelimitedTextContentType() everywhere to have a common source of truth if (isSupportedDelimitedTextContentType(contentType)) { if ( @@ -333,16 +347,19 @@ const getProcessingFunction = ({ case "text/markdown": case "text/plain": case "text/vnd.dust.attachment.slack.thread": + case "text/html": + case "text/xml": + case "text/calendar": + case "text/css": + case "text/javascript": + case "application/json": + case "application/xml": + case "application/x-sh": if (useCase === "conversation" || useCase === "tool_output") { return upsertDocumentToDatasource; } break; - case "image/jpeg": - case "image/png": - // We do nothing for images. - break; - default: assertNever(contentType); } diff --git a/sdks/js/src/types.ts b/sdks/js/src/types.ts index 308ea0ff11ac..f5954f76e064 100644 --- a/sdks/js/src/types.ts +++ b/sdks/js/src/types.ts @@ -77,6 +77,14 @@ export const supportedRawText = { "text/tab-separated-values": [".tsv"], "text/tsv": [".tsv"], "text/vnd.dust.attachment.slack.thread": [".txt"], + "text/html": [".html", ".htm", ".xhtml", ".xhtml+xml"], + "text/xml": [".xml"], + "text/calendar": [".ics"], + "text/css": [".css"], + "text/javascript": [".js", ".mjs"], + "application/json": [".json"], + "application/xml": [".xml"], + "application/x-sh": [".sh"], } as const; // Supported content types for plain text (after processing). diff --git a/types/src/front/files.ts b/types/src/front/files.ts index 4335df1a66b9..db1cf65e9e5c 100644 --- a/types/src/front/files.ts +++ b/types/src/front/files.ts @@ -80,6 +80,14 @@ const supportedRawText = { "text/markdown": [".md", ".markdown"], "text/plain": [".txt"], "text/vnd.dust.attachment.slack.thread": [".txt"], + "text/html": [".html", ".htm", ".xhtml", ".xhtml+xml"], + "text/xml": [".xml"], + "text/calendar": [".ics"], + "text/css": [".css"], + "text/javascript": [".js", ".mjs"], + "application/json": [".json"], + "application/xml": [".xml"], + "application/x-sh": [".sh"], }; // Supported content types for plain text (after processing).