Skip to content

Commit

Permalink
Add: more textual format support
Browse files Browse the repository at this point in the history
  • Loading branch information
Fraggle committed Dec 9, 2024
1 parent f3acbf4 commit 7302f37
Show file tree
Hide file tree
Showing 6 changed files with 101 additions and 103 deletions.
2 changes: 1 addition & 1 deletion front/hooks/useFileUploaderService.ts
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ export function useFileUploaderService({
new FileBlobUploadError(
"file_type_not_supported",
file,
`File "${file.name}" is not supported.`
`File "${file.name}" is not supported (${contentType}).`
)
)
);
Expand Down
56 changes: 6 additions & 50 deletions front/lib/api/assistant/jit_utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ import type {
SupportedContentFragmentType,
} from "@dust-tt/types";
import {
assertNever,
isAgentMessageType,
isContentFragmentType,
isSupportedDelimitedTextContentType,
Expand All @@ -14,52 +13,21 @@ import {
function isConversationIncludableFileContentType(
contentType: SupportedContentFragmentType
): boolean {
// We allow including everything except images.
if (isSupportedImageContentType(contentType)) {
return false;
}
// We allow including everything except images.
switch (contentType) {
case "application/msword":
case "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
case "application/pdf":
case "text/markdown":
case "text/plain":
case "dust-application/slack":
case "text/vnd.dust.attachment.slack.thread":
case "text/comma-separated-values":
case "text/csv":
case "text/tab-separated-values":
case "text/tsv":
return true;

default:
assertNever(contentType);
}
return true;
}

function isQueryableContentType(
contentType: SupportedContentFragmentType
): boolean {
if (isSupportedImageContentType(contentType)) {
return false;
}
// For now we only allow querying tabular files.
if (isSupportedDelimitedTextContentType(contentType)) {
return true;
}
// For now we only allow querying tabular files.
switch (contentType) {
case "application/msword":
case "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
case "application/pdf":
case "text/markdown":
case "text/plain":
case "dust-application/slack":
case "text/vnd.dust.attachment.slack.thread":
return false;

default:
assertNever(contentType);
}
return false;
}

function isSearchableContentType(
Expand All @@ -71,20 +39,8 @@ function isSearchableContentType(
if (isSupportedDelimitedTextContentType(contentType)) {
return false;
}
// For now we only allow searching text files.
switch (contentType) {
case "application/msword":
case "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
case "application/pdf":
case "text/markdown":
case "text/plain":
case "dust-application/slack":
case "text/vnd.dust.attachment.slack.thread":
return true;

default:
assertNever(contentType);
}
// For now we allow searching everything else.
return true;
}

function isListableContentType(
Expand Down
49 changes: 29 additions & 20 deletions front/lib/api/files/upload.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ import type {
import {
assertNever,
Err,
isSupportedDelimitedTextContentType,
isSupportedImageContentType,
isTextExtractionSupportedContentType,
Ok,
TextExtraction,
Expand Down Expand Up @@ -287,16 +289,26 @@ const getProcessingFunction = ({
contentType: SupportedFileContentType;
useCase: FileUseCase;
}): ProcessingFunction | undefined => {
switch (contentType) {
case "image/jpeg":
case "image/png":
if (useCase === "conversation") {
return resizeAndUploadToFileStorage;
} else if (useCase === "avatar") {
return uploadToPublicBucket;
}
break;
if (isSupportedImageContentType(contentType)) {
if (useCase === "conversation") {
return resizeAndUploadToFileStorage;
} else if (useCase === "avatar") {
return uploadToPublicBucket;
}
return undefined;
}

if (isSupportedDelimitedTextContentType(contentType)) {
if (useCase === "conversation" || useCase === "folder_table") {
// TODO(JIT): after JIT enablement, store raw text here too, the snippet is useless
return extractContentAndSchemaFromDelimitedTextFiles;
} else if (useCase === "folder_document" || useCase === "tool_output") {
return storeRawText;
}
return undefined;
}

switch (contentType) {
case "application/msword":
case "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
case "application/pdf":
Expand All @@ -306,6 +318,14 @@ const getProcessingFunction = ({
break;
case "text/plain":
case "text/markdown":
case "text/html":
case "text/xml":
case "text/calendar":
case "text/css":
case "text/javascript":
case "application/json":
case "application/xml":
case "application/x-sh":
if (
useCase === "conversation" ||
useCase === "folder_document" ||
Expand All @@ -319,17 +339,6 @@ const getProcessingFunction = ({
return storeRawText;
}
break;
case "text/comma-separated-values":
case "text/csv":
case "text/tab-separated-values":
case "text/tsv":
if (useCase === "conversation" || useCase === "folder_table") {
// TODO(JIT): after JIT enablement, store raw text here too, the snippet is useless
return extractContentAndSchemaFromDelimitedTextFiles;
} else if (useCase === "folder_document" || useCase === "tool_output") {
return storeRawText;
}
break;

default:
assertNever(contentType);
Expand Down
81 changes: 49 additions & 32 deletions front/lib/api/files/upsert.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import {
Err,
getSmallWhitelistedModel,
isSupportedDelimitedTextContentType,
isSupportedImageContentType,
Ok,
removeNulls,
slugify,
Expand Down Expand Up @@ -66,37 +67,45 @@ async function generateSnippet(
const startTime = Date.now();
const owner = auth.getNonNullableWorkspace();

switch (file.contentType) {
case "image/jpeg":
case "image/png":
return new Err(
new Error("Image files are not supported for file snippets.")
);
case "text/csv":
case "text/comma-separated-values":
case "text/tsv":
case "text/tab-separated-values":
const format =
file.contentType === "text/csv" ||
file.contentType === "text/comma-separated-values"
? "csv"
: "tsv";

// Parse only the headers from the CSV file
const headers = content.split("\n")[0];

let snippet = `${format.toUpperCase()} file with headers: ${headers}`;
if (snippet.length > 256) {
snippet = snippet.slice(0, 242) + "... (truncated)";
}
if (isSupportedImageContentType(file.contentType)) {
return new Err(
new Error("Image files are not supported for file snippets.")
);
}

return new Ok(snippet);
case "text/markdown":
case "text/plain":
case "text/vnd.dust.attachment.slack.thread":
if (isSupportedDelimitedTextContentType(file.contentType)) {
const format =
file.contentType === "text/csv" ||
file.contentType === "text/comma-separated-values"
? "csv"
: "tsv";

// Parse only the headers from the CSV file
const headers = content.split("\n")[0];

let snippet = `${format.toUpperCase()} file with headers: ${headers}`;
if (snippet.length > 256) {
snippet = snippet.slice(0, 242) + "... (truncated)";
}

return new Ok(snippet);
}

switch (file.contentType) {
case "application/msword":
case "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
case "application/pdf":
case "text/plain":
case "text/markdown":
case "text/html":
case "text/xml":
case "text/calendar":
case "text/css":
case "text/javascript":
case "application/json":
case "application/xml":
case "application/x-sh":
case "text/vnd.dust.attachment.slack.thread":
if (!ENABLE_LLM_SNIPPETS) {
// Take the first 256 characters
if (content.length > 256) {
Expand Down Expand Up @@ -214,6 +223,7 @@ async function generateSnippet(
assertNever(run);
}
break;

default:
assertNever(file.contentType);
}
Expand Down Expand Up @@ -311,6 +321,10 @@ const getProcessingFunction = ({
contentType: SupportedFileContentType;
useCase: FileUseCase;
}): ProcessingFunction | undefined => {
if (isSupportedImageContentType(contentType)) {
return undefined;
}

// Use isSupportedDelimitedTextContentType() everywhere to have a common source of truth
if (isSupportedDelimitedTextContentType(contentType)) {
if (
Expand All @@ -333,16 +347,19 @@ const getProcessingFunction = ({
case "text/markdown":
case "text/plain":
case "text/vnd.dust.attachment.slack.thread":
case "text/html":
case "text/xml":
case "text/calendar":
case "text/css":
case "text/javascript":
case "application/json":
case "application/xml":
case "application/x-sh":
if (useCase === "conversation" || useCase === "tool_output") {
return upsertDocumentToDatasource;
}
break;

case "image/jpeg":
case "image/png":
// We do nothing for images.
break;

default:
assertNever(contentType);
}
Expand Down
8 changes: 8 additions & 0 deletions sdks/js/src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,14 @@ export const supportedRawText = {
"text/tab-separated-values": [".tsv"],
"text/tsv": [".tsv"],
"text/vnd.dust.attachment.slack.thread": [".txt"],
"text/html": [".html", ".htm", ".xhtml", ".xhtml+xml"],
"text/xml": [".xml"],
"text/calendar": [".ics"],
"text/css": [".css"],
"text/javascript": [".js", ".mjs"],
"application/json": [".json"],
"application/xml": [".xml"],
"application/x-sh": [".sh"],
} as const;

// Supported content types for plain text (after processing).
Expand Down
8 changes: 8 additions & 0 deletions types/src/front/files.ts
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,14 @@ const supportedRawText = {
"text/markdown": [".md", ".markdown"],
"text/plain": [".txt"],
"text/vnd.dust.attachment.slack.thread": [".txt"],
"text/html": [".html", ".htm", ".xhtml", ".xhtml+xml"],
"text/xml": [".xml"],
"text/calendar": [".ics"],
"text/css": [".css"],
"text/javascript": [".js", ".mjs"],
"application/json": [".json"],
"application/xml": [".xml"],
"application/x-sh": [".sh"],
};

// Supported content types for plain text (after processing).
Expand Down

0 comments on commit 7302f37

Please sign in to comment.