diff --git a/connectors/migrations/20231109_2_create_gdrive_config.ts b/connectors/migrations/20231109_2_create_gdrive_config.ts index cc874b120fc7..f502be5f4cfe 100644 --- a/connectors/migrations/20231109_2_create_gdrive_config.ts +++ b/connectors/migrations/20231109_2_create_gdrive_config.ts @@ -12,6 +12,7 @@ async function main() { const config = await GoogleDriveConfig.create({ connectorId: connector.id, pdfEnabled: false, + largeFilesEnabled: false, }); console.log( `Created config for connector ${config.connectorId} with id ${config.id} and pdfEnabled ${config.pdfEnabled}` diff --git a/connectors/src/connectors/google_drive/index.ts b/connectors/src/connectors/google_drive/index.ts index fedcd149b7cc..ee4ccf1ae30d 100644 --- a/connectors/src/connectors/google_drive/index.ts +++ b/connectors/src/connectors/google_drive/index.ts @@ -117,6 +117,7 @@ export async function createGoogleDriveConnector( const googleDriveConfigurationBlob = { pdfEnabled: false, + largeFilesEnabled: false, }; const connector = await ConnectorResource.makeNew( @@ -769,15 +770,15 @@ export async function setGoogleDriveConfig( new Error(`Google Drive config not found with connectorId ${connectorId}`) ); } + + if (!["true", "false"].includes(configValue)) { + return new Err( + new Error(`Invalid config value ${configValue}, must be true or false`) + ); + } + switch (configKey) { case "pdfEnabled": { - if (!["true", "false"].includes(configValue)) { - return new Err( - new Error( - `Invalid config value ${configValue}, must be true or false` - ) - ); - } await config.update({ pdfEnabled: configValue === "true", }); @@ -791,6 +792,20 @@ export async function setGoogleDriveConfig( return new Ok(void 0); } + case "largeFilesEnabled": { + await config.update({ + largeFilesEnabled: configValue === "true", + }); + const workflowRes = await launchGoogleDriveFullSyncWorkflow( + connectorId, + null + ); + if (workflowRes.isErr()) { + return workflowRes; + } + return new Ok(void 0); + } + default: { return new Err(new Error(`Invalid config key ${configKey}`)); } diff --git a/connectors/src/connectors/google_drive/temporal/activities.ts b/connectors/src/connectors/google_drive/temporal/activities.ts index 1a0d4f32705f..78e6b64d467d 100644 --- a/connectors/src/connectors/google_drive/temporal/activities.ts +++ b/connectors/src/connectors/google_drive/temporal/activities.ts @@ -13,7 +13,7 @@ import { getGoogleDriveObject } from "@connectors/connectors/google_drive/lib/go import { getFileParentsMemoized } from "@connectors/connectors/google_drive/lib/hierarchy"; import { syncOneFile } from "@connectors/connectors/google_drive/temporal/file"; import { - getMimesTypeToSync, + getMimeTypesToSync, isGoogleDriveSpreadSheetFile, } from "@connectors/connectors/google_drive/temporal/mime_types"; import { deleteSpreadsheet } from "@connectors/connectors/google_drive/temporal/spreadsheets"; @@ -28,6 +28,7 @@ import { deleteFromDataSource } from "@connectors/lib/data_sources"; import { HTTPError } from "@connectors/lib/error"; import { ExternalOauthTokenError } from "@connectors/lib/error"; import { + GoogleDriveConfig, GoogleDriveFiles, GoogleDriveFolders, GoogleDriveSyncToken, @@ -114,7 +115,14 @@ export async function syncFiles( if (!connector) { throw new Error(`Connector ${connectorId} not found`); } - const mimeTypeToSync = await getMimesTypeToSync(connectorId); + const config = await GoogleDriveConfig.findOne({ + where: { + connectorId: connectorId, + }, + }); + const mimeTypesToSync = getMimeTypesToSync({ + pdfEnabled: config?.pdfEnabled || false, + }); const authCredentials = await getAuthObject(connector.connectionId); const driveFolder = await getGoogleDriveObject( authCredentials, @@ -149,7 +157,7 @@ export async function syncFiles( } const drive = await getDriveClient(authCredentials); - const mimeTypesSearchString = mimeTypeToSync + const mimeTypesSearchString = mimeTypesToSync .map((mimeType) => `mimeType='${mimeType}'`) .join(" or "); @@ -255,7 +263,14 @@ export async function incrementalSync( if (!nextPageToken) { nextPageToken = await getSyncPageToken(connectorId, driveId, sharedDrive); } - const mimeTypesToSync = await getMimesTypeToSync(connectorId); + const config = await GoogleDriveConfig.findOne({ + where: { + connectorId: connectorId, + }, + }); + const mimeTypesToSync = getMimeTypesToSync({ + pdfEnabled: config?.pdfEnabled || false, + }); const selectedFoldersIds = await getFoldersToSync(connectorId); diff --git a/connectors/src/connectors/google_drive/temporal/file.ts b/connectors/src/connectors/google_drive/temporal/file.ts index 0a091aef36e9..771e6c00625e 100644 --- a/connectors/src/connectors/google_drive/temporal/file.ts +++ b/connectors/src/connectors/google_drive/temporal/file.ts @@ -18,11 +18,15 @@ import { } from "@connectors/connectors/google_drive/temporal/utils"; import { MAX_DOCUMENT_TXT_LEN, + MAX_LARGE_DOCUMENT_TXT_LEN, renderDocumentTitleAndContent, upsertToDatasource, } from "@connectors/lib/data_sources"; import { dpdf2text } from "@connectors/lib/dpdf2text"; -import { GoogleDriveFiles } from "@connectors/lib/models/google_drive"; +import { + GoogleDriveConfig, + GoogleDriveFiles, +} from "@connectors/lib/models/google_drive"; import logger from "@connectors/logger/logger"; import type { DataSourceConfig } from "@connectors/types/data_source_config"; import type { GoogleDriveObjectType } from "@connectors/types/google_drive"; @@ -35,7 +39,18 @@ export async function syncOneFile( startSyncTs: number, isBatchSync = false ): Promise { - const mimeTypesToDownload = await getMimeTypesToDownload(connectorId); + const config = await GoogleDriveConfig.findOne({ + where: { + connectorId: connectorId, + }, + }); + const maxDocumentLen = config?.largeFilesEnabled + ? MAX_LARGE_DOCUMENT_TXT_LEN + : MAX_DOCUMENT_TXT_LEN; + + const mimeTypesToDownload = getMimeTypesToDownload({ + pdfEnabled: config?.pdfEnabled || false, + }); const documentId = getDocumentId(file.id); let documentContent: string | undefined = undefined; @@ -183,7 +198,7 @@ export async function syncOneFile( // converted to utf-8 it will overcome the limit enforced below. This // avoids operations on very long text files, that can cause // Buffer.toString to crash if the file is > 500MB - if (res.data.byteLength > 4 * MAX_DOCUMENT_TXT_LEN) { + if (res.data.byteLength > 4 * maxDocumentLen) { logger.info( { file_id: file.id, @@ -311,7 +326,7 @@ export async function syncOneFile( if ( documentContent.length > 0 && - documentContent.length <= MAX_DOCUMENT_TXT_LEN + documentContent.length <= maxDocumentLen ) { const parents = ( await getFileParentsMemoized( diff --git a/connectors/src/connectors/google_drive/temporal/mime_types.ts b/connectors/src/connectors/google_drive/temporal/mime_types.ts index f2ba22c18c54..4040dcaa2855 100644 --- a/connectors/src/connectors/google_drive/temporal/mime_types.ts +++ b/connectors/src/connectors/google_drive/temporal/mime_types.ts @@ -1,29 +1,27 @@ -import type { ModelId } from "@dust-tt/types"; - import type { GoogleDriveFiles } from "@connectors/lib/models/google_drive"; -import { GoogleDriveConfig } from "@connectors/lib/models/google_drive"; export const MIME_TYPES_TO_EXPORT: { [key: string]: string } = { "application/vnd.google-apps.document": "text/plain", "application/vnd.google-apps.presentation": "text/plain", }; -export async function getMimeTypesToDownload(connectorId: ModelId) { +export function getMimeTypesToDownload({ + pdfEnabled, +}: { + pdfEnabled: boolean; +}) { const mimeTypes = ["text/plain"]; - const config = await GoogleDriveConfig.findOne({ - where: { - connectorId: connectorId, - }, - }); - if (config?.pdfEnabled) { + if (pdfEnabled) { mimeTypes.push("application/pdf"); } return mimeTypes; } -export async function getMimesTypeToSync(connectorId: ModelId) { - const mimeTypes = await getMimeTypesToDownload(connectorId); +export function getMimeTypesToSync({ pdfEnabled }: { pdfEnabled: boolean }) { + const mimeTypes = getMimeTypesToDownload({ + pdfEnabled, + }); mimeTypes.push(...Object.keys(MIME_TYPES_TO_EXPORT)); mimeTypes.push("application/vnd.google-apps.folder"); mimeTypes.push("application/vnd.google-apps.spreadsheet"); diff --git a/connectors/src/lib/data_sources.ts b/connectors/src/lib/data_sources.ts index 0c7828e023d6..889ff6c2c719 100644 --- a/connectors/src/lib/data_sources.ts +++ b/connectors/src/lib/data_sources.ts @@ -29,6 +29,8 @@ if (!DUST_FRONT_API) { // We limit the document size we support. Beyond a certain size, upsert is simply too slow (>300s) // and large files are generally less useful anyway. export const MAX_DOCUMENT_TXT_LEN = 750000; +// For some data sources we allow large documents (5mb) to be processed (behind flag). +export const MAX_LARGE_DOCUMENT_TXT_LEN = 5000000; type UpsertContext = { sync_type: "batch" | "incremental"; diff --git a/connectors/src/lib/models/google_drive.ts b/connectors/src/lib/models/google_drive.ts index a526d3d5fabd..0649f1890c32 100644 --- a/connectors/src/lib/models/google_drive.ts +++ b/connectors/src/lib/models/google_drive.ts @@ -18,6 +18,7 @@ export class GoogleDriveConfig extends Model< declare updatedAt: CreationOptional; declare connectorId: ForeignKey; declare pdfEnabled: boolean; + declare largeFilesEnabled: boolean; } GoogleDriveConfig.init( { @@ -45,6 +46,11 @@ GoogleDriveConfig.init( allowNull: false, defaultValue: false, }, + largeFilesEnabled: { + type: DataTypes.BOOLEAN, + allowNull: false, + defaultValue: false, + }, }, { sequelize: sequelizeConnection, diff --git a/front/pages/poke/[wId]/data_sources/[name]/index.tsx b/front/pages/poke/[wId]/data_sources/[name]/index.tsx index 31cfe442cab9..ba4e4cb91c9b 100644 --- a/front/pages/poke/[wId]/data_sources/[name]/index.tsx +++ b/front/pages/poke/[wId]/data_sources/[name]/index.tsx @@ -48,6 +48,7 @@ export const getServerSideProps = withSuperUserAuthRequirements<{ features: { slackBotEnabled: boolean; googleDrivePdfEnabled: boolean; + googleDriveLargeFilesEnabled: boolean; githubCodeSyncEnabled: boolean; }; temporalWorkspace: string; @@ -102,10 +103,12 @@ export const getServerSideProps = withSuperUserAuthRequirements<{ const features: { slackBotEnabled: boolean; googleDrivePdfEnabled: boolean; + googleDriveLargeFilesEnabled: boolean; githubCodeSyncEnabled: boolean; } = { slackBotEnabled: false, googleDrivePdfEnabled: false, + googleDriveLargeFilesEnabled: false, githubCodeSyncEnabled: false, }; @@ -132,6 +135,17 @@ export const getServerSideProps = withSuperUserAuthRequirements<{ } features.googleDrivePdfEnabled = gdrivePDFEnabledRes.value.configValue === "true"; + + const gdriveLargeFilesEnabledRes = + await connectorsAPI.getConnectorConfig( + dataSource.connectorId, + "largeFilesEnabled" + ); + if (gdriveLargeFilesEnabledRes.isErr()) { + throw gdriveLargeFilesEnabledRes.error; + } + features.googleDrivePdfEnabled = + gdriveLargeFilesEnabledRes.value.configValue === "true"; break; case "github": const githubConnectorEnabledRes = @@ -243,12 +257,37 @@ const DataSourcePage = ({ } ); if (!r.ok) { - throw new Error("Failed to toggle Gdrive PDF sync."); + throw new Error("Failed to toggle Google Drive PDF sync."); + } + router.reload(); + } catch (e) { + console.error(e); + window.alert("Failed to toggle Google Drive PDF sync."); + } + }); + + const { submit: onGdriveLargeFilesToggle } = useSubmitFunction(async () => { + try { + const r = await fetch( + `/api/poke/workspaces/${owner.sId}/data_sources/managed-google_drive/config`, + { + method: "POST", + headers: { + "Content-Type": "application/json", + }, + body: JSON.stringify({ + configKey: "largeFilesEnabled", + configValue: `${!features.googleDriveLargeFilesEnabled}`, + }), + } + ); + if (!r.ok) { + throw new Error("Failed to toggle Google Drive Large Files sync."); } router.reload(); } catch (e) { console.error(e); - window.alert("Failed to toggle Gdrive PDF sync."); + window.alert("Failed to toggle Google Drive Large Files sync."); } }); @@ -370,13 +409,22 @@ const DataSourcePage = ({ /> )} {dataSource.connectorProvider === "google_drive" && ( -
-
PDF syncing enabled?
- -
+ <> +
+
PDF syncing enabled?
+ +
+
+
Large Files enabled?
+ +
+ )} {dataSource.connectorProvider === "github" && (