Skip to content

Commit

Permalink
GoogleDrive largeFiles flag (#4416)
Browse files Browse the repository at this point in the history
* GoogleDrive: Large Files flag

* allow larger files

* clean-up

* typing
  • Loading branch information
spolu authored Mar 24, 2024
1 parent 1b18f2e commit 20feefa
Show file tree
Hide file tree
Showing 8 changed files with 136 additions and 36 deletions.
1 change: 1 addition & 0 deletions connectors/migrations/20231109_2_create_gdrive_config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ async function main() {
const config = await GoogleDriveConfig.create({
connectorId: connector.id,
pdfEnabled: false,
largeFilesEnabled: false,
});
console.log(
`Created config for connector ${config.connectorId} with id ${config.id} and pdfEnabled ${config.pdfEnabled}`
Expand Down
29 changes: 22 additions & 7 deletions connectors/src/connectors/google_drive/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ export async function createGoogleDriveConnector(

const googleDriveConfigurationBlob = {
pdfEnabled: false,
largeFilesEnabled: false,
};

const connector = await ConnectorResource.makeNew(
Expand Down Expand Up @@ -769,15 +770,15 @@ export async function setGoogleDriveConfig(
new Error(`Google Drive config not found with connectorId ${connectorId}`)
);
}

if (!["true", "false"].includes(configValue)) {
return new Err(
new Error(`Invalid config value ${configValue}, must be true or false`)
);
}

switch (configKey) {
case "pdfEnabled": {
if (!["true", "false"].includes(configValue)) {
return new Err(
new Error(
`Invalid config value ${configValue}, must be true or false`
)
);
}
await config.update({
pdfEnabled: configValue === "true",
});
Expand All @@ -791,6 +792,20 @@ export async function setGoogleDriveConfig(
return new Ok(void 0);
}

case "largeFilesEnabled": {
await config.update({
largeFilesEnabled: configValue === "true",
});
const workflowRes = await launchGoogleDriveFullSyncWorkflow(
connectorId,
null
);
if (workflowRes.isErr()) {
return workflowRes;
}
return new Ok(void 0);
}

default: {
return new Err(new Error(`Invalid config key ${configKey}`));
}
Expand Down
23 changes: 19 additions & 4 deletions connectors/src/connectors/google_drive/temporal/activities.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ import { getGoogleDriveObject } from "@connectors/connectors/google_drive/lib/go
import { getFileParentsMemoized } from "@connectors/connectors/google_drive/lib/hierarchy";
import { syncOneFile } from "@connectors/connectors/google_drive/temporal/file";
import {
getMimesTypeToSync,
getMimeTypesToSync,
isGoogleDriveSpreadSheetFile,
} from "@connectors/connectors/google_drive/temporal/mime_types";
import { deleteSpreadsheet } from "@connectors/connectors/google_drive/temporal/spreadsheets";
Expand All @@ -28,6 +28,7 @@ import { deleteFromDataSource } from "@connectors/lib/data_sources";
import { HTTPError } from "@connectors/lib/error";
import { ExternalOauthTokenError } from "@connectors/lib/error";
import {
GoogleDriveConfig,
GoogleDriveFiles,
GoogleDriveFolders,
GoogleDriveSyncToken,
Expand Down Expand Up @@ -114,7 +115,14 @@ export async function syncFiles(
if (!connector) {
throw new Error(`Connector ${connectorId} not found`);
}
const mimeTypeToSync = await getMimesTypeToSync(connectorId);
const config = await GoogleDriveConfig.findOne({
where: {
connectorId: connectorId,
},
});
const mimeTypesToSync = getMimeTypesToSync({
pdfEnabled: config?.pdfEnabled || false,
});
const authCredentials = await getAuthObject(connector.connectionId);
const driveFolder = await getGoogleDriveObject(
authCredentials,
Expand Down Expand Up @@ -149,7 +157,7 @@ export async function syncFiles(
}

const drive = await getDriveClient(authCredentials);
const mimeTypesSearchString = mimeTypeToSync
const mimeTypesSearchString = mimeTypesToSync
.map((mimeType) => `mimeType='${mimeType}'`)
.join(" or ");

Expand Down Expand Up @@ -255,7 +263,14 @@ export async function incrementalSync(
if (!nextPageToken) {
nextPageToken = await getSyncPageToken(connectorId, driveId, sharedDrive);
}
const mimeTypesToSync = await getMimesTypeToSync(connectorId);
const config = await GoogleDriveConfig.findOne({
where: {
connectorId: connectorId,
},
});
const mimeTypesToSync = getMimeTypesToSync({
pdfEnabled: config?.pdfEnabled || false,
});

const selectedFoldersIds = await getFoldersToSync(connectorId);

Expand Down
23 changes: 19 additions & 4 deletions connectors/src/connectors/google_drive/temporal/file.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,15 @@ import {
} from "@connectors/connectors/google_drive/temporal/utils";
import {
MAX_DOCUMENT_TXT_LEN,
MAX_LARGE_DOCUMENT_TXT_LEN,
renderDocumentTitleAndContent,
upsertToDatasource,
} from "@connectors/lib/data_sources";
import { dpdf2text } from "@connectors/lib/dpdf2text";
import { GoogleDriveFiles } from "@connectors/lib/models/google_drive";
import {
GoogleDriveConfig,
GoogleDriveFiles,
} from "@connectors/lib/models/google_drive";
import logger from "@connectors/logger/logger";
import type { DataSourceConfig } from "@connectors/types/data_source_config";
import type { GoogleDriveObjectType } from "@connectors/types/google_drive";
Expand All @@ -35,7 +39,18 @@ export async function syncOneFile(
startSyncTs: number,
isBatchSync = false
): Promise<boolean> {
const mimeTypesToDownload = await getMimeTypesToDownload(connectorId);
const config = await GoogleDriveConfig.findOne({
where: {
connectorId: connectorId,
},
});
const maxDocumentLen = config?.largeFilesEnabled
? MAX_LARGE_DOCUMENT_TXT_LEN
: MAX_DOCUMENT_TXT_LEN;

const mimeTypesToDownload = getMimeTypesToDownload({
pdfEnabled: config?.pdfEnabled || false,
});
const documentId = getDocumentId(file.id);
let documentContent: string | undefined = undefined;

Expand Down Expand Up @@ -183,7 +198,7 @@ export async function syncOneFile(
// converted to utf-8 it will overcome the limit enforced below. This
// avoids operations on very long text files, that can cause
// Buffer.toString to crash if the file is > 500MB
if (res.data.byteLength > 4 * MAX_DOCUMENT_TXT_LEN) {
if (res.data.byteLength > 4 * maxDocumentLen) {
logger.info(
{
file_id: file.id,
Expand Down Expand Up @@ -311,7 +326,7 @@ export async function syncOneFile(

if (
documentContent.length > 0 &&
documentContent.length <= MAX_DOCUMENT_TXT_LEN
documentContent.length <= maxDocumentLen
) {
const parents = (
await getFileParentsMemoized(
Expand Down
22 changes: 10 additions & 12 deletions connectors/src/connectors/google_drive/temporal/mime_types.ts
Original file line number Diff line number Diff line change
@@ -1,29 +1,27 @@
import type { ModelId } from "@dust-tt/types";

import type { GoogleDriveFiles } from "@connectors/lib/models/google_drive";
import { GoogleDriveConfig } from "@connectors/lib/models/google_drive";

export const MIME_TYPES_TO_EXPORT: { [key: string]: string } = {
"application/vnd.google-apps.document": "text/plain",
"application/vnd.google-apps.presentation": "text/plain",
};

export async function getMimeTypesToDownload(connectorId: ModelId) {
export function getMimeTypesToDownload({
pdfEnabled,
}: {
pdfEnabled: boolean;
}) {
const mimeTypes = ["text/plain"];
const config = await GoogleDriveConfig.findOne({
where: {
connectorId: connectorId,
},
});
if (config?.pdfEnabled) {
if (pdfEnabled) {
mimeTypes.push("application/pdf");
}

return mimeTypes;
}

export async function getMimesTypeToSync(connectorId: ModelId) {
const mimeTypes = await getMimeTypesToDownload(connectorId);
export function getMimeTypesToSync({ pdfEnabled }: { pdfEnabled: boolean }) {
const mimeTypes = getMimeTypesToDownload({
pdfEnabled,
});
mimeTypes.push(...Object.keys(MIME_TYPES_TO_EXPORT));
mimeTypes.push("application/vnd.google-apps.folder");
mimeTypes.push("application/vnd.google-apps.spreadsheet");
Expand Down
2 changes: 2 additions & 0 deletions connectors/src/lib/data_sources.ts
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ if (!DUST_FRONT_API) {
// We limit the document size we support. Beyond a certain size, upsert is simply too slow (>300s)
// and large files are generally less useful anyway.
export const MAX_DOCUMENT_TXT_LEN = 750000;
// For some data sources we allow large documents (5mb) to be processed (behind flag).
export const MAX_LARGE_DOCUMENT_TXT_LEN = 5000000;

type UpsertContext = {
sync_type: "batch" | "incremental";
Expand Down
6 changes: 6 additions & 0 deletions connectors/src/lib/models/google_drive.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ export class GoogleDriveConfig extends Model<
declare updatedAt: CreationOptional<Date>;
declare connectorId: ForeignKey<ConnectorModel["id"]>;
declare pdfEnabled: boolean;
declare largeFilesEnabled: boolean;
}
GoogleDriveConfig.init(
{
Expand Down Expand Up @@ -45,6 +46,11 @@ GoogleDriveConfig.init(
allowNull: false,
defaultValue: false,
},
largeFilesEnabled: {
type: DataTypes.BOOLEAN,
allowNull: false,
defaultValue: false,
},
},
{
sequelize: sequelizeConnection,
Expand Down
66 changes: 57 additions & 9 deletions front/pages/poke/[wId]/data_sources/[name]/index.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ export const getServerSideProps = withSuperUserAuthRequirements<{
features: {
slackBotEnabled: boolean;
googleDrivePdfEnabled: boolean;
googleDriveLargeFilesEnabled: boolean;
githubCodeSyncEnabled: boolean;
};
temporalWorkspace: string;
Expand Down Expand Up @@ -102,10 +103,12 @@ export const getServerSideProps = withSuperUserAuthRequirements<{
const features: {
slackBotEnabled: boolean;
googleDrivePdfEnabled: boolean;
googleDriveLargeFilesEnabled: boolean;
githubCodeSyncEnabled: boolean;
} = {
slackBotEnabled: false,
googleDrivePdfEnabled: false,
googleDriveLargeFilesEnabled: false,
githubCodeSyncEnabled: false,
};

Expand All @@ -132,6 +135,17 @@ export const getServerSideProps = withSuperUserAuthRequirements<{
}
features.googleDrivePdfEnabled =
gdrivePDFEnabledRes.value.configValue === "true";

const gdriveLargeFilesEnabledRes =
await connectorsAPI.getConnectorConfig(
dataSource.connectorId,
"largeFilesEnabled"
);
if (gdriveLargeFilesEnabledRes.isErr()) {
throw gdriveLargeFilesEnabledRes.error;
}
features.googleDrivePdfEnabled =
gdriveLargeFilesEnabledRes.value.configValue === "true";
break;
case "github":
const githubConnectorEnabledRes =
Expand Down Expand Up @@ -243,12 +257,37 @@ const DataSourcePage = ({
}
);
if (!r.ok) {
throw new Error("Failed to toggle Gdrive PDF sync.");
throw new Error("Failed to toggle Google Drive PDF sync.");
}
router.reload();
} catch (e) {
console.error(e);
window.alert("Failed to toggle Google Drive PDF sync.");
}
});

const { submit: onGdriveLargeFilesToggle } = useSubmitFunction(async () => {
try {
const r = await fetch(
`/api/poke/workspaces/${owner.sId}/data_sources/managed-google_drive/config`,
{
method: "POST",
headers: {
"Content-Type": "application/json",
},
body: JSON.stringify({
configKey: "largeFilesEnabled",
configValue: `${!features.googleDriveLargeFilesEnabled}`,
}),
}
);
if (!r.ok) {
throw new Error("Failed to toggle Google Drive Large Files sync.");
}
router.reload();
} catch (e) {
console.error(e);
window.alert("Failed to toggle Gdrive PDF sync.");
window.alert("Failed to toggle Google Drive Large Files sync.");
}
});

Expand Down Expand Up @@ -370,13 +409,22 @@ const DataSourcePage = ({
/>
)}
{dataSource.connectorProvider === "google_drive" && (
<div className="mb-2 flex w-64 items-center justify-between rounded-md border px-2 py-2 text-sm text-gray-600">
<div>PDF syncing enabled?</div>
<SliderToggle
selected={features.googleDrivePdfEnabled}
onClick={onGdrivePDFToggle}
/>
</div>
<>
<div className="mb-2 flex w-64 items-center justify-between rounded-md border px-2 py-2 text-sm text-gray-600">
<div>PDF syncing enabled?</div>
<SliderToggle
selected={features.googleDrivePdfEnabled}
onClick={onGdrivePDFToggle}
/>
</div>
<div className="mb-2 flex w-64 items-center justify-between rounded-md border px-2 py-2 text-sm text-gray-600">
<div>Large Files enabled?</div>
<SliderToggle
selected={features.googleDriveLargeFilesEnabled}
onClick={onGdriveLargeFilesToggle}
/>
</div>
</>
)}
{dataSource.connectorProvider === "github" && (
<div className="mb-2 flex w-64 items-center justify-between rounded-md border px-2 py-2 text-sm text-gray-600">
Expand Down

0 comments on commit 20feefa

Please sign in to comment.