Skip to content

Commit

Permalink
GDrive PDF: Inform current page as section prefix (#4479)
Browse files Browse the repository at this point in the history
  • Loading branch information
spolu authored Mar 27, 2024
1 parent e029ee3 commit a4e0397
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 20 deletions.
61 changes: 44 additions & 17 deletions connectors/src/connectors/google_drive/temporal/file.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import type { ModelId } from "@dust-tt/types";
import type { CoreAPIDataSourceDocumentSection, ModelId } from "@dust-tt/types";
import { uuid4 } from "@temporalio/workflow";
import fs from "fs/promises";
import type { OAuth2Client } from "googleapis-common";
Expand All @@ -20,6 +20,7 @@ import {
MAX_DOCUMENT_TXT_LEN,
MAX_LARGE_DOCUMENT_TXT_LEN,
renderDocumentTitleAndContent,
sectionLength,
upsertToDatasource,
} from "@connectors/lib/data_sources";
import { dpdf2text } from "@connectors/lib/dpdf2text";
Expand Down Expand Up @@ -52,7 +53,7 @@ export async function syncOneFile(
pdfEnabled: config?.pdfEnabled || false,
});
const documentId = getDocumentId(file.id);
let documentContent: string | undefined = undefined;
let documentContent: CoreAPIDataSourceDocumentSection | null = null;

const fileInDb = await GoogleDriveFiles.findOne({
where: {
Expand Down Expand Up @@ -116,7 +117,14 @@ export async function syncOneFile(
);
}
if (typeof res.data === "string") {
documentContent = res.data;
documentContent =
res.data && res.data.trim().length > 0
? {
prefix: null,
content: res.data.trim(),
sections: [],
}
: null;
} else if (
typeof res.data === "object" ||
typeof res.data === "number" ||
Expand All @@ -127,7 +135,14 @@ export async function syncOneFile(
// we need to convert it
// e.g. a google presentation with just the number
// 1 in it, the export will return the number 1 instead of a string
documentContent = res.data?.toString();
documentContent =
res.data && res.data.toString().trim().length > 0
? {
prefix: null,
content: res.data.toString().trim(),
sections: [],
}
: null;
} else {
logger.error(
{
Expand Down Expand Up @@ -209,7 +224,11 @@ export async function syncOneFile(
);
return false;
}
documentContent = Buffer.from(res.data).toString("utf-8");
documentContent = {
prefix: null,
content: Buffer.from(res.data).toString("utf-8").trim(),
sections: [],
};
} else {
logger.error(
{
Expand All @@ -231,13 +250,26 @@ export async function syncOneFile(
await fs.writeFile(pdf_path, Buffer.from(res.data), "binary");
}

const { content: pdfTextData } = await dpdf2text(pdf_path);
const { pages } = await dpdf2text(pdf_path);

documentContent =
pages.length > 0
? {
prefix: null,
content: null,
sections: pages.map((page, i) => ({
prefix: `$pdfPage: ${i + 1}/${pages.length}\n`,
content: page,
sections: [],
})),
}
: null;

documentContent = pdfTextData;
logger.info(
{
file_id: file.id,
mimeType: file.mimeType,
pagesCount: pages.length,
title: file.name,
},
`Successfully converted PDF to text`
Expand Down Expand Up @@ -286,17 +318,13 @@ export async function syncOneFile(
let upsertTimestampMs: number | undefined = undefined;
// We only upsert the document if it's not a google drive spreadsheet.
if (!isGoogleDriveSpreadSheetFile(file)) {
documentContent = documentContent?.trim();

const content = await renderDocumentTitleAndContent({
dataSourceConfig,
title: file.name,
updatedAt: file.updatedAtMs ? new Date(file.updatedAtMs) : undefined,
createdAt: file.createdAtMs ? new Date(file.createdAtMs) : undefined,
lastEditor: file.lastEditor ? file.lastEditor.displayName : undefined,
content: documentContent
? { prefix: null, content: documentContent, sections: [] }
: null,
content: documentContent,
});

if (documentContent === undefined) {
Expand Down Expand Up @@ -324,10 +352,9 @@ export async function syncOneFile(
}
tags.push(`mimeType:${file.mimeType}`);

if (
documentContent.length > 0 &&
documentContent.length <= maxDocumentLen
) {
const documentLen = documentContent ? sectionLength(documentContent) : 0;

if (documentLen > 0 && documentLen <= maxDocumentLen) {
const parents = (
await getFileParentsMemoized(
connectorId,
Expand Down Expand Up @@ -359,7 +386,7 @@ export async function syncOneFile(
{
documentId,
dataSourceConfig,
documentLen: documentContent.length,
documentLen: documentLen,
title: file.name,
},
`Document is empty or too big to be upserted. Skipping`
Expand Down
6 changes: 3 additions & 3 deletions connectors/src/lib/dpdf2text.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ import { spawn } from "child_process";

export async function dpdf2text(
pdfPath: string
): Promise<{ pages: { [pageNumber: string]: string }; content: string }> {
const pages: { [pageNumber: number]: string } = {};
): Promise<{ pages: string[]; content: string }> {
const pages: string[] = [];
let content = "";

let currentPage: number | null = 1;
Expand Down Expand Up @@ -50,7 +50,7 @@ export async function dpdf2text(
if (pageText === null) {
currentPage = null;
} else {
pages[currentPage] = pageText;
pages.push(pageText);
// Pages are generally separated by `\f` (form feed), so we can just concatenate here.
content += pageText;
currentPage++;
Expand Down

0 comments on commit a4e0397

Please sign in to comment.