Skip to content

Commit

Permalink
Fix excessive PDF newlines (langchain-ai#3218)
Browse files Browse the repository at this point in the history
* fix: remove "\n" between words when use pdf-loader

* Add fix to web PDF loader, add tests

* Comment + log

* Update test

* Fix directory test

---------

Co-authored-by: ppxu <[email protected]>
  • Loading branch information
jacoblee93 and ppxu authored Nov 10, 2023
1 parent 85734d8 commit 65f68b7
Show file tree
Hide file tree
Showing 6 changed files with 63 additions and 10 deletions.
21 changes: 17 additions & 4 deletions langchain/src/document_loaders/fs/pdf.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import type { TextItem } from "pdf-parse/lib/pdf.js/v1.10.100/build/pdf.js";
import { Document } from "../../document.js";
import { BufferLoader } from "./buffer.js";
import { formatDocumentsAsString } from "../../util/document.js";
Expand Down Expand Up @@ -61,9 +60,23 @@ export class PDFLoader extends BufferLoader {
continue;
}

const text = content.items
.map((item) => (item as TextItem).str)
.join("\n");
// Eliminate excessive newlines
// Source: https://github.com/albertcui/pdf-parse/blob/7086fc1cc9058545cdf41dd0646d6ae5832c7107/lib/pdf-parse.js#L16
let lastY;
const textItems = [];
for (const item of content.items) {
if ("str" in item) {
if (lastY === item.transform[5] || !lastY) {
textItems.push(item.str);
} else {
textItems.push(`\n${item.str}`);
}
// eslint-disable-next-line prefer-destructuring
lastY = item.transform[5];
}
}

const text = textItems.join(" ");

documents.push(
new Document({
Expand Down
3 changes: 2 additions & 1 deletion langchain/src/document_loaders/tests/directory.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,13 @@ test("Test Directory loader", async () => {
UnknownHandling.Ignore
);
const docs = await loader.load();
expect(docs.length).toBe(122);
expect(docs.length).toBe(123);
expect(docs.map((d) => d.metadata.source).sort()).toEqual([
// PDF
...Array.from({ length: 15 }, (_) =>
path.resolve(directoryPath, "1706.03762.pdf")
),
path.resolve(directoryPath, "Jacob_Lee_Resume_2023.pdf"),
// CSV
...Array.from({ length: 32 }, (_) =>
path.resolve(
Expand Down
Binary file not shown.
11 changes: 11 additions & 0 deletions langchain/src/document_loaders/tests/pdf.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,14 @@ test("Test PDF loader from file to single document", async () => {
expect(docs.length).toBe(1);
expect(docs[0].pageContent).toContain("Attention Is All You Need");
});

test("Test PDF loader should not create documents with excessive newlines", async () => {
const filePath = path.resolve(
path.dirname(url.fileURLToPath(import.meta.url)),
"./example_data/Jacob_Lee_Resume_2023.pdf"
);
const loader = new PDFLoader(filePath, { splitPages: false });
const docs = await loader.load();
expect(docs.length).toBe(1);
expect(docs[0].pageContent.split("\n").length).toBeLessThan(100);
});
17 changes: 17 additions & 0 deletions langchain/src/document_loaders/tests/webpdf.int.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -94,3 +94,20 @@ test("Test Web PDF loader with custom pdfjs", async () => {
}
`);
});

test("Test Web PDF loader lines", async () => {
const filePath = path.resolve(
path.dirname(url.fileURLToPath(import.meta.url)),
"./example_data/Jacob_Lee_Resume_2023.pdf"
);
const loader = new WebPDFLoader(
new Blob([await fs.readFile(filePath)], {
type: "application/pdf",
}),
{ splitPages: false }
);
const docs = await loader.load();

expect(docs.length).toBe(1);
expect(docs[0].pageContent.split("\n").length).toBeLessThan(100);
});
21 changes: 16 additions & 5 deletions langchain/src/document_loaders/web/pdf.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
import { type TextItem } from "pdf-parse/lib/pdf.js/v1.10.100/build/pdf.js";

import { Document } from "../../document.js";
import { BaseDocumentLoader } from "../base.js";
import { formatDocumentsAsString } from "../../util/document.js";
Expand Down Expand Up @@ -48,9 +46,22 @@ export class WebPDFLoader extends BaseDocumentLoader {
continue;
}

const text = content.items
.map((item) => (item as TextItem).str)
.join("\n");
// Eliminate excessive newlines
// Source: https://github.com/albertcui/pdf-parse/blob/7086fc1cc9058545cdf41dd0646d6ae5832c7107/lib/pdf-parse.js#L16
let lastY;
const textItems = [];
for (const item of content.items) {
if ("str" in item) {
if (lastY === item.transform[5] || !lastY) {
textItems.push(item.str);
} else {
textItems.push(`\n${item.str}`);
}
// eslint-disable-next-line prefer-destructuring
lastY = item.transform[5];
}
}
const text = textItems.join(" ");

documents.push(
new Document({
Expand Down

0 comments on commit 65f68b7

Please sign in to comment.