diff --git a/langchain/src/document_loaders/fs/unstructured.ts b/langchain/src/document_loaders/fs/unstructured.ts index edb60008aca1..e3b62a9cfbc3 100644 --- a/langchain/src/document_loaders/fs/unstructured.ts +++ b/langchain/src/document_loaders/fs/unstructured.ts @@ -126,6 +126,11 @@ type UnstructuredDirectoryLoaderOptions = UnstructuredLoaderOptions & { unknown?: UnknownHandling; }; +type UnstructuredMemoryLoaderOptions = { + buffer: Buffer; + fileName: string; +}; + /** * @deprecated - Import from "@langchain/community/document_loaders/fs/unstructured" instead. This entrypoint will be removed in 0.3.0. * @@ -139,6 +144,10 @@ type UnstructuredDirectoryLoaderOptions = UnstructuredLoaderOptions & { export class UnstructuredLoader extends BaseDocumentLoader { public filePath: string; + private buffer?: Buffer; + + private fileName?: string; + private apiUrl = "https://api.unstructured.io/general/v0/general"; private apiKey?: string; @@ -175,7 +184,9 @@ export class UnstructuredLoader extends BaseDocumentLoader { private maxCharacters?: number; constructor( - filePathOrLegacyApiUrl: string, + filePathOrLegacyApiUrlOrMemoryBuffer: + | string + | UnstructuredMemoryLoaderOptions, optionsOrLegacyFilePath: UnstructuredLoaderOptions | string = {} ) { super(); @@ -183,11 +194,20 @@ export class UnstructuredLoader extends BaseDocumentLoader { // Temporary shim to avoid breaking existing users // Remove when API keys are enforced by Unstructured and existing code will break anyway const isLegacySyntax = typeof optionsOrLegacyFilePath === "string"; - if (isLegacySyntax) { + const isMemorySyntax = + typeof filePathOrLegacyApiUrlOrMemoryBuffer === "object"; + + if (isMemorySyntax) { + this.buffer = filePathOrLegacyApiUrlOrMemoryBuffer.buffer; + this.fileName = filePathOrLegacyApiUrlOrMemoryBuffer.fileName; + } else if (isLegacySyntax) { this.filePath = optionsOrLegacyFilePath; - this.apiUrl = filePathOrLegacyApiUrl; + this.apiUrl = filePathOrLegacyApiUrlOrMemoryBuffer; } else { - this.filePath = filePathOrLegacyApiUrl; + this.filePath = filePathOrLegacyApiUrlOrMemoryBuffer; + } + + if (!isLegacySyntax) { const options = optionsOrLegacyFilePath; this.apiKey = options.apiKey; this.apiUrl = options.apiUrl ?? this.apiUrl; @@ -209,14 +229,20 @@ export class UnstructuredLoader extends BaseDocumentLoader { } async _partition() { - const { readFile, basename } = await this.imports(); + let { buffer } = this; + let { fileName } = this; + + if (!buffer) { + const { readFile, basename } = await this.imports(); - const buffer = await readFile(this.filePath); - const fileName = basename(this.filePath); + buffer = await readFile(this.filePath); + fileName = basename(this.filePath); + + // I'm aware this reads the file into memory first, but we have lots of work + // to do on then consuming Documents in a streaming fashion anyway, so not + // worried about this for now. + } - // I'm aware this reads the file into memory first, but we have lots of work - // to do on then consuming Documents in a streaming fashion anyway, so not - // worried about this for now. const formData = new FormData(); formData.append("files", new Blob([buffer]), fileName); formData.append("strategy", this.strategy); diff --git a/langchain/src/document_loaders/tests/unstructured.int.test.ts b/langchain/src/document_loaders/tests/unstructured.int.test.ts index e30913e10a2d..b0b0712118a6 100644 --- a/langchain/src/document_loaders/tests/unstructured.int.test.ts +++ b/langchain/src/document_loaders/tests/unstructured.int.test.ts @@ -3,6 +3,7 @@ import * as url from "node:url"; import * as path from "node:path"; +import { readFile } from "node:fs/promises"; import { test, expect } from "@jest/globals"; import { UnstructuredDirectoryLoader, @@ -29,6 +30,34 @@ test.skip("Test Unstructured base loader", async () => { } }); +test.skip("Test Unstructured base loader with buffer", async () => { + const filePath = path.resolve( + path.dirname(url.fileURLToPath(import.meta.url)), + "./example_data/example.txt" + ); + + const options = { + apiKey: process.env.UNSTRUCTURED_API_KEY!, + }; + + const buffer = await readFile(filePath); + const fileName = "example.txt"; + + const loader = new UnstructuredLoader( + { + buffer, + fileName, + }, + options + ); + const docs = await loader.load(); + + expect(docs.length).toBe(3); + for (const doc of docs) { + expect(typeof doc.pageContent).toBe("string"); + } +}); + test.skip("Test Unstructured base loader with fast strategy", async () => { const filePath = path.resolve( path.dirname(url.fileURLToPath(import.meta.url)), diff --git a/libs/langchain-community/.eslintrc.cjs b/libs/langchain-community/.eslintrc.cjs index 459826a03a17..fea6aa3b0a6c 100644 --- a/libs/langchain-community/.eslintrc.cjs +++ b/libs/langchain-community/.eslintrc.cjs @@ -64,6 +64,7 @@ module.exports = { "prefer-rest-params": 0, "new-cap": ["error", { properties: false, capIsNew: false }], "arrow-body-style": 0, + "prefer-destructuring": 0 }, overrides: [ { diff --git a/libs/langchain-community/src/document_loaders/fs/unstructured.ts b/libs/langchain-community/src/document_loaders/fs/unstructured.ts index f9040b11110a..62e7053f42b2 100644 --- a/libs/langchain-community/src/document_loaders/fs/unstructured.ts +++ b/libs/langchain-community/src/document_loaders/fs/unstructured.ts @@ -10,7 +10,7 @@ import { } from "langchain/document_loaders/fs/directory"; import { BaseDocumentLoader } from "@langchain/core/document_loaders/base"; -const UNSTRUCTURED_API_FILETYPES = [ +export const UNSTRUCTURED_API_FILETYPES = [ ".txt", ".text", ".pdf", @@ -94,7 +94,7 @@ export type SkipInferTableTypes = /** * Set the chunking_strategy to chunk text into larger or smaller elements. Defaults to None with optional arg of by_title */ -type ChunkingStrategy = "None" | "by_title"; +export type ChunkingStrategy = "None" | "by_title"; export type UnstructuredLoaderOptions = { apiKey?: string; @@ -115,11 +115,16 @@ export type UnstructuredLoaderOptions = { maxCharacters?: number; }; -type UnstructuredDirectoryLoaderOptions = UnstructuredLoaderOptions & { +export type UnstructuredDirectoryLoaderOptions = UnstructuredLoaderOptions & { recursive?: boolean; unknown?: UnknownHandling; }; +export type UnstructuredMemoryLoaderOptions = { + buffer: Buffer; + fileName: string; +}; + /** * A document loader that uses the Unstructured API to load unstructured * documents. It supports both the new syntax with options object and the @@ -127,10 +132,17 @@ type UnstructuredDirectoryLoaderOptions = UnstructuredLoaderOptions & { * partitioning request to the Unstructured API and retrieves the * partitioned elements. It creates a Document instance for each element * and returns an array of Document instances. + * + * It accepts either a filepath or an object containing a buffer and a filename + * as input. */ export class UnstructuredLoader extends BaseDocumentLoader { public filePath: string; + private buffer?: Buffer; + + private fileName?: string; + private apiUrl = "https://api.unstructured.io/general/v0/general"; private apiKey?: string; @@ -167,20 +179,28 @@ export class UnstructuredLoader extends BaseDocumentLoader { private maxCharacters?: number; constructor( - filePathOrLegacyApiUrl: string, - optionsOrLegacyFilePath: UnstructuredLoaderOptions | string = {} + filepathOrBufferOptions: string | UnstructuredMemoryLoaderOptions, + unstructuredOptions: UnstructuredLoaderOptions | string = {} ) { super(); // Temporary shim to avoid breaking existing users // Remove when API keys are enforced by Unstructured and existing code will break anyway - const isLegacySyntax = typeof optionsOrLegacyFilePath === "string"; - if (isLegacySyntax) { - this.filePath = optionsOrLegacyFilePath; - this.apiUrl = filePathOrLegacyApiUrl; + const isLegacySyntax = typeof unstructuredOptions === "string"; + const isMemorySyntax = typeof filepathOrBufferOptions === "object"; + + if (isMemorySyntax) { + this.buffer = filepathOrBufferOptions.buffer; + this.fileName = filepathOrBufferOptions.fileName; + } else if (isLegacySyntax) { + this.filePath = unstructuredOptions; + this.apiUrl = filepathOrBufferOptions; } else { - this.filePath = filePathOrLegacyApiUrl; - const options = optionsOrLegacyFilePath; + this.filePath = filepathOrBufferOptions; + } + + if (!isLegacySyntax) { + const options = unstructuredOptions; this.apiKey = options.apiKey ?? getEnvironmentVariable("UNSTRUCTURED_API_KEY"); this.apiUrl = @@ -205,14 +225,20 @@ export class UnstructuredLoader extends BaseDocumentLoader { } async _partition() { - const { readFile, basename } = await this.imports(); + let buffer = this.buffer; + let fileName = this.fileName; + + if (!buffer) { + const { readFile, basename } = await this.imports(); - const buffer = await readFile(this.filePath); - const fileName = basename(this.filePath); + buffer = await readFile(this.filePath); + fileName = basename(this.filePath); + + // I'm aware this reads the file into memory first, but we have lots of work + // to do on then consuming Documents in a streaming fashion anyway, so not + // worried about this for now. + } - // I'm aware this reads the file into memory first, but we have lots of work - // to do on then consuming Documents in a streaming fashion anyway, so not - // worried about this for now. const formData = new FormData(); formData.append("files", new Blob([buffer]), fileName); formData.append("strategy", this.strategy); diff --git a/libs/langchain-community/src/document_loaders/tests/unstructured.int.test.ts b/libs/langchain-community/src/document_loaders/tests/unstructured.int.test.ts index e30913e10a2d..b0b0712118a6 100644 --- a/libs/langchain-community/src/document_loaders/tests/unstructured.int.test.ts +++ b/libs/langchain-community/src/document_loaders/tests/unstructured.int.test.ts @@ -3,6 +3,7 @@ import * as url from "node:url"; import * as path from "node:path"; +import { readFile } from "node:fs/promises"; import { test, expect } from "@jest/globals"; import { UnstructuredDirectoryLoader, @@ -29,6 +30,34 @@ test.skip("Test Unstructured base loader", async () => { } }); +test.skip("Test Unstructured base loader with buffer", async () => { + const filePath = path.resolve( + path.dirname(url.fileURLToPath(import.meta.url)), + "./example_data/example.txt" + ); + + const options = { + apiKey: process.env.UNSTRUCTURED_API_KEY!, + }; + + const buffer = await readFile(filePath); + const fileName = "example.txt"; + + const loader = new UnstructuredLoader( + { + buffer, + fileName, + }, + options + ); + const docs = await loader.load(); + + expect(docs.length).toBe(3); + for (const doc of docs) { + expect(typeof doc.pageContent).toBe("string"); + } +}); + test.skip("Test Unstructured base loader with fast strategy", async () => { const filePath = path.resolve( path.dirname(url.fileURLToPath(import.meta.url)),