Skip to content

Commit

Permalink
Unstructured in memory loader (#5726)
Browse files Browse the repository at this point in the history
* feat: add in-memory option

* feat: add in memory community

* fix: object destructure fix

* Fix lint

* Rename args and add to docstring

* Export types

---------

Co-authored-by: andrewdoro <[email protected]>
  • Loading branch information
jacoblee93 and andrewdoro authored Jun 11, 2024
1 parent a550a7f commit 030607b
Show file tree
Hide file tree
Showing 5 changed files with 138 additions and 27 deletions.
46 changes: 36 additions & 10 deletions langchain/src/document_loaders/fs/unstructured.ts
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,11 @@ type UnstructuredDirectoryLoaderOptions = UnstructuredLoaderOptions & {
unknown?: UnknownHandling;
};

type UnstructuredMemoryLoaderOptions = {
buffer: Buffer;
fileName: string;
};

/**
* @deprecated - Import from "@langchain/community/document_loaders/fs/unstructured" instead. This entrypoint will be removed in 0.3.0.
*
Expand All @@ -139,6 +144,10 @@ type UnstructuredDirectoryLoaderOptions = UnstructuredLoaderOptions & {
export class UnstructuredLoader extends BaseDocumentLoader {
public filePath: string;

private buffer?: Buffer;

private fileName?: string;

private apiUrl = "https://api.unstructured.io/general/v0/general";

private apiKey?: string;
Expand Down Expand Up @@ -175,19 +184,30 @@ export class UnstructuredLoader extends BaseDocumentLoader {
private maxCharacters?: number;

constructor(
filePathOrLegacyApiUrl: string,
filePathOrLegacyApiUrlOrMemoryBuffer:
| string
| UnstructuredMemoryLoaderOptions,
optionsOrLegacyFilePath: UnstructuredLoaderOptions | string = {}
) {
super();

// Temporary shim to avoid breaking existing users
// Remove when API keys are enforced by Unstructured and existing code will break anyway
const isLegacySyntax = typeof optionsOrLegacyFilePath === "string";
if (isLegacySyntax) {
const isMemorySyntax =
typeof filePathOrLegacyApiUrlOrMemoryBuffer === "object";

if (isMemorySyntax) {
this.buffer = filePathOrLegacyApiUrlOrMemoryBuffer.buffer;
this.fileName = filePathOrLegacyApiUrlOrMemoryBuffer.fileName;
} else if (isLegacySyntax) {
this.filePath = optionsOrLegacyFilePath;
this.apiUrl = filePathOrLegacyApiUrl;
this.apiUrl = filePathOrLegacyApiUrlOrMemoryBuffer;
} else {
this.filePath = filePathOrLegacyApiUrl;
this.filePath = filePathOrLegacyApiUrlOrMemoryBuffer;
}

if (!isLegacySyntax) {
const options = optionsOrLegacyFilePath;
this.apiKey = options.apiKey;
this.apiUrl = options.apiUrl ?? this.apiUrl;
Expand All @@ -209,14 +229,20 @@ export class UnstructuredLoader extends BaseDocumentLoader {
}

async _partition() {
const { readFile, basename } = await this.imports();
let { buffer } = this;
let { fileName } = this;

if (!buffer) {
const { readFile, basename } = await this.imports();

const buffer = await readFile(this.filePath);
const fileName = basename(this.filePath);
buffer = await readFile(this.filePath);
fileName = basename(this.filePath);

// I'm aware this reads the file into memory first, but we have lots of work
// to do on then consuming Documents in a streaming fashion anyway, so not
// worried about this for now.
}

// I'm aware this reads the file into memory first, but we have lots of work
// to do on then consuming Documents in a streaming fashion anyway, so not
// worried about this for now.
const formData = new FormData();
formData.append("files", new Blob([buffer]), fileName);
formData.append("strategy", this.strategy);
Expand Down
29 changes: 29 additions & 0 deletions langchain/src/document_loaders/tests/unstructured.int.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import * as url from "node:url";
import * as path from "node:path";
import { readFile } from "node:fs/promises";
import { test, expect } from "@jest/globals";
import {
UnstructuredDirectoryLoader,
Expand All @@ -29,6 +30,34 @@ test.skip("Test Unstructured base loader", async () => {
}
});

test.skip("Test Unstructured base loader with buffer", async () => {
const filePath = path.resolve(
path.dirname(url.fileURLToPath(import.meta.url)),
"./example_data/example.txt"
);

const options = {
apiKey: process.env.UNSTRUCTURED_API_KEY!,
};

const buffer = await readFile(filePath);
const fileName = "example.txt";

const loader = new UnstructuredLoader(
{
buffer,
fileName,
},
options
);
const docs = await loader.load();

expect(docs.length).toBe(3);
for (const doc of docs) {
expect(typeof doc.pageContent).toBe("string");
}
});

test.skip("Test Unstructured base loader with fast strategy", async () => {
const filePath = path.resolve(
path.dirname(url.fileURLToPath(import.meta.url)),
Expand Down
1 change: 1 addition & 0 deletions libs/langchain-community/.eslintrc.cjs
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ module.exports = {
"prefer-rest-params": 0,
"new-cap": ["error", { properties: false, capIsNew: false }],
"arrow-body-style": 0,
"prefer-destructuring": 0
},
overrides: [
{
Expand Down
60 changes: 43 additions & 17 deletions libs/langchain-community/src/document_loaders/fs/unstructured.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ import {
} from "langchain/document_loaders/fs/directory";
import { BaseDocumentLoader } from "@langchain/core/document_loaders/base";

const UNSTRUCTURED_API_FILETYPES = [
export const UNSTRUCTURED_API_FILETYPES = [
".txt",
".text",
".pdf",
Expand Down Expand Up @@ -94,7 +94,7 @@ export type SkipInferTableTypes =
/**
* Set the chunking_strategy to chunk text into larger or smaller elements. Defaults to None with optional arg of by_title
*/
type ChunkingStrategy = "None" | "by_title";
export type ChunkingStrategy = "None" | "by_title";

export type UnstructuredLoaderOptions = {
apiKey?: string;
Expand All @@ -115,22 +115,34 @@ export type UnstructuredLoaderOptions = {
maxCharacters?: number;
};

type UnstructuredDirectoryLoaderOptions = UnstructuredLoaderOptions & {
export type UnstructuredDirectoryLoaderOptions = UnstructuredLoaderOptions & {
recursive?: boolean;
unknown?: UnknownHandling;
};

export type UnstructuredMemoryLoaderOptions = {
buffer: Buffer;
fileName: string;
};

/**
* A document loader that uses the Unstructured API to load unstructured
* documents. It supports both the new syntax with options object and the
* legacy syntax for backward compatibility. The load() method sends a
* partitioning request to the Unstructured API and retrieves the
* partitioned elements. It creates a Document instance for each element
* and returns an array of Document instances.
*
* It accepts either a filepath or an object containing a buffer and a filename
* as input.
*/
export class UnstructuredLoader extends BaseDocumentLoader {
public filePath: string;

private buffer?: Buffer;

private fileName?: string;

private apiUrl = "https://api.unstructured.io/general/v0/general";

private apiKey?: string;
Expand Down Expand Up @@ -167,20 +179,28 @@ export class UnstructuredLoader extends BaseDocumentLoader {
private maxCharacters?: number;

constructor(
filePathOrLegacyApiUrl: string,
optionsOrLegacyFilePath: UnstructuredLoaderOptions | string = {}
filepathOrBufferOptions: string | UnstructuredMemoryLoaderOptions,
unstructuredOptions: UnstructuredLoaderOptions | string = {}
) {
super();

// Temporary shim to avoid breaking existing users
// Remove when API keys are enforced by Unstructured and existing code will break anyway
const isLegacySyntax = typeof optionsOrLegacyFilePath === "string";
if (isLegacySyntax) {
this.filePath = optionsOrLegacyFilePath;
this.apiUrl = filePathOrLegacyApiUrl;
const isLegacySyntax = typeof unstructuredOptions === "string";
const isMemorySyntax = typeof filepathOrBufferOptions === "object";

if (isMemorySyntax) {
this.buffer = filepathOrBufferOptions.buffer;
this.fileName = filepathOrBufferOptions.fileName;
} else if (isLegacySyntax) {
this.filePath = unstructuredOptions;
this.apiUrl = filepathOrBufferOptions;
} else {
this.filePath = filePathOrLegacyApiUrl;
const options = optionsOrLegacyFilePath;
this.filePath = filepathOrBufferOptions;
}

if (!isLegacySyntax) {
const options = unstructuredOptions;
this.apiKey =
options.apiKey ?? getEnvironmentVariable("UNSTRUCTURED_API_KEY");
this.apiUrl =
Expand All @@ -205,14 +225,20 @@ export class UnstructuredLoader extends BaseDocumentLoader {
}

async _partition() {
const { readFile, basename } = await this.imports();
let buffer = this.buffer;
let fileName = this.fileName;

if (!buffer) {
const { readFile, basename } = await this.imports();

const buffer = await readFile(this.filePath);
const fileName = basename(this.filePath);
buffer = await readFile(this.filePath);
fileName = basename(this.filePath);

// I'm aware this reads the file into memory first, but we have lots of work
// to do on then consuming Documents in a streaming fashion anyway, so not
// worried about this for now.
}

// I'm aware this reads the file into memory first, but we have lots of work
// to do on then consuming Documents in a streaming fashion anyway, so not
// worried about this for now.
const formData = new FormData();
formData.append("files", new Blob([buffer]), fileName);
formData.append("strategy", this.strategy);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import * as url from "node:url";
import * as path from "node:path";
import { readFile } from "node:fs/promises";
import { test, expect } from "@jest/globals";
import {
UnstructuredDirectoryLoader,
Expand All @@ -29,6 +30,34 @@ test.skip("Test Unstructured base loader", async () => {
}
});

test.skip("Test Unstructured base loader with buffer", async () => {
const filePath = path.resolve(
path.dirname(url.fileURLToPath(import.meta.url)),
"./example_data/example.txt"
);

const options = {
apiKey: process.env.UNSTRUCTURED_API_KEY!,
};

const buffer = await readFile(filePath);
const fileName = "example.txt";

const loader = new UnstructuredLoader(
{
buffer,
fileName,
},
options
);
const docs = await loader.load();

expect(docs.length).toBe(3);
for (const doc of docs) {
expect(typeof doc.pageContent).toBe("string");
}
});

test.skip("Test Unstructured base loader with fast strategy", async () => {
const filePath = path.resolve(
path.dirname(url.fileURLToPath(import.meta.url)),
Expand Down

0 comments on commit 030607b

Please sign in to comment.