-
Notifications
You must be signed in to change notification settings - Fork 2.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Unstructured in memory loader #5726
Changes from all commits
cd53eb2
9f479d7
dd2d707
e204c3e
55e51e0
5ba8819
e2699c6
52807fa
4a3bdc7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,7 +10,7 @@ import { | |
} from "langchain/document_loaders/fs/directory"; | ||
import { BaseDocumentLoader } from "@langchain/core/document_loaders/base"; | ||
|
||
const UNSTRUCTURED_API_FILETYPES = [ | ||
export const UNSTRUCTURED_API_FILETYPES = [ | ||
".txt", | ||
".text", | ||
".pdf", | ||
|
@@ -94,7 +94,7 @@ export type SkipInferTableTypes = | |
/** | ||
* Set the chunking_strategy to chunk text into larger or smaller elements. Defaults to None with optional arg of by_title | ||
*/ | ||
type ChunkingStrategy = "None" | "by_title"; | ||
export type ChunkingStrategy = "None" | "by_title"; | ||
|
||
export type UnstructuredLoaderOptions = { | ||
apiKey?: string; | ||
|
@@ -115,22 +115,34 @@ export type UnstructuredLoaderOptions = { | |
maxCharacters?: number; | ||
}; | ||
|
||
type UnstructuredDirectoryLoaderOptions = UnstructuredLoaderOptions & { | ||
export type UnstructuredDirectoryLoaderOptions = UnstructuredLoaderOptions & { | ||
recursive?: boolean; | ||
unknown?: UnknownHandling; | ||
}; | ||
|
||
export type UnstructuredMemoryLoaderOptions = { | ||
buffer: Buffer; | ||
fileName: string; | ||
}; | ||
|
||
/** | ||
* A document loader that uses the Unstructured API to load unstructured | ||
* documents. It supports both the new syntax with options object and the | ||
* legacy syntax for backward compatibility. The load() method sends a | ||
* partitioning request to the Unstructured API and retrieves the | ||
* partitioned elements. It creates a Document instance for each element | ||
* and returns an array of Document instances. | ||
* | ||
* It accepts either a filepath or an object containing a buffer and a filename | ||
* as input. | ||
*/ | ||
export class UnstructuredLoader extends BaseDocumentLoader { | ||
public filePath: string; | ||
|
||
private buffer?: Buffer; | ||
|
||
private fileName?: string; | ||
|
||
private apiUrl = "https://api.unstructured.io/general/v0/general"; | ||
|
||
private apiKey?: string; | ||
|
@@ -167,20 +179,28 @@ export class UnstructuredLoader extends BaseDocumentLoader { | |
private maxCharacters?: number; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hey there! I've reviewed the code and noticed that the recent change in the |
||
|
||
constructor( | ||
filePathOrLegacyApiUrl: string, | ||
optionsOrLegacyFilePath: UnstructuredLoaderOptions | string = {} | ||
filepathOrBufferOptions: string | UnstructuredMemoryLoaderOptions, | ||
unstructuredOptions: UnstructuredLoaderOptions | string = {} | ||
) { | ||
super(); | ||
|
||
// Temporary shim to avoid breaking existing users | ||
// Remove when API keys are enforced by Unstructured and existing code will break anyway | ||
const isLegacySyntax = typeof optionsOrLegacyFilePath === "string"; | ||
if (isLegacySyntax) { | ||
this.filePath = optionsOrLegacyFilePath; | ||
this.apiUrl = filePathOrLegacyApiUrl; | ||
const isLegacySyntax = typeof unstructuredOptions === "string"; | ||
const isMemorySyntax = typeof filepathOrBufferOptions === "object"; | ||
|
||
if (isMemorySyntax) { | ||
this.buffer = filepathOrBufferOptions.buffer; | ||
this.fileName = filepathOrBufferOptions.fileName; | ||
} else if (isLegacySyntax) { | ||
this.filePath = unstructuredOptions; | ||
this.apiUrl = filepathOrBufferOptions; | ||
} else { | ||
this.filePath = filePathOrLegacyApiUrl; | ||
const options = optionsOrLegacyFilePath; | ||
this.filePath = filepathOrBufferOptions; | ||
} | ||
|
||
if (!isLegacySyntax) { | ||
const options = unstructuredOptions; | ||
this.apiKey = | ||
options.apiKey ?? getEnvironmentVariable("UNSTRUCTURED_API_KEY"); | ||
this.apiUrl = | ||
|
@@ -205,14 +225,20 @@ export class UnstructuredLoader extends BaseDocumentLoader { | |
} | ||
|
||
async _partition() { | ||
const { readFile, basename } = await this.imports(); | ||
let buffer = this.buffer; | ||
let fileName = this.fileName; | ||
|
||
if (!buffer) { | ||
const { readFile, basename } = await this.imports(); | ||
|
||
const buffer = await readFile(this.filePath); | ||
const fileName = basename(this.filePath); | ||
buffer = await readFile(this.filePath); | ||
fileName = basename(this.filePath); | ||
|
||
// I'm aware this reads the file into memory first, but we have lots of work | ||
// to do on then consuming Documents in a streaming fashion anyway, so not | ||
// worried about this for now. | ||
} | ||
|
||
// I'm aware this reads the file into memory first, but we have lots of work | ||
// to do on then consuming Documents in a streaming fashion anyway, so not | ||
// worried about this for now. | ||
const formData = new FormData(); | ||
formData.append("files", new Blob([buffer]), fileName); | ||
formData.append("strategy", this.strategy); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,6 +3,7 @@ | |
|
||
import * as url from "node:url"; | ||
import * as path from "node:path"; | ||
import { readFile } from "node:fs/promises"; | ||
import { test, expect } from "@jest/globals"; | ||
import { | ||
UnstructuredDirectoryLoader, | ||
|
@@ -29,6 +30,34 @@ test.skip("Test Unstructured base loader", async () => { | |
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hey team, I've flagged this PR for your review as it adds code that explicitly accesses an environment variable via |
||
}); | ||
|
||
test.skip("Test Unstructured base loader with buffer", async () => { | ||
const filePath = path.resolve( | ||
path.dirname(url.fileURLToPath(import.meta.url)), | ||
"./example_data/example.txt" | ||
); | ||
|
||
const options = { | ||
apiKey: process.env.UNSTRUCTURED_API_KEY!, | ||
}; | ||
|
||
const buffer = await readFile(filePath); | ||
const fileName = "example.txt"; | ||
|
||
const loader = new UnstructuredLoader( | ||
{ | ||
buffer, | ||
fileName, | ||
}, | ||
options | ||
); | ||
const docs = await loader.load(); | ||
|
||
expect(docs.length).toBe(3); | ||
for (const doc of docs) { | ||
expect(typeof doc.pageContent).toBe("string"); | ||
} | ||
}); | ||
|
||
test.skip("Test Unstructured base loader with fast strategy", async () => { | ||
const filePath = path.resolve( | ||
path.dirname(url.fileURLToPath(import.meta.url)), | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hey team, just wanted to flag this PR for review as it includes changes that access an environment variable via
process.env
to obtain the API key for the options object. Please take a look and ensure everything is handled securely. Thanks!