-
Notifications
You must be signed in to change notification settings - Fork 2.3k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(community): Adds an HTML loader for URLS (#7184)
Co-authored-by: Jacob Lee <[email protected]>
- Loading branch information
1 parent
c30ae29
commit 54decfe
Showing
10 changed files
with
146 additions
and
33 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
24 changes: 24 additions & 0 deletions
24
libs/langchain-community/src/document_loaders/tests/html.int.test.ts
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
import { expect, test } from "@jest/globals"; | ||
import { HTMLWebBaseLoader } from "../web/html.js"; | ||
|
||
test("Test HTML web scraper loader", async () => { | ||
const loader = new HTMLWebBaseLoader( | ||
"https://news.ycombinator.com/item?id=34817881" | ||
); | ||
const docs = await loader.load(); | ||
expect(docs[0].pageContent).toEqual( | ||
expect.stringContaining("What Lights the Universe’s Standard Candles?") | ||
); | ||
}); | ||
|
||
test("Test HTML web scraper loader with textDecoder", async () => { | ||
const loader = new HTMLWebBaseLoader( | ||
"https://corp.163.com/gb/about/management.html", | ||
{ | ||
textDecoder: new TextDecoder("gbk"), | ||
} | ||
); | ||
|
||
const docs = await loader.load(); | ||
expect(docs[0].pageContent.trim()).toEqual(expect.stringContaining("网易")); | ||
}); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
import { | ||
AsyncCaller, | ||
AsyncCallerParams, | ||
} from "@langchain/core/utils/async_caller"; | ||
import { BaseDocumentLoader } from "@langchain/core/document_loaders/base"; | ||
import { Document } from "@langchain/core/documents"; | ||
import type { DocumentLoader } from "@langchain/core/document_loaders/base"; | ||
|
||
/** | ||
* Represents the parameters for configuring WebBaseLoaders. It extends the | ||
* AsyncCallerParams interface and adds additional parameters specific to | ||
* web-based loaders. | ||
*/ | ||
export interface WebBaseLoaderParams extends AsyncCallerParams { | ||
/** | ||
* The timeout in milliseconds for the fetch request. Defaults to 10s. | ||
*/ | ||
timeout?: number; | ||
|
||
/** | ||
* The text decoder to use to decode the response. Defaults to UTF-8. | ||
*/ | ||
textDecoder?: TextDecoder; | ||
/** | ||
* The headers to use in the fetch request. | ||
*/ | ||
headers?: HeadersInit; | ||
/** | ||
* The selector to use to extract the text from the document. | ||
* Defaults to "body". | ||
* @deprecated Use CheerioWebBaseLoaderParams from @langchain/community/document_loaders/web/cheerio | ||
* instead. | ||
*/ | ||
// eslint-disable-next-line @typescript-eslint/no-explicit-any | ||
selector?: any; | ||
} | ||
|
||
export interface WebBaseLoader extends DocumentLoader { | ||
timeout: number; | ||
|
||
caller: AsyncCaller; | ||
|
||
textDecoder?: TextDecoder; | ||
|
||
headers?: HeadersInit; | ||
} | ||
|
||
export class HTMLWebBaseLoader | ||
extends BaseDocumentLoader | ||
implements WebBaseLoader | ||
{ | ||
timeout: number; | ||
|
||
caller: AsyncCaller; | ||
|
||
textDecoder?: TextDecoder; | ||
|
||
headers?: HeadersInit; | ||
|
||
constructor(public webPath: string, fields?: WebBaseLoaderParams) { | ||
super(); | ||
const { timeout, textDecoder, headers, ...rest } = fields ?? {}; | ||
this.timeout = timeout ?? 10000; | ||
this.caller = new AsyncCaller(rest); | ||
this.textDecoder = textDecoder; | ||
this.headers = headers; | ||
} | ||
|
||
async load(): Promise<Document[]> { | ||
const response = await this.caller.call(fetch, this.webPath, { | ||
signal: this.timeout ? AbortSignal.timeout(this.timeout) : undefined, | ||
headers: this.headers, | ||
}); | ||
|
||
const html = | ||
this.textDecoder?.decode(await response.arrayBuffer()) ?? | ||
(await response.text()); | ||
|
||
return [new Document({ pageContent: html })]; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters