From 54decfe9b33469437bfd17822fcf77cea7e72c75 Mon Sep 17 00:00:00 2001 From: Phil Nash Date: Sun, 17 Nov 2024 12:13:23 +1100 Subject: [PATCH] feat(community): Adds an HTML loader for URLS (#7184) Co-authored-by: Jacob Lee --- .../mozilla_readability.ts | 6 +- libs/langchain-community/.gitignore | 4 + libs/langchain-community/langchain.config.js | 1 + libs/langchain-community/package.json | 13 +++ .../document_loaders/tests/html.int.test.ts | 24 ++++++ .../src/document_loaders/web/cheerio.ts | 39 ++++----- .../src/document_loaders/web/html.ts | 81 +++++++++++++++++++ .../src/document_loaders/web/sitemap.ts | 6 +- .../mozilla_readability.ts | 4 +- .../src/load/import_map.ts | 1 + 10 files changed, 146 insertions(+), 33 deletions(-) create mode 100644 libs/langchain-community/src/document_loaders/tests/html.int.test.ts create mode 100644 libs/langchain-community/src/document_loaders/web/html.ts diff --git a/examples/src/document_transformers/mozilla_readability.ts b/examples/src/document_transformers/mozilla_readability.ts index b3ac3c2b155a..22b19e463a92 100644 --- a/examples/src/document_transformers/mozilla_readability.ts +++ b/examples/src/document_transformers/mozilla_readability.ts @@ -1,8 +1,8 @@ -import { CheerioWebBaseLoader } from "@langchain/community/document_loaders/web/cheerio"; +import { HTMLWebBaseLoader } from "@langchain/community/document_loaders/web/html"; import { MozillaReadabilityTransformer } from "@langchain/community/document_transformers/mozilla_readability"; import { RecursiveCharacterTextSplitter } from "@langchain/textsplitters"; -const loader = new CheerioWebBaseLoader( +const loader = new HTMLWebBaseLoader( "https://news.ycombinator.com/item?id=34817881" ); @@ -11,7 +11,7 @@ const docs = await loader.load(); const splitter = RecursiveCharacterTextSplitter.fromLanguage("html"); const transformer = new MozillaReadabilityTransformer(); -const sequence = splitter.pipe(transformer); +const sequence = transformer.pipe(splitter); const newDocuments = await sequence.invoke(docs); diff --git a/libs/langchain-community/.gitignore b/libs/langchain-community/.gitignore index 24abb6e79bc7..b554afb6f1ec 100644 --- a/libs/langchain-community/.gitignore +++ b/libs/langchain-community/.gitignore @@ -866,6 +866,10 @@ document_loaders/web/cheerio.cjs document_loaders/web/cheerio.js document_loaders/web/cheerio.d.ts document_loaders/web/cheerio.d.cts +document_loaders/web/html.cjs +document_loaders/web/html.js +document_loaders/web/html.d.ts +document_loaders/web/html.d.cts document_loaders/web/puppeteer.cjs document_loaders/web/puppeteer.js document_loaders/web/puppeteer.d.ts diff --git a/libs/langchain-community/langchain.config.js b/libs/langchain-community/langchain.config.js index e4d9102124de..631c2a12879e 100644 --- a/libs/langchain-community/langchain.config.js +++ b/libs/langchain-community/langchain.config.js @@ -270,6 +270,7 @@ export const config = { "document_loaders/web/azure_blob_storage_file", "document_loaders/web/browserbase": "document_loaders/web/browserbase", "document_loaders/web/cheerio": "document_loaders/web/cheerio", + "document_loaders/web/html": "document_loaders/web/html", "document_loaders/web/puppeteer": "document_loaders/web/puppeteer", "document_loaders/web/playwright": "document_loaders/web/playwright", "document_loaders/web/college_confidential": diff --git a/libs/langchain-community/package.json b/libs/langchain-community/package.json index 8ca1c034abd9..654a00fd51dc 100644 --- a/libs/langchain-community/package.json +++ b/libs/langchain-community/package.json @@ -2661,6 +2661,15 @@ "import": "./document_loaders/web/cheerio.js", "require": "./document_loaders/web/cheerio.cjs" }, + "./document_loaders/web/html": { + "types": { + "import": "./document_loaders/web/html.d.ts", + "require": "./document_loaders/web/html.d.cts", + "default": "./document_loaders/web/html.d.ts" + }, + "import": "./document_loaders/web/html.js", + "require": "./document_loaders/web/html.cjs" + }, "./document_loaders/web/puppeteer": { "types": { "import": "./document_loaders/web/puppeteer.d.ts", @@ -3938,6 +3947,10 @@ "document_loaders/web/cheerio.js", "document_loaders/web/cheerio.d.ts", "document_loaders/web/cheerio.d.cts", + "document_loaders/web/html.cjs", + "document_loaders/web/html.js", + "document_loaders/web/html.d.ts", + "document_loaders/web/html.d.cts", "document_loaders/web/puppeteer.cjs", "document_loaders/web/puppeteer.js", "document_loaders/web/puppeteer.d.ts", diff --git a/libs/langchain-community/src/document_loaders/tests/html.int.test.ts b/libs/langchain-community/src/document_loaders/tests/html.int.test.ts new file mode 100644 index 000000000000..afd308a19b4f --- /dev/null +++ b/libs/langchain-community/src/document_loaders/tests/html.int.test.ts @@ -0,0 +1,24 @@ +import { expect, test } from "@jest/globals"; +import { HTMLWebBaseLoader } from "../web/html.js"; + +test("Test HTML web scraper loader", async () => { + const loader = new HTMLWebBaseLoader( + "https://news.ycombinator.com/item?id=34817881" + ); + const docs = await loader.load(); + expect(docs[0].pageContent).toEqual( + expect.stringContaining("What Lights the Universe’s Standard Candles?") + ); +}); + +test("Test HTML web scraper loader with textDecoder", async () => { + const loader = new HTMLWebBaseLoader( + "https://corp.163.com/gb/about/management.html", + { + textDecoder: new TextDecoder("gbk"), + } + ); + + const docs = await loader.load(); + expect(docs[0].pageContent.trim()).toEqual(expect.stringContaining("网易")); +}); diff --git a/libs/langchain-community/src/document_loaders/web/cheerio.ts b/libs/langchain-community/src/document_loaders/web/cheerio.ts index abdd5a7e15b2..106b1ffe9d33 100644 --- a/libs/langchain-community/src/document_loaders/web/cheerio.ts +++ b/libs/langchain-community/src/document_loaders/web/cheerio.ts @@ -5,38 +5,27 @@ import type { SelectorType, } from "cheerio"; import { Document } from "@langchain/core/documents"; -import { - AsyncCaller, - AsyncCallerParams, -} from "@langchain/core/utils/async_caller"; +import { AsyncCaller } from "@langchain/core/utils/async_caller"; import { BaseDocumentLoader } from "@langchain/core/document_loaders/base"; -import type { DocumentLoader } from "@langchain/core/document_loaders/base"; +import type { WebBaseLoaderParams, WebBaseLoader } from "./html.js"; /** - * Represents the parameters for configuring the CheerioWebBaseLoader. It - * extends the AsyncCallerParams interface and adds additional parameters - * specific to web-based loaders. + * @deprecated Either import the CheerioWebBaseLoaderParams from @langchain/community/document_loaders/web/cheerio + * or use the WebBaseLoaderParams from @langchain/community/document_loaders/web/html. */ -export interface WebBaseLoaderParams extends AsyncCallerParams { - /** - * The timeout in milliseconds for the fetch request. Defaults to 10s. - */ - timeout?: number; +export { WebBaseLoaderParams }; +/** + * Represents the parameters for configuring the CheerioWebBaseLoader. It + * extends the WebBaseLoaderParams interface and adds additional parameters + * specific to loading with Cheerio. + */ +export interface CheerioWebBaseLoaderParams extends WebBaseLoaderParams { /** * The selector to use to extract the text from the document. Defaults to * "body". */ selector?: SelectorType; - - /** - * The text decoder to use to decode the response. Defaults to UTF-8. - */ - textDecoder?: TextDecoder; - /** - * The headers to use in the fetch request. - */ - headers?: HeadersInit; } /** @@ -45,14 +34,14 @@ export interface WebBaseLoaderParams extends AsyncCallerParams { * web-based documents using Cheerio. * @example * ```typescript - * const loader = new CheerioWebBaseLoader("https:exampleurl.com"); + * const loader = new CheerioWebBaseLoader("https://exampleurl.com"); * const docs = await loader.load(); * console.log({ docs }); * ``` */ export class CheerioWebBaseLoader extends BaseDocumentLoader - implements DocumentLoader + implements WebBaseLoader { timeout: number; @@ -64,7 +53,7 @@ export class CheerioWebBaseLoader headers?: HeadersInit; - constructor(public webPath: string, fields?: WebBaseLoaderParams) { + constructor(public webPath: string, fields?: CheerioWebBaseLoaderParams) { super(); const { timeout, selector, textDecoder, headers, ...rest } = fields ?? {}; this.timeout = timeout ?? 10000; diff --git a/libs/langchain-community/src/document_loaders/web/html.ts b/libs/langchain-community/src/document_loaders/web/html.ts new file mode 100644 index 000000000000..b07d61912c13 --- /dev/null +++ b/libs/langchain-community/src/document_loaders/web/html.ts @@ -0,0 +1,81 @@ +import { + AsyncCaller, + AsyncCallerParams, +} from "@langchain/core/utils/async_caller"; +import { BaseDocumentLoader } from "@langchain/core/document_loaders/base"; +import { Document } from "@langchain/core/documents"; +import type { DocumentLoader } from "@langchain/core/document_loaders/base"; + +/** + * Represents the parameters for configuring WebBaseLoaders. It extends the + * AsyncCallerParams interface and adds additional parameters specific to + * web-based loaders. + */ +export interface WebBaseLoaderParams extends AsyncCallerParams { + /** + * The timeout in milliseconds for the fetch request. Defaults to 10s. + */ + timeout?: number; + + /** + * The text decoder to use to decode the response. Defaults to UTF-8. + */ + textDecoder?: TextDecoder; + /** + * The headers to use in the fetch request. + */ + headers?: HeadersInit; + /** + * The selector to use to extract the text from the document. + * Defaults to "body". + * @deprecated Use CheerioWebBaseLoaderParams from @langchain/community/document_loaders/web/cheerio + * instead. + */ + // eslint-disable-next-line @typescript-eslint/no-explicit-any + selector?: any; +} + +export interface WebBaseLoader extends DocumentLoader { + timeout: number; + + caller: AsyncCaller; + + textDecoder?: TextDecoder; + + headers?: HeadersInit; +} + +export class HTMLWebBaseLoader + extends BaseDocumentLoader + implements WebBaseLoader +{ + timeout: number; + + caller: AsyncCaller; + + textDecoder?: TextDecoder; + + headers?: HeadersInit; + + constructor(public webPath: string, fields?: WebBaseLoaderParams) { + super(); + const { timeout, textDecoder, headers, ...rest } = fields ?? {}; + this.timeout = timeout ?? 10000; + this.caller = new AsyncCaller(rest); + this.textDecoder = textDecoder; + this.headers = headers; + } + + async load(): Promise { + const response = await this.caller.call(fetch, this.webPath, { + signal: this.timeout ? AbortSignal.timeout(this.timeout) : undefined, + headers: this.headers, + }); + + const html = + this.textDecoder?.decode(await response.arrayBuffer()) ?? + (await response.text()); + + return [new Document({ pageContent: html })]; + } +} diff --git a/libs/langchain-community/src/document_loaders/web/sitemap.ts b/libs/langchain-community/src/document_loaders/web/sitemap.ts index aa6a6e41cb33..1cf5efcf75ba 100644 --- a/libs/langchain-community/src/document_loaders/web/sitemap.ts +++ b/libs/langchain-community/src/document_loaders/web/sitemap.ts @@ -1,13 +1,13 @@ import { Document, DocumentInterface } from "@langchain/core/documents"; import { chunkArray } from "@langchain/core/utils/chunk_array"; -import { CheerioWebBaseLoader, WebBaseLoaderParams } from "./cheerio.js"; +import { CheerioWebBaseLoader, CheerioWebBaseLoaderParams } from "./cheerio.js"; /** * Interface representing the parameters for initializing a SitemapLoader. * @interface SitemapLoaderParams - * @extends WebBaseLoaderParams + * @extends CheerioWebBaseLoaderParams */ -export interface SitemapLoaderParams extends WebBaseLoaderParams { +export interface SitemapLoaderParams extends CheerioWebBaseLoaderParams { /** * @property {(string | RegExp)[] | undefined} filterUrls - A list of regexes. Only URLs that match one of the filter URLs will be loaded. * WARNING: The filter URLs are interpreted as regular expressions. Escape special characters if needed. diff --git a/libs/langchain-community/src/document_transformers/mozilla_readability.ts b/libs/langchain-community/src/document_transformers/mozilla_readability.ts index a26b42a6d6c7..e8003c3d0514 100644 --- a/libs/langchain-community/src/document_transformers/mozilla_readability.ts +++ b/libs/langchain-community/src/document_transformers/mozilla_readability.ts @@ -11,7 +11,7 @@ import { * main content from a web page. * @example * ```typescript - * const loader = new CheerioWebBaseLoader("https://example.com/article"); + * const loader = new HTMLWebBaseLoader("https://example.com/article"); * const docs = await loader.load(); * * const splitter = new RecursiveCharacterTextSplitter({ @@ -20,7 +20,7 @@ import { * const transformer = new MozillaReadabilityTransformer(); * * // The sequence processes the loaded documents through the splitter and then the transformer. - * const sequence = splitter.pipe(transformer); + * const sequence = transformer.pipe(splitter); * * // Invoke the sequence to transform the documents into a more readable format. * const newDocuments = await sequence.invoke(docs); diff --git a/libs/langchain-community/src/load/import_map.ts b/libs/langchain-community/src/load/import_map.ts index 7425c4331ded..8b3b734a82c1 100644 --- a/libs/langchain-community/src/load/import_map.ts +++ b/libs/langchain-community/src/load/import_map.ts @@ -72,6 +72,7 @@ export * as memory__chat_memory from "../memory/chat_memory.js"; export * as indexes__base from "../indexes/base.js"; export * as indexes__memory from "../indexes/memory.js"; export * as document_loaders__web__airtable from "../document_loaders/web/airtable.js"; +export * as document_loaders__web__html from "../document_loaders/web/html.js"; export * as document_loaders__web__searchapi from "../document_loaders/web/searchapi.js"; export * as document_loaders__web__serpapi from "../document_loaders/web/serpapi.js"; export * as document_loaders__web__sort_xyz_blockchain from "../document_loaders/web/sort_xyz_blockchain.js";