Skip to content

Commit

Permalink
feat(community): Adds an HTML loader for URLS (#7184)
Browse files Browse the repository at this point in the history
Co-authored-by: Jacob Lee <[email protected]>
  • Loading branch information
philnash and jacoblee93 authored Nov 17, 2024
1 parent c30ae29 commit 54decfe
Show file tree
Hide file tree
Showing 10 changed files with 146 additions and 33 deletions.
6 changes: 3 additions & 3 deletions examples/src/document_transformers/mozilla_readability.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import { CheerioWebBaseLoader } from "@langchain/community/document_loaders/web/cheerio";
import { HTMLWebBaseLoader } from "@langchain/community/document_loaders/web/html";
import { MozillaReadabilityTransformer } from "@langchain/community/document_transformers/mozilla_readability";
import { RecursiveCharacterTextSplitter } from "@langchain/textsplitters";

const loader = new CheerioWebBaseLoader(
const loader = new HTMLWebBaseLoader(
"https://news.ycombinator.com/item?id=34817881"
);

Expand All @@ -11,7 +11,7 @@ const docs = await loader.load();
const splitter = RecursiveCharacterTextSplitter.fromLanguage("html");
const transformer = new MozillaReadabilityTransformer();

const sequence = splitter.pipe(transformer);
const sequence = transformer.pipe(splitter);

const newDocuments = await sequence.invoke(docs);

Expand Down
4 changes: 4 additions & 0 deletions libs/langchain-community/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -866,6 +866,10 @@ document_loaders/web/cheerio.cjs
document_loaders/web/cheerio.js
document_loaders/web/cheerio.d.ts
document_loaders/web/cheerio.d.cts
document_loaders/web/html.cjs
document_loaders/web/html.js
document_loaders/web/html.d.ts
document_loaders/web/html.d.cts
document_loaders/web/puppeteer.cjs
document_loaders/web/puppeteer.js
document_loaders/web/puppeteer.d.ts
Expand Down
1 change: 1 addition & 0 deletions libs/langchain-community/langchain.config.js
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,7 @@ export const config = {
"document_loaders/web/azure_blob_storage_file",
"document_loaders/web/browserbase": "document_loaders/web/browserbase",
"document_loaders/web/cheerio": "document_loaders/web/cheerio",
"document_loaders/web/html": "document_loaders/web/html",
"document_loaders/web/puppeteer": "document_loaders/web/puppeteer",
"document_loaders/web/playwright": "document_loaders/web/playwright",
"document_loaders/web/college_confidential":
Expand Down
13 changes: 13 additions & 0 deletions libs/langchain-community/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -2661,6 +2661,15 @@
"import": "./document_loaders/web/cheerio.js",
"require": "./document_loaders/web/cheerio.cjs"
},
"./document_loaders/web/html": {
"types": {
"import": "./document_loaders/web/html.d.ts",
"require": "./document_loaders/web/html.d.cts",
"default": "./document_loaders/web/html.d.ts"
},
"import": "./document_loaders/web/html.js",
"require": "./document_loaders/web/html.cjs"
},
"./document_loaders/web/puppeteer": {
"types": {
"import": "./document_loaders/web/puppeteer.d.ts",
Expand Down Expand Up @@ -3938,6 +3947,10 @@
"document_loaders/web/cheerio.js",
"document_loaders/web/cheerio.d.ts",
"document_loaders/web/cheerio.d.cts",
"document_loaders/web/html.cjs",
"document_loaders/web/html.js",
"document_loaders/web/html.d.ts",
"document_loaders/web/html.d.cts",
"document_loaders/web/puppeteer.cjs",
"document_loaders/web/puppeteer.js",
"document_loaders/web/puppeteer.d.ts",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import { expect, test } from "@jest/globals";
import { HTMLWebBaseLoader } from "../web/html.js";

test("Test HTML web scraper loader", async () => {
const loader = new HTMLWebBaseLoader(
"https://news.ycombinator.com/item?id=34817881"
);
const docs = await loader.load();
expect(docs[0].pageContent).toEqual(
expect.stringContaining("What Lights the Universe’s Standard Candles?")
);
});

test("Test HTML web scraper loader with textDecoder", async () => {
const loader = new HTMLWebBaseLoader(
"https://corp.163.com/gb/about/management.html",
{
textDecoder: new TextDecoder("gbk"),
}
);

const docs = await loader.load();
expect(docs[0].pageContent.trim()).toEqual(expect.stringContaining("网易"));
});
39 changes: 14 additions & 25 deletions libs/langchain-community/src/document_loaders/web/cheerio.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,38 +5,27 @@ import type {
SelectorType,
} from "cheerio";
import { Document } from "@langchain/core/documents";
import {
AsyncCaller,
AsyncCallerParams,
} from "@langchain/core/utils/async_caller";
import { AsyncCaller } from "@langchain/core/utils/async_caller";
import { BaseDocumentLoader } from "@langchain/core/document_loaders/base";
import type { DocumentLoader } from "@langchain/core/document_loaders/base";
import type { WebBaseLoaderParams, WebBaseLoader } from "./html.js";

/**
* Represents the parameters for configuring the CheerioWebBaseLoader. It
* extends the AsyncCallerParams interface and adds additional parameters
* specific to web-based loaders.
* @deprecated Either import the CheerioWebBaseLoaderParams from @langchain/community/document_loaders/web/cheerio
* or use the WebBaseLoaderParams from @langchain/community/document_loaders/web/html.
*/
export interface WebBaseLoaderParams extends AsyncCallerParams {
/**
* The timeout in milliseconds for the fetch request. Defaults to 10s.
*/
timeout?: number;
export { WebBaseLoaderParams };

/**
* Represents the parameters for configuring the CheerioWebBaseLoader. It
* extends the WebBaseLoaderParams interface and adds additional parameters
* specific to loading with Cheerio.
*/
export interface CheerioWebBaseLoaderParams extends WebBaseLoaderParams {
/**
* The selector to use to extract the text from the document. Defaults to
* "body".
*/
selector?: SelectorType;

/**
* The text decoder to use to decode the response. Defaults to UTF-8.
*/
textDecoder?: TextDecoder;
/**
* The headers to use in the fetch request.
*/
headers?: HeadersInit;
}

/**
Expand All @@ -45,14 +34,14 @@ export interface WebBaseLoaderParams extends AsyncCallerParams {
* web-based documents using Cheerio.
* @example
* ```typescript
* const loader = new CheerioWebBaseLoader("https:exampleurl.com");
* const loader = new CheerioWebBaseLoader("https://exampleurl.com");
* const docs = await loader.load();
* console.log({ docs });
* ```
*/
export class CheerioWebBaseLoader
extends BaseDocumentLoader
implements DocumentLoader
implements WebBaseLoader
{
timeout: number;

Expand All @@ -64,7 +53,7 @@ export class CheerioWebBaseLoader

headers?: HeadersInit;

constructor(public webPath: string, fields?: WebBaseLoaderParams) {
constructor(public webPath: string, fields?: CheerioWebBaseLoaderParams) {
super();
const { timeout, selector, textDecoder, headers, ...rest } = fields ?? {};
this.timeout = timeout ?? 10000;
Expand Down
81 changes: 81 additions & 0 deletions libs/langchain-community/src/document_loaders/web/html.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
import {
AsyncCaller,
AsyncCallerParams,
} from "@langchain/core/utils/async_caller";
import { BaseDocumentLoader } from "@langchain/core/document_loaders/base";
import { Document } from "@langchain/core/documents";
import type { DocumentLoader } from "@langchain/core/document_loaders/base";

/**
* Represents the parameters for configuring WebBaseLoaders. It extends the
* AsyncCallerParams interface and adds additional parameters specific to
* web-based loaders.
*/
export interface WebBaseLoaderParams extends AsyncCallerParams {
/**
* The timeout in milliseconds for the fetch request. Defaults to 10s.
*/
timeout?: number;

/**
* The text decoder to use to decode the response. Defaults to UTF-8.
*/
textDecoder?: TextDecoder;
/**
* The headers to use in the fetch request.
*/
headers?: HeadersInit;
/**
* The selector to use to extract the text from the document.
* Defaults to "body".
* @deprecated Use CheerioWebBaseLoaderParams from @langchain/community/document_loaders/web/cheerio
* instead.
*/
// eslint-disable-next-line @typescript-eslint/no-explicit-any
selector?: any;
}

export interface WebBaseLoader extends DocumentLoader {
timeout: number;

caller: AsyncCaller;

textDecoder?: TextDecoder;

headers?: HeadersInit;
}

export class HTMLWebBaseLoader
extends BaseDocumentLoader
implements WebBaseLoader
{
timeout: number;

caller: AsyncCaller;

textDecoder?: TextDecoder;

headers?: HeadersInit;

constructor(public webPath: string, fields?: WebBaseLoaderParams) {
super();
const { timeout, textDecoder, headers, ...rest } = fields ?? {};
this.timeout = timeout ?? 10000;
this.caller = new AsyncCaller(rest);
this.textDecoder = textDecoder;
this.headers = headers;
}

async load(): Promise<Document[]> {
const response = await this.caller.call(fetch, this.webPath, {
signal: this.timeout ? AbortSignal.timeout(this.timeout) : undefined,
headers: this.headers,
});

const html =
this.textDecoder?.decode(await response.arrayBuffer()) ??
(await response.text());

return [new Document({ pageContent: html })];
}
}
6 changes: 3 additions & 3 deletions libs/langchain-community/src/document_loaders/web/sitemap.ts
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
import { Document, DocumentInterface } from "@langchain/core/documents";
import { chunkArray } from "@langchain/core/utils/chunk_array";
import { CheerioWebBaseLoader, WebBaseLoaderParams } from "./cheerio.js";
import { CheerioWebBaseLoader, CheerioWebBaseLoaderParams } from "./cheerio.js";

/**
* Interface representing the parameters for initializing a SitemapLoader.
* @interface SitemapLoaderParams
* @extends WebBaseLoaderParams
* @extends CheerioWebBaseLoaderParams
*/
export interface SitemapLoaderParams extends WebBaseLoaderParams {
export interface SitemapLoaderParams extends CheerioWebBaseLoaderParams {
/**
* @property {(string | RegExp)[] | undefined} filterUrls - A list of regexes. Only URLs that match one of the filter URLs will be loaded.
* WARNING: The filter URLs are interpreted as regular expressions. Escape special characters if needed.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ import {
* main content from a web page.
* @example
* ```typescript
* const loader = new CheerioWebBaseLoader("https://example.com/article");
* const loader = new HTMLWebBaseLoader("https://example.com/article");
* const docs = await loader.load();
*
* const splitter = new RecursiveCharacterTextSplitter({
Expand All @@ -20,7 +20,7 @@ import {
* const transformer = new MozillaReadabilityTransformer();
*
* // The sequence processes the loaded documents through the splitter and then the transformer.
* const sequence = splitter.pipe(transformer);
* const sequence = transformer.pipe(splitter);
*
* // Invoke the sequence to transform the documents into a more readable format.
* const newDocuments = await sequence.invoke(docs);
Expand Down
1 change: 1 addition & 0 deletions libs/langchain-community/src/load/import_map.ts
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ export * as memory__chat_memory from "../memory/chat_memory.js";
export * as indexes__base from "../indexes/base.js";
export * as indexes__memory from "../indexes/memory.js";
export * as document_loaders__web__airtable from "../document_loaders/web/airtable.js";
export * as document_loaders__web__html from "../document_loaders/web/html.js";
export * as document_loaders__web__searchapi from "../document_loaders/web/searchapi.js";
export * as document_loaders__web__serpapi from "../document_loaders/web/serpapi.js";
export * as document_loaders__web__sort_xyz_blockchain from "../document_loaders/web/sort_xyz_blockchain.js";
Expand Down

0 comments on commit 54decfe

Please sign in to comment.