From c3fb92fc7be9df3041f4d5d7c975cb97560d5db1 Mon Sep 17 00:00:00 2001 From: bracesproul Date: Thu, 1 Aug 2024 13:34:38 -0700 Subject: [PATCH] delete old page --- .../web_loaders/recursive_url_loader.mdx | 67 ------------------- 1 file changed, 67 deletions(-) delete mode 100644 docs/core_docs/docs/integrations/document_loaders/web_loaders/recursive_url_loader.mdx diff --git a/docs/core_docs/docs/integrations/document_loaders/web_loaders/recursive_url_loader.mdx b/docs/core_docs/docs/integrations/document_loaders/web_loaders/recursive_url_loader.mdx deleted file mode 100644 index ddcb358c3056..000000000000 --- a/docs/core_docs/docs/integrations/document_loaders/web_loaders/recursive_url_loader.mdx +++ /dev/null @@ -1,67 +0,0 @@ ---- -sidebar_class_name: node-only -hide_table_of_contents: true ---- - -# Recursive URL Loader - -When loading content from a website, we may want to process load all URLs on a page. - -For example, let's look at the [LangChain.js introduction](/docs/introduction) docs. - -This has many interesting child pages that we may want to load, split, and later retrieve in bulk. - -The challenge is traversing the tree of child pages and assembling a list! - -We do this using the RecursiveUrlLoader. - -This also gives us the flexibility to exclude some children, customize the extractor, and more. - -## Setup - -To get started, you'll need to install the [`jsdom`](https://www.npmjs.com/package/jsdom) package: - -```bash npm2yarn -npm i jsdom -``` - -We also suggest adding a package like [`html-to-text`](https://www.npmjs.com/package/html-to-text) or -[`@mozilla/readability`](https://www.npmjs.com/package/@mozilla/readability) for extracting the raw text from the page. - -```bash npm2yarn -npm i html-to-text -``` - -## Usage - -```typescript -import { compile } from "html-to-text"; -import { RecursiveUrlLoader } from "@langchain/community/document_loaders/web/recursive_url"; - -const url = "/docs/introduction"; - -const compiledConvert = compile({ wordwrap: 130 }); // returns (text: string) => string; - -const loader = new RecursiveUrlLoader(url, { - extractor: compiledConvert, - maxDepth: 1, - excludeDirs: ["/docs/api/"], -}); - -const docs = await loader.load(); -``` - -## Options - -```typescript -interface Options { - excludeDirs?: string[]; // webpage directories to exclude. - extractor?: (text: string) => string; // a function to extract the text of the document from the webpage, by default it returns the page as it is. It is recommended to use tools like html-to-text to extract the text. By default, it just returns the page as it is. - maxDepth?: number; // the maximum depth to crawl. By default, it is set to 2. If you need to crawl the whole website, set it to a number that is large enough would simply do the job. - timeout?: number; // the timeout for each request, in the unit of seconds. By default, it is set to 10000 (10 seconds). - preventOutside?: boolean; // whether to prevent crawling outside the root url. By default, it is set to true. - callerOptions?: AsyncCallerConstructorParams; // the options to call the AsyncCaller for example setting max concurrency (default is 64) -} -``` - -However, since it's hard to perform a perfect filter, you may still see some irrelevant results in the results. You can perform a filter on the returned documents by yourself, if it's needed. Most of the time, the returned results are good enough.