diff --git a/docs/core_docs/docs/integrations/document_loaders/web_loaders/firecrawl.mdx b/docs/core_docs/docs/integrations/document_loaders/web_loaders/firecrawl.mdx new file mode 100644 index 000000000000..685e78726c71 --- /dev/null +++ b/docs/core_docs/docs/integrations/document_loaders/web_loaders/firecrawl.mdx @@ -0,0 +1,38 @@ +--- +hide_table_of_contents: true +--- + +# Firecrawl + +This guide shows how to use [Firecrawl](https://firecrawl.dev) with LangChain to load web data into an LLM-ready format using Firecrawl. + +## Overview + +[FireCrawl](https://firecrawl.dev) crawls and convert any website into LLM-ready data. It crawls all accessible subpages and give you clean markdown and metadata for each. No sitemap required. + +FireCrawl handles complex tasks such as reverse proxies, caching, rate limits, and content blocked by JavaScript. Built by the [mendable.ai](https://mendable.ai) team. + +This guide shows how to scrap and crawl entire websites and load them using the `FireCrawlLoader` in LangChain. + +## Setup + +Sign up and get your free [FireCrawl API key](https://firecrawl.dev) to start. FireCrawl offers 100 free credits to get you started, and it's [open-source](https://github.com/mendableai/firecrawl) in case you want to self-host. + +## Usage + +Here's an example of how to use the `FireCrawlLoader` to load web search results: + +Firecrawl offers 2 modes: `scrape` and `crawl`. In `scrape` mode, Firecrawl will only scrape the page you provide. In `crawl` mode, Firecrawl will crawl the entire website. + +import CodeBlock from "@theme/CodeBlock"; +import Example from "@examples/document_loaders/firecrawl.ts"; + +```bash npm2yarn +npm install @mendableai/firecrawl-js +``` + +{Example} + +### Additional Parameters + +For `params` you can pass any of the params according to the [Firecrawl documentation](https://docs.firecrawl.dev). diff --git a/examples/src/document_loaders/firecrawl.ts b/examples/src/document_loaders/firecrawl.ts new file mode 100644 index 000000000000..d8524e1b4b6c --- /dev/null +++ b/examples/src/document_loaders/firecrawl.ts @@ -0,0 +1,13 @@ +import { FireCrawlLoader } from "langchain/document_loaders/web/firecrawl"; + +const loader = new FireCrawlLoader({ + url: "https://firecrawl.dev", // The URL to scrape + apiKey: process.env.FIRECRAWL_API_KEY, // Optional, defaults to `FIRECRAWL_API_KEY` in your env. + mode: "scrape", // The mode to run the crawler in. Can be "scrape" for single urls or "crawl" for all accessible subpages + params: { + // optional parameters based on Firecrawl API docs + // For API documentation, visit https://docs.firecrawl.dev + }, +}); + +const docs = await loader.load(); diff --git a/langchain/.gitignore b/langchain/.gitignore index ec6142a2e774..aa56ad5db796 100644 --- a/langchain/.gitignore +++ b/langchain/.gitignore @@ -566,6 +566,10 @@ document_loaders/web/figma.cjs document_loaders/web/figma.js document_loaders/web/figma.d.ts document_loaders/web/figma.d.cts +document_loaders/web/firecrawl.cjs +document_loaders/web/firecrawl.js +document_loaders/web/firecrawl.d.ts +document_loaders/web/firecrawl.d.cts document_loaders/web/github.cjs document_loaders/web/github.js document_loaders/web/github.d.ts diff --git a/langchain/langchain.config.js b/langchain/langchain.config.js index b286f40db4a8..e25e9fecd913 100644 --- a/langchain/langchain.config.js +++ b/langchain/langchain.config.js @@ -191,6 +191,7 @@ export const config = { "document_loaders/web/hn": "document_loaders/web/hn", "document_loaders/web/imsdb": "document_loaders/web/imsdb", "document_loaders/web/figma": "document_loaders/web/figma", + "document_loaders/web/firecrawl": "document_loaders/web/firecrawl", "document_loaders/web/github": "document_loaders/web/github", "document_loaders/web/notiondb": "document_loaders/web/notiondb", "document_loaders/web/notionapi": "document_loaders/web/notionapi", @@ -637,6 +638,7 @@ export const config = { "document_loaders/web/hn", "document_loaders/web/imsdb", "document_loaders/web/figma", + "document_loaders/web/firecrawl", "document_loaders/web/github", "document_loaders/web/pdf", "document_loaders/web/notiondb", diff --git a/langchain/package.json b/langchain/package.json index 9503688aef95..816721dccbd4 100644 --- a/langchain/package.json +++ b/langchain/package.json @@ -578,6 +578,10 @@ "document_loaders/web/figma.js", "document_loaders/web/figma.d.ts", "document_loaders/web/figma.d.cts", + "document_loaders/web/firecrawl.cjs", + "document_loaders/web/firecrawl.js", + "document_loaders/web/firecrawl.d.ts", + "document_loaders/web/firecrawl.d.cts", "document_loaders/web/github.cjs", "document_loaders/web/github.js", "document_loaders/web/github.d.ts", @@ -1230,6 +1234,7 @@ "@google-cloud/storage": "^7.7.0", "@jest/globals": "^29.5.0", "@langchain/scripts": "~0.0", + "@mendable/firecrawl-js": "^0.0.13", "@notionhq/client": "^2.2.10", "@pinecone-database/pinecone": "^1.1.0", "@supabase/supabase-js": "^2.10.0", @@ -1314,6 +1319,7 @@ "@gomomento/sdk-web": "^1.51.1", "@google-ai/generativelanguage": "^0.2.1", "@google-cloud/storage": "^6.10.1 || ^7.7.0", + "@mendable/firecrawl-js": "^0.0.13", "@notionhq/client": "^2.2.10", "@pinecone-database/pinecone": "*", "@supabase/supabase-js": "^2.10.0", @@ -1386,6 +1392,9 @@ "@google-cloud/storage": { "optional": true }, + "@mendable/firecrawl-js": { + "optional": true + }, "@notionhq/client": { "optional": true }, @@ -2826,6 +2835,15 @@ "import": "./document_loaders/web/figma.js", "require": "./document_loaders/web/figma.cjs" }, + "./document_loaders/web/firecrawl": { + "types": { + "import": "./document_loaders/web/firecrawl.d.ts", + "require": "./document_loaders/web/firecrawl.d.cts", + "default": "./document_loaders/web/firecrawl.d.ts" + }, + "import": "./document_loaders/web/firecrawl.js", + "require": "./document_loaders/web/firecrawl.cjs" + }, "./document_loaders/web/github": { "types": { "import": "./document_loaders/web/github.d.ts", diff --git a/langchain/src/document_loaders/tests/firecrawl.int.test.ts b/langchain/src/document_loaders/tests/firecrawl.int.test.ts new file mode 100644 index 000000000000..18ac838c3fb3 --- /dev/null +++ b/langchain/src/document_loaders/tests/firecrawl.int.test.ts @@ -0,0 +1,34 @@ +/* eslint-disable no-process-env */ +/* eslint-disable @typescript-eslint/no-non-null-assertion */ +import { test } from "@jest/globals"; +import { Document } from "@langchain/core/documents"; +import { FireCrawlLoader } from "../web/firecrawl.js"; + +test("Test FireCrawlLoader load method with scrape mode", async () => { + const loader = new FireCrawlLoader({ + url: "https://firecrawl.dev", + apiKey: process.env.FIRECRAWL_API_KEY, + mode: "scrape", + }); + + const documents = await loader.load(); + expect(documents).toHaveLength(1); + const document = documents[0]; + expect(document).toBeInstanceOf(Document); + expect(document.pageContent).toBeTruthy(); + expect(document.metadata).toBeTruthy(); +}); + +test("Test FireCrawlLoader load method with crawl mode", async () => { + const loader = new FireCrawlLoader({ + url: "https://firecrawl.dev", + apiKey: process.env.FIRECRAWL_API_KEY, + mode: "crawl", + }); + + const documents = await loader.load(); + const document = documents[0]; + expect(document).toBeInstanceOf(Document); + expect(document.pageContent).toBeTruthy(); + expect(document.metadata).toBeTruthy(); +}, 15000); diff --git a/langchain/src/document_loaders/web/firecrawl.ts b/langchain/src/document_loaders/web/firecrawl.ts new file mode 100644 index 000000000000..9477d984023a --- /dev/null +++ b/langchain/src/document_loaders/web/firecrawl.ts @@ -0,0 +1,108 @@ +import FirecrawlApp from "@mendable/firecrawl-js"; +import { Document, type DocumentInterface } from "@langchain/core/documents"; +import { getEnvironmentVariable } from "@langchain/core/utils/env"; +import { BaseDocumentLoader } from "../base.js"; + +/** + * Interface representing the parameters for the Firecrawl loader. It + * includes properties such as the URL to scrape or crawl and the API key. + */ +interface FirecrawlLoaderParameters { + /** + * URL to scrape or crawl + */ + url: string; + + /** + * API key for Firecrawl. If not provided, the default value is the value of the FIRECRAWL_API_KEY environment variable. + */ + apiKey?: string; + + /** + * Mode of operation. Can be either "crawl" or "scrape". If not provided, the default value is "crawl". + */ + mode?: "crawl" | "scrape"; + params?: Record; +} +interface FirecrawlDocument { + markdown: string; + metadata: Record; +} + +/** + * Class representing a document loader for loading data from + * Firecrawl (firecrawl.dev). It extends the BaseDocumentLoader class. + * @example + * ```typescript + * const loader = new FireCrawlLoader({ + * url: "{url}", + * apiKey: "{apiKey}", + * mode: "crawl" + * }); + * const docs = await loader.load(); + * ``` + */ +export class FireCrawlLoader extends BaseDocumentLoader { + private apiKey: string; + + private url: string; + + private mode: "crawl" | "scrape"; + + private params?: Record; + + constructor(loaderParams: FirecrawlLoaderParameters) { + super(); + const { + apiKey = getEnvironmentVariable("FIRECRAWL_API_KEY"), + url, + mode = "crawl", + params, + } = loaderParams; + if (!apiKey) { + throw new Error( + "Firecrawl API key not set. You can set it as FIRECRAWL_API_KEY in your .env file, or pass it to Firecrawl." + ); + } + + this.apiKey = apiKey; + this.url = url; + this.mode = mode; + this.params = params; + } + + /** + * Loads the data from the Firecrawl. + * @returns An array of Documents representing the retrieved data. + * @throws An error if the data could not be loaded. + */ + public async load(): Promise { + const app = new FirecrawlApp({ apiKey: this.apiKey }); + let firecrawlDocs: FirecrawlDocument[]; + + if (this.mode === "scrape") { + const response = await app.scrapeUrl(this.url, this.params); + if (!response.success) { + throw new Error( + `Firecrawl: Failed to scrape URL. Error: ${response.error}` + ); + } + firecrawlDocs = [response.data as FirecrawlDocument]; + } else if (this.mode === "crawl") { + const response = await app.crawlUrl(this.url, this.params, true); + firecrawlDocs = response as FirecrawlDocument[]; + } else { + throw new Error( + `Unrecognized mode '${this.mode}'. Expected one of 'crawl', 'scrape'.` + ); + } + + return firecrawlDocs.map( + (doc) => + new Document({ + pageContent: doc.markdown || "", + metadata: doc.metadata || {}, + }) + ); + } +} diff --git a/langchain/src/load/import_constants.ts b/langchain/src/load/import_constants.ts index 81b09d272a05..3119a6164a92 100644 --- a/langchain/src/load/import_constants.ts +++ b/langchain/src/load/import_constants.ts @@ -91,6 +91,7 @@ export const optionalImportEntrypoints: string[] = [ "langchain/document_loaders/web/hn", "langchain/document_loaders/web/imsdb", "langchain/document_loaders/web/figma", + "langchain/document_loaders/web/firecrawl", "langchain/document_loaders/web/github", "langchain/document_loaders/web/notiondb", "langchain/document_loaders/web/notionapi", diff --git a/yarn.lock b/yarn.lock index d9a9dccb787f..6dfb086f7690 100644 --- a/yarn.lock +++ b/yarn.lock @@ -10092,6 +10092,16 @@ __metadata: languageName: node linkType: hard +"@mendable/firecrawl-js@npm:^0.0.13": + version: 0.0.13 + resolution: "@mendable/firecrawl-js@npm:0.0.13" + dependencies: + axios: ^1.6.8 + dotenv: ^16.4.5 + checksum: 9dbc0b6e5d300bb9ef9f45cebd5c0026ac468863984cdc73a57ed6fdf888eaead5f9e2325c6848d03897c72cab195fffb4ce7d832e39696a11216bc53b417b6d + languageName: node + linkType: hard + "@mistralai/mistralai@npm:^0.1.3": version: 0.1.3 resolution: "@mistralai/mistralai@npm:0.1.3" @@ -16954,7 +16964,7 @@ __metadata: languageName: node linkType: hard -"axios@npm:^1.6.2": +"axios@npm:^1.6.2, axios@npm:^1.6.8": version: 1.6.8 resolution: "axios@npm:1.6.8" dependencies: @@ -26670,6 +26680,7 @@ __metadata: "@langchain/openai": ~0.0.28 "@langchain/scripts": ~0.0 "@langchain/textsplitters": ~0.0.0 + "@mendable/firecrawl-js": ^0.0.13 "@notionhq/client": ^2.2.10 "@pinecone-database/pinecone": ^1.1.0 "@supabase/supabase-js": ^2.10.0 @@ -26766,6 +26777,7 @@ __metadata: "@gomomento/sdk-web": ^1.51.1 "@google-ai/generativelanguage": ^0.2.1 "@google-cloud/storage": ^6.10.1 || ^7.7.0 + "@mendable/firecrawl-js": ^0.0.13 "@notionhq/client": ^2.2.10 "@pinecone-database/pinecone": "*" "@supabase/supabase-js": ^2.10.0 @@ -26827,6 +26839,8 @@ __metadata: optional: true "@google-cloud/storage": optional: true + "@mendable/firecrawl-js": + optional: true "@notionhq/client": optional: true "@pinecone-database/pinecone":