diff --git a/docs/core_docs/docs/integrations/document_loaders/web_loaders/firecrawl.mdx b/docs/core_docs/docs/integrations/document_loaders/web_loaders/firecrawl.mdx
new file mode 100644
index 000000000000..685e78726c71
--- /dev/null
+++ b/docs/core_docs/docs/integrations/document_loaders/web_loaders/firecrawl.mdx
@@ -0,0 +1,38 @@
+---
+hide_table_of_contents: true
+---
+
+# Firecrawl
+
+This guide shows how to use [Firecrawl](https://firecrawl.dev) with LangChain to load web data into an LLM-ready format using Firecrawl.
+
+## Overview
+
+[FireCrawl](https://firecrawl.dev) crawls and convert any website into LLM-ready data. It crawls all accessible subpages and give you clean markdown and metadata for each. No sitemap required.
+
+FireCrawl handles complex tasks such as reverse proxies, caching, rate limits, and content blocked by JavaScript. Built by the [mendable.ai](https://mendable.ai) team.
+
+This guide shows how to scrap and crawl entire websites and load them using the `FireCrawlLoader` in LangChain.
+
+## Setup
+
+Sign up and get your free [FireCrawl API key](https://firecrawl.dev) to start. FireCrawl offers 100 free credits to get you started, and it's [open-source](https://github.com/mendableai/firecrawl) in case you want to self-host.
+
+## Usage
+
+Here's an example of how to use the `FireCrawlLoader` to load web search results:
+
+Firecrawl offers 2 modes: `scrape` and `crawl`. In `scrape` mode, Firecrawl will only scrape the page you provide. In `crawl` mode, Firecrawl will crawl the entire website.
+
+import CodeBlock from "@theme/CodeBlock";
+import Example from "@examples/document_loaders/firecrawl.ts";
+
+```bash npm2yarn
+npm install @mendableai/firecrawl-js
+```
+
+{Example}
+
+### Additional Parameters
+
+For `params` you can pass any of the params according to the [Firecrawl documentation](https://docs.firecrawl.dev).
diff --git a/examples/src/document_loaders/firecrawl.ts b/examples/src/document_loaders/firecrawl.ts
new file mode 100644
index 000000000000..d8524e1b4b6c
--- /dev/null
+++ b/examples/src/document_loaders/firecrawl.ts
@@ -0,0 +1,13 @@
+import { FireCrawlLoader } from "langchain/document_loaders/web/firecrawl";
+
+const loader = new FireCrawlLoader({
+ url: "https://firecrawl.dev", // The URL to scrape
+ apiKey: process.env.FIRECRAWL_API_KEY, // Optional, defaults to `FIRECRAWL_API_KEY` in your env.
+ mode: "scrape", // The mode to run the crawler in. Can be "scrape" for single urls or "crawl" for all accessible subpages
+ params: {
+ // optional parameters based on Firecrawl API docs
+ // For API documentation, visit https://docs.firecrawl.dev
+ },
+});
+
+const docs = await loader.load();
diff --git a/langchain/.gitignore b/langchain/.gitignore
index ec6142a2e774..aa56ad5db796 100644
--- a/langchain/.gitignore
+++ b/langchain/.gitignore
@@ -566,6 +566,10 @@ document_loaders/web/figma.cjs
document_loaders/web/figma.js
document_loaders/web/figma.d.ts
document_loaders/web/figma.d.cts
+document_loaders/web/firecrawl.cjs
+document_loaders/web/firecrawl.js
+document_loaders/web/firecrawl.d.ts
+document_loaders/web/firecrawl.d.cts
document_loaders/web/github.cjs
document_loaders/web/github.js
document_loaders/web/github.d.ts
diff --git a/langchain/langchain.config.js b/langchain/langchain.config.js
index b286f40db4a8..e25e9fecd913 100644
--- a/langchain/langchain.config.js
+++ b/langchain/langchain.config.js
@@ -191,6 +191,7 @@ export const config = {
"document_loaders/web/hn": "document_loaders/web/hn",
"document_loaders/web/imsdb": "document_loaders/web/imsdb",
"document_loaders/web/figma": "document_loaders/web/figma",
+ "document_loaders/web/firecrawl": "document_loaders/web/firecrawl",
"document_loaders/web/github": "document_loaders/web/github",
"document_loaders/web/notiondb": "document_loaders/web/notiondb",
"document_loaders/web/notionapi": "document_loaders/web/notionapi",
@@ -637,6 +638,7 @@ export const config = {
"document_loaders/web/hn",
"document_loaders/web/imsdb",
"document_loaders/web/figma",
+ "document_loaders/web/firecrawl",
"document_loaders/web/github",
"document_loaders/web/pdf",
"document_loaders/web/notiondb",
diff --git a/langchain/package.json b/langchain/package.json
index 9503688aef95..816721dccbd4 100644
--- a/langchain/package.json
+++ b/langchain/package.json
@@ -578,6 +578,10 @@
"document_loaders/web/figma.js",
"document_loaders/web/figma.d.ts",
"document_loaders/web/figma.d.cts",
+ "document_loaders/web/firecrawl.cjs",
+ "document_loaders/web/firecrawl.js",
+ "document_loaders/web/firecrawl.d.ts",
+ "document_loaders/web/firecrawl.d.cts",
"document_loaders/web/github.cjs",
"document_loaders/web/github.js",
"document_loaders/web/github.d.ts",
@@ -1230,6 +1234,7 @@
"@google-cloud/storage": "^7.7.0",
"@jest/globals": "^29.5.0",
"@langchain/scripts": "~0.0",
+ "@mendable/firecrawl-js": "^0.0.13",
"@notionhq/client": "^2.2.10",
"@pinecone-database/pinecone": "^1.1.0",
"@supabase/supabase-js": "^2.10.0",
@@ -1314,6 +1319,7 @@
"@gomomento/sdk-web": "^1.51.1",
"@google-ai/generativelanguage": "^0.2.1",
"@google-cloud/storage": "^6.10.1 || ^7.7.0",
+ "@mendable/firecrawl-js": "^0.0.13",
"@notionhq/client": "^2.2.10",
"@pinecone-database/pinecone": "*",
"@supabase/supabase-js": "^2.10.0",
@@ -1386,6 +1392,9 @@
"@google-cloud/storage": {
"optional": true
},
+ "@mendable/firecrawl-js": {
+ "optional": true
+ },
"@notionhq/client": {
"optional": true
},
@@ -2826,6 +2835,15 @@
"import": "./document_loaders/web/figma.js",
"require": "./document_loaders/web/figma.cjs"
},
+ "./document_loaders/web/firecrawl": {
+ "types": {
+ "import": "./document_loaders/web/firecrawl.d.ts",
+ "require": "./document_loaders/web/firecrawl.d.cts",
+ "default": "./document_loaders/web/firecrawl.d.ts"
+ },
+ "import": "./document_loaders/web/firecrawl.js",
+ "require": "./document_loaders/web/firecrawl.cjs"
+ },
"./document_loaders/web/github": {
"types": {
"import": "./document_loaders/web/github.d.ts",
diff --git a/langchain/src/document_loaders/tests/firecrawl.int.test.ts b/langchain/src/document_loaders/tests/firecrawl.int.test.ts
new file mode 100644
index 000000000000..18ac838c3fb3
--- /dev/null
+++ b/langchain/src/document_loaders/tests/firecrawl.int.test.ts
@@ -0,0 +1,34 @@
+/* eslint-disable no-process-env */
+/* eslint-disable @typescript-eslint/no-non-null-assertion */
+import { test } from "@jest/globals";
+import { Document } from "@langchain/core/documents";
+import { FireCrawlLoader } from "../web/firecrawl.js";
+
+test("Test FireCrawlLoader load method with scrape mode", async () => {
+ const loader = new FireCrawlLoader({
+ url: "https://firecrawl.dev",
+ apiKey: process.env.FIRECRAWL_API_KEY,
+ mode: "scrape",
+ });
+
+ const documents = await loader.load();
+ expect(documents).toHaveLength(1);
+ const document = documents[0];
+ expect(document).toBeInstanceOf(Document);
+ expect(document.pageContent).toBeTruthy();
+ expect(document.metadata).toBeTruthy();
+});
+
+test("Test FireCrawlLoader load method with crawl mode", async () => {
+ const loader = new FireCrawlLoader({
+ url: "https://firecrawl.dev",
+ apiKey: process.env.FIRECRAWL_API_KEY,
+ mode: "crawl",
+ });
+
+ const documents = await loader.load();
+ const document = documents[0];
+ expect(document).toBeInstanceOf(Document);
+ expect(document.pageContent).toBeTruthy();
+ expect(document.metadata).toBeTruthy();
+}, 15000);
diff --git a/langchain/src/document_loaders/web/firecrawl.ts b/langchain/src/document_loaders/web/firecrawl.ts
new file mode 100644
index 000000000000..9477d984023a
--- /dev/null
+++ b/langchain/src/document_loaders/web/firecrawl.ts
@@ -0,0 +1,108 @@
+import FirecrawlApp from "@mendable/firecrawl-js";
+import { Document, type DocumentInterface } from "@langchain/core/documents";
+import { getEnvironmentVariable } from "@langchain/core/utils/env";
+import { BaseDocumentLoader } from "../base.js";
+
+/**
+ * Interface representing the parameters for the Firecrawl loader. It
+ * includes properties such as the URL to scrape or crawl and the API key.
+ */
+interface FirecrawlLoaderParameters {
+ /**
+ * URL to scrape or crawl
+ */
+ url: string;
+
+ /**
+ * API key for Firecrawl. If not provided, the default value is the value of the FIRECRAWL_API_KEY environment variable.
+ */
+ apiKey?: string;
+
+ /**
+ * Mode of operation. Can be either "crawl" or "scrape". If not provided, the default value is "crawl".
+ */
+ mode?: "crawl" | "scrape";
+ params?: Record;
+}
+interface FirecrawlDocument {
+ markdown: string;
+ metadata: Record;
+}
+
+/**
+ * Class representing a document loader for loading data from
+ * Firecrawl (firecrawl.dev). It extends the BaseDocumentLoader class.
+ * @example
+ * ```typescript
+ * const loader = new FireCrawlLoader({
+ * url: "{url}",
+ * apiKey: "{apiKey}",
+ * mode: "crawl"
+ * });
+ * const docs = await loader.load();
+ * ```
+ */
+export class FireCrawlLoader extends BaseDocumentLoader {
+ private apiKey: string;
+
+ private url: string;
+
+ private mode: "crawl" | "scrape";
+
+ private params?: Record;
+
+ constructor(loaderParams: FirecrawlLoaderParameters) {
+ super();
+ const {
+ apiKey = getEnvironmentVariable("FIRECRAWL_API_KEY"),
+ url,
+ mode = "crawl",
+ params,
+ } = loaderParams;
+ if (!apiKey) {
+ throw new Error(
+ "Firecrawl API key not set. You can set it as FIRECRAWL_API_KEY in your .env file, or pass it to Firecrawl."
+ );
+ }
+
+ this.apiKey = apiKey;
+ this.url = url;
+ this.mode = mode;
+ this.params = params;
+ }
+
+ /**
+ * Loads the data from the Firecrawl.
+ * @returns An array of Documents representing the retrieved data.
+ * @throws An error if the data could not be loaded.
+ */
+ public async load(): Promise {
+ const app = new FirecrawlApp({ apiKey: this.apiKey });
+ let firecrawlDocs: FirecrawlDocument[];
+
+ if (this.mode === "scrape") {
+ const response = await app.scrapeUrl(this.url, this.params);
+ if (!response.success) {
+ throw new Error(
+ `Firecrawl: Failed to scrape URL. Error: ${response.error}`
+ );
+ }
+ firecrawlDocs = [response.data as FirecrawlDocument];
+ } else if (this.mode === "crawl") {
+ const response = await app.crawlUrl(this.url, this.params, true);
+ firecrawlDocs = response as FirecrawlDocument[];
+ } else {
+ throw new Error(
+ `Unrecognized mode '${this.mode}'. Expected one of 'crawl', 'scrape'.`
+ );
+ }
+
+ return firecrawlDocs.map(
+ (doc) =>
+ new Document({
+ pageContent: doc.markdown || "",
+ metadata: doc.metadata || {},
+ })
+ );
+ }
+}
diff --git a/langchain/src/load/import_constants.ts b/langchain/src/load/import_constants.ts
index 81b09d272a05..3119a6164a92 100644
--- a/langchain/src/load/import_constants.ts
+++ b/langchain/src/load/import_constants.ts
@@ -91,6 +91,7 @@ export const optionalImportEntrypoints: string[] = [
"langchain/document_loaders/web/hn",
"langchain/document_loaders/web/imsdb",
"langchain/document_loaders/web/figma",
+ "langchain/document_loaders/web/firecrawl",
"langchain/document_loaders/web/github",
"langchain/document_loaders/web/notiondb",
"langchain/document_loaders/web/notionapi",
diff --git a/yarn.lock b/yarn.lock
index d9a9dccb787f..6dfb086f7690 100644
--- a/yarn.lock
+++ b/yarn.lock
@@ -10092,6 +10092,16 @@ __metadata:
languageName: node
linkType: hard
+"@mendable/firecrawl-js@npm:^0.0.13":
+ version: 0.0.13
+ resolution: "@mendable/firecrawl-js@npm:0.0.13"
+ dependencies:
+ axios: ^1.6.8
+ dotenv: ^16.4.5
+ checksum: 9dbc0b6e5d300bb9ef9f45cebd5c0026ac468863984cdc73a57ed6fdf888eaead5f9e2325c6848d03897c72cab195fffb4ce7d832e39696a11216bc53b417b6d
+ languageName: node
+ linkType: hard
+
"@mistralai/mistralai@npm:^0.1.3":
version: 0.1.3
resolution: "@mistralai/mistralai@npm:0.1.3"
@@ -16954,7 +16964,7 @@ __metadata:
languageName: node
linkType: hard
-"axios@npm:^1.6.2":
+"axios@npm:^1.6.2, axios@npm:^1.6.8":
version: 1.6.8
resolution: "axios@npm:1.6.8"
dependencies:
@@ -26670,6 +26680,7 @@ __metadata:
"@langchain/openai": ~0.0.28
"@langchain/scripts": ~0.0
"@langchain/textsplitters": ~0.0.0
+ "@mendable/firecrawl-js": ^0.0.13
"@notionhq/client": ^2.2.10
"@pinecone-database/pinecone": ^1.1.0
"@supabase/supabase-js": ^2.10.0
@@ -26766,6 +26777,7 @@ __metadata:
"@gomomento/sdk-web": ^1.51.1
"@google-ai/generativelanguage": ^0.2.1
"@google-cloud/storage": ^6.10.1 || ^7.7.0
+ "@mendable/firecrawl-js": ^0.0.13
"@notionhq/client": ^2.2.10
"@pinecone-database/pinecone": "*"
"@supabase/supabase-js": ^2.10.0
@@ -26827,6 +26839,8 @@ __metadata:
optional: true
"@google-cloud/storage":
optional: true
+ "@mendable/firecrawl-js":
+ optional: true
"@notionhq/client":
optional: true
"@pinecone-database/pinecone":