langchain-ai · jacoblee93 · Apr 24, 2024 · Apr 22, 2024 · Apr 22, 2024 · Apr 22, 2024
diff --git a/docs/core_docs/docs/integrations/document_loaders/web_loaders/firecrawl.mdx b/docs/core_docs/docs/integrations/document_loaders/web_loaders/firecrawl.mdx
@@ -0,0 +1,38 @@
+---
+hide_table_of_contents: true
+---
+
+# Firecrawl
+
+This guide shows how to use [Firecrawl](https://firecrawl.dev) with LangChain to load web data into an LLM-ready format using Firecrawl.
+
+## Overview
+
+[FireCrawl](https://firecrawl.dev) crawls and convert any website into LLM-ready data. It crawls all accessible subpages and give you clean markdown and metadata for each. No sitemap required.
+
+FireCrawl handles complex tasks such as reverse proxies, caching, rate limits, and content blocked by JavaScript. Built by the [mendable.ai](https://mendable.ai) team.
+
+This guide shows how to scrap and crawl entire websites and load them using the `FireCrawlLoader` in LangChain.
+
+## Setup
+
+Sign up and get your free [FireCrawl API key](https://firecrawl.dev) to start. FireCrawl offers 100 free credits to get you started, and it's [open-source](https://github.com/mendableai/firecrawl) in case you want to self-host.
+
+## Usage
+
+Here's an example of how to use the `FireCrawlLoader` to load web search results:
+
+Firecrawl offers 2 modes: `scrape` and `crawl`. In `scrape` mode, Firecrawl will only scrape the page you provide. In `crawl` mode, Firecrawl will crawl the entire website.
+
+import CodeBlock from "@theme/CodeBlock";
+import Example from "@examples/document_loaders/firecrawl.ts";
+
+```bash npm2yarn
+npm install @mendableai/firecrawl-js
+```
+
+<CodeBlock language="typescript">{Example}</CodeBlock>
+
+### Additional Parameters
+
+For `params` you can pass any of the params according to the [Firecrawl documentation](https://docs.firecrawl.dev).
diff --git a/examples/src/document_loaders/firecrawl.ts b/examples/src/document_loaders/firecrawl.ts
@@ -0,0 +1,13 @@
+import { FireCrawlLoader } from "langchain/document_loaders/web/firecrawl";
+
+const loader = new FireCrawlLoader({
+  url: "https://firecrawl.dev", // The URL to scrape
+  apiKey: process.env.FIRECRAWL_API_KEY, // Optional, defaults to `FIRECRAWL_API_KEY` in your env.
+  mode: "scrape", // The mode to run the crawler in. Can be "scrape" for single urls or "crawl" for all accessible subpages
+  params: {
+    // optional parameters based on Firecrawl API docs
+    // For API documentation, visit https://docs.firecrawl.dev
+  },
+});
+
+const docs = await loader.load();
diff --git a/langchain/.gitignore b/langchain/.gitignore
@@ -566,6 +566,10 @@ document_loaders/web/figma.cjs
 document_loaders/web/figma.js
 document_loaders/web/figma.d.ts
 document_loaders/web/figma.d.cts
+document_loaders/web/firecrawl.cjs
+document_loaders/web/firecrawl.js
+document_loaders/web/firecrawl.d.ts
+document_loaders/web/firecrawl.d.cts
 document_loaders/web/github.cjs
 document_loaders/web/github.js
 document_loaders/web/github.d.ts

diff --git a/langchain/langchain.config.js b/langchain/langchain.config.js
@@ -191,6 +191,7 @@ export const config = {
     "document_loaders/web/hn": "document_loaders/web/hn",
     "document_loaders/web/imsdb": "document_loaders/web/imsdb",
     "document_loaders/web/figma": "document_loaders/web/figma",
+    "document_loaders/web/firecrawl": "document_loaders/web/firecrawl",
     "document_loaders/web/github": "document_loaders/web/github",
     "document_loaders/web/notiondb": "document_loaders/web/notiondb",
     "document_loaders/web/notionapi": "document_loaders/web/notionapi",
@@ -637,6 +638,7 @@ export const config = {
     "document_loaders/web/hn",
     "document_loaders/web/imsdb",
     "document_loaders/web/figma",
+    "document_loaders/web/firecrawl",
     "document_loaders/web/github",
     "document_loaders/web/pdf",
     "document_loaders/web/notiondb",

diff --git a/langchain/package.json b/langchain/package.json
@@ -578,6 +578,10 @@
     "document_loaders/web/figma.js",
     "document_loaders/web/figma.d.ts",
     "document_loaders/web/figma.d.cts",
+    "document_loaders/web/firecrawl.cjs",
+    "document_loaders/web/firecrawl.js",
+    "document_loaders/web/firecrawl.d.ts",
+    "document_loaders/web/firecrawl.d.cts",
     "document_loaders/web/github.cjs",
     "document_loaders/web/github.js",
     "document_loaders/web/github.d.ts",
@@ -1230,6 +1234,7 @@
     "@google-cloud/storage": "^7.7.0",
     "@jest/globals": "^29.5.0",
     "@langchain/scripts": "~0.0",
+    "@mendable/firecrawl-js": "^0.0.13",
     "@notionhq/client": "^2.2.10",
     "@pinecone-database/pinecone": "^1.1.0",
     "@supabase/supabase-js": "^2.10.0",
@@ -1314,6 +1319,7 @@
     "@gomomento/sdk-web": "^1.51.1",
     "@google-ai/generativelanguage": "^0.2.1",
     "@google-cloud/storage": "^6.10.1 || ^7.7.0",
+    "@mendable/firecrawl-js": "^0.0.13",
     "@notionhq/client": "^2.2.10",
     "@pinecone-database/pinecone": "*",
     "@supabase/supabase-js": "^2.10.0",
@@ -1386,6 +1392,9 @@
     "@google-cloud/storage": {
       "optional": true
     },
+    "@mendable/firecrawl-js": {
+      "optional": true
+    },
     "@notionhq/client": {
       "optional": true
     },
@@ -2826,6 +2835,15 @@
       "import": "./document_loaders/web/figma.js",
       "require": "./document_loaders/web/figma.cjs"
     },
+    "./document_loaders/web/firecrawl": {
+      "types": {
+        "import": "./document_loaders/web/firecrawl.d.ts",
+        "require": "./document_loaders/web/firecrawl.d.cts",
+        "default": "./document_loaders/web/firecrawl.d.ts"
+      },
+      "import": "./document_loaders/web/firecrawl.js",
+      "require": "./document_loaders/web/firecrawl.cjs"
+    },
     "./document_loaders/web/github": {
       "types": {
         "import": "./document_loaders/web/github.d.ts",

diff --git a/langchain/src/document_loaders/tests/firecrawl.int.test.ts b/langchain/src/document_loaders/tests/firecrawl.int.test.ts
@@ -0,0 +1,34 @@
+/* eslint-disable no-process-env */
+/* eslint-disable @typescript-eslint/no-non-null-assertion */
+import { test } from "@jest/globals";
+import { Document } from "@langchain/core/documents";
+import { FireCrawlLoader } from "../web/firecrawl.js";
+
+test("Test FireCrawlLoader load method with scrape mode", async () => {
+  const loader = new FireCrawlLoader({
+    url: "https://firecrawl.dev",
+    apiKey: process.env.FIRECRAWL_API_KEY,
+    mode: "scrape",
+  });
+
+  const documents = await loader.load();
+  expect(documents).toHaveLength(1);
+  const document = documents[0];
+  expect(document).toBeInstanceOf(Document);
+  expect(document.pageContent).toBeTruthy();
+  expect(document.metadata).toBeTruthy();
+});
+
+test("Test FireCrawlLoader load method with crawl mode", async () => {
+  const loader = new FireCrawlLoader({
+    url: "https://firecrawl.dev",
+    apiKey: process.env.FIRECRAWL_API_KEY,
+    mode: "crawl",
+  });
+
+  const documents = await loader.load();
+  const document = documents[0];
+  expect(document).toBeInstanceOf(Document);
+  expect(document.pageContent).toBeTruthy();
+  expect(document.metadata).toBeTruthy();
+}, 15000);
diff --git a/langchain/src/document_loaders/web/firecrawl.ts b/langchain/src/document_loaders/web/firecrawl.ts
@@ -0,0 +1,108 @@
+import FirecrawlApp from "@mendable/firecrawl-js";
+import { Document, type DocumentInterface } from "@langchain/core/documents";
+import { getEnvironmentVariable } from "@langchain/core/utils/env";
+import { BaseDocumentLoader } from "../base.js";
+
+/**
+ * Interface representing the parameters for the Firecrawl loader. It
+ * includes properties such as the URL to scrape or crawl and the API key.
+ */
+interface FirecrawlLoaderParameters {
+  /**
+   * URL to scrape or crawl
+   */
+  url: string;
+
+  /**
+   * API key for Firecrawl. If not provided, the default value is the value of the FIRECRAWL_API_KEY environment variable.
+   */
+  apiKey?: string;
+
+  /**
+   * Mode of operation. Can be either "crawl" or "scrape". If not provided, the default value is "crawl".
+   */
+  mode?: "crawl" | "scrape";
+  params?: Record<string, unknown>;
+}
+interface FirecrawlDocument {
+  markdown: string;
+  metadata: Record<string, unknown>;
+}
+
+/**
+ * Class representing a document loader for loading data from
+ * Firecrawl (firecrawl.dev). It extends the BaseDocumentLoader class.
+ * @example
+ * ```typescript
+ * const loader = new FireCrawlLoader({
+ *   url: "{url}",
+ *   apiKey: "{apiKey}",
+ *   mode: "crawl"
+ * });
+ * const docs = await loader.load();
+ * ```
+ */
+export class FireCrawlLoader extends BaseDocumentLoader {
+  private apiKey: string;
+
+  private url: string;
+
+  private mode: "crawl" | "scrape";
+
+  private params?: Record<string, unknown>;
+
+  constructor(loaderParams: FirecrawlLoaderParameters) {
+    super();
+    const {
+      apiKey = getEnvironmentVariable("FIRECRAWL_API_KEY"),
+      url,
+      mode = "crawl",
+      params,
+    } = loaderParams;
+    if (!apiKey) {
+      throw new Error(
+        "Firecrawl API key not set. You can set it as FIRECRAWL_API_KEY in your .env file, or pass it to Firecrawl."
+      );
+    }
+
+    this.apiKey = apiKey;
+    this.url = url;
+    this.mode = mode;
+    this.params = params;
+  }
+
+  /**
+   * Loads the data from the Firecrawl.
+   * @returns An array of Documents representing the retrieved data.
+   * @throws An error if the data could not be loaded.
+   */
+  public async load(): Promise<DocumentInterface[]> {
+    const app = new FirecrawlApp({ apiKey: this.apiKey });
+    let firecrawlDocs: FirecrawlDocument[];
+
+    if (this.mode === "scrape") {
+      const response = await app.scrapeUrl(this.url, this.params);
+      if (!response.success) {
+        throw new Error(
+          `Firecrawl: Failed to scrape URL. Error: ${response.error}`
+        );
+      }
+      firecrawlDocs = [response.data as FirecrawlDocument];
+    } else if (this.mode === "crawl") {
+      const response = await app.crawlUrl(this.url, this.params, true);
+      firecrawlDocs = response as FirecrawlDocument[];
+    } else {
+      throw new Error(
+        `Unrecognized mode '${this.mode}'. Expected one of 'crawl', 'scrape'.`
+      );
+    }
+
+    return firecrawlDocs.map(
+      (doc) =>
+        new Document({
+          pageContent: doc.markdown || "",
+          metadata: doc.metadata || {},
+        })
+    );
+  }
+}
diff --git a/langchain/src/load/import_constants.ts b/langchain/src/load/import_constants.ts
@@ -91,6 +91,7 @@ export const optionalImportEntrypoints: string[] = [
   "langchain/document_loaders/web/hn",
   "langchain/document_loaders/web/imsdb",
   "langchain/document_loaders/web/figma",
+  "langchain/document_loaders/web/firecrawl",
   "langchain/document_loaders/web/github",
   "langchain/document_loaders/web/notiondb",
   "langchain/document_loaders/web/notionapi",

diff --git a/yarn.lock b/yarn.lock
@@ -10092,6 +10092,16 @@ __metadata:
   languageName: node
   linkType: hard
 
+"@mendable/firecrawl-js@npm:^0.0.13":
+  version: 0.0.13
+  resolution: "@mendable/firecrawl-js@npm:0.0.13"
+  dependencies:
+    axios: ^1.6.8
+    dotenv: ^16.4.5
+  checksum: 9dbc0b6e5d300bb9ef9f45cebd5c0026ac468863984cdc73a57ed6fdf888eaead5f9e2325c6848d03897c72cab195fffb4ce7d832e39696a11216bc53b417b6d
+  languageName: node
+  linkType: hard
+
 "@mistralai/mistralai@npm:^0.1.3":
   version: 0.1.3
   resolution: "@mistralai/mistralai@npm:0.1.3"
@@ -16954,7 +16964,7 @@ __metadata:
   languageName: node
   linkType: hard
 
-"axios@npm:^1.6.2":
+"axios@npm:^1.6.2, axios@npm:^1.6.8":
   version: 1.6.8
   resolution: "axios@npm:1.6.8"
   dependencies:
@@ -26670,6 +26680,7 @@ __metadata:
     "@langchain/openai": ~0.0.28
     "@langchain/scripts": ~0.0
     "@langchain/textsplitters": ~0.0.0
+    "@mendable/firecrawl-js": ^0.0.13
     "@notionhq/client": ^2.2.10
     "@pinecone-database/pinecone": ^1.1.0
     "@supabase/supabase-js": ^2.10.0
@@ -26766,6 +26777,7 @@ __metadata:
     "@gomomento/sdk-web": ^1.51.1
     "@google-ai/generativelanguage": ^0.2.1
     "@google-cloud/storage": ^6.10.1 || ^7.7.0
+    "@mendable/firecrawl-js": ^0.0.13
     "@notionhq/client": ^2.2.10
     "@pinecone-database/pinecone": "*"
     "@supabase/supabase-js": ^2.10.0
@@ -26827,6 +26839,8 @@ __metadata:
       optional: true
     "@google-cloud/storage":
       optional: true
+    "@mendable/firecrawl-js":
+      optional: true
     "@notionhq/client":
       optional: true
     "@pinecone-database/pinecone":