-
Notifications
You must be signed in to change notification settings - Fork 2.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
langchain[minor]: Firecrawl Document Loader #5180
Changes from all commits
602ef9a
6aa8895
da7be5b
8782cc1
192a563
9d30f99
3980dc5
6a7bc20
29108ca
3b5c75b
4327d2a
f9a09ec
bed72a3
5744672
68edbd0
e28ef50
ac50398
94bad49
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
--- | ||
hide_table_of_contents: true | ||
--- | ||
|
||
# Firecrawl | ||
|
||
This guide shows how to use [Firecrawl](https://firecrawl.dev) with LangChain to load web data into an LLM-ready format using Firecrawl. | ||
|
||
## Overview | ||
|
||
[FireCrawl](https://firecrawl.dev) crawls and convert any website into LLM-ready data. It crawls all accessible subpages and give you clean markdown and metadata for each. No sitemap required. | ||
|
||
FireCrawl handles complex tasks such as reverse proxies, caching, rate limits, and content blocked by JavaScript. Built by the [mendable.ai](https://mendable.ai) team. | ||
|
||
This guide shows how to scrap and crawl entire websites and load them using the `FireCrawlLoader` in LangChain. | ||
|
||
## Setup | ||
|
||
Sign up and get your free [FireCrawl API key](https://firecrawl.dev) to start. FireCrawl offers 100 free credits to get you started, and it's [open-source](https://github.com/mendableai/firecrawl) in case you want to self-host. | ||
|
||
## Usage | ||
|
||
Here's an example of how to use the `FireCrawlLoader` to load web search results: | ||
|
||
Firecrawl offers 2 modes: `scrape` and `crawl`. In `scrape` mode, Firecrawl will only scrape the page you provide. In `crawl` mode, Firecrawl will crawl the entire website. | ||
|
||
import CodeBlock from "@theme/CodeBlock"; | ||
import Example from "@examples/document_loaders/firecrawl.ts"; | ||
|
||
```bash npm2yarn | ||
npm install @mendableai/firecrawl-js | ||
``` | ||
|
||
<CodeBlock language="typescript">{Example}</CodeBlock> | ||
|
||
### Additional Parameters | ||
|
||
For `params` you can pass any of the params according to the [Firecrawl documentation](https://docs.firecrawl.dev). |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
import { FireCrawlLoader } from "langchain/document_loaders/web/firecrawl"; | ||
|
||
const loader = new FireCrawlLoader({ | ||
url: "https://firecrawl.dev", // The URL to scrape | ||
apiKey: process.env.FIRECRAWL_API_KEY, // Optional, defaults to `FIRECRAWL_API_KEY` in your env. | ||
mode: "scrape", // The mode to run the crawler in. Can be "scrape" for single urls or "crawl" for all accessible subpages | ||
params: { | ||
// optional parameters based on Firecrawl API docs | ||
// For API documentation, visit https://docs.firecrawl.dev | ||
}, | ||
}); | ||
|
||
const docs = await loader.load(); |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -578,6 +578,10 @@ | |
"document_loaders/web/figma.js", | ||
"document_loaders/web/figma.d.ts", | ||
"document_loaders/web/figma.d.cts", | ||
"document_loaders/web/firecrawl.cjs", | ||
"document_loaders/web/firecrawl.js", | ||
"document_loaders/web/firecrawl.d.ts", | ||
"document_loaders/web/firecrawl.d.cts", | ||
"document_loaders/web/github.cjs", | ||
"document_loaders/web/github.js", | ||
"document_loaders/web/github.d.ts", | ||
|
@@ -1230,6 +1234,7 @@ | |
"@google-cloud/storage": "^7.7.0", | ||
"@jest/globals": "^29.5.0", | ||
"@langchain/scripts": "~0.0", | ||
"@mendable/firecrawl-js": "^0.0.13", | ||
"@notionhq/client": "^2.2.10", | ||
"@pinecone-database/pinecone": "^1.1.0", | ||
"@supabase/supabase-js": "^2.10.0", | ||
|
@@ -1314,6 +1319,7 @@ | |
"@gomomento/sdk-web": "^1.51.1", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hey there! I noticed that this PR adds a new hard dependency with the addition of "@mendable/firecrawl-js". I've flagged this for your review as it's important to ensure the impact of this change on the project's dependencies. Keep up the great work! |
||
"@google-ai/generativelanguage": "^0.2.1", | ||
"@google-cloud/storage": "^6.10.1 || ^7.7.0", | ||
"@mendable/firecrawl-js": "^0.0.13", | ||
"@notionhq/client": "^2.2.10", | ||
"@pinecone-database/pinecone": "*", | ||
"@supabase/supabase-js": "^2.10.0", | ||
|
@@ -1386,6 +1392,9 @@ | |
"@google-cloud/storage": { | ||
"optional": true | ||
}, | ||
"@mendable/firecrawl-js": { | ||
"optional": true | ||
}, | ||
"@notionhq/client": { | ||
"optional": true | ||
}, | ||
|
@@ -2826,6 +2835,15 @@ | |
"import": "./document_loaders/web/figma.js", | ||
"require": "./document_loaders/web/figma.cjs" | ||
}, | ||
"./document_loaders/web/firecrawl": { | ||
"types": { | ||
"import": "./document_loaders/web/firecrawl.d.ts", | ||
"require": "./document_loaders/web/firecrawl.d.cts", | ||
"default": "./document_loaders/web/firecrawl.d.ts" | ||
}, | ||
"import": "./document_loaders/web/firecrawl.js", | ||
"require": "./document_loaders/web/firecrawl.cjs" | ||
}, | ||
"./document_loaders/web/github": { | ||
"types": { | ||
"import": "./document_loaders/web/github.d.ts", | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
/* eslint-disable no-process-env */ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hey there! I noticed that the recent changes in the FireCrawlLoader tests are accessing environment variables directly. I've flagged this for your review to ensure proper handling of sensitive information. Let me know if you need any further assistance with this. |
||
/* eslint-disable @typescript-eslint/no-non-null-assertion */ | ||
import { test } from "@jest/globals"; | ||
import { Document } from "@langchain/core/documents"; | ||
import { FireCrawlLoader } from "../web/firecrawl.js"; | ||
|
||
test("Test FireCrawlLoader load method with scrape mode", async () => { | ||
const loader = new FireCrawlLoader({ | ||
url: "https://firecrawl.dev", | ||
apiKey: process.env.FIRECRAWL_API_KEY, | ||
mode: "scrape", | ||
}); | ||
|
||
const documents = await loader.load(); | ||
expect(documents).toHaveLength(1); | ||
const document = documents[0]; | ||
expect(document).toBeInstanceOf(Document); | ||
expect(document.pageContent).toBeTruthy(); | ||
expect(document.metadata).toBeTruthy(); | ||
}); | ||
|
||
test("Test FireCrawlLoader load method with crawl mode", async () => { | ||
const loader = new FireCrawlLoader({ | ||
url: "https://firecrawl.dev", | ||
apiKey: process.env.FIRECRAWL_API_KEY, | ||
mode: "crawl", | ||
}); | ||
|
||
const documents = await loader.load(); | ||
const document = documents[0]; | ||
expect(document).toBeInstanceOf(Document); | ||
expect(document.pageContent).toBeTruthy(); | ||
expect(document.metadata).toBeTruthy(); | ||
}, 15000); |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,108 @@ | ||
import FirecrawlApp from "@mendable/firecrawl-js"; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hey there! I noticed that the recent code changes explicitly access an environment variable using |
||
import { Document, type DocumentInterface } from "@langchain/core/documents"; | ||
import { getEnvironmentVariable } from "@langchain/core/utils/env"; | ||
import { BaseDocumentLoader } from "../base.js"; | ||
|
||
/** | ||
* Interface representing the parameters for the Firecrawl loader. It | ||
* includes properties such as the URL to scrape or crawl and the API key. | ||
*/ | ||
interface FirecrawlLoaderParameters { | ||
/** | ||
* URL to scrape or crawl | ||
*/ | ||
url: string; | ||
|
||
/** | ||
* API key for Firecrawl. If not provided, the default value is the value of the FIRECRAWL_API_KEY environment variable. | ||
*/ | ||
apiKey?: string; | ||
|
||
/** | ||
* Mode of operation. Can be either "crawl" or "scrape". If not provided, the default value is "crawl". | ||
*/ | ||
mode?: "crawl" | "scrape"; | ||
params?: Record<string, unknown>; | ||
} | ||
interface FirecrawlDocument { | ||
markdown: string; | ||
metadata: Record<string, unknown>; | ||
} | ||
|
||
/** | ||
* Class representing a document loader for loading data from | ||
* Firecrawl (firecrawl.dev). It extends the BaseDocumentLoader class. | ||
* @example | ||
* ```typescript | ||
* const loader = new FireCrawlLoader({ | ||
* url: "{url}", | ||
* apiKey: "{apiKey}", | ||
* mode: "crawl" | ||
* }); | ||
* const docs = await loader.load(); | ||
* ``` | ||
*/ | ||
export class FireCrawlLoader extends BaseDocumentLoader { | ||
private apiKey: string; | ||
|
||
private url: string; | ||
|
||
private mode: "crawl" | "scrape"; | ||
|
||
private params?: Record<string, unknown>; | ||
|
||
constructor(loaderParams: FirecrawlLoaderParameters) { | ||
super(); | ||
const { | ||
apiKey = getEnvironmentVariable("FIRECRAWL_API_KEY"), | ||
url, | ||
mode = "crawl", | ||
params, | ||
} = loaderParams; | ||
if (!apiKey) { | ||
throw new Error( | ||
"Firecrawl API key not set. You can set it as FIRECRAWL_API_KEY in your .env file, or pass it to Firecrawl." | ||
); | ||
} | ||
|
||
this.apiKey = apiKey; | ||
this.url = url; | ||
this.mode = mode; | ||
this.params = params; | ||
} | ||
|
||
/** | ||
* Loads the data from the Firecrawl. | ||
* @returns An array of Documents representing the retrieved data. | ||
* @throws An error if the data could not be loaded. | ||
*/ | ||
public async load(): Promise<DocumentInterface[]> { | ||
const app = new FirecrawlApp({ apiKey: this.apiKey }); | ||
let firecrawlDocs: FirecrawlDocument[]; | ||
|
||
if (this.mode === "scrape") { | ||
const response = await app.scrapeUrl(this.url, this.params); | ||
if (!response.success) { | ||
throw new Error( | ||
`Firecrawl: Failed to scrape URL. Error: ${response.error}` | ||
); | ||
} | ||
firecrawlDocs = [response.data as FirecrawlDocument]; | ||
} else if (this.mode === "crawl") { | ||
const response = await app.crawlUrl(this.url, this.params, true); | ||
firecrawlDocs = response as FirecrawlDocument[]; | ||
} else { | ||
throw new Error( | ||
`Unrecognized mode '${this.mode}'. Expected one of 'crawl', 'scrape'.` | ||
); | ||
} | ||
|
||
return firecrawlDocs.map( | ||
(doc) => | ||
new Document({ | ||
pageContent: doc.markdown || "", | ||
metadata: doc.metadata || {}, | ||
}) | ||
); | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hey there! 👋 I've reviewed the code and noticed that the added lines explicitly access an environment variable using
process.env
. I've flagged this for your review to ensure it aligns with our best practices for handling environment variables. Let me know if you need further assistance with this.