Skip to content

Commit

Permalink
chore(community): Updated Firecrawl Document Loaders to v1 (#6818)
Browse files Browse the repository at this point in the history
Co-authored-by: jacoblee93 <[email protected]>
  • Loading branch information
rafaelsideguide and jacoblee93 authored Oct 23, 2024
1 parent 59217da commit fb3633f
Show file tree
Hide file tree
Showing 4 changed files with 62 additions and 21 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,9 @@
"\n",
"Here's an example of how to use the `FireCrawlLoader` to load web search results:\n",
"\n",
"Firecrawl offers 2 modes: `scrape` and `crawl`. In `scrape` mode, Firecrawl will only scrape the page you provide. In `crawl` mode, Firecrawl will crawl the entire website.\n",
"Firecrawl offers 3 modes: `scrape`, `crawl`, and `map`. In `scrape` mode, Firecrawl will only scrape the page you provide. In `crawl` mode, Firecrawl will crawl the entire website. In `map` mode, Firecrawl will return semantic links related to the website.\n",
"\n",
"The `formats` (`scrapeOptions.formats` for `crawl` mode) parameter allows selection from `\"markdown\"`, `\"html\"`, or `\"rawHtml\"`. However, the Loaded Document will return content in only one format, prioritizing as follows: `markdown`, then `html`, and finally `rawHtml`.\n",
"\n",
"Now we can instantiate our model object and load documents:"
]
Expand Down
4 changes: 2 additions & 2 deletions libs/langchain-community/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@
"@langchain/standard-tests": "0.0.0",
"@layerup/layerup-security": "^1.5.12",
"@libsql/client": "^0.14.0",
"@mendable/firecrawl-js": "^0.0.36",
"@mendable/firecrawl-js": "^1.4.3",
"@mlc-ai/web-llm": ">=0.2.62 <0.3.0",
"@mozilla/readability": "^0.4.4",
"@neondatabase/serverless": "^0.9.1",
Expand Down Expand Up @@ -249,7 +249,7 @@
"@langchain/core": ">=0.2.21 <0.4.0",
"@layerup/layerup-security": "^1.5.12",
"@libsql/client": "^0.14.0",
"@mendable/firecrawl-js": "^0.0.13",
"@mendable/firecrawl-js": "^1.4.3",
"@mlc-ai/web-llm": "*",
"@mozilla/readability": "*",
"@neondatabase/serverless": "*",
Expand Down
43 changes: 33 additions & 10 deletions libs/langchain-community/src/document_loaders/web/firecrawl.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,15 +23,17 @@ interface FirecrawlLoaderParameters {
*/
apiUrl?: string;
/**
* Mode of operation. Can be either "crawl" or "scrape". If not provided, the default value is "crawl".
* Mode of operation. Can be "crawl", "scrape", or "map". If not provided, the default value is "crawl".
*/
mode?: "crawl" | "scrape";
mode?: "crawl" | "scrape" | "map";
params?: Record<string, unknown>;
}

interface FirecrawlDocument {
markdown: string;
metadata: Record<string, unknown>;
markdown?: string;
html?: string;
rawHtml?: string;
metadata?: Record<string, unknown>;
}

/**
Expand All @@ -54,7 +56,7 @@ export class FireCrawlLoader extends BaseDocumentLoader {

private url: string;

private mode: "crawl" | "scrape";
private mode: "crawl" | "scrape" | "map";

private params?: Record<string, unknown>;

Expand Down Expand Up @@ -96,16 +98,37 @@ export class FireCrawlLoader extends BaseDocumentLoader {
let firecrawlDocs: FirecrawlDocument[];

if (this.mode === "scrape") {
const response = await app.scrapeUrl(this.url, this.params);
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const response = await app.scrapeUrl(this.url, this.params as any);
if (!response.success) {
throw new Error(
`Firecrawl: Failed to scrape URL. Error: ${response.error}`
);
}
firecrawlDocs = [response.data as FirecrawlDocument];
firecrawlDocs = [response] as FirecrawlDocument[];
} else if (this.mode === "crawl") {
const response = await app.crawlUrl(this.url, this.params, true);
firecrawlDocs = response as FirecrawlDocument[];
const response = await app.crawlUrl(this.url, this.params);
if (!response.success) {
throw new Error(
`Firecrawl: Failed to crawl URL. Error: ${response.error}`
);
}
firecrawlDocs = response.data as FirecrawlDocument[];
} else if (this.mode === "map") {
const response = await app.mapUrl(this.url, this.params);
if (!response.success) {
throw new Error(
`Firecrawl: Failed to map URL. Error: ${response.error}`
);
}
firecrawlDocs = response.links as FirecrawlDocument[];

return firecrawlDocs.map(
(doc) =>
new Document({
pageContent: JSON.stringify(doc),
})
);
} else {
throw new Error(
`Unrecognized mode '${this.mode}'. Expected one of 'crawl', 'scrape'.`
Expand All @@ -115,7 +138,7 @@ export class FireCrawlLoader extends BaseDocumentLoader {
return firecrawlDocs.map(
(doc) =>
new Document({
pageContent: doc.markdown || "",
pageContent: doc.markdown || doc.html || doc.rawHtml || "",
metadata: doc.metadata || {},
})
);
Expand Down
32 changes: 24 additions & 8 deletions yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -11482,7 +11482,7 @@ __metadata:
"@langchain/standard-tests": 0.0.0
"@layerup/layerup-security": ^1.5.12
"@libsql/client": ^0.14.0
"@mendable/firecrawl-js": ^0.0.36
"@mendable/firecrawl-js": ^1.4.3
"@mlc-ai/web-llm": ">=0.2.62 <0.3.0"
"@mozilla/readability": ^0.4.4
"@neondatabase/serverless": ^0.9.1
Expand Down Expand Up @@ -11655,7 +11655,7 @@ __metadata:
"@langchain/core": ">=0.2.21 <0.4.0"
"@layerup/layerup-security": ^1.5.12
"@libsql/client": ^0.14.0
"@mendable/firecrawl-js": ^0.0.13
"@mendable/firecrawl-js": ^1.4.3
"@mlc-ai/web-llm": "*"
"@mozilla/readability": "*"
"@neondatabase/serverless": "*"
Expand Down Expand Up @@ -13069,16 +13069,16 @@ __metadata:
languageName: node
linkType: hard

"@mendable/firecrawl-js@npm:^0.0.36":
version: 0.0.36
resolution: "@mendable/firecrawl-js@npm:0.0.36"
"@mendable/firecrawl-js@npm:^1.4.3":
version: 1.4.3
resolution: "@mendable/firecrawl-js@npm:1.4.3"
dependencies:
axios: ^1.6.8
dotenv: ^16.4.5
uuid: ^9.0.1
isows: ^1.0.4
typescript-event-target: ^1.1.1
zod: ^3.23.8
zod-to-json-schema: ^3.23.0
checksum: 93ac8a7d9d25c04d4f618e282c136af06cf7712ec3402922531094c3cdab0e59d6f484a7f583022032eb58f914a0494193f2fd22986edd0f6712a29545edf95a
checksum: ee36a4ceaca326d1ae86a714500dd0698060a63e84e0d5c83fb14967ac36755cd4b0b42a260c5e7b63914551a94ead2f4c712a76b9e58a6580dd5ca8628e851a
languageName: node
linkType: hard

Expand Down Expand Up @@ -31332,6 +31332,15 @@ __metadata:
languageName: node
linkType: hard

"isows@npm:^1.0.4":
version: 1.0.4
resolution: "isows@npm:1.0.4"
peerDependencies:
ws: "*"
checksum: a3ee62e3d6216abb3adeeb2a551fe2e7835eac87b05a6ecc3e7739259bf5f8e83290501f49e26137390c8093f207fc3378d4a7653aab76ad7bbab4b2dba9c5b9
languageName: node
linkType: hard

"isstream@npm:0.1.2":
version: 0.1.2
resolution: "isstream@npm:0.1.2"
Expand Down Expand Up @@ -41933,6 +41942,13 @@ __metadata:
languageName: node
linkType: hard

"typescript-event-target@npm:^1.1.1":
version: 1.1.1
resolution: "typescript-event-target@npm:1.1.1"
checksum: ad9eaf0f3c161c4062c33d80ac5235e7c32c5b6f79eabcf23f9c39c7617b9337a4d9d4a2249340a84626fa68abeed38f5973dff547fecd71164f96d0b11af516
languageName: node
linkType: hard

"typescript@npm:<5.2.0, typescript@npm:~5.1.6":
version: 5.1.6
resolution: "typescript@npm:5.1.6"
Expand Down

0 comments on commit fb3633f

Please sign in to comment.