From fc14f675f1af1e8fee23710f208259da9fcfd778 Mon Sep 17 00:00:00 2001 From: Rafael Miller <150964962+rafaelsideguide@users.noreply.github.com> Date: Tue, 15 Oct 2024 10:13:28 -0300 Subject: [PATCH] Community: Updated Firecrawl Document Loader to v1 (#26548) This PR updates the Firecrawl Document Loader to use the recently released V1 API of Firecrawl. **Key Updates:** **Firecrawl V1 Integration:** Updated the document loader to leverage the new Firecrawl V1 API for improved performance, reliability, and developer experience. **Map Functionality Added:** Introduced the map mode for more flexible document loading options. These updates enhance the integration and provide access to the latest features of Firecrawl. --------- Co-authored-by: Erick Friis Co-authored-by: Harrison Chase --- .../document_loaders/firecrawl.ipynb | 201 ++++++++-------- .../document_loaders/firecrawl.py | 219 ++++++++++++++++-- 2 files changed, 307 insertions(+), 113 deletions(-) diff --git a/docs/docs/integrations/document_loaders/firecrawl.ipynb b/docs/docs/integrations/document_loaders/firecrawl.ipynb index 4abf280da8f92..a6ec4ff418b01 100644 --- a/docs/docs/integrations/document_loaders/firecrawl.ipynb +++ b/docs/docs/integrations/document_loaders/firecrawl.ipynb @@ -26,33 +26,32 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Setup\n", - "\n", - "### Credentials \n", - "\n", - "You will need to get your own API key. Go to [this page](https://firecrawl.dev) to learn more." + "## Setup" ] }, { "cell_type": "code", - "execution_count": 1, - "metadata": {}, + "execution_count": null, + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ - "import getpass\n", - "import os\n", - "\n", - "if \"FIRECRAWL_API_KEY\" not in os.environ:\n", - " os.environ[\"FIRECRAWL_API_KEY\"] = getpass.getpass(\"Enter your Firecrawl API key: \")" + "pip install firecrawl-py" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Installation\n", - "\n", - "You will need to install both the `langchain_community` and `firecrawl-py` pacakges:" + "## Usage" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You will need to get your own API key. See https://firecrawl.dev" ] }, { @@ -61,151 +60,161 @@ "metadata": {}, "outputs": [], "source": [ - "%pip install -qU firecrawl-py==0.0.20 langchain_community" + "from langchain_community.document_loaders.firecrawl import FireCrawlLoader" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Document(metadata={'ogUrl': 'https://www.firecrawl.dev/', 'title': 'Home - Firecrawl', 'robots': 'follow, index', 'ogImage': 'https://www.firecrawl.dev/og.png?123', 'ogTitle': 'Firecrawl', 'sitemap': {'lastmod': '2024-08-12T00:28:16.681Z', 'changefreq': 'weekly'}, 'keywords': 'Firecrawl,Markdown,Data,Mendable,Langchain', 'sourceURL': 'https://www.firecrawl.dev/', 'ogSiteName': 'Firecrawl', 'description': 'Firecrawl crawls and converts any website into clean markdown.', 'ogDescription': 'Turn any website into LLM-ready data.', 'pageStatusCode': 200, 'ogLocaleAlternate': []}, page_content='Introducing [Smart Crawl!](https://www.firecrawl.dev/smart-crawl)\\n Join the waitlist to turn any website into an API with AI\\n\\n\\n\\n[šŸ”„ Firecrawl](/)\\n\\n* [Playground](/playground)\\n \\n* [Docs](https://docs.firecrawl.dev)\\n \\n* [Pricing](/pricing)\\n \\n* [Blog](/blog)\\n \\n* Beta Features\\n\\n[Log In](/signin)\\n[Log In](/signin)\\n[Sign Up](/signin/signup)\\n 8.9k\\n\\n[šŸ’„ Get 2 months free with yearly plan](/pricing)\\n\\nTurn websites into \\n_LLM-ready_ data\\n=====================================\\n\\nPower your AI apps with clean data crawled from any website. It\\'s also open-source.\\n\\nStart for free (500 credits)Start for free[Talk to us](https://calendly.com/d/cj83-ngq-knk/meet-firecrawl)\\n\\nA product by\\n\\n[![Mendable Logo](https://www.firecrawl.dev/images/mendable_logo_transparent.png)Mendable](https://mendable.ai)\\n\\n![Example Webpage](https://www.firecrawl.dev/multiple-websites.png)\\n\\nCrawl, Scrape, Clean\\n--------------------\\n\\nWe crawl all accessible subpages and give you clean markdown for each. No sitemap required.\\n\\n \\n [\\\\\\n {\\\\\\n \"url\": \"https://www.firecrawl.dev/\",\\\\\\n \"markdown\": \"## Welcome to Firecrawl\\\\\\n Firecrawl is a web scraper that allows you to extract the content of a webpage.\"\\\\\\n },\\\\\\n {\\\\\\n \"url\": \"https://www.firecrawl.dev/features\",\\\\\\n \"markdown\": \"## Features\\\\\\n Discover how Firecrawl\\'s cutting-edge features can \\\\\\n transform your data operations.\"\\\\\\n },\\\\\\n {\\\\\\n \"url\": \"https://www.firecrawl.dev/pricing\",\\\\\\n \"markdown\": \"## Pricing Plans\\\\\\n Choose the perfect plan that fits your needs.\"\\\\\\n },\\\\\\n {\\\\\\n \"url\": \"https://www.firecrawl.dev/about\",\\\\\\n \"markdown\": \"## About Us\\\\\\n Learn more about Firecrawl\\'s mission and the \\\\\\n team behind our innovative platform.\"\\\\\\n }\\\\\\n ]\\n \\n\\nNote: The markdown has been edited for display purposes.\\n\\nTrusted by Top Companies\\n------------------------\\n\\n[![Customer Logo](https://www.firecrawl.dev/logos/zapier.png)](https://www.zapier.com)\\n\\n[![Customer Logo](https://www.firecrawl.dev/logos/gamma.svg)](https://gamma.app)\\n\\n[![Customer Logo](https://www.firecrawl.dev/logos/nvidia-com.png)](https://www.nvidia.com)\\n\\n[![Customer Logo](https://www.firecrawl.dev/logos/teller-io.svg)](https://www.teller.io)\\n\\n[![Customer Logo](https://www.firecrawl.dev/logos/stackai.svg)](https://www.stack-ai.com)\\n\\n[![Customer Logo](https://www.firecrawl.dev/logos/palladiumdigital-co-uk.svg)](https://www.palladiumdigital.co.uk)\\n\\n[![Customer Logo](https://www.firecrawl.dev/logos/worldwide-casting-com.svg)](https://www.worldwide-casting.com)\\n\\n[![Customer Logo](https://www.firecrawl.dev/logos/open-gov-sg.png)](https://www.open.gov.sg)\\n\\n[![Customer Logo](https://www.firecrawl.dev/logos/bain-com.svg)](https://www.bain.com)\\n\\n[![Customer Logo](https://www.firecrawl.dev/logos/demand-io.svg)](https://www.demand.io)\\n\\n[![Customer Logo](https://www.firecrawl.dev/logos/nocodegarden-io.png)](https://www.nocodegarden.io)\\n\\n[![Customer Logo](https://www.firecrawl.dev/logos/cyberagent-co-jp.svg)](https://www.cyberagent.co.jp)\\n\\nIntegrate today\\n---------------\\n\\nEnhance your applications with top-tier web scraping and crawling capabilities.\\n\\n#### Scrape\\n\\nExtract markdown or structured data from websites quickly and efficiently.\\n\\n#### Crawling\\n\\nNavigate and retrieve data from all accessible subpages, even without a sitemap.\\n\\nNode.js\\n\\nPython\\n\\ncURL\\n\\n1\\n\\n2\\n\\n3\\n\\n4\\n\\n5\\n\\n6\\n\\n7\\n\\n8\\n\\n9\\n\\n10\\n\\n// npm install @mendable/firecrawl-js \\n \\nimport FirecrawlApp from \\'@mendable/firecrawl-js\\'; \\n \\nconst app \\\\= new FirecrawlApp({ apiKey: \"fc-YOUR\\\\_API\\\\_KEY\" }); \\n \\n// Scrape a website: \\nconst scrapeResult \\\\= await app.scrapeUrl(\\'firecrawl.dev\\'); \\n \\nconsole.log(scrapeResult.data.markdown)\\n\\n#### Use well-known tools\\n\\nAlready fully integrated with the greatest existing tools and workflows.\\n\\n[![LlamaIndex](https://www.firecrawl.dev/logos/llamaindex.svg)](https://docs.llamaindex.ai/en/stable/examples/data_connectors/WebPageDemo/#using-firecrawl-reader/)\\n[![Langchain](https://www.firecrawl.dev/integrations/langchain.png)](https://python.langchain.com/docs/integrations/document_loaders/firecrawl/)\\n[![Dify](https://www.firecrawl.dev/logos/dify.png)](https://dify.ai/blog/dify-ai-blog-integrated-with-firecrawl/)\\n[![Dify](https://www.firecrawl.dev/integrations/langflow_2.png)](https://www.langflow.org/)\\n[![Flowise](https://www.firecrawl.dev/integrations/flowise.png)](https://flowiseai.com/)\\n[![CrewAI](https://www.firecrawl.dev/integrations/crewai.png)](https://crewai.com/)\\n\\n#### Start for free, scale easily\\n\\nKick off your journey for free and scale seamlessly as your project expands.\\n\\n[Try it out](/signin/signup)\\n\\n#### Open-source\\n\\nDeveloped transparently and collaboratively. Join our community of contributors.\\n\\n[Check out our repo](https://github.com/mendableai/firecrawl)\\n\\nWe handle the hard stuff\\n------------------------\\n\\nRotating proxies, caching, rate limits, js-blocked content and more\\n\\n#### Crawling\\n\\nFirecrawl crawls all accessible subpages, even without a sitemap.\\n\\n#### Dynamic content\\n\\nFirecrawl gathers data even if a website uses javascript to render content.\\n\\n#### To Markdown\\n\\nFirecrawl returns clean, well formatted markdown - ready for use in LLM applications\\n\\n#### Crawling Orchestration\\n\\nFirecrawl orchestrates the crawling process in parallel for the fastest results.\\n\\n#### Caching\\n\\nFirecrawl caches content, so you don\\'t have to wait for a full scrape unless new content exists.\\n\\n#### Built for AI\\n\\nBuilt by LLM engineers, for LLM engineers. Giving you clean data the way you want it.\\n\\nOur wall of love\\n\\nDon\\'t take our word for it\\n--------------------------\\n\\n![Greg Kamradt](https://www.firecrawl.dev/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Ftestimonial-02.0afeb750.jpg&w=96&q=75)\\n\\nGreg Kamradt\\n\\n[@GregKamradt](https://twitter.com/GregKamradt/status/1780300642197840307)\\n\\nLLM structured data via API, handling requests, cleaning, and crawling. Enjoyed the early preview.\\n\\n![Amit Naik](https://www.firecrawl.dev/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Ftestimonial-03.ff5dbe11.jpg&w=96&q=75)\\n\\nAmit Naik\\n\\n[@suprgeek](https://twitter.com/suprgeek/status/1780338213351035254)\\n\\n#llm success with RAG relies on Retrieval. Firecrawl by @mendableai structures web content for processing. šŸ‘\\n\\n![Jerry Liu](https://www.firecrawl.dev/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Ftestimonial-04.76bef0df.jpg&w=96&q=75)\\n\\nJerry Liu\\n\\n[@jerryjliu0](https://twitter.com/jerryjliu0/status/1781122933349572772)\\n\\nFirecrawl is awesome šŸ”„ Turns web pages into structured markdown for LLM apps, thanks to @mendableai.\\n\\n![Bardia Pourvakil](https://www.firecrawl.dev/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Ftestimonial-01.025350bc.jpeg&w=96&q=75)\\n\\nBardia Pourvakil\\n\\n[@thepericulum](https://twitter.com/thepericulum/status/1781397799487078874)\\n\\nThese guys ship. I wanted types for their node SDK, and less than an hour later, I got them. Can\\'t recommend them enough.\\n\\n![latentsauce šŸ§˜šŸ½](https://www.firecrawl.dev/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Ftestimonial-07.c2285d35.jpeg&w=96&q=75)\\n\\nlatentsauce šŸ§˜šŸ½\\n\\n[@latentsauce](https://twitter.com/latentsauce/status/1781738253927735331)\\n\\nFirecrawl simplifies data preparation significantly, exactly what I was hoping for. Thank you for creating Firecrawl ā¤ļøā¤ļøā¤ļø\\n\\n![Greg Kamradt](https://www.firecrawl.dev/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Ftestimonial-02.0afeb750.jpg&w=96&q=75)\\n\\nGreg Kamradt\\n\\n[@GregKamradt](https://twitter.com/GregKamradt/status/1780300642197840307)\\n\\nLLM structured data via API, handling requests, cleaning, and crawling. Enjoyed the early preview.\\n\\n![Amit Naik](https://www.firecrawl.dev/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Ftestimonial-03.ff5dbe11.jpg&w=96&q=75)\\n\\nAmit Naik\\n\\n[@suprgeek](https://twitter.com/suprgeek/status/1780338213351035254)\\n\\n#llm success with RAG relies on Retrieval. Firecrawl by @mendableai structures web content for processing. šŸ‘\\n\\n![Jerry Liu](https://www.firecrawl.dev/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Ftestimonial-04.76bef0df.jpg&w=96&q=75)\\n\\nJerry Liu\\n\\n[@jerryjliu0](https://twitter.com/jerryjliu0/status/1781122933349572772)\\n\\nFirecrawl is awesome šŸ”„ Turns web pages into structured markdown for LLM apps, thanks to @mendableai.\\n\\n![Bardia Pourvakil](https://www.firecrawl.dev/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Ftestimonial-01.025350bc.jpeg&w=96&q=75)\\n\\nBardia Pourvakil\\n\\n[@thepericulum](https://twitter.com/thepericulum/status/1781397799487078874)\\n\\nThese guys ship. I wanted types for their node SDK, and less than an hour later, I got them. Can\\'t recommend them enough.\\n\\n![latentsauce šŸ§˜šŸ½](https://www.firecrawl.dev/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Ftestimonial-07.c2285d35.jpeg&w=96&q=75)\\n\\nlatentsauce šŸ§˜šŸ½\\n\\n[@latentsauce](https://twitter.com/latentsauce/status/1781738253927735331)\\n\\nFirecrawl simplifies data preparation significantly, exactly what I was hoping for. Thank you for creating Firecrawl ā¤ļøā¤ļøā¤ļø\\n\\n![Michael Ning](https://www.firecrawl.dev/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Ftestimonial-05.76d7cd3e.png&w=96&q=75)\\n\\nMichael Ning\\n\\n[](#)\\n\\nFirecrawl is impressive, saving us 2/3 the tokens and allowing gpt3.5turbo use over gpt4. Major savings in time and money.\\n\\n![Alex Reibman šŸ–‡ļø](https://www.firecrawl.dev/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Ftestimonial-06.4ee7cf5a.jpeg&w=96&q=75)\\n\\nAlex Reibman šŸ–‡ļø\\n\\n[@AlexReibman](https://twitter.com/AlexReibman/status/1780299595484131836)\\n\\nMoved our internal agent\\'s web scraping tool from Apify to Firecrawl because it benchmarked 50x faster with AgentOps.\\n\\n![Michael](https://www.firecrawl.dev/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Ftestimonial-08.0bed40be.jpeg&w=96&q=75)\\n\\nMichael\\n\\n[@michael\\\\_chomsky](#)\\n\\nI really like some of the design decisions Firecrawl made, so I really want to share with others.\\n\\n![Paul Scott](https://www.firecrawl.dev/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Ftestimonial-09.d303b5b4.png&w=96&q=75)\\n\\nPaul Scott\\n\\n[@palebluepaul](https://twitter.com/palebluepaul)\\n\\nAppreciating your lean approach, Firecrawl ticks off everything on our list without the cost prohibitive overkill.\\n\\n![Michael Ning](https://www.firecrawl.dev/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Ftestimonial-05.76d7cd3e.png&w=96&q=75)\\n\\nMichael Ning\\n\\n[](#)\\n\\nFirecrawl is impressive, saving us 2/3 the tokens and allowing gpt3.5turbo use over gpt4. Major savings in time and money.\\n\\n![Alex Reibman šŸ–‡ļø](https://www.firecrawl.dev/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Ftestimonial-06.4ee7cf5a.jpeg&w=96&q=75)\\n\\nAlex Reibman šŸ–‡ļø\\n\\n[@AlexReibman](https://twitter.com/AlexReibman/status/1780299595484131836)\\n\\nMoved our internal agent\\'s web scraping tool from Apify to Firecrawl because it benchmarked 50x faster with AgentOps.\\n\\n![Michael](https://www.firecrawl.dev/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Ftestimonial-08.0bed40be.jpeg&w=96&q=75)\\n\\nMichael\\n\\n[@michael\\\\_chomsky](#)\\n\\nI really like some of the design decisions Firecrawl made, so I really want to share with others.\\n\\n![Paul Scott](https://www.firecrawl.dev/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Ftestimonial-09.d303b5b4.png&w=96&q=75)\\n\\nPaul Scott\\n\\n[@palebluepaul](https://twitter.com/palebluepaul)\\n\\nAppreciating your lean approach, Firecrawl ticks off everything on our list without the cost prohibitive overkill.\\n\\nFlexible Pricing\\n----------------\\n\\nStart for free, then scale as you grow\\n\\nYearly (17% off)Yearly (2 months free)Monthly\\n\\nFree Plan\\n---------\\n\\n500 credits\\n\\n$0/month\\n\\n* Scrape 500 pages\\n* 5 /scrape per min\\n* 1 /crawl per min\\n\\nGet Started\\n\\nHobby\\n-----\\n\\n3,000 credits\\n\\n$16/month\\n\\n* Scrape 3,000 pages\\n* 10 /scrape per min\\n* 3 /crawl per min\\n\\nSubscribe\\n\\nStandardMost Popular\\n--------------------\\n\\n100,000 credits\\n\\n$83/month\\n\\n* Scrape 100,000 pages\\n* 50 /scrape per min\\n* 10 /crawl per min\\n\\nSubscribe\\n\\nGrowth\\n------\\n\\n500,000 credits\\n\\n$333/month\\n\\n* Scrape 500,000 pages\\n* 500 /scrape per min\\n* 50 /crawl per min\\n* Priority Support\\n\\nSubscribe\\n\\nEnterprise Plan\\n---------------\\n\\nUnlimited credits. Custom RPMs.\\n\\nTalk to us\\n\\n* Top priority support\\n* Feature Acceleration\\n* SLAs\\n* Account Manager\\n* Custom rate limits volume\\n* Custom concurrency limits\\n* Beta features access\\n* CEO\\'s number\\n\\n\\\\* a /scrape refers to the [scrape](https://docs.firecrawl.dev/api-reference/endpoint/scrape)\\n API endpoint.\\n\\n\\\\* a /crawl refers to the [crawl](https://docs.firecrawl.dev/api-reference/endpoint/crawl)\\n API endpoint.\\n\\nScrape Credits\\n--------------\\n\\nScrape credits are consumed for each API request, varying by endpoint and feature.\\n\\n| Features | Credits per page |\\n| --- | --- |\\n| Scrape(/scrape) | 1 |\\n| Crawl(/crawl) | 1 |\\n| Search(/search) | 1 |\\n| Scrape + LLM extraction (/scrape) | 50 |\\n\\n[šŸ”„](/)\\n\\nReady to _Build?_\\n-----------------\\n\\nStart scraping web data for your AI apps today. \\nNo credit card needed.\\n\\n[Get Started](/signin)\\n\\n[Talk to us](https://calendly.com/d/cj83-ngq-knk/meet-firecrawl)\\n\\nFAQ\\n---\\n\\nFrequently asked questions about Firecrawl\\n\\n#### General\\n\\nWhat is Firecrawl?\\n\\nFirecrawl turns entire websites into clean, LLM-ready markdown or structured data. Scrape, crawl and extract the web with a single API. Ideal for AI companies looking to empower their LLM applications with web data.\\n\\nWhat sites work?\\n\\nFirecrawl is best suited for business websites, docs and help centers. We currently don\\'t support social media platforms.\\n\\nWho can benefit from using Firecrawl?\\n\\nFirecrawl is tailored for LLM engineers, data scientists, AI researchers, and developers looking to harness web data for training machine learning models, market research, content aggregation, and more. It simplifies the data preparation process, allowing professionals to focus on insights and model development.\\n\\nIs Firecrawl open-source?\\n\\nYes, it is. You can check out the repository on GitHub. Keep in mind that this repository is currently in its early stages of development. We are in the process of merging custom modules into this mono repository.\\n\\n#### Scraping & Crawling\\n\\nHow does Firecrawl handle dynamic content on websites?\\n\\nUnlike traditional web scrapers, Firecrawl is equipped to handle dynamic content rendered with JavaScript. It ensures comprehensive data collection from all accessible subpages, making it a reliable tool for scraping websites that rely heavily on JS for content delivery.\\n\\nWhy is it not crawling all the pages?\\n\\nThere are a few reasons why Firecrawl may not be able to crawl all the pages of a website. Some common reasons include rate limiting, and anti-scraping mechanisms, disallowing the crawler from accessing certain pages. If you\\'re experiencing issues with the crawler, please reach out to our support team at hello@firecrawl.com.\\n\\nCan Firecrawl crawl websites without a sitemap?\\n\\nYes, Firecrawl can access and crawl all accessible subpages of a website, even in the absence of a sitemap. This feature enables users to gather data from a wide array of web sources with minimal setup.\\n\\nWhat formats can Firecrawl convert web data into?\\n\\nFirecrawl specializes in converting web data into clean, well-formatted markdown. This format is particularly suited for LLM applications, offering a structured yet flexible way to represent web content.\\n\\nHow does Firecrawl ensure the cleanliness of the data?\\n\\nFirecrawl employs advanced algorithms to clean and structure the scraped data, removing unnecessary elements and formatting the content into readable markdown. This process ensures that the data is ready for use in LLM applications without further preprocessing.\\n\\nIs Firecrawl suitable for large-scale data scraping projects?\\n\\nAbsolutely. Firecrawl offers various pricing plans, including a Scale plan that supports scraping of millions of pages. With features like caching and scheduled syncs, it\\'s designed to efficiently handle large-scale data scraping and continuous updates, making it ideal for enterprises and large projects.\\n\\nDoes it respect robots.txt?\\n\\nYes, Firecrawl crawler respects the rules set in a website\\'s robots.txt file. If you notice any issues with the way Firecrawl interacts with your website, you can adjust the robots.txt file to control the crawler\\'s behavior. Firecrawl user agent name is \\'FirecrawlAgent\\'. If you notice any behavior that is not expected, please let us know at hello@firecrawl.com.\\n\\nWhat measures does Firecrawl take to handle web scraping challenges like rate limits and caching?\\n\\nFirecrawl is built to navigate common web scraping challenges, including reverse proxies, rate limits, and caching. It smartly manages requests and employs caching techniques to minimize bandwidth usage and avoid triggering anti-scraping mechanisms, ensuring reliable data collection.\\n\\nDoes Firecrawl handle captcha or authentication?\\n\\nFirecrawl avoids captcha by using stealth proxyies. When it encounters captcha, it attempts to solve it automatically, but this is not always possible. We are working to add support for more captcha solving methods. Firecrawl can handle authentication by providing auth headers to the API.\\n\\n#### API Related\\n\\nWhere can I find my API key?\\n\\nClick on the dashboard button on the top navigation menu when logged in and you will find your API key in the main screen and under API Keys.\\n\\n#### Billing\\n\\nIs Firecrawl free?\\n\\nFirecrawl is free for the first 500 scraped pages (500 free credits). After that, you can upgrade to our Standard or Scale plans for more credits.\\n\\nIs there a pay per use plan instead of monthly?\\n\\nNo we do not currently offer a pay per use plan, instead you can upgrade to our Standard or Growth plans for more credits and higher rate limits.\\n\\nHow many credit does scraping, crawling, and extraction cost?\\n\\nScraping costs 1 credit per page. Crawling costs 1 credit per page.\\n\\nDo you charge for failed requests (scrape, crawl, extract)?\\n\\nWe do not charge for any failed requests (scrape, crawl, extract). Please contact support at help@firecrawl.dev if you have any questions.\\n\\nWhat payment methods do you accept?\\n\\nWe accept payments through Stripe which accepts most major credit cards, debit cards, and PayPal.\\n\\n[šŸ”„](/)\\n\\nĀ© A product by Mendable.ai - All rights reserved.\\n\\n[StatusStatus](https://firecrawl.betteruptime.com)\\n[Terms of ServiceTerms of Service](/terms-of-service)\\n[Privacy PolicyPrivacy Policy](/privacy-policy)\\n\\n[Twitter](https://twitter.com/mendableai)\\n[GitHub](https://github.com/mendableai)\\n[Discord](https://discord.gg/gSmWdAkdwd)\\n\\n###### Helpful Links\\n\\n* [Status](https://firecrawl.betteruptime.com/)\\n \\n* [Pricing](/pricing)\\n \\n* [Blog](https://www.firecrawl.dev/blog)\\n \\n* [Docs](https://docs.firecrawl.dev)\\n \\n\\nBacked by![Y Combinator Logo](https://www.firecrawl.dev/images/yc.svg)\\n\\n![SOC 2 Type II](https://www.firecrawl.dev/soc2type2badge.png)\\n\\n###### Resources\\n\\n* [Community](#0)\\n \\n* [Terms of service](#0)\\n \\n* [Collaboration features](#0)\\n \\n\\n###### Legals\\n\\n* [Refund policy](#0)\\n \\n* [Terms & Conditions](#0)\\n \\n* [Privacy policy](#0)\\n \\n* [Brand Kit](#0)')" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "## Initialization\n", - "\n", - "### Modes\n", + "loader = FireCrawlLoader(\n", + " api_key=\"YOUR_API_KEY\", url=\"https://firecrawl.dev\", mode=\"scrape\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pages = []\n", + "for doc in loader.lazy_load():\n", + " pages.append(doc)\n", + " if len(pages) >= 10:\n", + " # do some paged operation, e.g.\n", + " # index.upsert(page)\n", "\n", - "- `scrape`: Scrape single url and return the markdown.\n", - "- `crawl`: Crawl the url and all accessible sub pages and return the markdown for each one." + " pages = []" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "from langchain_community.document_loaders import FireCrawlLoader\n", + "pages" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Modes\n", "\n", - "loader = FireCrawlLoader(url=\"https://firecrawl.dev\", mode=\"crawl\")" + "- `scrape`: Scrape single url and return the markdown.\n", + "- `crawl`: Crawl the url and all accessible sub pages and return the markdown for each one.\n", + "- `map`: Maps the URL and returns a list of semantically related pages." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Load" + "### Crawl\n", + "\n" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Document(metadata={'ogUrl': 'https://www.firecrawl.dev/', 'title': 'Home - Firecrawl', 'robots': 'follow, index', 'ogImage': 'https://www.firecrawl.dev/og.png?123', 'ogTitle': 'Firecrawl', 'sitemap': {'lastmod': '2024-08-12T00:28:16.681Z', 'changefreq': 'weekly'}, 'keywords': 'Firecrawl,Markdown,Data,Mendable,Langchain', 'sourceURL': 'https://www.firecrawl.dev/', 'ogSiteName': 'Firecrawl', 'description': 'Firecrawl crawls and converts any website into clean markdown.', 'ogDescription': 'Turn any website into LLM-ready data.', 'pageStatusCode': 200, 'ogLocaleAlternate': []}, page_content='Introducing [Smart Crawl!](https://www.firecrawl.dev/smart-crawl)\\n Join the waitlist to turn any website into an API with AI\\n\\n\\n\\n[šŸ”„ Firecrawl](/)\\n\\n* [Playground](/playground)\\n \\n* [Docs](https://docs.firecrawl.dev)\\n \\n* [Pricing](/pricing)\\n \\n* [Blog](/blog)\\n \\n* Beta Features\\n\\n[Log In](/signin)\\n[Log In](/signin)\\n[Sign Up](/signin/signup)\\n 8.9k\\n\\n[šŸ’„ Get 2 months free with yearly plan](/pricing)\\n\\nTurn websites into \\n_LLM-ready_ data\\n=====================================\\n\\nPower your AI apps with clean data crawled from any website. It\\'s also open-source.\\n\\nStart for free (500 credits)Start for free[Talk to us](https://calendly.com/d/cj83-ngq-knk/meet-firecrawl)\\n\\nA product by\\n\\n[![Mendable Logo](https://www.firecrawl.dev/images/mendable_logo_transparent.png)Mendable](https://mendable.ai)\\n\\n![Example Webpage](https://www.firecrawl.dev/multiple-websites.png)\\n\\nCrawl, Scrape, Clean\\n--------------------\\n\\nWe crawl all accessible subpages and give you clean markdown for each. No sitemap required.\\n\\n \\n [\\\\\\n {\\\\\\n \"url\": \"https://www.firecrawl.dev/\",\\\\\\n \"markdown\": \"## Welcome to Firecrawl\\\\\\n Firecrawl is a web scraper that allows you to extract the content of a webpage.\"\\\\\\n },\\\\\\n {\\\\\\n \"url\": \"https://www.firecrawl.dev/features\",\\\\\\n \"markdown\": \"## Features\\\\\\n Discover how Firecrawl\\'s cutting-edge features can \\\\\\n transform your data operations.\"\\\\\\n },\\\\\\n {\\\\\\n \"url\": \"https://www.firecrawl.dev/pricing\",\\\\\\n \"markdown\": \"## Pricing Plans\\\\\\n Choose the perfect plan that fits your needs.\"\\\\\\n },\\\\\\n {\\\\\\n \"url\": \"https://www.firecrawl.dev/about\",\\\\\\n \"markdown\": \"## About Us\\\\\\n Learn more about Firecrawl\\'s mission and the \\\\\\n team behind our innovative platform.\"\\\\\\n }\\\\\\n ]\\n \\n\\nNote: The markdown has been edited for display purposes.\\n\\nTrusted by Top Companies\\n------------------------\\n\\n[![Customer Logo](https://www.firecrawl.dev/logos/zapier.png)](https://www.zapier.com)\\n\\n[![Customer Logo](https://www.firecrawl.dev/logos/gamma.svg)](https://gamma.app)\\n\\n[![Customer Logo](https://www.firecrawl.dev/logos/nvidia-com.png)](https://www.nvidia.com)\\n\\n[![Customer Logo](https://www.firecrawl.dev/logos/teller-io.svg)](https://www.teller.io)\\n\\n[![Customer Logo](https://www.firecrawl.dev/logos/stackai.svg)](https://www.stack-ai.com)\\n\\n[![Customer Logo](https://www.firecrawl.dev/logos/palladiumdigital-co-uk.svg)](https://www.palladiumdigital.co.uk)\\n\\n[![Customer Logo](https://www.firecrawl.dev/logos/worldwide-casting-com.svg)](https://www.worldwide-casting.com)\\n\\n[![Customer Logo](https://www.firecrawl.dev/logos/open-gov-sg.png)](https://www.open.gov.sg)\\n\\n[![Customer Logo](https://www.firecrawl.dev/logos/bain-com.svg)](https://www.bain.com)\\n\\n[![Customer Logo](https://www.firecrawl.dev/logos/demand-io.svg)](https://www.demand.io)\\n\\n[![Customer Logo](https://www.firecrawl.dev/logos/nocodegarden-io.png)](https://www.nocodegarden.io)\\n\\n[![Customer Logo](https://www.firecrawl.dev/logos/cyberagent-co-jp.svg)](https://www.cyberagent.co.jp)\\n\\nIntegrate today\\n---------------\\n\\nEnhance your applications with top-tier web scraping and crawling capabilities.\\n\\n#### Scrape\\n\\nExtract markdown or structured data from websites quickly and efficiently.\\n\\n#### Crawling\\n\\nNavigate and retrieve data from all accessible subpages, even without a sitemap.\\n\\nNode.js\\n\\nPython\\n\\ncURL\\n\\n1\\n\\n2\\n\\n3\\n\\n4\\n\\n5\\n\\n6\\n\\n7\\n\\n8\\n\\n9\\n\\n10\\n\\n// npm install @mendable/firecrawl-js \\n \\nimport FirecrawlApp from \\'@mendable/firecrawl-js\\'; \\n \\nconst app \\\\= new FirecrawlApp({ apiKey: \"fc-YOUR\\\\_API\\\\_KEY\" }); \\n \\n// Scrape a website: \\nconst scrapeResult \\\\= await app.scrapeUrl(\\'firecrawl.dev\\'); \\n \\nconsole.log(scrapeResult.data.markdown)\\n\\n#### Use well-known tools\\n\\nAlready fully integrated with the greatest existing tools and workflows.\\n\\n[![LlamaIndex](https://www.firecrawl.dev/logos/llamaindex.svg)](https://docs.llamaindex.ai/en/stable/examples/data_connectors/WebPageDemo/#using-firecrawl-reader/)\\n[![Langchain](https://www.firecrawl.dev/integrations/langchain.png)](https://python.langchain.com/docs/integrations/document_loaders/firecrawl/)\\n[![Dify](https://www.firecrawl.dev/logos/dify.png)](https://dify.ai/blog/dify-ai-blog-integrated-with-firecrawl/)\\n[![Dify](https://www.firecrawl.dev/integrations/langflow_2.png)](https://www.langflow.org/)\\n[![Flowise](https://www.firecrawl.dev/integrations/flowise.png)](https://flowiseai.com/)\\n[![CrewAI](https://www.firecrawl.dev/integrations/crewai.png)](https://crewai.com/)\\n\\n#### Start for free, scale easily\\n\\nKick off your journey for free and scale seamlessly as your project expands.\\n\\n[Try it out](/signin/signup)\\n\\n#### Open-source\\n\\nDeveloped transparently and collaboratively. Join our community of contributors.\\n\\n[Check out our repo](https://github.com/mendableai/firecrawl)\\n\\nWe handle the hard stuff\\n------------------------\\n\\nRotating proxies, caching, rate limits, js-blocked content and more\\n\\n#### Crawling\\n\\nFirecrawl crawls all accessible subpages, even without a sitemap.\\n\\n#### Dynamic content\\n\\nFirecrawl gathers data even if a website uses javascript to render content.\\n\\n#### To Markdown\\n\\nFirecrawl returns clean, well formatted markdown - ready for use in LLM applications\\n\\n#### Crawling Orchestration\\n\\nFirecrawl orchestrates the crawling process in parallel for the fastest results.\\n\\n#### Caching\\n\\nFirecrawl caches content, so you don\\'t have to wait for a full scrape unless new content exists.\\n\\n#### Built for AI\\n\\nBuilt by LLM engineers, for LLM engineers. Giving you clean data the way you want it.\\n\\nOur wall of love\\n\\nDon\\'t take our word for it\\n--------------------------\\n\\n![Greg Kamradt](https://www.firecrawl.dev/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Ftestimonial-02.0afeb750.jpg&w=96&q=75)\\n\\nGreg Kamradt\\n\\n[@GregKamradt](https://twitter.com/GregKamradt/status/1780300642197840307)\\n\\nLLM structured data via API, handling requests, cleaning, and crawling. Enjoyed the early preview.\\n\\n![Amit Naik](https://www.firecrawl.dev/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Ftestimonial-03.ff5dbe11.jpg&w=96&q=75)\\n\\nAmit Naik\\n\\n[@suprgeek](https://twitter.com/suprgeek/status/1780338213351035254)\\n\\n#llm success with RAG relies on Retrieval. Firecrawl by @mendableai structures web content for processing. šŸ‘\\n\\n![Jerry Liu](https://www.firecrawl.dev/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Ftestimonial-04.76bef0df.jpg&w=96&q=75)\\n\\nJerry Liu\\n\\n[@jerryjliu0](https://twitter.com/jerryjliu0/status/1781122933349572772)\\n\\nFirecrawl is awesome šŸ”„ Turns web pages into structured markdown for LLM apps, thanks to @mendableai.\\n\\n![Bardia Pourvakil](https://www.firecrawl.dev/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Ftestimonial-01.025350bc.jpeg&w=96&q=75)\\n\\nBardia Pourvakil\\n\\n[@thepericulum](https://twitter.com/thepericulum/status/1781397799487078874)\\n\\nThese guys ship. I wanted types for their node SDK, and less than an hour later, I got them. Can\\'t recommend them enough.\\n\\n![latentsauce šŸ§˜šŸ½](https://www.firecrawl.dev/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Ftestimonial-07.c2285d35.jpeg&w=96&q=75)\\n\\nlatentsauce šŸ§˜šŸ½\\n\\n[@latentsauce](https://twitter.com/latentsauce/status/1781738253927735331)\\n\\nFirecrawl simplifies data preparation significantly, exactly what I was hoping for. Thank you for creating Firecrawl ā¤ļøā¤ļøā¤ļø\\n\\n![Greg Kamradt](https://www.firecrawl.dev/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Ftestimonial-02.0afeb750.jpg&w=96&q=75)\\n\\nGreg Kamradt\\n\\n[@GregKamradt](https://twitter.com/GregKamradt/status/1780300642197840307)\\n\\nLLM structured data via API, handling requests, cleaning, and crawling. Enjoyed the early preview.\\n\\n![Amit Naik](https://www.firecrawl.dev/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Ftestimonial-03.ff5dbe11.jpg&w=96&q=75)\\n\\nAmit Naik\\n\\n[@suprgeek](https://twitter.com/suprgeek/status/1780338213351035254)\\n\\n#llm success with RAG relies on Retrieval. Firecrawl by @mendableai structures web content for processing. šŸ‘\\n\\n![Jerry Liu](https://www.firecrawl.dev/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Ftestimonial-04.76bef0df.jpg&w=96&q=75)\\n\\nJerry Liu\\n\\n[@jerryjliu0](https://twitter.com/jerryjliu0/status/1781122933349572772)\\n\\nFirecrawl is awesome šŸ”„ Turns web pages into structured markdown for LLM apps, thanks to @mendableai.\\n\\n![Bardia Pourvakil](https://www.firecrawl.dev/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Ftestimonial-01.025350bc.jpeg&w=96&q=75)\\n\\nBardia Pourvakil\\n\\n[@thepericulum](https://twitter.com/thepericulum/status/1781397799487078874)\\n\\nThese guys ship. I wanted types for their node SDK, and less than an hour later, I got them. Can\\'t recommend them enough.\\n\\n![latentsauce šŸ§˜šŸ½](https://www.firecrawl.dev/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Ftestimonial-07.c2285d35.jpeg&w=96&q=75)\\n\\nlatentsauce šŸ§˜šŸ½\\n\\n[@latentsauce](https://twitter.com/latentsauce/status/1781738253927735331)\\n\\nFirecrawl simplifies data preparation significantly, exactly what I was hoping for. Thank you for creating Firecrawl ā¤ļøā¤ļøā¤ļø\\n\\n![Michael Ning](https://www.firecrawl.dev/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Ftestimonial-05.76d7cd3e.png&w=96&q=75)\\n\\nMichael Ning\\n\\n[](#)\\n\\nFirecrawl is impressive, saving us 2/3 the tokens and allowing gpt3.5turbo use over gpt4. Major savings in time and money.\\n\\n![Alex Reibman šŸ–‡ļø](https://www.firecrawl.dev/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Ftestimonial-06.4ee7cf5a.jpeg&w=96&q=75)\\n\\nAlex Reibman šŸ–‡ļø\\n\\n[@AlexReibman](https://twitter.com/AlexReibman/status/1780299595484131836)\\n\\nMoved our internal agent\\'s web scraping tool from Apify to Firecrawl because it benchmarked 50x faster with AgentOps.\\n\\n![Michael](https://www.firecrawl.dev/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Ftestimonial-08.0bed40be.jpeg&w=96&q=75)\\n\\nMichael\\n\\n[@michael\\\\_chomsky](#)\\n\\nI really like some of the design decisions Firecrawl made, so I really want to share with others.\\n\\n![Paul Scott](https://www.firecrawl.dev/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Ftestimonial-09.d303b5b4.png&w=96&q=75)\\n\\nPaul Scott\\n\\n[@palebluepaul](https://twitter.com/palebluepaul)\\n\\nAppreciating your lean approach, Firecrawl ticks off everything on our list without the cost prohibitive overkill.\\n\\n![Michael Ning](https://www.firecrawl.dev/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Ftestimonial-05.76d7cd3e.png&w=96&q=75)\\n\\nMichael Ning\\n\\n[](#)\\n\\nFirecrawl is impressive, saving us 2/3 the tokens and allowing gpt3.5turbo use over gpt4. Major savings in time and money.\\n\\n![Alex Reibman šŸ–‡ļø](https://www.firecrawl.dev/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Ftestimonial-06.4ee7cf5a.jpeg&w=96&q=75)\\n\\nAlex Reibman šŸ–‡ļø\\n\\n[@AlexReibman](https://twitter.com/AlexReibman/status/1780299595484131836)\\n\\nMoved our internal agent\\'s web scraping tool from Apify to Firecrawl because it benchmarked 50x faster with AgentOps.\\n\\n![Michael](https://www.firecrawl.dev/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Ftestimonial-08.0bed40be.jpeg&w=96&q=75)\\n\\nMichael\\n\\n[@michael\\\\_chomsky](#)\\n\\nI really like some of the design decisions Firecrawl made, so I really want to share with others.\\n\\n![Paul Scott](https://www.firecrawl.dev/_next/image?url=%2F_next%2Fstatic%2Fmedia%2Ftestimonial-09.d303b5b4.png&w=96&q=75)\\n\\nPaul Scott\\n\\n[@palebluepaul](https://twitter.com/palebluepaul)\\n\\nAppreciating your lean approach, Firecrawl ticks off everything on our list without the cost prohibitive overkill.\\n\\nFlexible Pricing\\n----------------\\n\\nStart for free, then scale as you grow\\n\\nYearly (17% off)Yearly (2 months free)Monthly\\n\\nFree Plan\\n---------\\n\\n500 credits\\n\\n$0/month\\n\\n* Scrape 500 pages\\n* 5 /scrape per min\\n* 1 /crawl per min\\n\\nGet Started\\n\\nHobby\\n-----\\n\\n3,000 credits\\n\\n$16/month\\n\\n* Scrape 3,000 pages\\n* 10 /scrape per min\\n* 3 /crawl per min\\n\\nSubscribe\\n\\nStandardMost Popular\\n--------------------\\n\\n100,000 credits\\n\\n$83/month\\n\\n* Scrape 100,000 pages\\n* 50 /scrape per min\\n* 10 /crawl per min\\n\\nSubscribe\\n\\nGrowth\\n------\\n\\n500,000 credits\\n\\n$333/month\\n\\n* Scrape 500,000 pages\\n* 500 /scrape per min\\n* 50 /crawl per min\\n* Priority Support\\n\\nSubscribe\\n\\nEnterprise Plan\\n---------------\\n\\nUnlimited credits. Custom RPMs.\\n\\nTalk to us\\n\\n* Top priority support\\n* Feature Acceleration\\n* SLAs\\n* Account Manager\\n* Custom rate limits volume\\n* Custom concurrency limits\\n* Beta features access\\n* CEO\\'s number\\n\\n\\\\* a /scrape refers to the [scrape](https://docs.firecrawl.dev/api-reference/endpoint/scrape)\\n API endpoint.\\n\\n\\\\* a /crawl refers to the [crawl](https://docs.firecrawl.dev/api-reference/endpoint/crawl)\\n API endpoint.\\n\\nScrape Credits\\n--------------\\n\\nScrape credits are consumed for each API request, varying by endpoint and feature.\\n\\n| Features | Credits per page |\\n| --- | --- |\\n| Scrape(/scrape) | 1 |\\n| Crawl(/crawl) | 1 |\\n| Search(/search) | 1 |\\n| Scrape + LLM extraction (/scrape) | 50 |\\n\\n[šŸ”„](/)\\n\\nReady to _Build?_\\n-----------------\\n\\nStart scraping web data for your AI apps today. \\nNo credit card needed.\\n\\n[Get Started](/signin)\\n\\n[Talk to us](https://calendly.com/d/cj83-ngq-knk/meet-firecrawl)\\n\\nFAQ\\n---\\n\\nFrequently asked questions about Firecrawl\\n\\n#### General\\n\\nWhat is Firecrawl?\\n\\nFirecrawl turns entire websites into clean, LLM-ready markdown or structured data. Scrape, crawl and extract the web with a single API. Ideal for AI companies looking to empower their LLM applications with web data.\\n\\nWhat sites work?\\n\\nFirecrawl is best suited for business websites, docs and help centers. We currently don\\'t support social media platforms.\\n\\nWho can benefit from using Firecrawl?\\n\\nFirecrawl is tailored for LLM engineers, data scientists, AI researchers, and developers looking to harness web data for training machine learning models, market research, content aggregation, and more. It simplifies the data preparation process, allowing professionals to focus on insights and model development.\\n\\nIs Firecrawl open-source?\\n\\nYes, it is. You can check out the repository on GitHub. Keep in mind that this repository is currently in its early stages of development. We are in the process of merging custom modules into this mono repository.\\n\\n#### Scraping & Crawling\\n\\nHow does Firecrawl handle dynamic content on websites?\\n\\nUnlike traditional web scrapers, Firecrawl is equipped to handle dynamic content rendered with JavaScript. It ensures comprehensive data collection from all accessible subpages, making it a reliable tool for scraping websites that rely heavily on JS for content delivery.\\n\\nWhy is it not crawling all the pages?\\n\\nThere are a few reasons why Firecrawl may not be able to crawl all the pages of a website. Some common reasons include rate limiting, and anti-scraping mechanisms, disallowing the crawler from accessing certain pages. If you\\'re experiencing issues with the crawler, please reach out to our support team at hello@firecrawl.com.\\n\\nCan Firecrawl crawl websites without a sitemap?\\n\\nYes, Firecrawl can access and crawl all accessible subpages of a website, even in the absence of a sitemap. This feature enables users to gather data from a wide array of web sources with minimal setup.\\n\\nWhat formats can Firecrawl convert web data into?\\n\\nFirecrawl specializes in converting web data into clean, well-formatted markdown. This format is particularly suited for LLM applications, offering a structured yet flexible way to represent web content.\\n\\nHow does Firecrawl ensure the cleanliness of the data?\\n\\nFirecrawl employs advanced algorithms to clean and structure the scraped data, removing unnecessary elements and formatting the content into readable markdown. This process ensures that the data is ready for use in LLM applications without further preprocessing.\\n\\nIs Firecrawl suitable for large-scale data scraping projects?\\n\\nAbsolutely. Firecrawl offers various pricing plans, including a Scale plan that supports scraping of millions of pages. With features like caching and scheduled syncs, it\\'s designed to efficiently handle large-scale data scraping and continuous updates, making it ideal for enterprises and large projects.\\n\\nDoes it respect robots.txt?\\n\\nYes, Firecrawl crawler respects the rules set in a website\\'s robots.txt file. If you notice any issues with the way Firecrawl interacts with your website, you can adjust the robots.txt file to control the crawler\\'s behavior. Firecrawl user agent name is \\'FirecrawlAgent\\'. If you notice any behavior that is not expected, please let us know at hello@firecrawl.com.\\n\\nWhat measures does Firecrawl take to handle web scraping challenges like rate limits and caching?\\n\\nFirecrawl is built to navigate common web scraping challenges, including reverse proxies, rate limits, and caching. It smartly manages requests and employs caching techniques to minimize bandwidth usage and avoid triggering anti-scraping mechanisms, ensuring reliable data collection.\\n\\nDoes Firecrawl handle captcha or authentication?\\n\\nFirecrawl avoids captcha by using stealth proxyies. When it encounters captcha, it attempts to solve it automatically, but this is not always possible. We are working to add support for more captcha solving methods. Firecrawl can handle authentication by providing auth headers to the API.\\n\\n#### API Related\\n\\nWhere can I find my API key?\\n\\nClick on the dashboard button on the top navigation menu when logged in and you will find your API key in the main screen and under API Keys.\\n\\n#### Billing\\n\\nIs Firecrawl free?\\n\\nFirecrawl is free for the first 500 scraped pages (500 free credits). After that, you can upgrade to our Standard or Scale plans for more credits.\\n\\nIs there a pay per use plan instead of monthly?\\n\\nNo we do not currently offer a pay per use plan, instead you can upgrade to our Standard or Growth plans for more credits and higher rate limits.\\n\\nHow many credit does scraping, crawling, and extraction cost?\\n\\nScraping costs 1 credit per page. Crawling costs 1 credit per page.\\n\\nDo you charge for failed requests (scrape, crawl, extract)?\\n\\nWe do not charge for any failed requests (scrape, crawl, extract). Please contact support at help@firecrawl.dev if you have any questions.\\n\\nWhat payment methods do you accept?\\n\\nWe accept payments through Stripe which accepts most major credit cards, debit cards, and PayPal.\\n\\n[šŸ”„](/)\\n\\nĀ© A product by Mendable.ai - All rights reserved.\\n\\n[StatusStatus](https://firecrawl.betteruptime.com)\\n[Terms of ServiceTerms of Service](/terms-of-service)\\n[Privacy PolicyPrivacy Policy](/privacy-policy)\\n\\n[Twitter](https://twitter.com/mendableai)\\n[GitHub](https://github.com/mendableai)\\n[Discord](https://discord.gg/gSmWdAkdwd)\\n\\n###### Helpful Links\\n\\n* [Status](https://firecrawl.betteruptime.com/)\\n \\n* [Pricing](/pricing)\\n \\n* [Blog](https://www.firecrawl.dev/blog)\\n \\n* [Docs](https://docs.firecrawl.dev)\\n \\n\\nBacked by![Y Combinator Logo](https://www.firecrawl.dev/images/yc.svg)\\n\\n![SOC 2 Type II](https://www.firecrawl.dev/soc2type2badge.png)\\n\\n###### Resources\\n\\n* [Community](#0)\\n \\n* [Terms of service](#0)\\n \\n* [Collaboration features](#0)\\n \\n\\n###### Legals\\n\\n* [Refund policy](#0)\\n \\n* [Terms & Conditions](#0)\\n \\n* [Privacy policy](#0)\\n \\n* [Brand Kit](#0)')" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "docs = loader.load()\n", - "\n", - "docs[0]" + "loader = FireCrawlLoader(\n", + " api_key=\"YOUR_API_KEY\",\n", + " url=\"https://firecrawl.dev\",\n", + " mode=\"crawl\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "data = loader.load()" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'ogUrl': 'https://www.firecrawl.dev/', 'title': 'Home - Firecrawl', 'robots': 'follow, index', 'ogImage': 'https://www.firecrawl.dev/og.png?123', 'ogTitle': 'Firecrawl', 'sitemap': {'lastmod': '2024-08-12T00:28:16.681Z', 'changefreq': 'weekly'}, 'keywords': 'Firecrawl,Markdown,Data,Mendable,Langchain', 'sourceURL': 'https://www.firecrawl.dev/', 'ogSiteName': 'Firecrawl', 'description': 'Firecrawl crawls and converts any website into clean markdown.', 'ogDescription': 'Turn any website into LLM-ready data.', 'pageStatusCode': 200, 'ogLocaleAlternate': []}\n" - ] - } - ], + "outputs": [], "source": [ - "print(docs[0].metadata)" + "print(pages[0].page_content[:100])\n", + "print(pages[0].metadata)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Lazy Load\n", + "#### Crawl Options\n", "\n", - "You can use lazy loading to minimize memory requirements." + "You can also pass `params` to the loader. This is a dictionary of options to pass to the crawler. See the [FireCrawl API documentation](https://github.com/mendableai/firecrawl-py) for more information." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "### Map" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "pages = []\n", - "for doc in loader.lazy_load():\n", - " pages.append(doc)\n", - " if len(pages) >= 10:\n", - " # do some paged operation, e.g.\n", - " # index.upsert(page)\n", - "\n", - " pages = []" + "loader = FireCrawlLoader(api_key=\"YOUR_API_KEY\", url=\"firecrawl.dev\", mode=\"map\")" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "8" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "len(pages)" + "docs = loader.load()" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Introducing [Smart Crawl!](https://www.firecrawl.dev/smart-crawl)\n", - " Join the waitlist to turn any web\n", - "{'ogUrl': 'https://www.firecrawl.dev/blog/introducing-fire-engine-for-firecrawl', 'title': 'Introducing Fire Engine for Firecrawl', 'robots': 'follow, index', 'ogImage': 'https://www.firecrawl.dev/images/blog/fire-engine-launch.png', 'ogTitle': 'Introducing Fire Engine for Firecrawl', 'sitemap': {'lastmod': '2024-08-06T00:00:00.000Z', 'changefreq': 'weekly'}, 'keywords': 'firecrawl,fireengine,web crawling,dashboard,web scraping,LLM,data extraction', 'sourceURL': 'https://www.firecrawl.dev/blog/introducing-fire-engine-for-firecrawl', 'ogSiteName': 'Firecrawl', 'description': 'The most scalable, reliable, and fast way to get web data for Firecrawl.', 'ogDescription': 'The most scalable, reliable, and fast way to get web data for Firecrawl.', 'pageStatusCode': 200, 'ogLocaleAlternate': []}\n" - ] - } - ], + "outputs": [], "source": [ - "print(pages[0].page_content[:100])\n", - "print(pages[0].metadata)" + "docs" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Crawler Options\n", + "#### Map Options\n", "\n", - "You can also pass `params` to the loader. This is a dictionary of options to pass to the crawler. See the [FireCrawl API documentation](https://github.com/mendableai/firecrawl-py) for more information.\n", - "\n" + "You can also pass `params` to the loader. This is a dictionary of options to pass to the loader. See the [FireCrawl API documentation](https://github.com/mendableai/firecrawl-py) for more information." ] }, { @@ -220,7 +229,7 @@ ], "metadata": { "kernelspec": { - "display_name": "langchain", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, diff --git a/libs/community/langchain_community/document_loaders/firecrawl.py b/libs/community/langchain_community/document_loaders/firecrawl.py index 467813419ae29..4423881dfef4c 100644 --- a/libs/community/langchain_community/document_loaders/firecrawl.py +++ b/libs/community/langchain_community/document_loaders/firecrawl.py @@ -1,3 +1,4 @@ +import warnings from typing import Iterator, Literal, Optional from langchain_core.document_loaders import BaseLoader @@ -48,7 +49,6 @@ class FireCrawlLoader(BaseLoader): Join the waitlist to turn any web {'ogUrl': 'https://www.firecrawl.dev/', 'title': 'Home - Firecrawl', 'robots': 'follow, index', 'ogImage': 'https://www.firecrawl.dev/og.png?123', 'ogTitle': 'Firecrawl', 'sitemap': {'lastmod': '2024-08-12T00:28:16.681Z', 'changefreq': 'weekly'}, 'keywords': 'Firecrawl,Markdown,Data,Mendable,Langchain', 'sourceURL': 'https://www.firecrawl.dev/', 'ogSiteName': 'Firecrawl', 'description': 'Firecrawl crawls and converts any website into clean markdown.', 'ogDescription': 'Turn any website into LLM-ready data.', 'pageStatusCode': 200, 'ogLocaleAlternate': []} - Async load: .. code-block:: python @@ -64,13 +64,169 @@ class FireCrawlLoader(BaseLoader): """ # noqa: E501 + def legacy_crawler_options_adapter(self, params: dict) -> dict: + use_legacy_options = False + legacy_keys = [ + "includes", + "excludes", + "allowBackwardCrawling", + "allowExternalContentLinks", + "pageOptions", + ] + for key in legacy_keys: + if params.get(key): + use_legacy_options = True + break + + if use_legacy_options: + warnings.warn( + "Deprecated parameters detected. See Firecrawl v1 docs for updates.", + DeprecationWarning, + ) + if "includes" in params: + if params["includes"] is True: + params["includePaths"] = params["includes"] + del params["includes"] + + if "excludes" in params: + if params["excludes"] is True: + params["excludePaths"] = params["excludes"] + del params["excludes"] + + if "allowBackwardCrawling" in params: + if params["allowBackwardCrawling"] is True: + params["allowBackwardLinks"] = params["allowBackwardCrawling"] + del params["allowBackwardCrawling"] + + if "allowExternalContentLinks" in params: + if params["allowExternalContentLinks"] is True: + params["allowExternalLinks"] = params["allowExternalContentLinks"] + del params["allowExternalContentLinks"] + + if "pageOptions" in params: + if isinstance(params["pageOptions"], dict): + params["scrapeOptions"] = self.legacy_scrape_options_adapter( + params["pageOptions"] + ) + del params["pageOptions"] + + return params + + def legacy_scrape_options_adapter(self, params: dict) -> dict: + use_legacy_options = False + formats = ["markdown"] + + if "extractorOptions" in params: + if "mode" in params["extractorOptions"]: + if ( + params["extractorOptions"]["mode"] == "llm-extraction" + or params["extractorOptions"]["mode"] + == "llm-extraction-from-raw-html" + or params["extractorOptions"]["mode"] + == "llm-extraction-from-markdown" + ): + use_legacy_options = True + if "extractionPrompt" in params["extractorOptions"]: + if params["extractorOptions"]["extractionPrompt"]: + params["prompt"] = params["extractorOptions"][ + "extractionPrompt" + ] + else: + params["prompt"] = params["extractorOptions"].get( + "extractionPrompt", + "Extract page information based on the schema.", + ) + + if "extractionSchema" in params["extractorOptions"]: + if params["extractorOptions"]["extractionSchema"]: + params["schema"] = params["extractorOptions"][ + "extractionSchema" + ] + + if "userPrompt" in params["extractorOptions"]: + if params["extractorOptions"]["userPrompt"]: + params["prompt"] = params["extractorOptions"]["userPrompt"] + + del params["extractorOptions"] + + scrape_keys = [ + "includeMarkdown", + "includeHtml", + "includeRawHtml", + "includeExtract", + "includeLinks", + "screenshot", + "fullPageScreenshot", + "onlyIncludeTags", + "removeTags", + ] + for key in scrape_keys: + if params.get(key): + use_legacy_options = True + break + + if use_legacy_options: + warnings.warn( + "Deprecated parameters detected. See Firecrawl v1 docs for updates.", + DeprecationWarning, + ) + if "includeMarkdown" in params: + if params["includeMarkdown"] is False: + formats.remove("markdown") + del params["includeMarkdown"] + + if "includeHtml" in params: + if params["includeHtml"] is True: + formats.append("html") + del params["includeHtml"] + + if "includeRawHtml" in params: + if params["includeRawHtml"] is True: + formats.append("rawHtml") + del params["includeRawHtml"] + + if "includeExtract" in params: + if params["includeExtract"] is True: + formats.append("extract") + del params["includeExtract"] + + if "includeLinks" in params: + if params["includeLinks"] is True: + formats.append("links") + del params["includeLinks"] + + if "screenshot" in params: + if params["screenshot"] is True: + formats.append("screenshot") + del params["screenshot"] + + if "fullPageScreenshot" in params: + if params["fullPageScreenshot"] is True: + formats.append("screenshot@fullPage") + del params["fullPageScreenshot"] + + if "onlyIncludeTags" in params: + if params["onlyIncludeTags"] is True: + params["includeTags"] = params["onlyIncludeTags"] + del params["onlyIncludeTags"] + + if "removeTags" in params: + if params["removeTags"] is True: + params["excludeTags"] = params["removeTags"] + del params["removeTags"] + + if "formats" not in params: + params["formats"] = formats + + return params + def __init__( self, url: str, *, api_key: Optional[str] = None, api_url: Optional[str] = None, - mode: Literal["crawl", "scrape"] = "crawl", + mode: Literal["crawl", "scrape", "map"] = "crawl", params: Optional[dict] = None, ): """Initialize with API key and url. @@ -82,8 +238,9 @@ def __init__( api_url: The Firecrawl API URL. If not specified will be read from env var FIRECRAWL_API_URL or defaults to https://api.firecrawl.dev. mode: The mode to run the loader in. Default is "crawl". - Options include "scrape" (single url) and - "crawl" (all accessible sub pages). + Options include "scrape" (single url), + "crawl" (all accessible sub pages), + "map" (returns list of links that are semantically related). params: The parameters to pass to the Firecrawl API. Examples include crawlerOptions. For more details, visit: https://github.com/mendableai/firecrawl-py @@ -95,30 +252,58 @@ def __init__( raise ImportError( "`firecrawl` package not found, please run `pip install firecrawl-py`" ) - if mode not in ("crawl", "scrape"): + if mode not in ("crawl", "scrape", "search", "map"): raise ValueError( - f"Unrecognized mode '{mode}'. Expected one of 'crawl', 'scrape'." + f"Invalid mode '{mode}'. Allowed: 'crawl', 'scrape', 'search', 'map'." ) + + if not url: + raise ValueError("Url must be provided") + api_key = api_key or get_from_env("api_key", "FIRECRAWL_API_KEY") self.firecrawl = FirecrawlApp(api_key=api_key, api_url=api_url) self.url = url self.mode = mode - self.params = params + self.params = params or {} def lazy_load(self) -> Iterator[Document]: if self.mode == "scrape": - firecrawl_docs = [self.firecrawl.scrape_url(self.url, params=self.params)] + firecrawl_docs = [ + self.firecrawl.scrape_url( + self.url, params=self.legacy_scrape_options_adapter(self.params) + ) + ] elif self.mode == "crawl": - firecrawl_docs = self.firecrawl.crawl_url(self.url, params=self.params) + if not self.url: + raise ValueError("URL is required for crawl mode") + crawl_response = self.firecrawl.crawl_url( + self.url, params=self.legacy_crawler_options_adapter(self.params) + ) + firecrawl_docs = crawl_response.get("data", []) + elif self.mode == "map": + if not self.url: + raise ValueError("URL is required for map mode") + firecrawl_docs = self.firecrawl.map_url(self.url, params=self.params) + elif self.mode == "search": + raise ValueError( + "Search mode is not supported in this version, please downgrade." + ) else: raise ValueError( - f"Unrecognized mode '{self.mode}'. Expected one of 'crawl', 'scrape'." + f"Invalid mode '{self.mode}'. Allowed: 'crawl', 'scrape', 'map'." ) for doc in firecrawl_docs: - metadata = doc.get("metadata", {}) - if (self.params is not None) and self.params.get( - "extractorOptions", {} - ).get("mode") == "llm-extraction": - metadata["llm_extraction"] = doc.get("llm_extraction") - - yield Document(page_content=doc.get("markdown", ""), metadata=metadata) + if self.mode == "map": + page_content = doc + metadata = {} + else: + page_content = ( + doc.get("markdown") or doc.get("html") or doc.get("rawHtml", "") + ) + metadata = doc.get("metadata", {}) + if not page_content: + continue + yield Document( + page_content=page_content, + metadata=metadata, + )