From 6ccecf23639ef5cbebcbc4eaeda99eb1f7b84deb Mon Sep 17 00:00:00 2001 From: Mish Ushakov <10400064+mishushakov@users.noreply.github.com> Date: Thu, 25 Apr 2024 03:11:03 +0200 Subject: [PATCH] community[minor]: added Browserbase loader (#20478) --- .../document_loaders/browserbase.ipynb | 122 ++++++++++++++++++ .../integrations/providers/browserbase.mdx | 28 ++++ .../document_loaders/__init__.py | 5 + .../document_loaders/browserbase.py | 47 +++++++ .../document_loaders/test_imports.py | 1 + 5 files changed, 203 insertions(+) create mode 100644 docs/docs/integrations/document_loaders/browserbase.ipynb create mode 100644 docs/docs/integrations/providers/browserbase.mdx create mode 100644 libs/community/langchain_community/document_loaders/browserbase.py diff --git a/docs/docs/integrations/document_loaders/browserbase.ipynb b/docs/docs/integrations/document_loaders/browserbase.ipynb new file mode 100644 index 0000000000000..8ed52fd8dedd3 --- /dev/null +++ b/docs/docs/integrations/document_loaders/browserbase.ipynb @@ -0,0 +1,122 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Browserbase\n", + "\n", + "[Browserbase](https://browserbase.com) is a serverless platform for running headless browsers, it offers advanced debugging, session recordings, stealth mode, integrated proxies and captcha solving.\n", + "\n", + "## Installation\n", + "\n", + "- Get an API key from [browserbase.com](https://browserbase.com) and set it in environment variables (`BROWSERBASE_API_KEY`).\n", + "- Install the [Browserbase SDK](http://github.com/browserbase/python-sdk):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "% pip install browserbase" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Loading documents" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can load webpages into LangChain using `BrowserbaseLoader`. Optionally, you can set `text_content` parameter to convert the pages to text-only representation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_community.document_loaders import BrowserbaseLoader" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "loader = BrowserbaseLoader(\n", + " urls=[\n", + " \"https://example.com\",\n", + " ],\n", + " # Text mode\n", + " text_content=False,\n", + ")\n", + "\n", + "docs = loader.load()\n", + "print(docs[0].page_content[:61])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Loading images\n", + "\n", + "You can also load screenshots of webpages (as bytes) for multi-modal models.\n", + "\n", + "Full example using GPT-4V:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from browserbase import Browserbase\n", + "from browserbase.helpers.gpt4 import GPT4VImage, GPT4VImageDetail\n", + "from langchain_core.messages import HumanMessage\n", + "from langchain_openai import ChatOpenAI\n", + "\n", + "chat = ChatOpenAI(model=\"gpt-4-vision-preview\", max_tokens=256)\n", + "browser = Browserbase()\n", + "\n", + "screenshot = browser.screenshot(\"https://browserbase.com\")\n", + "\n", + "result = chat.invoke(\n", + " [\n", + " HumanMessage(\n", + " content=[\n", + " {\"type\": \"text\", \"text\": \"What color is the logo?\"},\n", + " GPT4VImage(screenshot, GPT4VImageDetail.auto),\n", + " ]\n", + " )\n", + " ]\n", + ")\n", + "\n", + "print(result.content)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.9.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/docs/integrations/providers/browserbase.mdx b/docs/docs/integrations/providers/browserbase.mdx new file mode 100644 index 0000000000000..d5ec545a3f162 --- /dev/null +++ b/docs/docs/integrations/providers/browserbase.mdx @@ -0,0 +1,28 @@ +# Browserbase + +>[Browserbase](https://browserbase.com) is a serverless platform for running headless browsers, it offers advanced debugging, session recordings, stealth mode, integrated proxies and captcha solving. + +## Installation and Setup + +- Get an API key from [browserbase.com](https://browserbase.com) and set it in environment variables (`BROWSERBASE_API_KEY`). +- Install the [Browserbase SDK](http://github.com/browserbase/python-sdk): + +```python +pip install browserbase +``` + +## Document loader + +See a [usage example](/docs/integrations/document_loaders/browserbase). + +```python +from langchain_community.document_loaders import BrowserbaseLoader +``` + +## Multi-Modal + +See a [usage example](/docs/integrations/document_loaders/browserbase). + +```python +from browserbase.helpers.gpt4 import GPT4VImage, GPT4VImageDetail +``` diff --git a/libs/community/langchain_community/document_loaders/__init__.py b/libs/community/langchain_community/document_loaders/__init__.py index 07a83d1168649..fe52b4ff3bf5c 100644 --- a/libs/community/langchain_community/document_loaders/__init__.py +++ b/libs/community/langchain_community/document_loaders/__init__.py @@ -95,6 +95,9 @@ from langchain_community.document_loaders.brave_search import ( BraveSearchLoader, # noqa: F401 ) + from langchain_community.document_loaders.browserbase import ( + BrowserbaseLoader, # noqa: F401 + ) from langchain_community.document_loaders.browserless import ( BrowserlessLoader, # noqa: F401 ) @@ -541,6 +544,7 @@ "BlobLoader", "BlockchainDocumentLoader", "BraveSearchLoader", + "BrowserbaseLoader", "BrowserlessLoader", "CSVLoader", "CassandraLoader", @@ -727,6 +731,7 @@ "BlobLoader": "langchain_community.document_loaders.blob_loaders", "BlockchainDocumentLoader": "langchain_community.document_loaders.blockchain", "BraveSearchLoader": "langchain_community.document_loaders.brave_search", + "BrowserbaseLoader": "langchain_community.document_loaders.browserbase", "BrowserlessLoader": "langchain_community.document_loaders.browserless", "CSVLoader": "langchain_community.document_loaders.csv_loader", "CassandraLoader": "langchain_community.document_loaders.cassandra", diff --git a/libs/community/langchain_community/document_loaders/browserbase.py b/libs/community/langchain_community/document_loaders/browserbase.py new file mode 100644 index 0000000000000..888a89107abb8 --- /dev/null +++ b/libs/community/langchain_community/document_loaders/browserbase.py @@ -0,0 +1,47 @@ +from typing import Iterator, List, Optional, Tuple, Union + +from langchain_core.documents import Document + +from langchain_community.document_loaders.base import BaseLoader + + +class BrowserbaseLoader(BaseLoader): + """Load pre-rendered web pages using a headless browser hosted on Browserbase. + + Depends on `browserbase` package. + Get your API key from https://browserbase.com + """ + + def __init__( + self, + urls: Union[List[str], Tuple[str, ...]], + *, + api_key: Optional[str] = None, + text_content: bool = False, + ): + self.urls = urls + self.text_content = text_content + + try: + from browserbase import Browserbase + except ImportError: + raise ImportError( + "You must run " + "`pip install --upgrade " + "browserbase` " + "to use the Browserbase loader." + ) + + self.browserbase = Browserbase(api_key=api_key) + + def lazy_load(self) -> Iterator[Document]: + """Load pages from URLs""" + pages = self.browserbase.load_urls(self.urls, self.text_content) + + for i, page in enumerate(pages): + yield Document( + page_content=page, + metadata={ + "url": self.urls[i], + }, + ) diff --git a/libs/community/tests/unit_tests/document_loaders/test_imports.py b/libs/community/tests/unit_tests/document_loaders/test_imports.py index 7274d432e5c1d..28c68459e76db 100644 --- a/libs/community/tests/unit_tests/document_loaders/test_imports.py +++ b/libs/community/tests/unit_tests/document_loaders/test_imports.py @@ -38,6 +38,7 @@ "BlobLoader", "BlockchainDocumentLoader", "BraveSearchLoader", + "BrowserbaseLoader", "BrowserlessLoader", "CassandraLoader", "CSVLoader",