Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft: community: add pull_md tool for converting URLs to Markdown #29006

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
408 changes: 408 additions & 0 deletions docs/docs/integrations/tools/pull_md.ipynb

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
from langchain_community.tools.metaphor_search.tool import MetaphorSearchResults
from langchain_community.tools.openweathermap.tool import OpenWeatherMapQueryRun
from langchain_community.tools.pubmed.tool import PubmedQueryRun
from langchain_community.tools.pull_md.tool import PullMdQueryRun
from langchain_community.tools.reddit_search.tool import RedditSearchRun
from langchain_community.tools.requests.tool import (
RequestsDeleteTool,
Expand Down Expand Up @@ -90,6 +91,7 @@
from langchain_community.utilities.metaphor_search import MetaphorSearchAPIWrapper
from langchain_community.utilities.openweathermap import OpenWeatherMapAPIWrapper
from langchain_community.utilities.pubmed import PubMedAPIWrapper
from langchain_community.utilities.pull_md import PullMdAPIWrapper
from langchain_community.utilities.reddit_search import RedditSearchAPIWrapper
from langchain_community.utilities.requests import TextRequestsWrapper
from langchain_community.utilities.searchapi import SearchApiAPIWrapper
Expand Down Expand Up @@ -333,6 +335,8 @@ def _get_golden_query(**kwargs: Any) -> BaseTool:
def _get_pubmed(**kwargs: Any) -> BaseTool:
return PubmedQueryRun(api_wrapper=PubMedAPIWrapper(**kwargs))

def _get_pull_md(**kwargs: Any) -> BaseTool:
return PullMdQueryRun(api_wrapper=PullMdAPIWrapper(**kwargs))

def _get_google_books(**kwargs: Any) -> BaseTool:
from langchain_community.tools.google_books import GoogleBooksQueryRun
Expand Down Expand Up @@ -537,6 +541,7 @@ def _get_reddit_search(**kwargs: Any) -> BaseTool:
),
"golden-query": (_get_golden_query, ["golden_api_key"]),
"pubmed": (_get_pubmed, ["top_k_results"]),
"pull-md": (_get_pull_md, []),
"human": (_get_human_tool, ["prompt_func", "input_func"]),
"awslambda": (
_get_lambda_api,
Expand Down
5 changes: 5 additions & 0 deletions libs/community/langchain_community/tools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,9 @@
from langchain_community.tools.pubmed.tool import (
PubmedQueryRun,
)
from langchain_community.tools.pull_md.tool import (
PullMdQueryRun,
)
from langchain_community.tools.reddit_search.tool import (
RedditSearchRun,
RedditSearchSchema,
Expand Down Expand Up @@ -451,6 +454,7 @@
"PolygonLastQuote",
"PolygonTickerNews",
"PubmedQueryRun",
"PullMdQueryRun",
"QueryCheckerTool",
"QueryPowerBITool",
"QuerySQLCheckerTool",
Expand Down Expand Up @@ -605,6 +609,7 @@
"PolygonLastQuote": "langchain_community.tools.polygon.last_quote",
"PolygonTickerNews": "langchain_community.tools.polygon.ticker_news",
"PubmedQueryRun": "langchain_community.tools.pubmed.tool",
"PullMdQueryRun": "langchain_community.tools.pull_md.tool",
"QueryCheckerTool": "langchain_community.tools.spark_sql.tool",
"QueryPowerBITool": "langchain_community.tools.powerbi.tool",
"QuerySQLCheckerTool": "langchain_community.tools.sql_database.tool",
Expand Down
3 changes: 3 additions & 0 deletions libs/community/langchain_community/tools/pull_md/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from langchain_community.tools.pull_md.tool import PullMdQueryRun

__all__ = ["PullMdQueryRun"]
33 changes: 33 additions & 0 deletions libs/community/langchain_community/tools/pull_md/tool.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
"""Tool for the Pull.md API."""

from typing import Optional, Type

from langchain_core.callbacks import CallbackManagerForToolRun
from langchain_core.tools import BaseTool
from pydantic import BaseModel, Field

from langchain_community.utilities.pull_md import PullMdAPIWrapper

class PullMdInput(BaseModel):

Check failure on line 11 in libs/community/langchain_community/tools/pull_md/tool.py

View workflow job for this annotation

GitHub Actions / cd libs/community / make lint #3.13

Ruff (I001)

langchain_community/tools/pull_md/tool.py:3:1: I001 Import block is un-sorted or un-formatted

Check failure on line 11 in libs/community/langchain_community/tools/pull_md/tool.py

View workflow job for this annotation

GitHub Actions / cd libs/community / make lint #3.9

Ruff (I001)

langchain_community/tools/pull_md/tool.py:3:1: I001 Import block is un-sorted or un-formatted
"""Input for the Pull.md tool."""
url: str = Field(description="URL to convert to markdown")

class PullMdQueryRun(BaseTool):
"""Tool that uses Pull.md API to convert URLs to Markdown."""

name: str = "pull_md"
description: str = (
"A wrapper around Pull.md service. "
"Useful for converting web pages to Markdown format. "
"Input should be a valid URL."
)
api_wrapper: PullMdAPIWrapper = Field(default_factory=PullMdAPIWrapper)
args_schema: Type[BaseModel] = PullMdInput

def _run(
self,
url: str,
run_manager: Optional[CallbackManagerForToolRun] = None,
) -> str:
"""Use the Pull.md tool."""
return self.api_wrapper.convert_url_to_markdown(url)
5 changes: 5 additions & 0 deletions libs/community/langchain_community/utilities/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,9 @@
from langchain_community.utilities.pubmed import (
PubMedAPIWrapper,
)
from langchain_community.utilities.pull_md import (
PullMdAPIWrapper,
)
from langchain_community.utilities.rememberizer import RememberizerAPIWrapper
from langchain_community.utilities.requests import (
Requests,
Expand Down Expand Up @@ -216,6 +219,7 @@
"Portkey",
"PowerBIDataset",
"PubMedAPIWrapper",
"PullMdAPIWrapper",
"RememberizerAPIWrapper",
"Requests",
"RequestsWrapper",
Expand Down Expand Up @@ -280,6 +284,7 @@
"Portkey": "langchain_community.utilities.portkey",
"PowerBIDataset": "langchain_community.utilities.powerbi",
"PubMedAPIWrapper": "langchain_community.utilities.pubmed",
"PullMdAPIWrapper": "langchain_community.utilities.pull_md",
"RememberizerAPIWrapper": "langchain_community.utilities.rememberizer",
"Requests": "langchain_community.utilities.requests",
"RequestsWrapper": "langchain_community.utilities.requests",
Expand Down
40 changes: 40 additions & 0 deletions libs/community/langchain_community/utilities/pull_md.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
"""Utility that calls the Pull.md API for markdown conversion."""

import logging

from pydantic import BaseModel
import requests

logger = logging.getLogger(__name__)

Check failure on line 8 in libs/community/langchain_community/utilities/pull_md.py

View workflow job for this annotation

GitHub Actions / cd libs/community / make lint #3.13

Ruff (I001)

langchain_community/utilities/pull_md.py:3:1: I001 Import block is un-sorted or un-formatted

Check failure on line 8 in libs/community/langchain_community/utilities/pull_md.py

View workflow job for this annotation

GitHub Actions / cd libs/community / make lint #3.9

Ruff (I001)

langchain_community/utilities/pull_md.py:3:1: I001 Import block is un-sorted or un-formatted


class PullMdAPIWrapper(BaseModel):
"""Wrapper around PullMdAPI to convert URLs to Markdown.

This utility provides a method to convert a given URL to Markdown format using
the Pull.md service.
"""

def convert_url_to_markdown(self, url: str) -> str:
"""Converts a URL to Markdown using the Pull.md service.

Args:
url: A string representing the URL to be converted.

Returns:
A string containing the Markdown version of the URL's content.

Raises:
HTTPError: An error from the requests library for bad HTTP responses.
"""
try:
from pull_md import pull_markdown # Assuming pull_md is installed in the environment

Check failure on line 31 in libs/community/langchain_community/utilities/pull_md.py

View workflow job for this annotation

GitHub Actions / cd libs/community / make lint #3.13

Ruff (E501)

langchain_community/utilities/pull_md.py:31:89: E501 Line too long (97 > 88)

Check failure on line 31 in libs/community/langchain_community/utilities/pull_md.py

View workflow job for this annotation

GitHub Actions / cd libs/community / make lint #3.9

Ruff (E501)

langchain_community/utilities/pull_md.py:31:89: E501 Line too long (97 > 88)
return pull_markdown(url)

Check failure on line 32 in libs/community/langchain_community/utilities/pull_md.py

View workflow job for this annotation

GitHub Actions / cd libs/community / make lint #3.13

Ruff (I001)

langchain_community/utilities/pull_md.py:31:1: I001 Import block is un-sorted or un-formatted

Check failure on line 32 in libs/community/langchain_community/utilities/pull_md.py

View workflow job for this annotation

GitHub Actions / cd libs/community / make lint #3.9

Ruff (I001)

langchain_community/utilities/pull_md.py:31:1: I001 Import block is un-sorted or un-formatted
except ImportError:
raise ImportError("pull_md package is not installed. Install it with `pip install pull-md`")

Check failure on line 34 in libs/community/langchain_community/utilities/pull_md.py

View workflow job for this annotation

GitHub Actions / cd libs/community / make lint #3.13

Ruff (E501)

langchain_community/utilities/pull_md.py:34:89: E501 Line too long (104 > 88)

Check failure on line 34 in libs/community/langchain_community/utilities/pull_md.py

View workflow job for this annotation

GitHub Actions / cd libs/community / make lint #3.9

Ruff (E501)

langchain_community/utilities/pull_md.py:34:89: E501 Line too long (104 > 88)
except requests.exceptions.RequestException as e:
logger.error(f"Request to Pull.md API failed: {e}")
raise

class Config:
arbitrary_types_allowed = True
23 changes: 23 additions & 0 deletions libs/community/tests/unit_tests/utilities/test_pull_md.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import pytest

from utilities.pull_md import PullMdAPIWrapper


def test_convert_url_to_markdown_success(mocker):
"""Test successful URL conversion to Markdown."""
expected_markdown = "# Example Domain"
mocker.patch('pull_md.pull_markdown', return_value=expected_markdown)

pull_md = PullMdAPIWrapper()
result = pull_md.convert_url_to_markdown("http://example.com")
assert result == expected_markdown


def test_convert_url_to_markdown_failure(mocker):
"""Test failure in URL conversion to Markdown."""
mocker.patch('pull_md.pull_markdown', side_effect=Exception("Failed to convert"))

pull_md = PullMdAPIWrapper()
with pytest.raises(Exception) as excinfo:
pull_md.convert_url_to_markdown("http://example.com")
assert "Failed to convert" in str(excinfo.value)
Loading