From af74301ab981a0421f84bbff6921b7feca79dbfe Mon Sep 17 00:00:00 2001 From: Bagatur <22008038+baskaryan@users.noreply.github.com> Date: Wed, 7 Feb 2024 14:15:30 -0800 Subject: [PATCH] core[patch], community[patch]: link extraction continue on failure (#17200) --- .../document_loaders/recursive_url_loader.py | 32 +++++++++++------ .../langchain_community/vectorstores/kdbai.py | 4 +-- libs/core/langchain_core/utils/html.py | 35 +++++++++++++------ 3 files changed, 48 insertions(+), 23 deletions(-) diff --git a/libs/community/langchain_community/document_loaders/recursive_url_loader.py b/libs/community/langchain_community/document_loaders/recursive_url_loader.py index 6fca4edf86a80..c24ab1730fd35 100644 --- a/libs/community/langchain_community/document_loaders/recursive_url_loader.py +++ b/libs/community/langchain_community/document_loaders/recursive_url_loader.py @@ -93,6 +93,7 @@ def __init__( link_regex: Union[str, re.Pattern, None] = None, headers: Optional[dict] = None, check_response_status: bool = False, + continue_on_failure: bool = True, ) -> None: """Initialize with URL to crawl and any subdirectories to exclude. @@ -117,6 +118,8 @@ def __init__( link_regex: Regex for extracting sub-links from the raw html of a web page. check_response_status: If True, check HTTP response status and skip URLs with error responses (400-599). + continue_on_failure: If True, continue if getting or parsing a link raises + an exception. Otherwise, raise the exception. """ self.url = url @@ -142,6 +145,7 @@ def __init__( self._lock = asyncio.Lock() if self.use_async else None self.headers = headers self.check_response_status = check_response_status + self.continue_on_failure = continue_on_failure def _get_child_links_recursive( self, url: str, visited: Set[str], *, depth: int = 0 @@ -164,11 +168,14 @@ def _get_child_links_recursive( if self.check_response_status and 400 <= response.status_code <= 599: raise ValueError(f"Received HTTP status {response.status_code}") except Exception as e: - logger.warning( - f"Unable to load from {url}. Received error {e} of type " - f"{e.__class__.__name__}" - ) - return + if self.continue_on_failure: + logger.warning( + f"Unable to load from {url}. Received error {e} of type " + f"{e.__class__.__name__}" + ) + return + else: + raise e content = self.extractor(response.text) if content: yield Document( @@ -184,6 +191,7 @@ def _get_child_links_recursive( pattern=self.link_regex, prevent_outside=self.prevent_outside, exclude_prefixes=self.exclude_dirs, + continue_on_failure=self.continue_on_failure, ) for link in sub_links: # Check all unvisited links @@ -237,13 +245,16 @@ async def _async_get_child_links_recursive( if self.check_response_status and 400 <= response.status <= 599: raise ValueError(f"Received HTTP status {response.status}") except (aiohttp.client_exceptions.InvalidURL, Exception) as e: - logger.warning( - f"Unable to load {url}. Received error {e} of type " - f"{e.__class__.__name__}" - ) if close_session: await session.close() - return [] + if self.continue_on_failure: + logger.warning( + f"Unable to load {url}. Received error {e} of type " + f"{e.__class__.__name__}" + ) + return [] + else: + raise e results = [] content = self.extractor(text) if content: @@ -261,6 +272,7 @@ async def _async_get_child_links_recursive( pattern=self.link_regex, prevent_outside=self.prevent_outside, exclude_prefixes=self.exclude_dirs, + continue_on_failure=self.continue_on_failure, ) # Recursively call the function to get the children of the children diff --git a/libs/community/langchain_community/vectorstores/kdbai.py b/libs/community/langchain_community/vectorstores/kdbai.py index 1122b691d439b..9ac1a7d580a82 100644 --- a/libs/community/langchain_community/vectorstores/kdbai.py +++ b/libs/community/langchain_community/vectorstores/kdbai.py @@ -14,7 +14,7 @@ class KDBAI(VectorStore): - """`KDB.AI` vector store [https://kdb.ai](https://kdb.ai) + """`KDB.AI` vector store. To use, you should have the `kdbai_client` python package installed. @@ -25,7 +25,7 @@ class KDBAI(VectorStore): distance_strategy: One option from DistanceStrategy.EUCLIDEAN_DISTANCE, DistanceStrategy.DOT_PRODUCT or DistanceStrategy.COSINE. - See the example [notebook](https://github.com/KxSystems/langchain/blob/KDB.AI/docs/docs/integrations/vectorstores/kdbai.ipynb). + See the example https://github.com/KxSystems/langchain/blob/KDB.AI/docs/docs/integrations/vectorstores/kdbai.ipynb. """ def __init__( diff --git a/libs/core/langchain_core/utils/html.py b/libs/core/langchain_core/utils/html.py index bbea15f0fa435..837b19ed101da 100644 --- a/libs/core/langchain_core/utils/html.py +++ b/libs/core/langchain_core/utils/html.py @@ -1,7 +1,10 @@ +import logging import re from typing import List, Optional, Sequence, Union from urllib.parse import urljoin, urlparse +logger = logging.getLogger(__name__) + PREFIXES_TO_IGNORE = ("javascript:", "mailto:", "#") SUFFIXES_TO_IGNORE = ( ".css", @@ -52,6 +55,7 @@ def extract_sub_links( pattern: Union[str, re.Pattern, None] = None, prevent_outside: bool = True, exclude_prefixes: Sequence[str] = (), + continue_on_failure: bool = False, ) -> List[str]: """Extract all links from a raw html string and convert into absolute paths. @@ -63,25 +67,34 @@ def extract_sub_links( prevent_outside: If True, ignore external links which are not children of the base url. exclude_prefixes: Exclude any URLs that start with one of these prefixes. - + continue_on_failure: If True, continue if parsing a specific link raises an + exception. Otherwise, raise the exception. Returns: List[str]: sub links """ base_url_to_use = base_url if base_url is not None else url parsed_base_url = urlparse(base_url_to_use) + parsed_url = urlparse(url) all_links = find_all_links(raw_html, pattern=pattern) absolute_paths = set() for link in all_links: - parsed_link = urlparse(link) - # Some may be absolute links like https://to/path - if parsed_link.scheme == "http" or parsed_link.scheme == "https": - absolute_path = link - # Some may have omitted the protocol like //to/path - elif link.startswith("//"): - absolute_path = f"{urlparse(url).scheme}:{link}" - else: - absolute_path = urljoin(url, parsed_link.path) - absolute_paths.add(absolute_path) + try: + parsed_link = urlparse(link) + # Some may be absolute links like https://to/path + if parsed_link.scheme == "http" or parsed_link.scheme == "https": + absolute_path = link + # Some may have omitted the protocol like //to/path + elif link.startswith("//"): + absolute_path = f"{parsed_url.scheme}:{link}" + else: + absolute_path = urljoin(url, parsed_link.path) + absolute_paths.add(absolute_path) + except Exception as e: + if continue_on_failure: + logger.warning(f"Unable to load link {link}. Raised exception:\n\n{e}") + continue + else: + raise e results = [] for path in absolute_paths: