From af74301ab981a0421f84bbff6921b7feca79dbfe Mon Sep 17 00:00:00 2001
From: Bagatur <22008038+baskaryan@users.noreply.github.com>
Date: Wed, 7 Feb 2024 14:15:30 -0800
Subject: [PATCH] core[patch], community[patch]: link extraction continue on
 failure (#17200)

---
 .../document_loaders/recursive_url_loader.py  | 32 +++++++++++------
 .../langchain_community/vectorstores/kdbai.py |  4 +--
 libs/core/langchain_core/utils/html.py        | 35 +++++++++++++------
 3 files changed, 48 insertions(+), 23 deletions(-)

diff --git a/libs/community/langchain_community/document_loaders/recursive_url_loader.py b/libs/community/langchain_community/document_loaders/recursive_url_loader.py
index 6fca4edf86a80..c24ab1730fd35 100644
--- a/libs/community/langchain_community/document_loaders/recursive_url_loader.py
+++ b/libs/community/langchain_community/document_loaders/recursive_url_loader.py
@@ -93,6 +93,7 @@ def __init__(
         link_regex: Union[str, re.Pattern, None] = None,
         headers: Optional[dict] = None,
         check_response_status: bool = False,
+        continue_on_failure: bool = True,
     ) -> None:
         """Initialize with URL to crawl and any subdirectories to exclude.
 
@@ -117,6 +118,8 @@ def __init__(
             link_regex: Regex for extracting sub-links from the raw html of a web page.
             check_response_status: If True, check HTTP response status and skip
                 URLs with error responses (400-599).
+            continue_on_failure: If True, continue if getting or parsing a link raises
+                an exception. Otherwise, raise the exception.
         """
 
         self.url = url
@@ -142,6 +145,7 @@ def __init__(
         self._lock = asyncio.Lock() if self.use_async else None
         self.headers = headers
         self.check_response_status = check_response_status
+        self.continue_on_failure = continue_on_failure
 
     def _get_child_links_recursive(
         self, url: str, visited: Set[str], *, depth: int = 0
@@ -164,11 +168,14 @@ def _get_child_links_recursive(
             if self.check_response_status and 400 <= response.status_code <= 599:
                 raise ValueError(f"Received HTTP status {response.status_code}")
         except Exception as e:
-            logger.warning(
-                f"Unable to load from {url}. Received error {e} of type "
-                f"{e.__class__.__name__}"
-            )
-            return
+            if self.continue_on_failure:
+                logger.warning(
+                    f"Unable to load from {url}. Received error {e} of type "
+                    f"{e.__class__.__name__}"
+                )
+                return
+            else:
+                raise e
         content = self.extractor(response.text)
         if content:
             yield Document(
@@ -184,6 +191,7 @@ def _get_child_links_recursive(
             pattern=self.link_regex,
             prevent_outside=self.prevent_outside,
             exclude_prefixes=self.exclude_dirs,
+            continue_on_failure=self.continue_on_failure,
         )
         for link in sub_links:
             # Check all unvisited links
@@ -237,13 +245,16 @@ async def _async_get_child_links_recursive(
                 if self.check_response_status and 400 <= response.status <= 599:
                     raise ValueError(f"Received HTTP status {response.status}")
         except (aiohttp.client_exceptions.InvalidURL, Exception) as e:
-            logger.warning(
-                f"Unable to load {url}. Received error {e} of type "
-                f"{e.__class__.__name__}"
-            )
             if close_session:
                 await session.close()
-            return []
+            if self.continue_on_failure:
+                logger.warning(
+                    f"Unable to load {url}. Received error {e} of type "
+                    f"{e.__class__.__name__}"
+                )
+                return []
+            else:
+                raise e
         results = []
         content = self.extractor(text)
         if content:
@@ -261,6 +272,7 @@ async def _async_get_child_links_recursive(
                 pattern=self.link_regex,
                 prevent_outside=self.prevent_outside,
                 exclude_prefixes=self.exclude_dirs,
+                continue_on_failure=self.continue_on_failure,
             )
 
             # Recursively call the function to get the children of the children
diff --git a/libs/community/langchain_community/vectorstores/kdbai.py b/libs/community/langchain_community/vectorstores/kdbai.py
index 1122b691d439b..9ac1a7d580a82 100644
--- a/libs/community/langchain_community/vectorstores/kdbai.py
+++ b/libs/community/langchain_community/vectorstores/kdbai.py
@@ -14,7 +14,7 @@
 
 
 class KDBAI(VectorStore):
-    """`KDB.AI` vector store [https://kdb.ai](https://kdb.ai)
+    """`KDB.AI` vector store.
 
     To use, you should have the `kdbai_client` python package installed.
 
@@ -25,7 +25,7 @@ class KDBAI(VectorStore):
         distance_strategy: One option from DistanceStrategy.EUCLIDEAN_DISTANCE,
             DistanceStrategy.DOT_PRODUCT or DistanceStrategy.COSINE.
 
-    See the example [notebook](https://github.com/KxSystems/langchain/blob/KDB.AI/docs/docs/integrations/vectorstores/kdbai.ipynb).
+    See the example https://github.com/KxSystems/langchain/blob/KDB.AI/docs/docs/integrations/vectorstores/kdbai.ipynb.
     """
 
     def __init__(
diff --git a/libs/core/langchain_core/utils/html.py b/libs/core/langchain_core/utils/html.py
index bbea15f0fa435..837b19ed101da 100644
--- a/libs/core/langchain_core/utils/html.py
+++ b/libs/core/langchain_core/utils/html.py
@@ -1,7 +1,10 @@
+import logging
 import re
 from typing import List, Optional, Sequence, Union
 from urllib.parse import urljoin, urlparse
 
+logger = logging.getLogger(__name__)
+
 PREFIXES_TO_IGNORE = ("javascript:", "mailto:", "#")
 SUFFIXES_TO_IGNORE = (
     ".css",
@@ -52,6 +55,7 @@ def extract_sub_links(
     pattern: Union[str, re.Pattern, None] = None,
     prevent_outside: bool = True,
     exclude_prefixes: Sequence[str] = (),
+    continue_on_failure: bool = False,
 ) -> List[str]:
     """Extract all links from a raw html string and convert into absolute paths.
 
@@ -63,25 +67,34 @@ def extract_sub_links(
         prevent_outside: If True, ignore external links which are not children
             of the base url.
         exclude_prefixes: Exclude any URLs that start with one of these prefixes.
-
+        continue_on_failure: If True, continue if parsing a specific link raises an
+            exception. Otherwise, raise the exception.
     Returns:
         List[str]: sub links
     """
     base_url_to_use = base_url if base_url is not None else url
     parsed_base_url = urlparse(base_url_to_use)
+    parsed_url = urlparse(url)
     all_links = find_all_links(raw_html, pattern=pattern)
     absolute_paths = set()
     for link in all_links:
-        parsed_link = urlparse(link)
-        # Some may be absolute links like https://to/path
-        if parsed_link.scheme == "http" or parsed_link.scheme == "https":
-            absolute_path = link
-        # Some may have omitted the protocol like //to/path
-        elif link.startswith("//"):
-            absolute_path = f"{urlparse(url).scheme}:{link}"
-        else:
-            absolute_path = urljoin(url, parsed_link.path)
-        absolute_paths.add(absolute_path)
+        try:
+            parsed_link = urlparse(link)
+            # Some may be absolute links like https://to/path
+            if parsed_link.scheme == "http" or parsed_link.scheme == "https":
+                absolute_path = link
+            # Some may have omitted the protocol like //to/path
+            elif link.startswith("//"):
+                absolute_path = f"{parsed_url.scheme}:{link}"
+            else:
+                absolute_path = urljoin(url, parsed_link.path)
+            absolute_paths.add(absolute_path)
+        except Exception as e:
+            if continue_on_failure:
+                logger.warning(f"Unable to load link {link}. Raised exception:\n\n{e}")
+                continue
+            else:
+                raise e
 
     results = []
     for path in absolute_paths: