Skip to content

Commit

Permalink
core[patch], community[patch]: link extraction continue on failure (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
baskaryan authored Feb 7, 2024
1 parent 2281f00 commit af74301
Show file tree
Hide file tree
Showing 3 changed files with 48 additions and 23 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ def __init__(
link_regex: Union[str, re.Pattern, None] = None,
headers: Optional[dict] = None,
check_response_status: bool = False,
continue_on_failure: bool = True,
) -> None:
"""Initialize with URL to crawl and any subdirectories to exclude.
Expand All @@ -117,6 +118,8 @@ def __init__(
link_regex: Regex for extracting sub-links from the raw html of a web page.
check_response_status: If True, check HTTP response status and skip
URLs with error responses (400-599).
continue_on_failure: If True, continue if getting or parsing a link raises
an exception. Otherwise, raise the exception.
"""

self.url = url
Expand All @@ -142,6 +145,7 @@ def __init__(
self._lock = asyncio.Lock() if self.use_async else None
self.headers = headers
self.check_response_status = check_response_status
self.continue_on_failure = continue_on_failure

def _get_child_links_recursive(
self, url: str, visited: Set[str], *, depth: int = 0
Expand All @@ -164,11 +168,14 @@ def _get_child_links_recursive(
if self.check_response_status and 400 <= response.status_code <= 599:
raise ValueError(f"Received HTTP status {response.status_code}")
except Exception as e:
logger.warning(
f"Unable to load from {url}. Received error {e} of type "
f"{e.__class__.__name__}"
)
return
if self.continue_on_failure:
logger.warning(
f"Unable to load from {url}. Received error {e} of type "
f"{e.__class__.__name__}"
)
return
else:
raise e
content = self.extractor(response.text)
if content:
yield Document(
Expand All @@ -184,6 +191,7 @@ def _get_child_links_recursive(
pattern=self.link_regex,
prevent_outside=self.prevent_outside,
exclude_prefixes=self.exclude_dirs,
continue_on_failure=self.continue_on_failure,
)
for link in sub_links:
# Check all unvisited links
Expand Down Expand Up @@ -237,13 +245,16 @@ async def _async_get_child_links_recursive(
if self.check_response_status and 400 <= response.status <= 599:
raise ValueError(f"Received HTTP status {response.status}")
except (aiohttp.client_exceptions.InvalidURL, Exception) as e:
logger.warning(
f"Unable to load {url}. Received error {e} of type "
f"{e.__class__.__name__}"
)
if close_session:
await session.close()
return []
if self.continue_on_failure:
logger.warning(
f"Unable to load {url}. Received error {e} of type "
f"{e.__class__.__name__}"
)
return []
else:
raise e
results = []
content = self.extractor(text)
if content:
Expand All @@ -261,6 +272,7 @@ async def _async_get_child_links_recursive(
pattern=self.link_regex,
prevent_outside=self.prevent_outside,
exclude_prefixes=self.exclude_dirs,
continue_on_failure=self.continue_on_failure,
)

# Recursively call the function to get the children of the children
Expand Down
4 changes: 2 additions & 2 deletions libs/community/langchain_community/vectorstores/kdbai.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@


class KDBAI(VectorStore):
"""`KDB.AI` vector store [https://kdb.ai](https://kdb.ai)
"""`KDB.AI` vector store.
To use, you should have the `kdbai_client` python package installed.
Expand All @@ -25,7 +25,7 @@ class KDBAI(VectorStore):
distance_strategy: One option from DistanceStrategy.EUCLIDEAN_DISTANCE,
DistanceStrategy.DOT_PRODUCT or DistanceStrategy.COSINE.
See the example [notebook](https://github.com/KxSystems/langchain/blob/KDB.AI/docs/docs/integrations/vectorstores/kdbai.ipynb).
See the example https://github.com/KxSystems/langchain/blob/KDB.AI/docs/docs/integrations/vectorstores/kdbai.ipynb.
"""

def __init__(
Expand Down
35 changes: 24 additions & 11 deletions libs/core/langchain_core/utils/html.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
import logging
import re
from typing import List, Optional, Sequence, Union
from urllib.parse import urljoin, urlparse

logger = logging.getLogger(__name__)

PREFIXES_TO_IGNORE = ("javascript:", "mailto:", "#")
SUFFIXES_TO_IGNORE = (
".css",
Expand Down Expand Up @@ -52,6 +55,7 @@ def extract_sub_links(
pattern: Union[str, re.Pattern, None] = None,
prevent_outside: bool = True,
exclude_prefixes: Sequence[str] = (),
continue_on_failure: bool = False,
) -> List[str]:
"""Extract all links from a raw html string and convert into absolute paths.
Expand All @@ -63,25 +67,34 @@ def extract_sub_links(
prevent_outside: If True, ignore external links which are not children
of the base url.
exclude_prefixes: Exclude any URLs that start with one of these prefixes.
continue_on_failure: If True, continue if parsing a specific link raises an
exception. Otherwise, raise the exception.
Returns:
List[str]: sub links
"""
base_url_to_use = base_url if base_url is not None else url
parsed_base_url = urlparse(base_url_to_use)
parsed_url = urlparse(url)
all_links = find_all_links(raw_html, pattern=pattern)
absolute_paths = set()
for link in all_links:
parsed_link = urlparse(link)
# Some may be absolute links like https://to/path
if parsed_link.scheme == "http" or parsed_link.scheme == "https":
absolute_path = link
# Some may have omitted the protocol like //to/path
elif link.startswith("//"):
absolute_path = f"{urlparse(url).scheme}:{link}"
else:
absolute_path = urljoin(url, parsed_link.path)
absolute_paths.add(absolute_path)
try:
parsed_link = urlparse(link)
# Some may be absolute links like https://to/path
if parsed_link.scheme == "http" or parsed_link.scheme == "https":
absolute_path = link
# Some may have omitted the protocol like //to/path
elif link.startswith("//"):
absolute_path = f"{parsed_url.scheme}:{link}"
else:
absolute_path = urljoin(url, parsed_link.path)
absolute_paths.add(absolute_path)
except Exception as e:
if continue_on_failure:
logger.warning(f"Unable to load link {link}. Raised exception:\n\n{e}")
continue
else:
raise e

results = []
for path in absolute_paths:
Expand Down

0 comments on commit af74301

Please sign in to comment.