Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

core[patch], community[patch]: link extraction continue on failure #17200

Merged
merged 2 commits into from
Feb 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ def __init__(
link_regex: Union[str, re.Pattern, None] = None,
headers: Optional[dict] = None,
check_response_status: bool = False,
continue_on_failure: bool = True,
) -> None:
"""Initialize with URL to crawl and any subdirectories to exclude.

Expand All @@ -117,6 +118,8 @@ def __init__(
link_regex: Regex for extracting sub-links from the raw html of a web page.
check_response_status: If True, check HTTP response status and skip
URLs with error responses (400-599).
continue_on_failure: If True, continue if getting or parsing a link raises
an exception. Otherwise, raise the exception.
"""

self.url = url
Expand All @@ -142,6 +145,7 @@ def __init__(
self._lock = asyncio.Lock() if self.use_async else None
self.headers = headers
self.check_response_status = check_response_status
self.continue_on_failure = continue_on_failure

def _get_child_links_recursive(
self, url: str, visited: Set[str], *, depth: int = 0
Expand All @@ -164,11 +168,14 @@ def _get_child_links_recursive(
if self.check_response_status and 400 <= response.status_code <= 599:
raise ValueError(f"Received HTTP status {response.status_code}")
except Exception as e:
logger.warning(
f"Unable to load from {url}. Received error {e} of type "
f"{e.__class__.__name__}"
)
return
if self.continue_on_failure:
logger.warning(
f"Unable to load from {url}. Received error {e} of type "
f"{e.__class__.__name__}"
)
return
else:
raise e
content = self.extractor(response.text)
if content:
yield Document(
Expand All @@ -184,6 +191,7 @@ def _get_child_links_recursive(
pattern=self.link_regex,
prevent_outside=self.prevent_outside,
exclude_prefixes=self.exclude_dirs,
continue_on_failure=self.continue_on_failure,
)
for link in sub_links:
# Check all unvisited links
Expand Down Expand Up @@ -237,13 +245,16 @@ async def _async_get_child_links_recursive(
if self.check_response_status and 400 <= response.status <= 599:
raise ValueError(f"Received HTTP status {response.status}")
except (aiohttp.client_exceptions.InvalidURL, Exception) as e:
logger.warning(
f"Unable to load {url}. Received error {e} of type "
f"{e.__class__.__name__}"
)
if close_session:
await session.close()
return []
if self.continue_on_failure:
logger.warning(
f"Unable to load {url}. Received error {e} of type "
f"{e.__class__.__name__}"
)
return []
else:
raise e
results = []
content = self.extractor(text)
if content:
Expand All @@ -261,6 +272,7 @@ async def _async_get_child_links_recursive(
pattern=self.link_regex,
prevent_outside=self.prevent_outside,
exclude_prefixes=self.exclude_dirs,
continue_on_failure=self.continue_on_failure,
)

# Recursively call the function to get the children of the children
Expand Down
4 changes: 2 additions & 2 deletions libs/community/langchain_community/vectorstores/kdbai.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@


class KDBAI(VectorStore):
"""`KDB.AI` vector store [https://kdb.ai](https://kdb.ai)
"""`KDB.AI` vector store.

To use, you should have the `kdbai_client` python package installed.

Expand All @@ -25,7 +25,7 @@ class KDBAI(VectorStore):
distance_strategy: One option from DistanceStrategy.EUCLIDEAN_DISTANCE,
DistanceStrategy.DOT_PRODUCT or DistanceStrategy.COSINE.

See the example [notebook](https://github.com/KxSystems/langchain/blob/KDB.AI/docs/docs/integrations/vectorstores/kdbai.ipynb).
See the example https://github.com/KxSystems/langchain/blob/KDB.AI/docs/docs/integrations/vectorstores/kdbai.ipynb.
"""

def __init__(
Expand Down
35 changes: 24 additions & 11 deletions libs/core/langchain_core/utils/html.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
import logging
import re
from typing import List, Optional, Sequence, Union
from urllib.parse import urljoin, urlparse

logger = logging.getLogger(__name__)

PREFIXES_TO_IGNORE = ("javascript:", "mailto:", "#")
SUFFIXES_TO_IGNORE = (
".css",
Expand Down Expand Up @@ -52,6 +55,7 @@ def extract_sub_links(
pattern: Union[str, re.Pattern, None] = None,
prevent_outside: bool = True,
exclude_prefixes: Sequence[str] = (),
continue_on_failure: bool = False,
) -> List[str]:
"""Extract all links from a raw html string and convert into absolute paths.

Expand All @@ -63,25 +67,34 @@ def extract_sub_links(
prevent_outside: If True, ignore external links which are not children
of the base url.
exclude_prefixes: Exclude any URLs that start with one of these prefixes.

continue_on_failure: If True, continue if parsing a specific link raises an
exception. Otherwise, raise the exception.
Returns:
List[str]: sub links
"""
base_url_to_use = base_url if base_url is not None else url
parsed_base_url = urlparse(base_url_to_use)
parsed_url = urlparse(url)
all_links = find_all_links(raw_html, pattern=pattern)
absolute_paths = set()
for link in all_links:
parsed_link = urlparse(link)
# Some may be absolute links like https://to/path
if parsed_link.scheme == "http" or parsed_link.scheme == "https":
absolute_path = link
# Some may have omitted the protocol like //to/path
elif link.startswith("//"):
absolute_path = f"{urlparse(url).scheme}:{link}"
else:
absolute_path = urljoin(url, parsed_link.path)
absolute_paths.add(absolute_path)
try:
parsed_link = urlparse(link)
# Some may be absolute links like https://to/path
if parsed_link.scheme == "http" or parsed_link.scheme == "https":
absolute_path = link
# Some may have omitted the protocol like //to/path
elif link.startswith("//"):
absolute_path = f"{parsed_url.scheme}:{link}"
else:
absolute_path = urljoin(url, parsed_link.path)
absolute_paths.add(absolute_path)
except Exception as e:
if continue_on_failure:
logger.warning(f"Unable to load link {link}. Raised exception:\n\n{e}")
continue
else:
raise e

results = []
for path in absolute_paths:
Expand Down
Loading