-
Notifications
You must be signed in to change notification settings - Fork 131
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
zilto
authored and
zilto
committed
Nov 9, 2023
1 parent
292b1ce
commit f722539
Showing
7 changed files
with
400 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
# Purpose of this module | ||
|
||
This module implements a simple webscraper that collects the specified HTML tags and removes undesirable ones. Simply give it a list of URLs. | ||
|
||
Timeout and retry logic for HTTP request is implemented using the `tenacity` package | ||
|
||
# Configuration Options | ||
## Config.when | ||
This module doesn't receive any configuration. | ||
|
||
## Inputs | ||
- `urls` (Required): a list of valid url to scrape | ||
- `tags_to_extract`: a list of HTML tags to extract | ||
- `tags_to_remove`: a list of HTML tags to remove | ||
|
||
## Overrides | ||
- `parsed_html`: if the function doesn't provide enough flexibility, another parser can be provided as long as it has parameters `url` and `html_page` and outputs a `ParsingResult` object. | ||
|
||
# Limitations | ||
- The timeout and retry values need to be edited via the decorator of `html_page()`. |
96 changes: 96 additions & 0 deletions
96
contrib/hamilton/contrib/user/zilto/webscraper/__init__.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
import logging | ||
from typing import Any, List | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
from hamilton import contrib | ||
|
||
with contrib.catch_import_errors(__name__, __file__, logger): | ||
from bs4 import BeautifulSoup | ||
import lxml # noqa: F401 | ||
import requests | ||
from tenacity import retry, stop_after_attempt, wait_random_exponential | ||
|
||
import dataclasses | ||
|
||
from hamilton.htypes import Collect, Parallelizable | ||
|
||
|
||
@dataclasses.dataclass | ||
class ParsingResult: | ||
"""Result from the parsing function | ||
:param url: url to the HTML page | ||
:param parsed: the result of the parsing function | ||
""" | ||
|
||
url: str | ||
parsed: Any | ||
|
||
|
||
def url(urls: List[str]) -> Parallelizable[str]: | ||
"""Iterate over the list of urls and create one branch per url | ||
:param urls: list of url to scrape and parse | ||
:return: a single url to scrape and parse | ||
""" | ||
for url_ in urls: | ||
yield url_ | ||
|
||
|
||
@retry(wait=wait_random_exponential(min=1, max=40), stop=stop_after_attempt(3)) | ||
def html_page(url: str) -> str: | ||
"""Get the HTML page as string | ||
The tenacity decorator sets the timeout and retry logic | ||
:param url: a single url to request | ||
:return: the HTML page as a string | ||
""" | ||
response = requests.get(url) | ||
response.raise_for_status() | ||
return response.text | ||
|
||
|
||
def parsed_html( | ||
url: str, | ||
html_page: str, | ||
tags_to_extract: List[str] = ["p", "li", "div"], | ||
tags_to_remove: List[str] = ["script", "style"], | ||
) -> ParsingResult: | ||
"""Parse an HTML string using BeautifulSoup | ||
:param url: the url of the requested page | ||
:param html_page: the HTML page associated with the url | ||
:param tags_to_extract: HTML tags to extract and gather | ||
:param tags_to_remove: HTML tags to remove | ||
:return: the ParsingResult which contains the url and the parsing results | ||
""" | ||
soup = BeautifulSoup(html_page, features="lxml") | ||
|
||
for tag in tags_to_remove: | ||
for element in soup.find_all(tag): | ||
element.decompose() | ||
|
||
content = [] | ||
for tag in tags_to_extract: | ||
for element in soup.find_all(tag): | ||
if tag == "a": | ||
href = element.get("href") | ||
if href: | ||
content.append(f"{element.get_text()} ({href})") | ||
else: | ||
content.append(element.get_text(strip=True)) | ||
else: | ||
content.append(element.get_text(strip=True)) | ||
content = " ".join(content) | ||
|
||
return ParsingResult(url=url, parsed=content) | ||
|
||
|
||
def parsed_html_collection(parsed_html: Collect[ParsingResult]) -> List[ParsingResult]: | ||
"""Collect parallel branches of `parsed_html` | ||
:param parsed_html: receive the ParsingResult associated with each url | ||
:return: list of ParsingResult | ||
""" | ||
return list(parsed_html) |
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
4 changes: 4 additions & 0 deletions
4
contrib/hamilton/contrib/user/zilto/webscraper/requirements.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
beautifulsoup4 | ||
lxml | ||
requests | ||
sf-hamilton[visualization] |
273 changes: 273 additions & 0 deletions
273
contrib/hamilton/contrib/user/zilto/webscraper/run.ipynb
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
{ | ||
"schema": "1.0", | ||
"use_case_tags": ["webscraper"], | ||
"secondary_tags": { | ||
"language": "English" | ||
} | ||
} |
Empty file.