Skip to content

Commit

Permalink
added webscraper contrib
Browse files Browse the repository at this point in the history
  • Loading branch information
zilto authored and zilto committed Nov 9, 2023
1 parent 292b1ce commit f722539
Show file tree
Hide file tree
Showing 7 changed files with 400 additions and 0 deletions.
20 changes: 20 additions & 0 deletions contrib/hamilton/contrib/user/zilto/webscraper/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Purpose of this module

This module implements a simple webscraper that collects the specified HTML tags and removes undesirable ones. Simply give it a list of URLs.

Timeout and retry logic for HTTP request is implemented using the `tenacity` package

# Configuration Options
## Config.when
This module doesn't receive any configuration.

## Inputs
- `urls` (Required): a list of valid url to scrape
- `tags_to_extract`: a list of HTML tags to extract
- `tags_to_remove`: a list of HTML tags to remove

## Overrides
- `parsed_html`: if the function doesn't provide enough flexibility, another parser can be provided as long as it has parameters `url` and `html_page` and outputs a `ParsingResult` object.

# Limitations
- The timeout and retry values need to be edited via the decorator of `html_page()`.
96 changes: 96 additions & 0 deletions contrib/hamilton/contrib/user/zilto/webscraper/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
import logging
from typing import Any, List

logger = logging.getLogger(__name__)

from hamilton import contrib

with contrib.catch_import_errors(__name__, __file__, logger):
from bs4 import BeautifulSoup
import lxml # noqa: F401
import requests
from tenacity import retry, stop_after_attempt, wait_random_exponential

import dataclasses

from hamilton.htypes import Collect, Parallelizable


@dataclasses.dataclass
class ParsingResult:
"""Result from the parsing function
:param url: url to the HTML page
:param parsed: the result of the parsing function
"""

url: str
parsed: Any


def url(urls: List[str]) -> Parallelizable[str]:
"""Iterate over the list of urls and create one branch per url
:param urls: list of url to scrape and parse
:return: a single url to scrape and parse
"""
for url_ in urls:
yield url_


@retry(wait=wait_random_exponential(min=1, max=40), stop=stop_after_attempt(3))
def html_page(url: str) -> str:
"""Get the HTML page as string
The tenacity decorator sets the timeout and retry logic
:param url: a single url to request
:return: the HTML page as a string
"""
response = requests.get(url)
response.raise_for_status()
return response.text


def parsed_html(
url: str,
html_page: str,
tags_to_extract: List[str] = ["p", "li", "div"],
tags_to_remove: List[str] = ["script", "style"],
) -> ParsingResult:
"""Parse an HTML string using BeautifulSoup
:param url: the url of the requested page
:param html_page: the HTML page associated with the url
:param tags_to_extract: HTML tags to extract and gather
:param tags_to_remove: HTML tags to remove
:return: the ParsingResult which contains the url and the parsing results
"""
soup = BeautifulSoup(html_page, features="lxml")

for tag in tags_to_remove:
for element in soup.find_all(tag):
element.decompose()

content = []
for tag in tags_to_extract:
for element in soup.find_all(tag):
if tag == "a":
href = element.get("href")
if href:
content.append(f"{element.get_text()} ({href})")
else:
content.append(element.get_text(strip=True))
else:
content.append(element.get_text(strip=True))
content = " ".join(content)

return ParsingResult(url=url, parsed=content)


def parsed_html_collection(parsed_html: Collect[ParsingResult]) -> List[ParsingResult]:
"""Collect parallel branches of `parsed_html`
:param parsed_html: receive the ParsingResult associated with each url
:return: list of ParsingResult
"""
return list(parsed_html)
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
beautifulsoup4
lxml
requests
sf-hamilton[visualization]
273 changes: 273 additions & 0 deletions contrib/hamilton/contrib/user/zilto/webscraper/run.ipynb

Large diffs are not rendered by default.

7 changes: 7 additions & 0 deletions contrib/hamilton/contrib/user/zilto/webscraper/tags.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"schema": "1.0",
"use_case_tags": ["webscraper"],
"secondary_tags": {
"language": "English"
}
}
Empty file.

0 comments on commit f722539

Please sign in to comment.