added webscraper contrib

DAGWorks-Inc · Nov 9, 2023 · f722539 · f722539
1 parent 292b1ce
commit f722539
Show file tree

Hide file tree

Showing 7 changed files with 400 additions and 0 deletions.
diff --git a/contrib/hamilton/contrib/user/zilto/webscraper/README.md b/contrib/hamilton/contrib/user/zilto/webscraper/README.md
@@ -0,0 +1,20 @@
+# Purpose of this module
+
+This module implements a simple webscraper that collects the specified HTML tags and removes undesirable ones. Simply give it a list of URLs.
+
+Timeout and retry logic for HTTP request is implemented using the `tenacity` package
+
+# Configuration Options
+## Config.when
+This module doesn't receive any configuration.
+
+## Inputs
+ - `urls` (Required): a list of valid url to scrape
+ - `tags_to_extract`: a list of HTML tags to extract
+ - `tags_to_remove`: a list of HTML tags to remove
+
+## Overrides
+ - `parsed_html`: if the function doesn't provide enough flexibility, another parser can be provided as long as it has parameters `url` and `html_page` and outputs a `ParsingResult` object.
+
+# Limitations
+- The timeout and retry values need to be edited via the decorator of `html_page()`.
diff --git a/contrib/hamilton/contrib/user/zilto/webscraper/__init__.py b/contrib/hamilton/contrib/user/zilto/webscraper/__init__.py
@@ -0,0 +1,96 @@
+import logging
+from typing import Any, List
+
+logger = logging.getLogger(__name__)
+
+from hamilton import contrib
+
+with contrib.catch_import_errors(__name__, __file__, logger):
+    from bs4 import BeautifulSoup
+    import lxml  # noqa: F401
+    import requests
+    from tenacity import retry, stop_after_attempt, wait_random_exponential
+
+import dataclasses
+
+from hamilton.htypes import Collect, Parallelizable
+
+
+@dataclasses.dataclass
+class ParsingResult:
+    """Result from the parsing function
+
+    :param url: url to the HTML page
+    :param parsed: the result of the parsing function
+    """
+
+    url: str
+    parsed: Any
+
+
+def url(urls: List[str]) -> Parallelizable[str]:
+    """Iterate over the list of urls and create one branch per url
+
+    :param urls: list of url to scrape and parse
+    :return: a single url to scrape and parse
+    """
+    for url_ in urls:
+        yield url_
+
+
+@retry(wait=wait_random_exponential(min=1, max=40), stop=stop_after_attempt(3))
+def html_page(url: str) -> str:
+    """Get the HTML page as string
+    The tenacity decorator sets the timeout and retry logic
+
+    :param url: a single url to request
+    :return: the HTML page as a string
+    """
+    response = requests.get(url)
+    response.raise_for_status()
+    return response.text
+
+
+def parsed_html(
+    url: str,
+    html_page: str,
+    tags_to_extract: List[str] = ["p", "li", "div"],
+    tags_to_remove: List[str] = ["script", "style"],
+) -> ParsingResult:
+    """Parse an HTML string using BeautifulSoup
+
+    :param url: the url of the requested page
+    :param html_page: the HTML page associated with the url
+    :param tags_to_extract: HTML tags to extract and gather
+    :param tags_to_remove: HTML tags to remove
+    :return: the ParsingResult which contains the url and the parsing results
+    """
+    soup = BeautifulSoup(html_page, features="lxml")
+
+    for tag in tags_to_remove:
+        for element in soup.find_all(tag):
+            element.decompose()
+
+    content = []
+    for tag in tags_to_extract:
+        for element in soup.find_all(tag):
+            if tag == "a":
+                href = element.get("href")
+                if href:
+                    content.append(f"{element.get_text()} ({href})")
+                else:
+                    content.append(element.get_text(strip=True))
+            else:
+                content.append(element.get_text(strip=True))
+    content = " ".join(content)
+
+    return ParsingResult(url=url, parsed=content)
+
+
+def parsed_html_collection(parsed_html: Collect[ParsingResult]) -> List[ParsingResult]:
+    """Collect parallel branches of `parsed_html`
+
+    :param parsed_html: receive the ParsingResult associated with each url
+    :return: list of ParsingResult
+    """
+    return list(parsed_html)
diff --git a/contrib/hamilton/contrib/user/zilto/webscraper/dag.png b/contrib/hamilton/contrib/user/zilto/webscraper/dag.png
diff --git a/contrib/hamilton/contrib/user/zilto/webscraper/requirements.txt b/contrib/hamilton/contrib/user/zilto/webscraper/requirements.txt
@@ -0,0 +1,4 @@
+beautifulsoup4
+lxml
+requests
+sf-hamilton[visualization]
diff --git a/contrib/hamilton/contrib/user/zilto/webscraper/run.ipynb b/contrib/hamilton/contrib/user/zilto/webscraper/run.ipynb
diff --git a/contrib/hamilton/contrib/user/zilto/webscraper/tags.json b/contrib/hamilton/contrib/user/zilto/webscraper/tags.json
@@ -0,0 +1,7 @@
+{
+  "schema": "1.0",
+  "use_case_tags": ["webscraper"],
+  "secondary_tags": {
+    "language": "English"
+  }
+}
diff --git a/contrib/hamilton/contrib/user/zilto/webscraper/valid_configs.jsonl b/contrib/hamilton/contrib/user/zilto/webscraper/valid_configs.jsonl