commoncrawl · silentninja · Dec 27, 2024 · Dec 27, 2024 · Dec 27, 2024 · Dec 27, 2024
diff --git a/README.md b/README.md
@@ -207,6 +207,28 @@ Some differences between the warcio and FastWARC APIs are hidden from the user i
 
 However, it's recommended that you carefully verify that your custom job implementation works in combination with FastWARC. There are subtle differences between the warcio and FastWARC APIs, including the underlying classes (WARC/HTTP headers and stream implementations). In addition, FastWARC does not support for legacy ARC files and does not automatically decode HTTP content and transfer encodings (see [Resiliparse HTTP Tools](https://resiliparse.chatnoir.eu/en/latest/man/parse/http.html#read-chunked-http-payloads)). While content and transfer encodings are already decoded in Common Crawl WARC files, this may not be the case for WARC files from other sources. See also [WARC 1.1 specification, http/https response records](https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.1/#http-and-https-schemes).
 
+
+### Configuring HTML Parsers
+
+The project supports two HTML parser [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/) and [Resiliparse](https://resiliparse.chatnoir.eu/en/latest/man/installation.html) that can be specified using the `--html_parser` argument.
+By default, Beautifulsoup is used for parsing the HTML files.
+
+To configure the HTML parser.
+- use the `--html_parser` argument to specify the HTML parser to use when submitting the job
+- Install the correct dependencies for the parser
+- when running the job in a Spark cluster, include correct parser module (bs4_parser.py for beautifulsoup or resiliparse_parser.py for resiliparse) via `--py-files` in addition to `sparkcc.py` and further job-specific Python files. See also [running in a Spark cluster](#running-in-spark-cluster-over-large-amounts-of-data).
+
+Below an example call to count words in 10 WARC records host under the `.is` top-level domain using `Resiliparse`
+```
+$SPARK_HOME/bin/spark-submit \
+    ./cc_index_word_count.py \
+    --input_base_url s3://commoncrawl/ \
+    --query "SELECT url, warc_filename, warc_record_offset, warc_record_length, content_charset FROM ccindex WHERE crawl = 'CC-MAIN-2020-24' AND subset = 'warc' AND url_host_tld = 'is' LIMIT 10" \
+    s3a://commoncrawl/cc-index/table/cc-main/warc/ \
+    myccindexwordcountoutput \
+    --html_parser resiliparse
+```
+
 ## Credits
 
 Examples are originally ported from Stephen Merity's [cc-mrjob](https://github.com/commoncrawl/cc-mrjob/) with the following changes and upgrades:

diff --git a/bs4_parser.py b/bs4_parser.py
@@ -0,0 +1,41 @@
+from bs4 import BeautifulSoup
+from bs4.dammit import EncodingDetector
+
+
+class HTMLParser(object):
+    """
+    HTML parser using BeautifulSoup4
+    """
+
+    def html_to_text(self, html_tree: BeautifulSoup) -> str:
+        """
+        Convert HTML content to plain text using BeautifulSoup4.
+
+        Returns:
+            str: Extracted plain text with scripts and styles removed
+        """
+        for script in html_tree(['script', 'style']):
+            script.extract()
+        text = html_tree.get_text(' ', strip=True)
+        return text
+
+    def get_html_tree(self, page: bytes, encoding: str=None, features='lxml', **kwargs) -> BeautifulSoup:
+        """
+        Return the HTML tree object
+
+        Args:
+            page (bytes): Raw HTML content as bytes
+            encoding (str, optional): Specific character encoding to use. If None, auto-detection is attempted
+            features: Parser to be used (default='lxml'). Refer https://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for supported parsers.
+            **kwargs: Additional arguments passed to BeautifulSoup constructor.
+             Refer here https://www.crummy.com/software/BeautifulSoup/bs4/doc/#bs4.BeautifulSoup for accepted arguments.
+
+        Returns:
+            BeautifulSoup: HTML tree object
+        """
+        if not encoding:
+            for encoding in EncodingDetector(page, is_html=True).encodings:
+                # take the first detected encoding
+                break
+        soup = BeautifulSoup(page, features, from_encoding=encoding, **kwargs)
+        return soup
diff --git a/cc_index_word_count.py b/cc_index_word_count.py
@@ -36,32 +36,25 @@ def reduce_by_key_func(a, b):
         # sum values of tuple <term_frequency, document_frequency>
         return ((a[0] + b[0]), (a[1] + b[1]))
 
-    def html_to_text(self, page, record):
-        try:
-            encoding = self.get_warc_header(record, 'WARC-Identified-Content-Charset')
-            if not encoding:
-                for encoding in EncodingDetector(page, is_html=True).encodings:
-                    # take the first detected encoding
-                    break
-            soup = BeautifulSoup(page, 'lxml', from_encoding=encoding)
-            for script in soup(['script', 'style']):
-                script.extract()
-            return soup.get_text(' ', strip=True)
-        except Exception as e:
-            self.get_logger().error("Error converting HTML to text for {}: {}",
-                                    self.get_warc_header(record, 'WARC-Target-URI'), e)
-            self.records_parsing_failed.add(1)
-            return ''
-
     def process_record(self, record):
         if not self.is_response_record(record):
             # skip over WARC request or metadata records
             return
         if not self.is_html(record):
             self.records_non_html.add(1)
             return
-        page = self.get_payload_stream(record).read()
-        text = self.html_to_text(page, record)
+
+        text = ""
+        try:
+            page = self.get_payload_stream(record).read()
+            encoding = self.get_warc_header(record, 'WARC-Identified-Content-Charset')
+            parser = self.get_html_parser()
+            html_tree = parser.get_html_tree(page, encoding=encoding)
+            text = parser.html_to_text(html_tree)
+        except Exception as e:
+            self.get_logger().error("Error converting HTML to text for {}: {}",
+                                    self.get_warc_header(record, 'WARC-Target-URI'), e)
+            self.records_parsing_failed.add(1)
         words = map(lambda w: w.lower(),
                     WordCountJob.word_pattern.findall(text))
         for word, count in Counter(words).items():

diff --git a/requirements.txt b/requirements.txt
@@ -18,3 +18,8 @@ lxml
 #fastwarc
 # (tested with)
 #fastwarc==0.14.1
+
+# to parse HTML using Resiliparse (https://github.com/chatnoir-eu/chatnoir-resiliparse)
+#Resiliparse
+# (tested with)
+#Resiliparse==0.15.1
diff --git a/resiliparse_parser.py b/resiliparse_parser.py
@@ -0,0 +1,36 @@
+from resiliparse.extract.html2text import extract_plain_text
+from resiliparse.parse import detect_encoding
+from resiliparse.parse.html import HTMLTree
+
+
+class HTMLParser(object):
+    """
+    HTML parser using Resiliparse
+    """
+
+    def html_to_text(self, tree, **kwargs) -> str:
+        """
+        Convert HTML content to plain text using Resiliparse.
+
+        Returns:
+            str: Extracted plain text with scripts and styles removed
+        """
+        text = extract_plain_text(tree, **kwargs)
+        return text
+
+    def get_html_tree(self, page: bytes, encoding: str=None, **kwargs) -> HTMLTree:
+        """
+        Get the HTML tree object
+
+        Args:
+            page (bytes): Raw HTML content as bytes
+            encoding (str, optional): Specific character encoding to use. If None, auto-detection is attempted
+            **kwargs: Additional arguments passed to extract_plain_text:
+                Refer here https://resiliparse.chatnoir.eu/en/latest/api/extract/html2text.html#resiliparse.extract.html2text.extract_plain_text for accepted arguments.
+        Returns:
+            str: Extracted plain text content
+        """
+        if not encoding:
+            encoding = detect_encoding(page)
+        tree = HTMLTree.parse_from_bytes(page, encoding, **kwargs)
+        return tree
diff --git a/sparkcc.py b/sparkcc.py
@@ -71,7 +71,8 @@ def parse_arguments(self):
 
         arg_parser.add_argument("input", help=self.input_descr)
         arg_parser.add_argument("output", help=self.output_descr)
-
+        arg_parser.add_argument("--html_parser", default="beautifulsoup",
+                                help="HTML parser: beautifulsoup or resiliparse")
         arg_parser.add_argument("--input_base_url",
                                 help="Base URL (prefix) used if paths to WARC/WAT/WET "
                                 "files are relative paths. Used to select the "
@@ -396,6 +397,22 @@ def get_warc_header(record: ArcWarcRecord, header: str, default: str=None):
     def get_http_headers(record: ArcWarcRecord):
         return record.http_headers.headers
 
+    def get_html_parser(self):
+        try:
+            if self.args.html_parser == 'beautifulsoup':
+                from bs4_parser import HTMLParser
+                return HTMLParser()
+            elif self.args.html_parser == 'resiliparse':
+                from resiliparse_parser import HTMLParser
+                return HTMLParser()
+            else:
+                raise ValueError(
+                    "Unknown HTML parser: {}".format(self.args.html_parser)
+                )
+        except ImportError as e:
+            raise ImportError(f"Failed to import HTML parser module '{self.args.html_parser}'."
+                              f" Please ensure the module is correctly added to PySpark cluster: {str(e)}")
+
     @staticmethod
     def is_response_record(record: ArcWarcRecord):
         """Return true if WARC record is a WARC response record"""