From f3e3cd7c842e71e92f26144f806d72e027b17d60 Mon Sep 17 00:00:00 2001 From: silentninja Date: Fri, 27 Dec 2024 10:51:26 +0400 Subject: [PATCH 1/5] Add bs4 and resiliparse html parsers --- bs4_parser.py | 41 +++++++++++++++++++++++++++++++++++++++++ resiliparse_parser.py | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+) create mode 100644 bs4_parser.py create mode 100644 resiliparse_parser.py diff --git a/bs4_parser.py b/bs4_parser.py new file mode 100644 index 0000000..799a344 --- /dev/null +++ b/bs4_parser.py @@ -0,0 +1,41 @@ +from bs4 import BeautifulSoup +from bs4.dammit import EncodingDetector + + +class HTMLParser(object): + """ + HTML parser using BeautifulSoup4 + """ + + def html_to_text(self, html_tree: BeautifulSoup) -> str: + """ + Convert HTML content to plain text using BeautifulSoup4. + + Returns: + str: Extracted plain text with scripts and styles removed + """ + for script in html_tree(['script', 'style']): + script.extract() + text = html_tree.get_text(' ', strip=True) + return text + + def get_html_tree(self, page: bytes, encoding: str=None, features='lxml', **kwargs) -> BeautifulSoup: + """ + Return the HTML tree object + + Args: + page (bytes): Raw HTML content as bytes + encoding (str, optional): Specific character encoding to use. If None, auto-detection is attempted + features: Parser to be used (default='lxml'). Refer https://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for supported parsers. + **kwargs: Additional arguments passed to BeautifulSoup constructor. + Refer here https://www.crummy.com/software/BeautifulSoup/bs4/doc/#bs4.BeautifulSoup for accepted arguments. + + Returns: + BeautifulSoup: HTML tree object + """ + if not encoding: + for encoding in EncodingDetector(page, is_html=True).encodings: + # take the first detected encoding + break + soup = BeautifulSoup(page, features, from_encoding=encoding, **kwargs) + return soup \ No newline at end of file diff --git a/resiliparse_parser.py b/resiliparse_parser.py new file mode 100644 index 0000000..e70de1d --- /dev/null +++ b/resiliparse_parser.py @@ -0,0 +1,36 @@ +from resiliparse.extract.html2text import extract_plain_text +from resiliparse.parse import detect_encoding +from resiliparse.parse.html import HTMLTree + + +class HTMLParser(object): + """ + HTML parser using Resiliparse + """ + + def html_to_text(self, tree, **kwargs) -> str: + """ + Convert HTML content to plain text using Resiliparse. + + Returns: + str: Extracted plain text with scripts and styles removed + """ + text = extract_plain_text(tree, **kwargs) + return text + + def get_html_tree(self, page: bytes, encoding: str=None, **kwargs) -> HTMLTree: + """ + Get the HTML tree object + + Args: + page (bytes): Raw HTML content as bytes + encoding (str, optional): Specific character encoding to use. If None, auto-detection is attempted + **kwargs: Additional arguments passed to extract_plain_text: + Refer here https://resiliparse.chatnoir.eu/en/latest/api/extract/html2text.html#resiliparse.extract.html2text.extract_plain_text for accepted arguments. + Returns: + str: Extracted plain text content + """ + if not encoding: + encoding = detect_encoding(page) + tree = HTMLTree.parse_from_bytes(page, encoding, **kwargs) + return tree \ No newline at end of file From c6b4ac9849e47bbd583d061f157fe0c5b5a0ede8 Mon Sep 17 00:00:00 2001 From: silentninja Date: Fri, 27 Dec 2024 10:53:42 +0400 Subject: [PATCH 2/5] Accept html parser as arguments --- sparkcc.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/sparkcc.py b/sparkcc.py index 6a99ec9..9e85f7e 100644 --- a/sparkcc.py +++ b/sparkcc.py @@ -71,7 +71,8 @@ def parse_arguments(self): arg_parser.add_argument("input", help=self.input_descr) arg_parser.add_argument("output", help=self.output_descr) - + arg_parser.add_argument("--html_parser", default="beautifulsoup", + help="HTML parser: beautifulsoup or resiliparse") arg_parser.add_argument("--input_base_url", help="Base URL (prefix) used if paths to WARC/WAT/WET " "files are relative paths. Used to select the " @@ -396,6 +397,22 @@ def get_warc_header(record: ArcWarcRecord, header: str, default: str=None): def get_http_headers(record: ArcWarcRecord): return record.http_headers.headers + def get_html_parser(self): + try: + if self.args.html_parser == 'beautifulsoup': + from bs4_parser import HTMLParser + return HTMLParser() + elif self.args.html_parser == 'resiliparse': + from resiliparse_parser import HTMLParser + return HTMLParser() + else: + raise ValueError( + "Unknown HTML parser: {}".format(self.args.html_parser) + ) + except ImportError as e: + raise ImportError(f"Failed to import HTML parser module '{self.args.html_parser}'." + f" Please ensure the module is correctly added to PySpark cluster: {str(e)}") + @staticmethod def is_response_record(record: ArcWarcRecord): """Return true if WARC record is a WARC response record""" From 864241aa0e8e3469eb34fe209108366f90b1d9d3 Mon Sep 17 00:00:00 2001 From: silentninja Date: Fri, 27 Dec 2024 10:54:27 +0400 Subject: [PATCH 3/5] modify the index word count to use the html parsers --- cc_index_word_count.py | 31 ++++++++++++------------------- 1 file changed, 12 insertions(+), 19 deletions(-) diff --git a/cc_index_word_count.py b/cc_index_word_count.py index 7ea88ca..d14d1f5 100644 --- a/cc_index_word_count.py +++ b/cc_index_word_count.py @@ -36,23 +36,6 @@ def reduce_by_key_func(a, b): # sum values of tuple return ((a[0] + b[0]), (a[1] + b[1])) - def html_to_text(self, page, record): - try: - encoding = self.get_warc_header(record, 'WARC-Identified-Content-Charset') - if not encoding: - for encoding in EncodingDetector(page, is_html=True).encodings: - # take the first detected encoding - break - soup = BeautifulSoup(page, 'lxml', from_encoding=encoding) - for script in soup(['script', 'style']): - script.extract() - return soup.get_text(' ', strip=True) - except Exception as e: - self.get_logger().error("Error converting HTML to text for {}: {}", - self.get_warc_header(record, 'WARC-Target-URI'), e) - self.records_parsing_failed.add(1) - return '' - def process_record(self, record): if not self.is_response_record(record): # skip over WARC request or metadata records @@ -60,8 +43,18 @@ def process_record(self, record): if not self.is_html(record): self.records_non_html.add(1) return - page = self.get_payload_stream(record).read() - text = self.html_to_text(page, record) + + text = "" + try: + page = self.get_payload_stream(record).read() + encoding = self.get_warc_header(record, 'WARC-Identified-Content-Charset') + parser = self.get_html_parser() + html_tree = parser.get_html_tree(page, encoding=encoding) + text = parser.html_to_text(html_tree) + except Exception as e: + self.get_logger().error("Error converting HTML to text for {}: {}", + self.get_warc_header(record, 'WARC-Target-URI'), e) + self.records_parsing_failed.add(1) words = map(lambda w: w.lower(), WordCountJob.word_pattern.findall(text)) for word, count in Counter(words).items(): From 2a0fd51b29bc2307eb70d565bca0847098456cff Mon Sep 17 00:00:00 2001 From: silentninja Date: Fri, 27 Dec 2024 10:55:02 +0400 Subject: [PATCH 4/5] Edit README.md to include instructions for the html parsers args --- README.md | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/README.md b/README.md index 9a4c110..707d0f7 100644 --- a/README.md +++ b/README.md @@ -207,6 +207,28 @@ Some differences between the warcio and FastWARC APIs are hidden from the user i However, it's recommended that you carefully verify that your custom job implementation works in combination with FastWARC. There are subtle differences between the warcio and FastWARC APIs, including the underlying classes (WARC/HTTP headers and stream implementations). In addition, FastWARC does not support for legacy ARC files and does not automatically decode HTTP content and transfer encodings (see [Resiliparse HTTP Tools](https://resiliparse.chatnoir.eu/en/latest/man/parse/http.html#read-chunked-http-payloads)). While content and transfer encodings are already decoded in Common Crawl WARC files, this may not be the case for WARC files from other sources. See also [WARC 1.1 specification, http/https response records](https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.1/#http-and-https-schemes). + +### Configuring HTML Parsers + +The project supports two HTML parser [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/) and [Resiliparse](https://resiliparse.chatnoir.eu/en/latest/man/installation.html) that can be specified using the `--html_parser` argument. +By default, Beautifulsoup is used for parsing the HTML files. + +To configure the HTML parser. +- use the `--html_parser` argument to specify the HTML parser to use when submitting the job +- Install the correct dependencies for the parser +- when running the job in a Spark cluster, include correct parser module (bs4_parser.py for beautifulsoup or resiliparse_parser.py for resiliparse) via `--py-files` in addition to `sparkcc.py` and further job-specific Python files. See also [running in a Spark cluster](#running-in-spark-cluster-over-large-amounts-of-data). + +Below an example call to count words in 10 WARC records host under the `.is` top-level domain using `Resiliparse` +``` +$SPARK_HOME/bin/spark-submit \ + ./cc_index_word_count.py \ + --input_base_url s3://commoncrawl/ \ + --query "SELECT url, warc_filename, warc_record_offset, warc_record_length, content_charset FROM ccindex WHERE crawl = 'CC-MAIN-2020-24' AND subset = 'warc' AND url_host_tld = 'is' LIMIT 10" \ + s3a://commoncrawl/cc-index/table/cc-main/warc/ \ + myccindexwordcountoutput \ + --html_parser resiliparse +``` + ## Credits Examples are originally ported from Stephen Merity's [cc-mrjob](https://github.com/commoncrawl/cc-mrjob/) with the following changes and upgrades: From 5ae6b8688dab73656dfa29f218832c1ebd3ad722 Mon Sep 17 00:00:00 2001 From: silentninja Date: Mon, 30 Dec 2024 10:58:46 +0400 Subject: [PATCH 5/5] Add Resiliparse to requirements.txt --- requirements.txt | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/requirements.txt b/requirements.txt index d693cd2..3d6d481 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,3 +18,8 @@ lxml #fastwarc # (tested with) #fastwarc==0.14.1 + +# to parse HTML using Resiliparse (https://github.com/chatnoir-eu/chatnoir-resiliparse) +#Resiliparse +# (tested with) +#Resiliparse==0.15.1 \ No newline at end of file