Skip to content

Commit

Permalink
Fixed check crashing when RF merged results contained HTML, refactore…
Browse files Browse the repository at this point in the history
…d HTML stripping (closes #176)
  • Loading branch information
simonmeggle committed Nov 4, 2021
1 parent 2851fa0 commit 862b77e
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 119 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## Unreleased

### Fixed

* Check: Crashed when Robot Framework merged results contained HTML; refactored HTML stripping (#176)

## 1.2.1.2 - 2021-09-30

### Changed
Expand Down
78 changes: 18 additions & 60 deletions checks/v1/robotmk
Original file line number Diff line number Diff line change
Expand Up @@ -480,13 +480,19 @@ class RobotItem(object):

@property
def text(self):
text = self._text

# Remove nasty Playwright log header lines and log messages
text = re.sub('Note: use DEBUG=pw:api environment variable to capture Playwright logs.', '', text)
text = re.sub('={60}|={27} logs ={27}', '', text)

# Return back plain text if the text has a HTML prefix.
# This is the necessary for test messages created by rebot after merging test results.
# Change when this https://github.com/robotframework/robotframework/issues/4068 has been solved.
if self._text.startswith('*HTML* '):
return html_to_text(self._text).replace('*HTML* ', '')
# Change when this https://github.com/robotframework/robotframework/issues/4068 has been solved.
if text.startswith('*HTML* '):
return html_to_text(text)
else:
return self._text
return text

@text.setter
def text(self, text):
Expand Down Expand Up @@ -1228,66 +1234,18 @@ def roundup(number, ndigits=0, return_type=None):
return return_type(result)


"""
HTML <-> text conversions.
http://stackoverflow.com/questions/328356/extracting-text-from-html-file-using-python
"""

def html_to_text(html):

class _HTMLToText(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self._buf = []
self.hide_output = False

def handle_starttag(self, tag, attrs):
if tag in ('p', 'br') and not self.hide_output:
self._buf.append('\n')
elif tag in ('script', 'style'):
self.hide_output = True

def handle_startendtag(self, tag, attrs):
if tag == 'br':
self._buf.append('\n')

def handle_endtag(self, tag):
if tag == 'p':
self._buf.append('\n')
elif tag in ('script', 'style'):
self.hide_output = False

def handle_data(self, text):
if text and not self.hide_output:
self._buf.append(re.sub(r'\s+', ' ', text))

def handle_entityref(self, name):
if name in name2codepoint and not self.hide_output:
c = chr(name2codepoint[name])
self._buf.append(c)

def handle_charref(self, name):
if not self.hide_output:
n = int(name[1:], 16) if name.startswith('x') else int(name)
self._buf.append(chr(n))

def get_text(self):
return re.sub(r' +', ' ', ''.join(self._buf))

"""
Given a piece of HTML, return the plain text it contains.
This handles entities and char refs, but not javascript and stylesheets.
"""
try:
from html.parser import HTMLParser
from html.entities import name2codepoint
parser = _HTMLToText()
parser.feed(html)
parser.close()
return parser.get_text()
except:
# on Checkmk1 there is no HTML.Parser; :-/
return html
# Remove Prefix
html = re.sub('\*HTML\* ', '', html)
html = re.sub('<span class="merge">Test has been re-executed and results merged.</span>', 'Test has been re-executed and the results were merged: ', html)

html = re.sub('<script.*?>|</script>|<style.*?>|<style>|<span.*?>|</span>|<a.*?>|</a>|<hr>', '', html)
html = re.sub('<br>|<p>', '\\n', html)
return html


# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# V1 Specific functions
Expand Down
75 changes: 16 additions & 59 deletions checks/v2/robotmk.py
Original file line number Diff line number Diff line change
Expand Up @@ -489,13 +489,19 @@ def __init__(self, xmlnode, lv_abs, lv_rel, parent, index=None):

@property
def text(self):
text = self._text

# Remove nasty Playwright log header lines and log messages
text = re.sub('Note: use DEBUG=pw:api environment variable to capture Playwright logs.', '', text)
text = re.sub('={60}|={27} logs ={27}', '', text)

# Return back plain text if the text has a HTML prefix.
# This is the necessary for test messages created by rebot after merging test results.
# Change when this https://github.com/robotframework/robotframework/issues/4068 has been solved.
if self._text.startswith('*HTML* '):
return html_to_text(self._text).replace('*HTML* ', '')
if text.startswith('*HTML* '):
return html_to_text(text)
else:
return self._text
return text

@text.setter
def text(self, text):
Expand Down Expand Up @@ -1243,66 +1249,17 @@ def roundup(number, ndigits=0, return_type=None):
return return_type(result)


"""
HTML <-> text conversions.
http://stackoverflow.com/questions/328356/extracting-text-from-html-file-using-python
"""

def html_to_text(html):

class _HTMLToText(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self._buf = []
self.hide_output = False

def handle_starttag(self, tag, attrs):
if tag in ('p', 'br') and not self.hide_output:
self._buf.append('\n')
elif tag in ('script', 'style'):
self.hide_output = True

def handle_startendtag(self, tag, attrs):
if tag == 'br':
self._buf.append('\n')

def handle_endtag(self, tag):
if tag == 'p':
self._buf.append('\n')
elif tag in ('script', 'style'):
self.hide_output = False

def handle_data(self, text):
if text and not self.hide_output:
self._buf.append(re.sub(r'\s+', ' ', text))

def handle_entityref(self, name):
if name in name2codepoint and not self.hide_output:
c = chr(name2codepoint[name])
self._buf.append(c)

def handle_charref(self, name):
if not self.hide_output:
n = int(name[1:], 16) if name.startswith('x') else int(name)
self._buf.append(chr(n))

def get_text(self):
return re.sub(r' +', ' ', ''.join(self._buf))

"""
Given a piece of HTML, return the plain text it contains.
This handles entities and char refs, but not javascript and stylesheets.
"""
try:
from html.parser import HTMLParser
from html.entities import name2codepoint
parser = _HTMLToText()
parser.feed(html)
parser.close()
return parser.get_text()
except:
# on Checkmk1 there is no HTML.Parser; :-/
return html
# Remove Prefix
html = re.sub('\*HTML\* ', '', html)
html = re.sub('<span class="merge">Test has been re-executed and results merged.</span>', 'Test has been re-executed and the results were merged: ', html)

html = re.sub('<script.*?>|</script>|<style.*?>|<style>|<span.*?>|</span>|<a.*?>|</a>|<hr>', '', html)
html = re.sub('<br>|<p>', '\\n', html)
return html

# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# V2 specific functions
Expand Down

0 comments on commit 862b77e

Please sign in to comment.