From 862b77e40900792ae93a22dcedab2f7d1289d643 Mon Sep 17 00:00:00 2001
From: Simon <simon.meggle@elabit.de>
Date: Thu, 4 Nov 2021 09:36:56 +0100
Subject: [PATCH 1/3] Fixed check crashing when RF merged results contained
 HTML, refactored HTML stripping (closes #176)

---
 CHANGELOG.md         |  6 ++++
 checks/v1/robotmk    | 78 ++++++++++----------------------------------
 checks/v2/robotmk.py | 75 +++++++++---------------------------------
 3 files changed, 40 insertions(+), 119 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 76a5efc9..c3b4c74c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## Unreleased
+
+### Fixed
+
+* Check: Crashed when Robot Framework merged results contained HTML; refactored HTML stripping (#176)
+
 ## 1.2.1.2 - 2021-09-30
 
 ### Changed 
diff --git a/checks/v1/robotmk b/checks/v1/robotmk
index 197d239b..8608c962 100644
--- a/checks/v1/robotmk
+++ b/checks/v1/robotmk
@@ -480,13 +480,19 @@ class RobotItem(object):
 
     @property
     def text(self): 
+        text = self._text
+
+        # Remove nasty Playwright log header lines and log messages
+        text = re.sub('Note: use DEBUG=pw:api environment variable to capture Playwright logs.', '', text)
+        text = re.sub('={60}|={27} logs ={27}', '', text)
+        
         # Return back plain text if the text has a HTML prefix.
         # This is the necessary for test messages created by rebot after merging test results. 
-        # Change when this https://github.com/robotframework/robotframework/issues/4068 has been solved. 
-        if self._text.startswith('*HTML* '): 
-            return html_to_text(self._text).replace('*HTML* ', '')
+        # Change when this https://github.com/robotframework/robotframework/issues/4068 has been solved.         
+        if text.startswith('*HTML* '): 
+            return html_to_text(text)
         else: 
-            return self._text
+            return text
 
     @text.setter
     def text(self, text):
@@ -1228,66 +1234,18 @@ def roundup(number, ndigits=0, return_type=None):
     return return_type(result)
 
 
-"""
-HTML <-> text conversions.
-http://stackoverflow.com/questions/328356/extracting-text-from-html-file-using-python
-"""
-
 def html_to_text(html):
-
-    class _HTMLToText(HTMLParser):
-        def __init__(self):
-            HTMLParser.__init__(self)
-            self._buf = []
-            self.hide_output = False
-
-        def handle_starttag(self, tag, attrs):
-            if tag in ('p', 'br') and not self.hide_output:
-                self._buf.append('\n')
-            elif tag in ('script', 'style'):
-                self.hide_output = True
-
-        def handle_startendtag(self, tag, attrs):
-            if tag == 'br':
-                self._buf.append('\n')
-
-        def handle_endtag(self, tag):
-            if tag == 'p':
-                self._buf.append('\n')
-            elif tag in ('script', 'style'):
-                self.hide_output = False
-
-        def handle_data(self, text):
-            if text and not self.hide_output:
-                self._buf.append(re.sub(r'\s+', ' ', text))
-
-        def handle_entityref(self, name):
-            if name in name2codepoint and not self.hide_output:
-                c = chr(name2codepoint[name])
-                self._buf.append(c)
-
-        def handle_charref(self, name):
-            if not self.hide_output:
-                n = int(name[1:], 16) if name.startswith('x') else int(name)
-                self._buf.append(chr(n))
-
-        def get_text(self):
-            return re.sub(r' +', ' ', ''.join(self._buf))
-                
     """
     Given a piece of HTML, return the plain text it contains.
-    This handles entities and char refs, but not javascript and stylesheets.
     """
-    try:
-        from html.parser import HTMLParser
-        from html.entities import name2codepoint
-        parser = _HTMLToText()
-        parser.feed(html)
-        parser.close()
-        return parser.get_text()
-    except:  
-        # on Checkmk1 there is no HTML.Parser; :-/ 
-        return html
+    # Remove Prefix
+    html = re.sub('\*HTML\* ', '', html)
+    html = re.sub('<span class="merge">Test has been re-executed and results merged.</span>', 'Test has been re-executed and the results were merged: ', html)
+
+    html = re.sub('<script.*?>|</script>|<style.*?>|<style>|<span.*?>|</span>|<a.*?>|</a>|<hr>', '', html)
+    html = re.sub('<br>|<p>', '\\n', html)
+    return html
+
         
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 # V1 Specific functions
diff --git a/checks/v2/robotmk.py b/checks/v2/robotmk.py
index 17209b5e..b26531c8 100644
--- a/checks/v2/robotmk.py
+++ b/checks/v2/robotmk.py
@@ -489,13 +489,19 @@ def __init__(self, xmlnode, lv_abs, lv_rel, parent, index=None):
 
     @property
     def text(self): 
+        text = self._text
+
+        # Remove nasty Playwright log header lines and log messages
+        text = re.sub('Note: use DEBUG=pw:api environment variable to capture Playwright logs.', '', text)
+        text = re.sub('={60}|={27} logs ={27}', '', text)
+        
         # Return back plain text if the text has a HTML prefix.
         # This is the necessary for test messages created by rebot after merging test results. 
         # Change when this https://github.com/robotframework/robotframework/issues/4068 has been solved.         
-        if self._text.startswith('*HTML* '): 
-            return html_to_text(self._text).replace('*HTML* ', '')
+        if text.startswith('*HTML* '): 
+            return html_to_text(text)
         else: 
-            return self._text
+            return text
 
     @text.setter
     def text(self, text):
@@ -1243,66 +1249,17 @@ def roundup(number, ndigits=0, return_type=None):
     return return_type(result)
 
 
-"""
-HTML <-> text conversions.
-http://stackoverflow.com/questions/328356/extracting-text-from-html-file-using-python
-"""
-
 def html_to_text(html):
-
-    class _HTMLToText(HTMLParser):
-        def __init__(self):
-            HTMLParser.__init__(self)
-            self._buf = []
-            self.hide_output = False
-
-        def handle_starttag(self, tag, attrs):
-            if tag in ('p', 'br') and not self.hide_output:
-                self._buf.append('\n')
-            elif tag in ('script', 'style'):
-                self.hide_output = True
-
-        def handle_startendtag(self, tag, attrs):
-            if tag == 'br':
-                self._buf.append('\n')
-
-        def handle_endtag(self, tag):
-            if tag == 'p':
-                self._buf.append('\n')
-            elif tag in ('script', 'style'):
-                self.hide_output = False
-
-        def handle_data(self, text):
-            if text and not self.hide_output:
-                self._buf.append(re.sub(r'\s+', ' ', text))
-
-        def handle_entityref(self, name):
-            if name in name2codepoint and not self.hide_output:
-                c = chr(name2codepoint[name])
-                self._buf.append(c)
-
-        def handle_charref(self, name):
-            if not self.hide_output:
-                n = int(name[1:], 16) if name.startswith('x') else int(name)
-                self._buf.append(chr(n))
-
-        def get_text(self):
-            return re.sub(r' +', ' ', ''.join(self._buf))
-                
     """
     Given a piece of HTML, return the plain text it contains.
-    This handles entities and char refs, but not javascript and stylesheets.
     """
-    try:
-        from html.parser import HTMLParser
-        from html.entities import name2codepoint
-        parser = _HTMLToText()
-        parser.feed(html)
-        parser.close()
-        return parser.get_text()
-    except:  
-        # on Checkmk1 there is no HTML.Parser; :-/ 
-        return html
+    # Remove Prefix
+    html = re.sub('\*HTML\* ', '', html)
+    html = re.sub('<span class="merge">Test has been re-executed and results merged.</span>', 'Test has been re-executed and the results were merged: ', html)
+
+    html = re.sub('<script.*?>|</script>|<style.*?>|<style>|<span.*?>|</span>|<a.*?>|</a>|<hr>', '', html)
+    html = re.sub('<br>|<p>', '\\n', html)
+    return html 
 
 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 # V2 specific functions

From 409a49f1d0f52748c35847e058b69cad2b794e46 Mon Sep 17 00:00:00 2001
From: Simon <simon.meggle@elabit.de>
Date: Thu, 4 Nov 2021 09:41:25 +0100
Subject: [PATCH 2/3] CHANGELOG: v1.2.2

---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index c3b4c74c..2037048a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,7 +5,7 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## Unreleased
+## 1.2.2 - 2021-11-04
 
 ### Fixed
 

From 4e29ebab2da40ebfb79e86f73377837f3125edd7 Mon Sep 17 00:00:00 2001
From: Simon <simon.meggle@elabit.de>
Date: Thu, 4 Nov 2021 09:41:27 +0100
Subject: [PATCH 3/3] Version bump: v1.2.2

---
 agents_plugins/robotmk.py | 2 +-
 bakery/v1/robotmk.py      | 2 +-
 bakery/v2/robotmk.py      | 2 +-
 checks/v1/robotmk         | 2 +-
 checks/v2/robotmk.py      | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/agents_plugins/robotmk.py b/agents_plugins/robotmk.py
index 0c406e29..ffab3e2c 100755
--- a/agents_plugins/robotmk.py
+++ b/agents_plugins/robotmk.py
@@ -49,7 +49,7 @@
 
 local_tz = datetime.utcnow().astimezone().tzinfo
 
-ROBOTMK_VERSION = 'v1.2.1.2'
+ROBOTMK_VERSION = 'v1.2.2'
 
 class RMKConfig():
     _PRESERVED_WORDS = [
diff --git a/bakery/v1/robotmk.py b/bakery/v1/robotmk.py
index ed9473c0..1c5ddb9c 100644
--- a/bakery/v1/robotmk.py
+++ b/bakery/v1/robotmk.py
@@ -18,7 +18,7 @@
 # to the Free Software Foundation, Inc., 51 Franklin St,  Fifth Floor,
 # Boston, MA 02110-1301 USA.
 
-ROBOTMK_VERSION = 'v1.2.1.2'
+ROBOTMK_VERSION = 'v1.2.2'
 
 import cmk.utils.paths
 import os
diff --git a/bakery/v2/robotmk.py b/bakery/v2/robotmk.py
index 62b35e31..07ff4d3f 100644
--- a/bakery/v2/robotmk.py
+++ b/bakery/v2/robotmk.py
@@ -18,7 +18,7 @@
 # to the Free Software Foundation, Inc., 51 Franklin St,  Fifth Floor,
 # Boston, MA 02110-1301 USA.
 
-ROBOTMK_VERSION = 'v1.2.1.2'
+ROBOTMK_VERSION = 'v1.2.2'
 
 from typing import Iterable, TypedDict, List
 from pathlib import Path
diff --git a/checks/v1/robotmk b/checks/v1/robotmk
index 8608c962..9d8a11f5 100644
--- a/checks/v1/robotmk
+++ b/checks/v1/robotmk
@@ -34,7 +34,7 @@ from collections import namedtuple
 iam = "robotmk"
 # DO NOT DELETE
 inventory_robotmk_rules = []
-ROBOTMK_VERSION = 'v1.2.1.2'
+ROBOTMK_VERSION = 'v1.2.2'
 DEFAULT_SVC_PREFIX = 'Robot Framework E2E $SUITEID$SPACE-$SPACE'
 HTML_LOG_DIR = "%s/%s" % (os.environ['OMD_ROOT'], 'local/share/addons/robotmk')
 
diff --git a/checks/v2/robotmk.py b/checks/v2/robotmk.py
index b26531c8..08f4ea6a 100644
--- a/checks/v2/robotmk.py
+++ b/checks/v2/robotmk.py
@@ -35,7 +35,7 @@
 from cmk.base.plugins.agent_based.agent_based_api.v1 import *
 from cmk.utils.exceptions import MKGeneralException
 
-ROBOTMK_VERSION = 'v1.2.1.2'
+ROBOTMK_VERSION = 'v1.2.2'
 DEFAULT_SVC_PREFIX = 'Robot Framework E2E $SUITEID$SPACE-$SPACE'
 HTML_LOG_DIR = "%s/%s" % (os.environ['OMD_ROOT'], 'local/share/addons/robotmk')