From 3f26da3ca604d9a8b5b0bbe66987d1e73967b516 Mon Sep 17 00:00:00 2001
From: Andrew Marwood <amarwood@aap.com.au>
Date: Thu, 10 Aug 2023 15:41:57 +1000
Subject: [PATCH] SDAN-722 Remove embeds from monitoring email (#1176)

---
 newsroom/monitoring/utils.py | 28 ++++++++++++++++++++++++++++
 tests/test_monitoring.py     | 11 ++++++++++-
 2 files changed, 38 insertions(+), 1 deletion(-)
diff --git a/newsroom/monitoring/utils.py b/newsroom/monitoring/utils.py
index 55f2594e..9d7799b9 100644
--- a/newsroom/monitoring/utils.py
+++ b/newsroom/monitoring/utils.py
@@ -1,4 +1,6 @@
 from flask import current_app as app
+from lxml import html as lxml_html
+import re
 import collections
 from superdesk.text_utils import get_text
 from newsroom.utils import get_items_by_id
@@ -45,6 +47,7 @@ def truncate_article_body(items, monitoring_profile, full_text=False):
     # To make sure PDF creator and RTF creator does truncate for linked_text settings
     # Manually truncate it
     for i in items:
+        remove_all_embeds(i)
         i['body_str'] = get_text(i.get('body_html', ''), content='html', lf_on_block=True)
         if monitoring_profile['alert_type'] == 'linked_text':
             if not full_text and len(i['body_str']) > 160:
@@ -66,3 +69,28 @@ def get_items_for_monitoring_report(_ids, monitoring_profile, full_text=False):
     items = get_items_by_id(_ids, 'items')
     truncate_article_body(items, monitoring_profile, full_text)
     return items
+
+
+def remove_all_embeds(item):
+    """
+    Remove the all embeds from the body of the article
+    :param item:
+    :return:
+    """
+    root_elem = lxml_html.fromstring(item.get('body_html') or '<p></p>')
+    regex = r" EMBED START (?:Image|Video|Audio) {id: \"editor_([0-9]+)"
+    html_updated = False
+    comments = root_elem.xpath('//comment()')
+    for comment in comments:
+        m = re.search(regex, comment.text)
+        # if we've found an Embed Start comment
+        if m and m.group(1):
+            parent = comment.getparent()
+            for elem in comment.itersiblings():
+                parent.remove(elem)
+                if elem.text and ' EMBED END ' in elem.text:
+                    break
+            parent.remove(comment)
+            html_updated = True
+    if html_updated:
+        item["body_html"] = sd_etree.to_string(root_elem, method="html")
diff --git a/tests/test_monitoring.py b/tests/test_monitoring.py
index 7bcc14d4..8253e121 100644
--- a/tests/test_monitoring.py
+++ b/tests/test_monitoring.py
@@ -835,7 +835,14 @@ def test_send_immediate_email_alerts(client, app):
         'products': [{'code': '12345'}],
         "versioncreated": utcnow(),
         'byline': 'Testy McTestface',
-        'body_html': '<p>line 1 of the article text\nline 2 of the story\nand a bit more.</p>',
+        'body_html': '<p>line 1 of the article text\nline 2 of the story\nand a bit more.</p>'
+                     '<!-- EMBED START Audio {id: "editor_2\"} -->'
+                     '<figure>'
+                     '    <audio controls src="/assets.mp3"></audio>'
+                     '    <figcaption>Assistant Treasurer</figcaption>'
+                     '</figure>'
+                     '<!-- EMBED END Audio {id: \"editor_2\"} -->'
+                     '<p>Something after the embed',
         'source': 'AAAA'
     }])
     w = app.data.find_one('monitoring', None, _id='5db11ec55f627d8aa0b545fb')
@@ -849,6 +856,8 @@ def test_send_immediate_email_alerts(client, app):
         assert outbox[0].recipients == ['foo_user@bar.com', 'foo_user2@bar.com']
         assert outbox[0].sender == 'newsroom@localhost'
         assert outbox[0].subject == 'Monitoring Subject'
+        assert 'Something after the embed' in outbox[0].body
+        assert 'Assistant Treasurer' not in outbox[0].body
         assert 'Newsroom Monitoring: W1' in outbox[0].body