From 3f26da3ca604d9a8b5b0bbe66987d1e73967b516 Mon Sep 17 00:00:00 2001 From: Andrew Marwood Date: Thu, 10 Aug 2023 15:41:57 +1000 Subject: [PATCH] SDAN-722 Remove embeds from monitoring email (#1176) --- newsroom/monitoring/utils.py | 28 ++++++++++++++++++++++++++++ tests/test_monitoring.py | 11 ++++++++++- 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/newsroom/monitoring/utils.py b/newsroom/monitoring/utils.py index 55f2594e..9d7799b9 100644 --- a/newsroom/monitoring/utils.py +++ b/newsroom/monitoring/utils.py @@ -1,4 +1,6 @@ from flask import current_app as app +from lxml import html as lxml_html +import re import collections from superdesk.text_utils import get_text from newsroom.utils import get_items_by_id @@ -45,6 +47,7 @@ def truncate_article_body(items, monitoring_profile, full_text=False): # To make sure PDF creator and RTF creator does truncate for linked_text settings # Manually truncate it for i in items: + remove_all_embeds(i) i['body_str'] = get_text(i.get('body_html', ''), content='html', lf_on_block=True) if monitoring_profile['alert_type'] == 'linked_text': if not full_text and len(i['body_str']) > 160: @@ -66,3 +69,28 @@ def get_items_for_monitoring_report(_ids, monitoring_profile, full_text=False): items = get_items_by_id(_ids, 'items') truncate_article_body(items, monitoring_profile, full_text) return items + + +def remove_all_embeds(item): + """ + Remove the all embeds from the body of the article + :param item: + :return: + """ + root_elem = lxml_html.fromstring(item.get('body_html') or '

') + regex = r" EMBED START (?:Image|Video|Audio) {id: \"editor_([0-9]+)" + html_updated = False + comments = root_elem.xpath('//comment()') + for comment in comments: + m = re.search(regex, comment.text) + # if we've found an Embed Start comment + if m and m.group(1): + parent = comment.getparent() + for elem in comment.itersiblings(): + parent.remove(elem) + if elem.text and ' EMBED END ' in elem.text: + break + parent.remove(comment) + html_updated = True + if html_updated: + item["body_html"] = sd_etree.to_string(root_elem, method="html") diff --git a/tests/test_monitoring.py b/tests/test_monitoring.py index 7bcc14d4..8253e121 100644 --- a/tests/test_monitoring.py +++ b/tests/test_monitoring.py @@ -835,7 +835,14 @@ def test_send_immediate_email_alerts(client, app): 'products': [{'code': '12345'}], "versioncreated": utcnow(), 'byline': 'Testy McTestface', - 'body_html': '

line 1 of the article text\nline 2 of the story\nand a bit more.

', + 'body_html': '

line 1 of the article text\nline 2 of the story\nand a bit more.

' + '' + '
' + ' ' + '
Assistant Treasurer
' + '
' + '' + '

Something after the embed', 'source': 'AAAA' }]) w = app.data.find_one('monitoring', None, _id='5db11ec55f627d8aa0b545fb') @@ -849,6 +856,8 @@ def test_send_immediate_email_alerts(client, app): assert outbox[0].recipients == ['foo_user@bar.com', 'foo_user2@bar.com'] assert outbox[0].sender == 'newsroom@localhost' assert outbox[0].subject == 'Monitoring Subject' + assert 'Something after the embed' in outbox[0].body + assert 'Assistant Treasurer' not in outbox[0].body assert 'Newsroom Monitoring: W1' in outbox[0].body