Skip to content

Commit

Permalink
fix: update html cleaning rules (#444)
Browse files Browse the repository at this point in the history
* fix: update html cleaning rules

* chore: add docstrings and code cleaning

* chore: fixes test cases and adds few new tests

* chore: simplify test cases and refactor docstrings
  • Loading branch information
ahmed-arb authored Jul 10, 2024
1 parent 43c165a commit 0633e78
Show file tree
Hide file tree
Showing 3 changed files with 69 additions and 3 deletions.
36 changes: 35 additions & 1 deletion openedx/core/djangolib/markup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import markupsafe
import bleach
import re
from lxml.html.clean import Cleaner
from mako.filters import decode

Expand All @@ -14,6 +15,39 @@
Text = markupsafe.escape # pylint: disable=invalid-name


class HTMLCleaner(Cleaner):
"""
HTMLCleaner extends lxml.html.clean.Cleaner to sanitize HTML content while preserving
valid URLs and removing unsafe JavaScript links.
"""
def _remove_javascript_link(self, link: str):
"""
Overrides the parent class's method to preserve valid URLs.
This method uses a regular expression to identify valid URLs that start with 'http', 'https',
'ftp', or 'file' schemes. If the link is a valid URL, it is returned unchanged. Otherwise,
the parent class's method is used to remove the JavaScript link.
Args:
link (str): The hyperlink (href attribute value) to be checked and potentially sanitized.
Returns:
str: The original link or empty string.
Examples:
Valid URLs:
'https://www.example.com/javascript:something'
'file://localhost/path/to/file'
Invalid URLs:
'javascript:alert("hello")'
'javascript:alert("hello") https://www.example.com/page'
"""
is_url = re.compile(r"^(?:https?|ftp|file)://", re.I).search(link.strip())
if is_url:
return link
return super()._remove_javascript_link(link)


def HTML(html): # pylint: disable=invalid-name
"""
Mark a string as already HTML, so that it won't be escaped before output.
Expand Down Expand Up @@ -70,6 +104,6 @@ def clean_dangerous_html(html):
"""
if not html:
return html
cleaner = Cleaner(style=True, inline_style=False, safe_attrs_only=False)
cleaner = HTMLCleaner(style=True, inline_style=False, safe_attrs_only=False)
html = cleaner.clean_html(html)
return HTML(html)
35 changes: 34 additions & 1 deletion openedx/core/djangolib/tests/test_markup.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from django.utils.translation import ngettext
from mako.template import Template

from openedx.core.djangolib.markup import HTML, Text, strip_all_tags_but_br
from openedx.core.djangolib.markup import HTML, HTMLCleaner, Text, strip_all_tags_but_br


@ddt.ddt
Expand Down Expand Up @@ -157,3 +157,36 @@ def test_clean_dengers_html_filter(self):
assert not html_soup.find('form')
assert not html_soup.find('blink')
assert not html_soup.find('object')

@ddt.ddt
class TestHTMLCleaner(unittest.TestCase):
"""
Tests that Url links are being cleaned properly and no useful link is removed.
"""

def setUp(self):
self.cleaner = HTMLCleaner()

@ddt.data(
"https://example.com/data:something", # Javascript cannot be executed this way so these urls are safe.
"http://example.com/path/to/page",
"ftp://ftp.example.com/resource",
"file://localhost/path/to/file",
)
def test_valid_urls(self, url):
"""
Test that valid URLs are preserved.
"""
cleaned_url = self.cleaner._remove_javascript_link(url)
self.assertEqual(cleaned_url, url)

@ddt.data(
"javascript:alert('Hello')",
"mocha:some_code https://example.com", # Javascript can be executed this way so this code should be removed.
)
def test_javascript_is_removed(self, url):
"""
Test that malicious Javascript is removed.
"""
cleaned_url = self.cleaner._remove_javascript_link(url)
self.assertEqual(cleaned_url, '')
1 change: 0 additions & 1 deletion openedx/features/wikimedia_features/messenger/apps.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ class MessengerConfig(AppConfig):
PluginSettings.CONFIG: {
ProjectType.LMS: {
SettingsType.COMMON: {PluginSettings.RELATIVE_PATH: 'settings.common'},
SettingsType.TEST: {PluginSettings.RELATIVE_PATH: 'settings.test'},
}
}
}
Expand Down

0 comments on commit 0633e78

Please sign in to comment.