Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: update html cleaning rules #444

Merged
merged 5 commits into from
Jul 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 35 additions & 1 deletion openedx/core/djangolib/markup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import markupsafe
import bleach
import re
from lxml.html.clean import Cleaner
from mako.filters import decode

Expand All @@ -14,6 +15,39 @@
Text = markupsafe.escape # pylint: disable=invalid-name


class HTMLCleaner(Cleaner):
"""
HTMLCleaner extends lxml.html.clean.Cleaner to sanitize HTML content while preserving
valid URLs and removing unsafe JavaScript links.
"""
def _remove_javascript_link(self, link: str):
"""
Overrides the parent class's method to preserve valid URLs.

This method uses a regular expression to identify valid URLs that start with 'http', 'https',
'ftp', or 'file' schemes. If the link is a valid URL, it is returned unchanged. Otherwise,
the parent class's method is used to remove the JavaScript link.

Args:
link (str): The hyperlink (href attribute value) to be checked and potentially sanitized.

Returns:
str: The original link or empty string.

Examples:
Valid URLs:
'https://www.example.com/javascript:something'
'file://localhost/path/to/file'
Invalid URLs:
'javascript:alert("hello")'
'javascript:alert("hello") https://www.example.com/page'
"""
is_url = re.compile(r"^(?:https?|ftp|file)://", re.I).search(link.strip())
if is_url:
return link
return super()._remove_javascript_link(link)


def HTML(html): # pylint: disable=invalid-name
"""
Mark a string as already HTML, so that it won't be escaped before output.
Expand Down Expand Up @@ -70,6 +104,6 @@ def clean_dangerous_html(html):
"""
if not html:
return html
cleaner = Cleaner(style=True, inline_style=False, safe_attrs_only=False)
cleaner = HTMLCleaner(style=True, inline_style=False, safe_attrs_only=False)
html = cleaner.clean_html(html)
return HTML(html)
35 changes: 34 additions & 1 deletion openedx/core/djangolib/tests/test_markup.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from django.utils.translation import ngettext
from mako.template import Template

from openedx.core.djangolib.markup import HTML, Text, strip_all_tags_but_br
from openedx.core.djangolib.markup import HTML, HTMLCleaner, Text, strip_all_tags_but_br


@ddt.ddt
Expand Down Expand Up @@ -157,3 +157,36 @@ def test_clean_dengers_html_filter(self):
assert not html_soup.find('form')
assert not html_soup.find('blink')
assert not html_soup.find('object')

@ddt.ddt
class TestHTMLCleaner(unittest.TestCase):
"""
Tests that Url links are being cleaned properly and no useful link is removed.
"""

def setUp(self):
self.cleaner = HTMLCleaner()

@ddt.data(
"https://example.com/data:something", # Javascript cannot be executed this way so these urls are safe.
"http://example.com/path/to/page",
"ftp://ftp.example.com/resource",
"file://localhost/path/to/file",
)
def test_valid_urls(self, url):
"""
Test that valid URLs are preserved.
"""
cleaned_url = self.cleaner._remove_javascript_link(url)
self.assertEqual(cleaned_url, url)

@ddt.data(
"javascript:alert('Hello')",
"mocha:some_code https://example.com", # Javascript can be executed this way so this code should be removed.
)
def test_javascript_is_removed(self, url):
"""
Test that malicious Javascript is removed.
"""
cleaned_url = self.cleaner._remove_javascript_link(url)
self.assertEqual(cleaned_url, '')
1 change: 0 additions & 1 deletion openedx/features/wikimedia_features/messenger/apps.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ class MessengerConfig(AppConfig):
PluginSettings.CONFIG: {
ProjectType.LMS: {
SettingsType.COMMON: {PluginSettings.RELATIVE_PATH: 'settings.common'},
SettingsType.TEST: {PluginSettings.RELATIVE_PATH: 'settings.test'},
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why we need to remove this?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This module does not exist.

}
}
}
Expand Down