Skip to content

Commit

Permalink
chore: simplify test cases and refactor docstrings
Browse files Browse the repository at this point in the history
  • Loading branch information
ahmed-arb committed Jul 5, 2024
1 parent 592c7e3 commit ec0572d
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 46 deletions.
46 changes: 19 additions & 27 deletions openedx/core/djangolib/markup.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,43 +17,35 @@

class HTMLCleaner(Cleaner):
"""
HTMLCleaner extends lxml.html.clean.Cleaner to sanitize HTML content while preserving valid URLs
and removing unsafe JavaScript links.
Attributes:
-----------
_is_url : Callable[[str], Optional[re.Match]]
A regular expression pattern used to identify valid URLs. This pattern matches strings that
start with 'http', 'https', 'ftp', or 'file' schemes, case-insensitively.
HTMLCleaner extends lxml.html.clean.Cleaner to sanitize HTML content while preserving
valid URLs and removing unsafe JavaScript links.
"""
def _remove_javascript_link(self, link: str):
"""
Checks if the given link is a valid URL. If it is, the link is returned unchanged.
Otherwise, the method delegates to the parent class's method to remove the JavaScript link.
Overrides the parent class's method to preserve valid URLs.
Parameters:
-----------
link : str
The hyperlink (href attribute value) to be checked and potentially sanitized.
This method uses a regular expression to identify valid URLs that start with 'http', 'https',
'ftp', or 'file' schemes. If the link is a valid URL, it is returned unchanged. Otherwise,
the parent class's method is used to remove the JavaScript link.
Args:
link (str): The hyperlink (href attribute value) to be checked and potentially sanitized.
Returns:
--------
Optional[str]
The original link if it is a valid URL; otherwise, the result of the parent class's method
to handle the link.
Example:
--------
'https://www.example.com/javascript:something' Valid
'javascript:alert("hello")' Invalid
'http://example.com/path/to/page' Valid
'ftp://ftp.example.com/resource' Valid
'file://localhost/path/to/file' Valid
str: The original link or empty string.
Examples:
Valid URLs:
'https://www.example.com/javascript:something'
'file://localhost/path/to/file'
Invalid URLs:
'javascript:alert("hello")'
'javascript:alert("hello") https://www.example.com/page'
"""
is_url = re.compile(r"^(?:https?|ftp|file)://", re.I).search(link.strip())
if is_url:
return link
super()._remove_javascript_link(link)
return super()._remove_javascript_link(link)


def HTML(html): # pylint: disable=invalid-name
Expand Down
36 changes: 17 additions & 19 deletions openedx/core/djangolib/tests/test_markup.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,34 +158,32 @@ def test_clean_dengers_html_filter(self):
assert not html_soup.find('blink')
assert not html_soup.find('object')


@ddt.ddt
class TestHTMLCleaner(unittest.TestCase):
"""
Tests that Url links are being cleaned properly and no useful link is removed.
"""

def setUp(self):
self.cleaner = HTMLCleaner(style=True, inline_style=False, safe_attrs_only=False)

def test_valid_urls(self):
https_url = "https://example.com"
http_url = "http://example.com/path/to/page"
ftp_url = "ftp://ftp.example.com/resource"
file_url = "file://localhost/path/to/file"

cleaned_url = self.cleaner._remove_javascript_link(https_url)
self.assertEqual(cleaned_url, https_url)
self.cleaner = HTMLCleaner()

cleaned_url = self.cleaner._remove_javascript_link(http_url)
self.assertEqual(cleaned_url, http_url)

cleaned_url = self.cleaner._remove_javascript_link(ftp_url)
self.assertEqual(cleaned_url, ftp_url)

cleaned_url = self.cleaner._remove_javascript_link(file_url)
self.assertEqual(cleaned_url, file_url)
@ddt.data(
"https://example.com/",
"http://example.com/path/to/page",
"ftp://ftp.example.com/resource",
"file://localhost/path/to/file",
)
def test_valid_urls(self, url):
"""
Test that valid URLs are preserved.
"""
cleaned_url = self.cleaner._remove_javascript_link(url)
self.assertEqual(cleaned_url, url)

def test_javascript_link(self):
"""
Test that malicious Javascript is removed.
"""
cleaned_url = self.cleaner._remove_javascript_link("javascript:alert('Hello')")
self.assertIsNone(cleaned_url)

Expand Down

0 comments on commit ec0572d

Please sign in to comment.