From 211f13ae1b576e89f5cb24e4ffc9d94a1a9cda3d Mon Sep 17 00:00:00 2001 From: Irtaza Akram Date: Wed, 10 Jan 2024 17:16:46 +0500 Subject: [PATCH] feat: nh3 replace bleach --- .github/workflows/unit-tests.yml | 1 + lms/djangoapps/certificates/views/support.py | 4 ++-- lms/djangoapps/courseware/views/views.py | 4 ++-- .../discussion/django_comment_client/utils.py | 2 +- lms/djangoapps/discussion/rest_api/render.py | 16 ++++++------- lms/templates/courseware/progress_graph.js | 4 ++-- lms/templates/lti.html | 2 +- openedx/core/djangoapps/debug/views.py | 4 ++-- .../djangoapps/user_authn/views/logout.py | 4 ++-- .../user_authn/views/tests/test_logout.py | 4 ++-- openedx/core/djangolib/markup.py | 4 ++-- requirements/edx/base.txt | 2 ++ requirements/edx/development.txt | 4 ++++ requirements/edx/doc.txt | 2 ++ requirements/edx/kernel.in | 1 + requirements/edx/testing.txt | 2 ++ xmodule/capa/inputtypes.py | 4 ++-- xmodule/capa/tests/test_util.py | 4 ++-- xmodule/capa/util.py | 24 +++++++++---------- xmodule/capa_block.py | 4 ++-- xmodule/library_content_block.py | 4 ++-- xmodule/lti_block.py | 4 ++-- 22 files changed, 57 insertions(+), 47 deletions(-) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index c3b1086c20eb..aa59f1a69d74 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -12,6 +12,7 @@ jobs: if: (github.repository == 'edx/edx-platform-private') || (github.repository == 'openedx/edx-platform' && (startsWith(github.base_ref, 'open-release') == false)) runs-on: [ edx-platform-runner ] strategy: + fail-fast: false matrix: python-version: - "3.8" diff --git a/lms/djangoapps/certificates/views/support.py b/lms/djangoapps/certificates/views/support.py index 7aadf3091952..12c3c8157eda 100644 --- a/lms/djangoapps/certificates/views/support.py +++ b/lms/djangoapps/certificates/views/support.py @@ -10,7 +10,7 @@ import urllib from functools import wraps -import bleach +import nh3 from django.db import transaction from django.db.models import Q from django.http import HttpResponse, HttpResponseBadRequest, HttpResponseForbidden, HttpResponseServerError @@ -89,7 +89,7 @@ def search_certificates(request): """ unbleached_filter = urllib.parse.unquote(urllib.parse.quote_plus(request.GET.get("user", ""))) - user_filter = bleach.clean(unbleached_filter) + user_filter = nh3.clean(unbleached_filter) if not user_filter: msg = _("user is not given.") return HttpResponseBadRequest(msg) diff --git a/lms/djangoapps/courseware/views/views.py b/lms/djangoapps/courseware/views/views.py index d0e657775bce..cff0136142d4 100644 --- a/lms/djangoapps/courseware/views/views.py +++ b/lms/djangoapps/courseware/views/views.py @@ -10,7 +10,7 @@ from datetime import datetime from urllib.parse import quote_plus, urlencode, urljoin, urlparse, urlunparse -import bleach +import nh3 import requests from django.conf import settings from django.contrib.auth.decorators import login_required @@ -1538,7 +1538,7 @@ def render_xblock(request, usage_key_string, check_if_enrolled=True, disable_sta requested_view = request.GET.get('view', 'student_view') if requested_view != 'student_view' and requested_view != 'public_view': # lint-amnesty, pylint: disable=consider-using-in return HttpResponseBadRequest( - f"Rendering of the xblock view '{bleach.clean(requested_view, strip=True)}' is not supported." + f"Rendering of the xblock view '{nh3.clean(requested_view, strip_comments=True)}' is not supported." ) staff_access = has_access(request.user, 'staff', course_key) diff --git a/lms/djangoapps/discussion/django_comment_client/utils.py b/lms/djangoapps/discussion/django_comment_client/utils.py index 3b5e1af4a039..e26b748270e3 100644 --- a/lms/djangoapps/discussion/django_comment_client/utils.py +++ b/lms/djangoapps/discussion/django_comment_client/utils.py @@ -1038,7 +1038,7 @@ def sanitize_body(body): This is possibly overly broad, and might tamper with legitimate posts that contain this code in fenced code blocks. As far as we can tell, this is an extra layer of protection, and current handling in the front end and using - bleach for HTML rendering on the server side should cover these cases. + nh3 for HTML rendering on the server side should cover these cases. """ if not body: return body diff --git a/lms/djangoapps/discussion/rest_api/render.py b/lms/djangoapps/discussion/rest_api/render.py index 44e2d8b692ce..e998a6bded87 100644 --- a/lms/djangoapps/discussion/rest_api/render.py +++ b/lms/djangoapps/discussion/rest_api/render.py @@ -4,17 +4,17 @@ Note that this module is designed to imitate the front end behavior as implemented in Markdown.Sanitizer.js. """ -import bleach +import nh3 import markdown -ALLOWED_TAGS = bleach.ALLOWED_TAGS | { +ALLOWED_TAGS = nh3.ALLOWED_TAGS | { 'br', 'dd', 'del', 'dl', 'dt', 'h1', 'h2', 'h3', 'h4', 'hr', 'img', 'kbd', 'p', 'pre', 's', 'strike', 'sub', 'sup' } ALLOWED_PROTOCOLS = {"http", "https", "ftp", "mailto"} ALLOWED_ATTRIBUTES = { - "a": ["href", "title", "target", "rel"], - "img": ["src", "alt", "title", "width", "height"], + "a": {"href", "title", "target", "rel"}, + "img": {"src", "alt", "title", "width", "height"}, } @@ -25,17 +25,17 @@ def render_body(raw_body): This includes the following steps: * Convert Markdown to HTML - * Sanitise HTML using bleach + * Sanitise HTML using nh3 Note that this does not prevent Markdown syntax inside a MathJax block from being processed, which the forums JavaScript code does. """ rendered_html = markdown.markdown(raw_body) - sanitised_html = bleach.clean( + sanitised_html = nh3.clean( rendered_html, tags=ALLOWED_TAGS, - protocols=ALLOWED_PROTOCOLS, - strip=True, + # protocols=ALLOWED_PROTOCOLS, + strip_comments=True, attributes=ALLOWED_ATTRIBUTES ) return sanitised_html diff --git a/lms/templates/courseware/progress_graph.js b/lms/templates/courseware/progress_graph.js index 5a1d64c36ee6..c3117b6e92ba 100644 --- a/lms/templates/courseware/progress_graph.js +++ b/lms/templates/courseware/progress_graph.js @@ -1,6 +1,6 @@ <%page args="grade_summary, grade_cutoffs, graph_div_id, show_grade_breakdown = True, show_grade_cutoffs = True, **kwargs"/> <%! - import bleach + import nh3 import json import math import six @@ -74,7 +74,7 @@ $(function () { ## allowing the display of such images, and remove any previously stored HTML ## to prevent ugly HTML from being shown to learners. ## xss-lint: disable=javascript-jquery-append - ticks.append( [tickIndex, bleach.clean(section['label'], tags=set(), strip=True)] ) + ticks.append( [tickIndex, nh3.clean(section['label'], tags=set(), strip_comments=True)] ) if section['category'] in detail_tooltips: ## xss-lint: disable=javascript-jquery-append diff --git a/lms/templates/lti.html b/lms/templates/lti.html index 2f89182597b1..05346ec3dc40 100644 --- a/lms/templates/lti.html +++ b/lms/templates/lti.html @@ -61,7 +61,7 @@

% if has_score and comment:

${_("Feedback on your work from the grader:")}

- ## sanitized with bleach in view + ## sanitized with nh3 in view ${comment | n, decode.utf8}
% endif diff --git a/openedx/core/djangoapps/debug/views.py b/openedx/core/djangoapps/debug/views.py index 6a3bcc40f280..9b64763015e3 100644 --- a/openedx/core/djangoapps/debug/views.py +++ b/openedx/core/djangoapps/debug/views.py @@ -5,7 +5,7 @@ """ -import bleach +import nh3 from django.http import HttpResponseNotFound from django.template import TemplateDoesNotExist from django.utils.translation import gettext as _ @@ -54,4 +54,4 @@ def show_reference_template(request, template): return render_to_response(template, context) except TemplateDoesNotExist: - return HttpResponseNotFound(f'Missing template {bleach.clean(template, strip=True)}') + return HttpResponseNotFound(f'Missing template {nh3.clean(template, strip_comments=True)}') diff --git a/openedx/core/djangoapps/user_authn/views/logout.py b/openedx/core/djangoapps/user_authn/views/logout.py index 13301f5e3bb1..616b792b9f22 100644 --- a/openedx/core/djangoapps/user_authn/views/logout.py +++ b/openedx/core/djangoapps/user_authn/views/logout.py @@ -5,7 +5,7 @@ import urllib.parse as parse # pylint: disable=import-error from urllib.parse import parse_qs, urlsplit, urlunsplit # pylint: disable=import-error -import bleach +import nh3 from django.conf import settings from django.contrib.auth import logout from django.shortcuts import redirect @@ -60,7 +60,7 @@ def target(self): # >> /courses/course-v1:ARTS+D1+2018_T/course/ # to handle this scenario we need to encode our URL using quote_plus and then unquote it again. if target_url: - target_url = bleach.clean(parse.unquote(parse.quote_plus(target_url))) + target_url = nh3.clean(parse.unquote(parse.quote_plus(target_url))) use_target_url = target_url and is_safe_login_or_logout_redirect( redirect_to=target_url, diff --git a/openedx/core/djangoapps/user_authn/views/tests/test_logout.py b/openedx/core/djangoapps/user_authn/views/tests/test_logout.py index 7d10fe1021ef..c59969c2d00d 100644 --- a/openedx/core/djangoapps/user_authn/views/tests/test_logout.py +++ b/openedx/core/djangoapps/user_authn/views/tests/test_logout.py @@ -5,7 +5,7 @@ import urllib from unittest import mock import ddt -import bleach +import nh3 from django.conf import settings from django.test import TestCase from django.test.utils import override_settings @@ -237,6 +237,6 @@ def test_logout_redirect_failure_with_xss_vulnerability(self, redirect_url, host ) response = self.client.get(url, HTTP_HOST=host) expected = { - 'target': bleach.clean(urllib.parse.unquote(redirect_url)), + 'target': nh3.clean(urllib.parse.unquote(redirect_url)), } self.assertDictContainsSubset(expected, response.context_data) diff --git a/openedx/core/djangolib/markup.py b/openedx/core/djangolib/markup.py index 3009f1d53f34..2b22a3035044 100644 --- a/openedx/core/djangolib/markup.py +++ b/openedx/core/djangolib/markup.py @@ -4,7 +4,7 @@ import markupsafe -import bleach +import nh3 from lxml.html.clean import Cleaner from mako.filters import decode @@ -53,7 +53,7 @@ def strip_all_tags_but_br(string_to_strip): string_to_strip = "" string_to_strip = decode.utf8(string_to_strip) - string_to_strip = bleach.clean(string_to_strip, tags={'br'}, strip=True) + string_to_strip = nh3.clean(string_to_strip, tags={'br'}, strip_comments=True) return HTML(string_to_strip) diff --git a/requirements/edx/base.txt b/requirements/edx/base.txt index 4b2900bc6aba..479abbef2853 100644 --- a/requirements/edx/base.txt +++ b/requirements/edx/base.txt @@ -733,6 +733,8 @@ newrelic==9.3.0 # via # -r requirements/edx/bundled.in # edx-django-utils +nh3==0.2.15 + # via -r requirements/edx/kernel.in nltk==3.8.1 # via chem nodeenv==1.8.0 diff --git a/requirements/edx/development.txt b/requirements/edx/development.txt index 19d3f96f8770..99f64f0afbb4 100644 --- a/requirements/edx/development.txt +++ b/requirements/edx/development.txt @@ -1238,6 +1238,10 @@ newrelic==9.3.0 # -r requirements/edx/doc.txt # -r requirements/edx/testing.txt # edx-django-utils +nh3==0.2.15 + # via + # -r requirements/edx/doc.txt + # -r requirements/edx/testing.txt nltk==3.8.1 # via # -r requirements/edx/doc.txt diff --git a/requirements/edx/doc.txt b/requirements/edx/doc.txt index dd9fbd2bb637..fe57c211c0d8 100644 --- a/requirements/edx/doc.txt +++ b/requirements/edx/doc.txt @@ -870,6 +870,8 @@ newrelic==9.3.0 # via # -r requirements/edx/base.txt # edx-django-utils +nh3==0.2.15 + # via -r requirements/edx/base.txt nltk==3.8.1 # via # -r requirements/edx/base.txt diff --git a/requirements/edx/kernel.in b/requirements/edx/kernel.in index 744f1bd632ce..e360f607e89e 100644 --- a/requirements/edx/kernel.in +++ b/requirements/edx/kernel.in @@ -109,6 +109,7 @@ mako # Primary template language used for server- Markdown # Convert text markup to HTML; used in capa problems, forums, and course wikis mongoengine # Object-document mapper for MongoDB, used in the LMS dashboard mysqlclient # Driver for the default production relational database +nh3 nodeenv # Utility for managing Node.js environments; we use this for deployments and testing oauthlib # OAuth specification support for authenticating via LTI or other Open edX services olxcleaner diff --git a/requirements/edx/testing.txt b/requirements/edx/testing.txt index e5b9c70a178a..bdfd8f58e378 100644 --- a/requirements/edx/testing.txt +++ b/requirements/edx/testing.txt @@ -930,6 +930,8 @@ newrelic==9.3.0 # via # -r requirements/edx/base.txt # edx-django-utils +nh3==0.2.15 + # via -r requirements/edx/base.txt nltk==3.8.1 # via # -r requirements/edx/base.txt diff --git a/xmodule/capa/inputtypes.py b/xmodule/capa/inputtypes.py index 5decad4a2ca8..8dff57768688 100644 --- a/xmodule/capa/inputtypes.py +++ b/xmodule/capa/inputtypes.py @@ -47,7 +47,7 @@ import time from datetime import datetime -import bleach +import nh3 import html5lib import pyparsing import six @@ -800,7 +800,7 @@ def setup_code_response_rendering(self): if self.status == 'incomplete': self.status = 'queued' self.queue_len = self.msg # lint-amnesty, pylint: disable=attribute-defined-outside-init - self.msg = bleach.clean(self.submitted_msg) + self.msg = nh3.clean(self.submitted_msg) def setup(self): """ setup this input type """ diff --git a/xmodule/capa/tests/test_util.py b/xmodule/capa/tests/test_util.py index 38f488a2befd..2fb24ccd3682 100644 --- a/xmodule/capa/tests/test_util.py +++ b/xmodule/capa/tests/test_util.py @@ -121,7 +121,7 @@ def test_compare_with_tolerance(self): # lint-amnesty, pylint: disable=too-many def test_sanitize_html(self): """ - Test for html sanitization with bleach. + Test for html sanitization with nh3. """ allowed_tags = ['div', 'p', 'audio', 'pre', 'span'] for tag in allowed_tags: @@ -142,7 +142,7 @@ def test_get_inner_html_from_xpath(self): def test_remove_markup(self): """ - Test for markup removal with bleach. + Test for markup removal with nh3. """ assert remove_markup('The Truth is Out There & you need to find it') ==\ 'The Truth is Out There & you need to find it' diff --git a/xmodule/capa/util.py b/xmodule/capa/util.py index 5af2a0dd6e99..418abcf5ce81 100644 --- a/xmodule/capa/util.py +++ b/xmodule/capa/util.py @@ -8,11 +8,10 @@ from cmath import isinf, isnan from decimal import Decimal -import bleach +import nh3 from calc import evaluator from lxml import etree -from bleach.css_sanitizer import CSSSanitizer from openedx.core.djangolib.markup import HTML #----------------------------------------------------------------------------- @@ -182,17 +181,16 @@ def sanitize_html(html_code): Used to sanitize XQueue responses from Matlab. """ - attributes = bleach.ALLOWED_ATTRIBUTES.copy() + attributes = nh3.ALLOWED_ATTRIBUTES.copy() attributes.update({ - '*': ['class', 'style', 'id'], - 'audio': ['controls', 'autobuffer', 'autoplay', 'src'], - 'img': ['src', 'width', 'height', 'class'] + '*': {'class', 'style', 'id'}, + 'audio': {'controls', 'autobuffer', 'autoplay', 'src'}, + 'img': {'src', 'width', 'height', 'class'} }) - output = bleach.clean( + output = nh3.clean( html_code, - protocols=bleach.ALLOWED_PROTOCOLS | {'data'}, - tags=bleach.ALLOWED_TAGS | {'div', 'p', 'audio', 'pre', 'img', 'span'}, - css_sanitizer=CSSSanitizer(allowed_css_properties=["white-space"]), + # protocols=nh3.ALLOWED_PROTOCOLS | {'data'}, + tags=nh3.ALLOWED_TAGS | {'div', 'p', 'audio', 'pre', 'img', 'span'}, attributes=attributes ) return output @@ -215,12 +213,12 @@ def remove_markup(html): """ Return html with markup stripped and text HTML-escaped. - >>> bleach.clean("Rock & Roll", tags=set(), strip=True) + >>> nh3.clean("Rock & Roll", tags=set(), strip_comments=True) 'Rock & Roll' - >>> bleach.clean("Rock & Roll", tags=set(), strip=True) + >>> nh3.clean("Rock & Roll", tags=set(), strip_comments=True) 'Rock & Roll' """ - return HTML(bleach.clean(html, tags=set(), strip=True)) + return HTML(nh3.clean(html, tags=set(), strip_comments=True)) def get_course_id_from_capa_block(capa_block): diff --git a/xmodule/capa_block.py b/xmodule/capa_block.py index 7b58b5aa9ab8..0108a86a8e91 100644 --- a/xmodule/capa_block.py +++ b/xmodule/capa_block.py @@ -13,7 +13,7 @@ import sys import traceback -from bleach.sanitizer import Cleaner +import nh3 from django.conf import settings from django.core.exceptions import ImproperlyConfigured from django.utils.encoding import smart_str @@ -557,7 +557,7 @@ def index_dictionary(self): capa_content = re.sub( r"(\s| |//)+", " ", - Cleaner(tags=[], strip=True).clean(capa_content) + nh3.clean(capa_content, tags=set()) ) capa_body = { diff --git a/xmodule/library_content_block.py b/xmodule/library_content_block.py index e4c4af267a1f..bd5dd23dff14 100644 --- a/xmodule/library_content_block.py +++ b/xmodule/library_content_block.py @@ -9,7 +9,7 @@ from copy import copy from gettext import ngettext, gettext -import bleach +import nh3 from django.conf import settings from django.core.exceptions import ObjectDoesNotExist, PermissionDenied from django.utils.functional import classproperty @@ -714,7 +714,7 @@ def source_library_values(self): lib_tools = self.get_tools() user_perms = self.runtime.service(self, 'studio_user_permissions') all_libraries = [ - (key, bleach.clean(name)) for key, name in lib_tools.list_available_libraries() + (key, nh3.clean(name)) for key, name in lib_tools.list_available_libraries() if user_perms.can_read(key) or self.source_library_id == str(key) ] all_libraries.sort(key=lambda entry: entry[1]) # Sort by name diff --git a/xmodule/lti_block.py b/xmodule/lti_block.py index a463da79ef0b..9a5531601ba2 100644 --- a/xmodule/lti_block.py +++ b/xmodule/lti_block.py @@ -63,7 +63,7 @@ from unittest import mock from urllib import parse -import bleach +import nh3 import oauthlib.oauth1 from django.conf import settings from lxml import etree @@ -468,7 +468,7 @@ def get_context(self): # 'acronym': ['title'], # # This lets all plaintext through. - sanitized_comment = bleach.clean(self.score_comment) + sanitized_comment = nh3.clean(self.score_comment) return { 'input_fields': self.get_input_fields(),