diff --git a/src/main/java/org/jsoup/safety/Safelist.java b/src/main/java/org/jsoup/safety/Safelist.java index cb5038d06d..c1a48edf26 100644 --- a/src/main/java/org/jsoup/safety/Safelist.java +++ b/src/main/java/org/jsoup/safety/Safelist.java @@ -12,12 +12,8 @@ Thank you to Ryan Grove (wonko.com) for the Ruby HTML cleaner http://github.com/ import org.jsoup.nodes.Attributes; import org.jsoup.nodes.Element; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Iterator; -import java.util.Map; -import java.util.Objects; -import java.util.Set; +import java.util.*; +import java.util.regex.Pattern; import static org.jsoup.internal.Normalizer.lowerCase; @@ -67,11 +63,14 @@ XSS attack examples (that jsoup will safegaurd against the default Cleaner and S */ public class Safelist { private static final String All = ":all"; + private static final TagName AllTagName = new TagName(All); + private final Set tagNames; // tags allowed, lower case. e.g. [p, br, span] private final Map> attributes; // tag -> attribute[]. allowed attributes [href] for a tag. private final Map> enforcedAttributes; // always set these attribute values private final Map>> protocols; // allowed URL protocols for attributes private boolean preserveRelativeLinks; // option to preserve relative links + private Map> wildcardAttributes = new LinkedHashMap<>(); /** This safelist allows only text nodes: any HTML Element or any Node other than a TextNode will be removed. @@ -237,6 +236,12 @@ public Safelist(Safelist copy) { protocols.put(protocolsEntry.getKey(), attributeProtocolsCopy); } preserveRelativeLinks = copy.preserveRelativeLinks; + + // create deep-ish copy. (The 'Pattern' is not deep-copied.) + wildcardAttributes = new LinkedHashMap<>(copy.wildcardAttributes.size()); + for (Map.Entry> entry : copy.wildcardAttributes.entrySet()) { + wildcardAttributes.put(entry.getKey(), new LinkedHashMap<>(entry.getValue())); + } } /** @@ -274,6 +279,7 @@ public Safelist removeTags(String... tags) { attributes.remove(tagName); enforcedAttributes.remove(tagName); protocols.remove(tagName); + wildcardAttributes.remove(tagName); } } return this; @@ -408,6 +414,93 @@ public Safelist removeEnforcedAttribute(String tag, String attribute) { return this; } + /** + * Add wildcard attributes + *

+ * The wildcard should be recognized by java.util.regex.Pattern. Multiple calls + * will result in only the last one being used. + *

+ *

+ * Examples: + *

    + *
  • data-.+ - HTML 5
  • + *
  • aria-.+ - a widely used library
  • + *
+ *

+ * + * @param tag The tag the attributes are for. + * @param wildcards wildcard pattern recognized by java.util.regex.Pattern + * @return this Safelist, for chaining. + */ + public Safelist addWildcardAttributes(String tag, String... wildcards) { + TagName tagName = TagName.valueOf(tag); + for (String wildcard : wildcards) { + if (!wildcardAttributes.containsKey(tagName)) { + wildcardAttributes.put(tagName, new LinkedHashMap<>()); + } + wildcardAttributes.get(tagName).put(wildcard, Pattern.compile("^" + wildcard + "$", + Pattern.CASE_INSENSITIVE + Pattern.UNICODE_CASE)); + } + + return this; + } + + /** + * Remove wildcard attributes + * + * @param tag The tag the attributes are for. + * @param wildcards wildcards pattern recognized by java.util.regex.Pattern + * @return this Safelist, for chaining. + */ + public Safelist removeWildcardAttributes(String tag, String... wildcards) { + TagName tagName = TagName.valueOf(tag); + for (String wildcard : wildcards) { + if (wildcardAttributes.containsKey(tagName)) { + if (wildcardAttributes.get(tagName).containsKey(wildcard)) { + wildcardAttributes.get(tagName).remove(wildcard); + } + + // remove any empty entries + if (wildcardAttributes.get(tagName).isEmpty()) { + wildcardAttributes.remove(tagName); + } + } + } + + return this; + } + + /** + * Add wildcard global attributes + *

+ * The wildcard should be recognized by java.util.regex.Pattern. Multiple calls + * will result in only the last pattern being used. + *

+ *

+ * Examples: + *

    + *
  • data-.+ - HTML 5
  • + *
  • aria-.+ - a widely used library
  • + *
+ *

+ * + * @param wildcards wildcard pattern recognized by java.util.regex.Pattern + * @return this Safelist, for chaining. + */ + public Safelist addWildcardGlobalAttributes(String... wildcards) { + return addWildcardAttributes(All, wildcards); + } + + /** + * Remove wildcard global attributes + * + * @param wildcards wildcard pattern recognized by java.util.regex.Pattern + * @return this Safelist, for chaining. + */ + public Safelist removeWildcardGlobalAttributes(String wildcards) { + return removeWildcardAttributes(All, wildcards); + } + /** * Configure this Safelist to preserve relative links in an element's URL attribute, or convert them to absolute * links. By default, this is false: URLs will be made absolute (e.g. start with an allowed protocol, like @@ -541,6 +634,22 @@ public boolean isSafeAttribute(String tagName, Element el, Attribute attr) { return expect.getIgnoreCase(attrKey).equals(attr.getValue()); } } + // might be a wildcard, e.g., "data-.+"? + if (wildcardAttributes.containsKey(tag)) { + for (Pattern pattern : wildcardAttributes.get(tag).values()) { + if (pattern.matcher(attr.getKey()).matches()) { + return true; + } + } + } + // might be a global wildcard, e.g., "data-.+"? + if (wildcardAttributes.containsKey(AllTagName)) { + for (Pattern pattern : wildcardAttributes.get(AllTagName).values()) { + if (pattern.matcher(attr.getKey()).matches()) { + return true; + } + } + } // no attributes defined for tag, try :all tag return !tagName.equals(All) && isSafeAttribute(All, el, attr); } diff --git a/src/test/java/org/jsoup/safety/SafelistTest.java b/src/test/java/org/jsoup/safety/SafelistTest.java index 796ddc7225..03c58fd255 100644 --- a/src/test/java/org/jsoup/safety/SafelistTest.java +++ b/src/test/java/org/jsoup/safety/SafelistTest.java @@ -12,6 +12,7 @@ public class SafelistTest { private static final String TEST_TAG = "testTag"; private static final String TEST_ATTRIBUTE = "testAttribute"; + private static final String TEST_DATA_ATTRIBUTE = "data-" + TEST_ATTRIBUTE; private static final String TEST_SCHEME = "valid-scheme"; private static final String TEST_VALUE = TEST_SCHEME + "://testValue"; @@ -75,5 +76,25 @@ void noscriptIsBlocked() { assertNull(safelist); } + @Test + public void testAttributeWildcard() { + Safelist safelist1 = Safelist.none(); + Safelist safelist2 = new Safelist(safelist1).addWildcardAttributes(TEST_TAG, "data-.+"); + Attribute attr = new Attribute(TEST_DATA_ATTRIBUTE, TEST_VALUE); + + assertFalse(safelist1.isSafeAttribute(TEST_TAG, null, attr)); + assertTrue(safelist2.isSafeAttribute(TEST_TAG, null, attr)); + assertFalse(safelist1.isSafeAttribute(TEST_TAG + "1", null, attr)); + } + @Test + public void test8GlobalAttributeWildcard() { + Safelist safelist1 = Safelist.none(); + Safelist safelist2 = new Safelist(safelist1).addWildcardGlobalAttributes("data-.+"); + Attribute attr = new Attribute(TEST_DATA_ATTRIBUTE, TEST_VALUE); + + assertFalse(safelist1.isSafeAttribute(TEST_TAG, null, attr)); + assertTrue(safelist2.isSafeAttribute(TEST_TAG, null, attr)); + assertTrue(safelist2.isSafeAttribute(TEST_TAG + "1", null, attr)); + } }