From 86538203c49c8f1c884f472cab440d9c7ffb6f60 Mon Sep 17 00:00:00 2001 From: Bear Giles Date: Fri, 15 Nov 2024 17:18:11 -0700 Subject: [PATCH 1/3] [JSOUP-2224] Added wildcards It is common for applications to use custom attributes, e.g., 'aria-*', and in fact with HTML 5(?) this is offically recognized with 'data-*' wildcards. It is often impossible to list all possible custom attributes, yet we want to be more selective than using ':all'. This patch adds support for both per-tag and global attribute wildcards. The wildcard must be recognized by `java.text.Pattern`, e.g., `data-.+`. The patch will quietly add the '^' and '$' delimiters to the provided pattern. --- src/main/java/org/jsoup/safety/Safelist.java | 120 +++++++++++++++++- .../java/org/jsoup/safety/SafelistTest.java | 21 +++ 2 files changed, 135 insertions(+), 6 deletions(-) diff --git a/src/main/java/org/jsoup/safety/Safelist.java b/src/main/java/org/jsoup/safety/Safelist.java index cb5038d06d..85d15e0ee2 100644 --- a/src/main/java/org/jsoup/safety/Safelist.java +++ b/src/main/java/org/jsoup/safety/Safelist.java @@ -12,12 +12,8 @@ Thank you to Ryan Grove (wonko.com) for the Ruby HTML cleaner http://github.com/ import org.jsoup.nodes.Attributes; import org.jsoup.nodes.Element; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Iterator; -import java.util.Map; -import java.util.Objects; -import java.util.Set; +import java.util.*; +import java.util.regex.Pattern; import static org.jsoup.internal.Normalizer.lowerCase; @@ -67,11 +63,13 @@ XSS attack examples (that jsoup will safegaurd against the default Cleaner and S */ public class Safelist { private static final String All = ":all"; + private final Set tagNames; // tags allowed, lower case. e.g. [p, br, span] private final Map> attributes; // tag -> attribute[]. allowed attributes [href] for a tag. private final Map> enforcedAttributes; // always set these attribute values private final Map>> protocols; // allowed URL protocols for attributes private boolean preserveRelativeLinks; // option to preserve relative links + private Map> attributeWildcards = new LinkedHashMap<>(); /** This safelist allows only text nodes: any HTML Element or any Node other than a TextNode will be removed. @@ -237,6 +235,12 @@ public Safelist(Safelist copy) { protocols.put(protocolsEntry.getKey(), attributeProtocolsCopy); } preserveRelativeLinks = copy.preserveRelativeLinks; + + // create deep-ish copy. (The 'Pattern' is not deep-copied.) + attributeWildcards = new LinkedHashMap<>(copy.attributeWildcards.size()); + for (Map.Entry> entry : copy.attributeWildcards.entrySet()) { + attributeWildcards.put(entry.getKey(), new LinkedHashMap<>(entry.getValue())); + } } /** @@ -408,6 +412,93 @@ public Safelist removeEnforcedAttribute(String tag, String attribute) { return this; } + /** + * Add wildcard attributes + *

+ * The wildcard should be recognized by java.text.Pattern. Multiple calls + * will result in only the last one being used. + *

+ *

+ * Examples: + *

    + *
  • data-.+ - HTML 5
  • + *
  • aria-.+ - a widely used library
  • + *
+ *

+ * + * @param tag The tag the attributes are for. + * @param wildcards wildcard pattern recognized by java.text.Pattern + * @return this Safelist, for chaining. + */ + public Safelist addWildcardAttributes(String tag, String... wildcards) { + String ltag = tag.toLowerCase(); + for (String wildcard : wildcards) { + if (!attributeWildcards.containsKey(ltag)) { + attributeWildcards.put(ltag, new LinkedHashMap<>()); + } + attributeWildcards.get(ltag).put(wildcard, Pattern.compile("^" + wildcard + "$", + Pattern.CASE_INSENSITIVE + Pattern.UNICODE_CASE)); + } + + return this; + } + + /** + * Remove wildcard attributes + * + * @param tag The tag the attributes are for. + * @param wildcards wildcards pattern recognized by java.text.Pattern + * @return this Safelist, for chaining. + */ + public Safelist removeWildcardAttributes(String tag, String... wildcards) { + String ltag = tag.toLowerCase(); + for (String wildcard : wildcards) { + if (attributeWildcards.containsKey(ltag)) { + if (attributeWildcards.get(ltag).containsKey(wildcard)) { + attributeWildcards.remove(wildcard); + } + + // remove any empty entries + if (attributeWildcards.get(ltag).isEmpty()) { + attributeWildcards.remove(ltag); + } + } + } + + return this; + } + + /** + * Add wildcard global attributes + *

+ * The wildcard should be recognized by java.text.Pattern. Multiple calls + * will result in only the last pattern being used. + *

+ *

+ * Examples: + *

    + *
  • data-.+ - HTML 5
  • + *
  • aria-.+ - a widely used library
  • + *
+ *

+ * + * @param wildcards wildcard pattern recognized by java.text.Pattern + * @return this Safelist, for chaining. + */ + public Safelist addWildcardGlobalAttributes(String... wildcards) { + return addWildcardAttributes(All, wildcards); + } + + /** + * Remove wildcard global attributes + * + * @param wildcards wildcard pattern recognized by java.text.Pattern + * @return this Safelist, for chaining. + */ + public Safelist removeWildcardGlobalAttributes(String wildcards) { + return removeWildcardAttributes(All, wildcards); + } + /** * Configure this Safelist to preserve relative links in an element's URL attribute, or convert them to absolute * links. By default, this is false: URLs will be made absolute (e.g. start with an allowed protocol, like @@ -541,6 +632,23 @@ public boolean isSafeAttribute(String tagName, Element el, Attribute attr) { return expect.getIgnoreCase(attrKey).equals(attr.getValue()); } } + // might be a wildcard, e.g., "data-.+"? + String ltag = tagName.toLowerCase(); + if (attributeWildcards.containsKey(ltag)) { + for (Pattern pattern : attributeWildcards.get(ltag).values()) { + if (pattern.matcher(attr.getKey()).matches()) { + return true; + } + } + } + // might be a global wildcard, e.g., "data-.+"? + if (attributeWildcards.containsKey(All)) { + for (Pattern pattern : attributeWildcards.get(All).values()) { + if (pattern.matcher(attr.getKey()).matches()) { + return true; + } + } + } // no attributes defined for tag, try :all tag return !tagName.equals(All) && isSafeAttribute(All, el, attr); } diff --git a/src/test/java/org/jsoup/safety/SafelistTest.java b/src/test/java/org/jsoup/safety/SafelistTest.java index 796ddc7225..03c58fd255 100644 --- a/src/test/java/org/jsoup/safety/SafelistTest.java +++ b/src/test/java/org/jsoup/safety/SafelistTest.java @@ -12,6 +12,7 @@ public class SafelistTest { private static final String TEST_TAG = "testTag"; private static final String TEST_ATTRIBUTE = "testAttribute"; + private static final String TEST_DATA_ATTRIBUTE = "data-" + TEST_ATTRIBUTE; private static final String TEST_SCHEME = "valid-scheme"; private static final String TEST_VALUE = TEST_SCHEME + "://testValue"; @@ -75,5 +76,25 @@ void noscriptIsBlocked() { assertNull(safelist); } + @Test + public void testAttributeWildcard() { + Safelist safelist1 = Safelist.none(); + Safelist safelist2 = new Safelist(safelist1).addWildcardAttributes(TEST_TAG, "data-.+"); + Attribute attr = new Attribute(TEST_DATA_ATTRIBUTE, TEST_VALUE); + + assertFalse(safelist1.isSafeAttribute(TEST_TAG, null, attr)); + assertTrue(safelist2.isSafeAttribute(TEST_TAG, null, attr)); + assertFalse(safelist1.isSafeAttribute(TEST_TAG + "1", null, attr)); + } + @Test + public void test8GlobalAttributeWildcard() { + Safelist safelist1 = Safelist.none(); + Safelist safelist2 = new Safelist(safelist1).addWildcardGlobalAttributes("data-.+"); + Attribute attr = new Attribute(TEST_DATA_ATTRIBUTE, TEST_VALUE); + + assertFalse(safelist1.isSafeAttribute(TEST_TAG, null, attr)); + assertTrue(safelist2.isSafeAttribute(TEST_TAG, null, attr)); + assertTrue(safelist2.isSafeAttribute(TEST_TAG + "1", null, attr)); + } } From 748b267c305337cddae58415b52bd8d8bf483dec Mon Sep 17 00:00:00 2001 From: Bear Giles Date: Fri, 15 Nov 2024 18:12:59 -0700 Subject: [PATCH 2/3] Fixed package name --- src/main/java/org/jsoup/safety/Safelist.java | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/main/java/org/jsoup/safety/Safelist.java b/src/main/java/org/jsoup/safety/Safelist.java index 85d15e0ee2..109c07a5c0 100644 --- a/src/main/java/org/jsoup/safety/Safelist.java +++ b/src/main/java/org/jsoup/safety/Safelist.java @@ -415,7 +415,7 @@ public Safelist removeEnforcedAttribute(String tag, String attribute) { /** * Add wildcard attributes *

- * The wildcard should be recognized by java.text.Pattern. Multiple calls + * The wildcard should be recognized by java.util.regex.Pattern. Multiple calls * will result in only the last one being used. *

*

@@ -427,7 +427,7 @@ public Safelist removeEnforcedAttribute(String tag, String attribute) { *

* * @param tag The tag the attributes are for. - * @param wildcards wildcard pattern recognized by java.text.Pattern + * @param wildcards wildcard pattern recognized by java.util.regex.Pattern * @return this Safelist, for chaining. */ public Safelist addWildcardAttributes(String tag, String... wildcards) { @@ -447,7 +447,7 @@ public Safelist addWildcardAttributes(String tag, String... wildcards) { * Remove wildcard attributes * * @param tag The tag the attributes are for. - * @param wildcards wildcards pattern recognized by java.text.Pattern + * @param wildcards wildcards pattern recognized by java.util.regex.Pattern * @return this Safelist, for chaining. */ public Safelist removeWildcardAttributes(String tag, String... wildcards) { @@ -471,7 +471,7 @@ public Safelist removeWildcardAttributes(String tag, String... wildcards) { /** * Add wildcard global attributes *

- * The wildcard should be recognized by java.text.Pattern. Multiple calls + * The wildcard should be recognized by java.util.regex.Pattern. Multiple calls * will result in only the last pattern being used. *

*

@@ -482,7 +482,7 @@ public Safelist removeWildcardAttributes(String tag, String... wildcards) { * *

* - * @param wildcards wildcard pattern recognized by java.text.Pattern + * @param wildcards wildcard pattern recognized by java.util.regex.Pattern * @return this Safelist, for chaining. */ public Safelist addWildcardGlobalAttributes(String... wildcards) { @@ -492,7 +492,7 @@ public Safelist addWildcardGlobalAttributes(String... wildcards) { /** * Remove wildcard global attributes * - * @param wildcards wildcard pattern recognized by java.text.Pattern + * @param wildcards wildcard pattern recognized by java.util.regex.Pattern * @return this Safelist, for chaining. */ public Safelist removeWildcardGlobalAttributes(String wildcards) { From 0ffb476d89a26febd8edce0c3e4b099f4319ef42 Mon Sep 17 00:00:00 2001 From: Bear Giles Date: Fri, 15 Nov 2024 20:59:23 -0700 Subject: [PATCH 3/3] [JSOUP-2224] fxed removeTags() Fixed removeTags() - it hadn't removed the wildcard attributes. Also changed the Map so the key is a TagName, not a String, in order to avoid any uncertainty regarding capitalization. --- src/main/java/org/jsoup/safety/Safelist.java | 39 ++++++++++---------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/src/main/java/org/jsoup/safety/Safelist.java b/src/main/java/org/jsoup/safety/Safelist.java index 109c07a5c0..c1a48edf26 100644 --- a/src/main/java/org/jsoup/safety/Safelist.java +++ b/src/main/java/org/jsoup/safety/Safelist.java @@ -63,13 +63,14 @@ XSS attack examples (that jsoup will safegaurd against the default Cleaner and S */ public class Safelist { private static final String All = ":all"; + private static final TagName AllTagName = new TagName(All); private final Set tagNames; // tags allowed, lower case. e.g. [p, br, span] private final Map> attributes; // tag -> attribute[]. allowed attributes [href] for a tag. private final Map> enforcedAttributes; // always set these attribute values private final Map>> protocols; // allowed URL protocols for attributes private boolean preserveRelativeLinks; // option to preserve relative links - private Map> attributeWildcards = new LinkedHashMap<>(); + private Map> wildcardAttributes = new LinkedHashMap<>(); /** This safelist allows only text nodes: any HTML Element or any Node other than a TextNode will be removed. @@ -237,9 +238,9 @@ public Safelist(Safelist copy) { preserveRelativeLinks = copy.preserveRelativeLinks; // create deep-ish copy. (The 'Pattern' is not deep-copied.) - attributeWildcards = new LinkedHashMap<>(copy.attributeWildcards.size()); - for (Map.Entry> entry : copy.attributeWildcards.entrySet()) { - attributeWildcards.put(entry.getKey(), new LinkedHashMap<>(entry.getValue())); + wildcardAttributes = new LinkedHashMap<>(copy.wildcardAttributes.size()); + for (Map.Entry> entry : copy.wildcardAttributes.entrySet()) { + wildcardAttributes.put(entry.getKey(), new LinkedHashMap<>(entry.getValue())); } } @@ -278,6 +279,7 @@ public Safelist removeTags(String... tags) { attributes.remove(tagName); enforcedAttributes.remove(tagName); protocols.remove(tagName); + wildcardAttributes.remove(tagName); } } return this; @@ -431,12 +433,12 @@ public Safelist removeEnforcedAttribute(String tag, String attribute) { * @return this Safelist, for chaining. */ public Safelist addWildcardAttributes(String tag, String... wildcards) { - String ltag = tag.toLowerCase(); + TagName tagName = TagName.valueOf(tag); for (String wildcard : wildcards) { - if (!attributeWildcards.containsKey(ltag)) { - attributeWildcards.put(ltag, new LinkedHashMap<>()); + if (!wildcardAttributes.containsKey(tagName)) { + wildcardAttributes.put(tagName, new LinkedHashMap<>()); } - attributeWildcards.get(ltag).put(wildcard, Pattern.compile("^" + wildcard + "$", + wildcardAttributes.get(tagName).put(wildcard, Pattern.compile("^" + wildcard + "$", Pattern.CASE_INSENSITIVE + Pattern.UNICODE_CASE)); } @@ -451,16 +453,16 @@ public Safelist addWildcardAttributes(String tag, String... wildcards) { * @return this Safelist, for chaining. */ public Safelist removeWildcardAttributes(String tag, String... wildcards) { - String ltag = tag.toLowerCase(); + TagName tagName = TagName.valueOf(tag); for (String wildcard : wildcards) { - if (attributeWildcards.containsKey(ltag)) { - if (attributeWildcards.get(ltag).containsKey(wildcard)) { - attributeWildcards.remove(wildcard); + if (wildcardAttributes.containsKey(tagName)) { + if (wildcardAttributes.get(tagName).containsKey(wildcard)) { + wildcardAttributes.get(tagName).remove(wildcard); } // remove any empty entries - if (attributeWildcards.get(ltag).isEmpty()) { - attributeWildcards.remove(ltag); + if (wildcardAttributes.get(tagName).isEmpty()) { + wildcardAttributes.remove(tagName); } } } @@ -633,17 +635,16 @@ public boolean isSafeAttribute(String tagName, Element el, Attribute attr) { } } // might be a wildcard, e.g., "data-.+"? - String ltag = tagName.toLowerCase(); - if (attributeWildcards.containsKey(ltag)) { - for (Pattern pattern : attributeWildcards.get(ltag).values()) { + if (wildcardAttributes.containsKey(tag)) { + for (Pattern pattern : wildcardAttributes.get(tag).values()) { if (pattern.matcher(attr.getKey()).matches()) { return true; } } } // might be a global wildcard, e.g., "data-.+"? - if (attributeWildcards.containsKey(All)) { - for (Pattern pattern : attributeWildcards.get(All).values()) { + if (wildcardAttributes.containsKey(AllTagName)) { + for (Pattern pattern : wildcardAttributes.get(AllTagName).values()) { if (pattern.matcher(attr.getKey()).matches()) { return true; }