From 86538203c49c8f1c884f472cab440d9c7ffb6f60 Mon Sep 17 00:00:00 2001
From: Bear Giles
Date: Fri, 15 Nov 2024 17:18:11 -0700
Subject: [PATCH 1/3] [JSOUP-2224] Added wildcards
It is common for applications to use custom attributes, e.g., 'aria-*', and
in fact with HTML 5(?) this is offically recognized with 'data-*' wildcards.
It is often impossible to list all possible custom attributes, yet we want
to be more selective than using ':all'.
This patch adds support for both per-tag and global attribute wildcards.
The wildcard must be recognized by `java.text.Pattern`, e.g., `data-.+`.
The patch will quietly add the '^' and '$' delimiters to the provided
pattern.
---
src/main/java/org/jsoup/safety/Safelist.java | 120 +++++++++++++++++-
.../java/org/jsoup/safety/SafelistTest.java | 21 +++
2 files changed, 135 insertions(+), 6 deletions(-)
diff --git a/src/main/java/org/jsoup/safety/Safelist.java b/src/main/java/org/jsoup/safety/Safelist.java
index cb5038d06d..85d15e0ee2 100644
--- a/src/main/java/org/jsoup/safety/Safelist.java
+++ b/src/main/java/org/jsoup/safety/Safelist.java
@@ -12,12 +12,8 @@ Thank you to Ryan Grove (wonko.com) for the Ruby HTML cleaner http://github.com/
import org.jsoup.nodes.Attributes;
import org.jsoup.nodes.Element;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Iterator;
-import java.util.Map;
-import java.util.Objects;
-import java.util.Set;
+import java.util.*;
+import java.util.regex.Pattern;
import static org.jsoup.internal.Normalizer.lowerCase;
@@ -67,11 +63,13 @@ XSS attack examples (that jsoup will safegaurd against the default Cleaner and S
*/
public class Safelist {
private static final String All = ":all";
+
private final Set tagNames; // tags allowed, lower case. e.g. [p, br, span]
private final Map> attributes; // tag -> attribute[]. allowed attributes [href] for a tag.
private final Map> enforcedAttributes; // always set these attribute values
private final Map>> protocols; // allowed URL protocols for attributes
private boolean preserveRelativeLinks; // option to preserve relative links
+ private Map> attributeWildcards = new LinkedHashMap<>();
/**
This safelist allows only text nodes: any HTML Element or any Node other than a TextNode will be removed.
@@ -237,6 +235,12 @@ public Safelist(Safelist copy) {
protocols.put(protocolsEntry.getKey(), attributeProtocolsCopy);
}
preserveRelativeLinks = copy.preserveRelativeLinks;
+
+ // create deep-ish copy. (The 'Pattern' is not deep-copied.)
+ attributeWildcards = new LinkedHashMap<>(copy.attributeWildcards.size());
+ for (Map.Entry> entry : copy.attributeWildcards.entrySet()) {
+ attributeWildcards.put(entry.getKey(), new LinkedHashMap<>(entry.getValue()));
+ }
}
/**
@@ -408,6 +412,93 @@ public Safelist removeEnforcedAttribute(String tag, String attribute) {
return this;
}
+ /**
+ * Add wildcard attributes
+ *
+ * The wildcard should be recognized by java.text.Pattern. Multiple calls
+ * will result in only the last one being used.
+ *
+ *
+ * Examples:
+ *
+ * data-.+
- HTML 5
+ * aria-.+
- a widely used library
+ *
+ *
+ *
+ * @param tag The tag the attributes are for.
+ * @param wildcards wildcard pattern recognized by java.text.Pattern
+ * @return this Safelist, for chaining.
+ */
+ public Safelist addWildcardAttributes(String tag, String... wildcards) {
+ String ltag = tag.toLowerCase();
+ for (String wildcard : wildcards) {
+ if (!attributeWildcards.containsKey(ltag)) {
+ attributeWildcards.put(ltag, new LinkedHashMap<>());
+ }
+ attributeWildcards.get(ltag).put(wildcard, Pattern.compile("^" + wildcard + "$",
+ Pattern.CASE_INSENSITIVE + Pattern.UNICODE_CASE));
+ }
+
+ return this;
+ }
+
+ /**
+ * Remove wildcard attributes
+ *
+ * @param tag The tag the attributes are for.
+ * @param wildcards wildcards pattern recognized by java.text.Pattern
+ * @return this Safelist, for chaining.
+ */
+ public Safelist removeWildcardAttributes(String tag, String... wildcards) {
+ String ltag = tag.toLowerCase();
+ for (String wildcard : wildcards) {
+ if (attributeWildcards.containsKey(ltag)) {
+ if (attributeWildcards.get(ltag).containsKey(wildcard)) {
+ attributeWildcards.remove(wildcard);
+ }
+
+ // remove any empty entries
+ if (attributeWildcards.get(ltag).isEmpty()) {
+ attributeWildcards.remove(ltag);
+ }
+ }
+ }
+
+ return this;
+ }
+
+ /**
+ * Add wildcard global attributes
+ *
+ * The wildcard should be recognized by java.text.Pattern. Multiple calls
+ * will result in only the last pattern being used.
+ *
+ *
+ * Examples:
+ *
+ * data-.+
- HTML 5
+ * aria-.+
- a widely used library
+ *
+ *
+ *
+ * @param wildcards wildcard pattern recognized by java.text.Pattern
+ * @return this Safelist, for chaining.
+ */
+ public Safelist addWildcardGlobalAttributes(String... wildcards) {
+ return addWildcardAttributes(All, wildcards);
+ }
+
+ /**
+ * Remove wildcard global attributes
+ *
+ * @param wildcards wildcard pattern recognized by java.text.Pattern
+ * @return this Safelist, for chaining.
+ */
+ public Safelist removeWildcardGlobalAttributes(String wildcards) {
+ return removeWildcardAttributes(All, wildcards);
+ }
+
/**
* Configure this Safelist to preserve relative links in an element's URL attribute, or convert them to absolute
* links. By default, this is false: URLs will be made absolute (e.g. start with an allowed protocol, like
@@ -541,6 +632,23 @@ public boolean isSafeAttribute(String tagName, Element el, Attribute attr) {
return expect.getIgnoreCase(attrKey).equals(attr.getValue());
}
}
+ // might be a wildcard, e.g., "data-.+"?
+ String ltag = tagName.toLowerCase();
+ if (attributeWildcards.containsKey(ltag)) {
+ for (Pattern pattern : attributeWildcards.get(ltag).values()) {
+ if (pattern.matcher(attr.getKey()).matches()) {
+ return true;
+ }
+ }
+ }
+ // might be a global wildcard, e.g., "data-.+"?
+ if (attributeWildcards.containsKey(All)) {
+ for (Pattern pattern : attributeWildcards.get(All).values()) {
+ if (pattern.matcher(attr.getKey()).matches()) {
+ return true;
+ }
+ }
+ }
// no attributes defined for tag, try :all tag
return !tagName.equals(All) && isSafeAttribute(All, el, attr);
}
diff --git a/src/test/java/org/jsoup/safety/SafelistTest.java b/src/test/java/org/jsoup/safety/SafelistTest.java
index 796ddc7225..03c58fd255 100644
--- a/src/test/java/org/jsoup/safety/SafelistTest.java
+++ b/src/test/java/org/jsoup/safety/SafelistTest.java
@@ -12,6 +12,7 @@
public class SafelistTest {
private static final String TEST_TAG = "testTag";
private static final String TEST_ATTRIBUTE = "testAttribute";
+ private static final String TEST_DATA_ATTRIBUTE = "data-" + TEST_ATTRIBUTE;
private static final String TEST_SCHEME = "valid-scheme";
private static final String TEST_VALUE = TEST_SCHEME + "://testValue";
@@ -75,5 +76,25 @@ void noscriptIsBlocked() {
assertNull(safelist);
}
+ @Test
+ public void testAttributeWildcard() {
+ Safelist safelist1 = Safelist.none();
+ Safelist safelist2 = new Safelist(safelist1).addWildcardAttributes(TEST_TAG, "data-.+");
+ Attribute attr = new Attribute(TEST_DATA_ATTRIBUTE, TEST_VALUE);
+
+ assertFalse(safelist1.isSafeAttribute(TEST_TAG, null, attr));
+ assertTrue(safelist2.isSafeAttribute(TEST_TAG, null, attr));
+ assertFalse(safelist1.isSafeAttribute(TEST_TAG + "1", null, attr));
+ }
+ @Test
+ public void test8GlobalAttributeWildcard() {
+ Safelist safelist1 = Safelist.none();
+ Safelist safelist2 = new Safelist(safelist1).addWildcardGlobalAttributes("data-.+");
+ Attribute attr = new Attribute(TEST_DATA_ATTRIBUTE, TEST_VALUE);
+
+ assertFalse(safelist1.isSafeAttribute(TEST_TAG, null, attr));
+ assertTrue(safelist2.isSafeAttribute(TEST_TAG, null, attr));
+ assertTrue(safelist2.isSafeAttribute(TEST_TAG + "1", null, attr));
+ }
}
From 748b267c305337cddae58415b52bd8d8bf483dec Mon Sep 17 00:00:00 2001
From: Bear Giles
Date: Fri, 15 Nov 2024 18:12:59 -0700
Subject: [PATCH 2/3] Fixed package name
---
src/main/java/org/jsoup/safety/Safelist.java | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/src/main/java/org/jsoup/safety/Safelist.java b/src/main/java/org/jsoup/safety/Safelist.java
index 85d15e0ee2..109c07a5c0 100644
--- a/src/main/java/org/jsoup/safety/Safelist.java
+++ b/src/main/java/org/jsoup/safety/Safelist.java
@@ -415,7 +415,7 @@ public Safelist removeEnforcedAttribute(String tag, String attribute) {
/**
* Add wildcard attributes
*
- * The wildcard should be recognized by java.text.Pattern. Multiple calls
+ * The wildcard should be recognized by java.util.regex.Pattern. Multiple calls
* will result in only the last one being used.
*
*
@@ -427,7 +427,7 @@ public Safelist removeEnforcedAttribute(String tag, String attribute) {
*
*
* @param tag The tag the attributes are for.
- * @param wildcards wildcard pattern recognized by java.text.Pattern
+ * @param wildcards wildcard pattern recognized by java.util.regex.Pattern
* @return this Safelist, for chaining.
*/
public Safelist addWildcardAttributes(String tag, String... wildcards) {
@@ -447,7 +447,7 @@ public Safelist addWildcardAttributes(String tag, String... wildcards) {
* Remove wildcard attributes
*
* @param tag The tag the attributes are for.
- * @param wildcards wildcards pattern recognized by java.text.Pattern
+ * @param wildcards wildcards pattern recognized by java.util.regex.Pattern
* @return this Safelist, for chaining.
*/
public Safelist removeWildcardAttributes(String tag, String... wildcards) {
@@ -471,7 +471,7 @@ public Safelist removeWildcardAttributes(String tag, String... wildcards) {
/**
* Add wildcard global attributes
*
- * The wildcard should be recognized by java.text.Pattern. Multiple calls
+ * The wildcard should be recognized by java.util.regex.Pattern. Multiple calls
* will result in only the last pattern being used.
*
*
@@ -482,7 +482,7 @@ public Safelist removeWildcardAttributes(String tag, String... wildcards) {
*
*
*
- * @param wildcards wildcard pattern recognized by java.text.Pattern
+ * @param wildcards wildcard pattern recognized by java.util.regex.Pattern
* @return this Safelist, for chaining.
*/
public Safelist addWildcardGlobalAttributes(String... wildcards) {
@@ -492,7 +492,7 @@ public Safelist addWildcardGlobalAttributes(String... wildcards) {
/**
* Remove wildcard global attributes
*
- * @param wildcards wildcard pattern recognized by java.text.Pattern
+ * @param wildcards wildcard pattern recognized by java.util.regex.Pattern
* @return this Safelist, for chaining.
*/
public Safelist removeWildcardGlobalAttributes(String wildcards) {
From 0ffb476d89a26febd8edce0c3e4b099f4319ef42 Mon Sep 17 00:00:00 2001
From: Bear Giles
Date: Fri, 15 Nov 2024 20:59:23 -0700
Subject: [PATCH 3/3] [JSOUP-2224] fxed removeTags()
Fixed removeTags() - it hadn't removed the wildcard attributes.
Also changed the Map so the key is a TagName, not a String, in
order to avoid any uncertainty regarding capitalization.
---
src/main/java/org/jsoup/safety/Safelist.java | 39 ++++++++++----------
1 file changed, 20 insertions(+), 19 deletions(-)
diff --git a/src/main/java/org/jsoup/safety/Safelist.java b/src/main/java/org/jsoup/safety/Safelist.java
index 109c07a5c0..c1a48edf26 100644
--- a/src/main/java/org/jsoup/safety/Safelist.java
+++ b/src/main/java/org/jsoup/safety/Safelist.java
@@ -63,13 +63,14 @@ XSS attack examples (that jsoup will safegaurd against the default Cleaner and S
*/
public class Safelist {
private static final String All = ":all";
+ private static final TagName AllTagName = new TagName(All);
private final Set tagNames; // tags allowed, lower case. e.g. [p, br, span]
private final Map> attributes; // tag -> attribute[]. allowed attributes [href] for a tag.
private final Map> enforcedAttributes; // always set these attribute values
private final Map>> protocols; // allowed URL protocols for attributes
private boolean preserveRelativeLinks; // option to preserve relative links
- private Map> attributeWildcards = new LinkedHashMap<>();
+ private Map> wildcardAttributes = new LinkedHashMap<>();
/**
This safelist allows only text nodes: any HTML Element or any Node other than a TextNode will be removed.
@@ -237,9 +238,9 @@ public Safelist(Safelist copy) {
preserveRelativeLinks = copy.preserveRelativeLinks;
// create deep-ish copy. (The 'Pattern' is not deep-copied.)
- attributeWildcards = new LinkedHashMap<>(copy.attributeWildcards.size());
- for (Map.Entry> entry : copy.attributeWildcards.entrySet()) {
- attributeWildcards.put(entry.getKey(), new LinkedHashMap<>(entry.getValue()));
+ wildcardAttributes = new LinkedHashMap<>(copy.wildcardAttributes.size());
+ for (Map.Entry> entry : copy.wildcardAttributes.entrySet()) {
+ wildcardAttributes.put(entry.getKey(), new LinkedHashMap<>(entry.getValue()));
}
}
@@ -278,6 +279,7 @@ public Safelist removeTags(String... tags) {
attributes.remove(tagName);
enforcedAttributes.remove(tagName);
protocols.remove(tagName);
+ wildcardAttributes.remove(tagName);
}
}
return this;
@@ -431,12 +433,12 @@ public Safelist removeEnforcedAttribute(String tag, String attribute) {
* @return this Safelist, for chaining.
*/
public Safelist addWildcardAttributes(String tag, String... wildcards) {
- String ltag = tag.toLowerCase();
+ TagName tagName = TagName.valueOf(tag);
for (String wildcard : wildcards) {
- if (!attributeWildcards.containsKey(ltag)) {
- attributeWildcards.put(ltag, new LinkedHashMap<>());
+ if (!wildcardAttributes.containsKey(tagName)) {
+ wildcardAttributes.put(tagName, new LinkedHashMap<>());
}
- attributeWildcards.get(ltag).put(wildcard, Pattern.compile("^" + wildcard + "$",
+ wildcardAttributes.get(tagName).put(wildcard, Pattern.compile("^" + wildcard + "$",
Pattern.CASE_INSENSITIVE + Pattern.UNICODE_CASE));
}
@@ -451,16 +453,16 @@ public Safelist addWildcardAttributes(String tag, String... wildcards) {
* @return this Safelist, for chaining.
*/
public Safelist removeWildcardAttributes(String tag, String... wildcards) {
- String ltag = tag.toLowerCase();
+ TagName tagName = TagName.valueOf(tag);
for (String wildcard : wildcards) {
- if (attributeWildcards.containsKey(ltag)) {
- if (attributeWildcards.get(ltag).containsKey(wildcard)) {
- attributeWildcards.remove(wildcard);
+ if (wildcardAttributes.containsKey(tagName)) {
+ if (wildcardAttributes.get(tagName).containsKey(wildcard)) {
+ wildcardAttributes.get(tagName).remove(wildcard);
}
// remove any empty entries
- if (attributeWildcards.get(ltag).isEmpty()) {
- attributeWildcards.remove(ltag);
+ if (wildcardAttributes.get(tagName).isEmpty()) {
+ wildcardAttributes.remove(tagName);
}
}
}
@@ -633,17 +635,16 @@ public boolean isSafeAttribute(String tagName, Element el, Attribute attr) {
}
}
// might be a wildcard, e.g., "data-.+"?
- String ltag = tagName.toLowerCase();
- if (attributeWildcards.containsKey(ltag)) {
- for (Pattern pattern : attributeWildcards.get(ltag).values()) {
+ if (wildcardAttributes.containsKey(tag)) {
+ for (Pattern pattern : wildcardAttributes.get(tag).values()) {
if (pattern.matcher(attr.getKey()).matches()) {
return true;
}
}
}
// might be a global wildcard, e.g., "data-.+"?
- if (attributeWildcards.containsKey(All)) {
- for (Pattern pattern : attributeWildcards.get(All).values()) {
+ if (wildcardAttributes.containsKey(AllTagName)) {
+ for (Pattern pattern : wildcardAttributes.get(AllTagName).values()) {
if (pattern.matcher(attr.getKey()).matches()) {
return true;
}