Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[JSOUP-2224] add wildcards #2225

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
121 changes: 115 additions & 6 deletions src/main/java/org/jsoup/safety/Safelist.java
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,8 @@ Thank you to Ryan Grove (wonko.com) for the Ruby HTML cleaner http://github.com/
import org.jsoup.nodes.Attributes;
import org.jsoup.nodes.Element;

import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.*;
import java.util.regex.Pattern;

import static org.jsoup.internal.Normalizer.lowerCase;

Expand Down Expand Up @@ -67,11 +63,14 @@ XSS attack examples (that jsoup will safegaurd against the default Cleaner and S
*/
public class Safelist {
private static final String All = ":all";
private static final TagName AllTagName = new TagName(All);

private final Set<TagName> tagNames; // tags allowed, lower case. e.g. [p, br, span]
private final Map<TagName, Set<AttributeKey>> attributes; // tag -> attribute[]. allowed attributes [href] for a tag.
private final Map<TagName, Map<AttributeKey, AttributeValue>> enforcedAttributes; // always set these attribute values
private final Map<TagName, Map<AttributeKey, Set<Protocol>>> protocols; // allowed URL protocols for attributes
private boolean preserveRelativeLinks; // option to preserve relative links
private Map<TagName, Map<String, Pattern>> wildcardAttributes = new LinkedHashMap<>();

/**
This safelist allows only text nodes: any HTML Element or any Node other than a TextNode will be removed.
Expand Down Expand Up @@ -237,6 +236,12 @@ public Safelist(Safelist copy) {
protocols.put(protocolsEntry.getKey(), attributeProtocolsCopy);
}
preserveRelativeLinks = copy.preserveRelativeLinks;

// create deep-ish copy. (The 'Pattern' is not deep-copied.)
wildcardAttributes = new LinkedHashMap<>(copy.wildcardAttributes.size());
for (Map.Entry<TagName, Map<String, Pattern>> entry : copy.wildcardAttributes.entrySet()) {
wildcardAttributes.put(entry.getKey(), new LinkedHashMap<>(entry.getValue()));
}
}

/**
Expand Down Expand Up @@ -274,6 +279,7 @@ public Safelist removeTags(String... tags) {
attributes.remove(tagName);
enforcedAttributes.remove(tagName);
protocols.remove(tagName);
wildcardAttributes.remove(tagName);
}
}
return this;
Expand Down Expand Up @@ -408,6 +414,93 @@ public Safelist removeEnforcedAttribute(String tag, String attribute) {
return this;
}

/**
* Add wildcard attributes
* <p>
* The wildcard should be recognized by java.util.regex.Pattern. Multiple calls
* will result in only the last one being used.
* </p>
* <p>
* Examples:
* <ul>
* <li><code>data-.+</code> - HTML 5</li>
* <li><code>aria-.+</code> - a widely used library</li>
* </ul>
* </p>
*
* @param tag The tag the attributes are for.
* @param wildcards wildcard pattern recognized by java.util.regex.Pattern
* @return this Safelist, for chaining.
*/
public Safelist addWildcardAttributes(String tag, String... wildcards) {
TagName tagName = TagName.valueOf(tag);
for (String wildcard : wildcards) {
if (!wildcardAttributes.containsKey(tagName)) {
wildcardAttributes.put(tagName, new LinkedHashMap<>());
}
wildcardAttributes.get(tagName).put(wildcard, Pattern.compile("^" + wildcard + "$",
Pattern.CASE_INSENSITIVE + Pattern.UNICODE_CASE));
}

return this;
}

/**
* Remove wildcard attributes
*
* @param tag The tag the attributes are for.
* @param wildcards wildcards pattern recognized by java.util.regex.Pattern
* @return this Safelist, for chaining.
*/
public Safelist removeWildcardAttributes(String tag, String... wildcards) {
TagName tagName = TagName.valueOf(tag);
for (String wildcard : wildcards) {
if (wildcardAttributes.containsKey(tagName)) {
if (wildcardAttributes.get(tagName).containsKey(wildcard)) {
wildcardAttributes.get(tagName).remove(wildcard);
}

// remove any empty entries
if (wildcardAttributes.get(tagName).isEmpty()) {
wildcardAttributes.remove(tagName);
}
}
}

return this;
}

/**
* Add wildcard global attributes
* <p>
* The wildcard should be recognized by java.util.regex.Pattern. Multiple calls
* will result in only the last pattern being used.
* </p>
* <p>
* Examples:
* <ul>
* <li><code>data-.+</code> - HTML 5</li>
* <li><code>aria-.+</code> - a widely used library</li>
* </ul>
* </p>
*
* @param wildcards wildcard pattern recognized by java.util.regex.Pattern
* @return this Safelist, for chaining.
*/
public Safelist addWildcardGlobalAttributes(String... wildcards) {
return addWildcardAttributes(All, wildcards);
}

/**
* Remove wildcard global attributes
*
* @param wildcards wildcard pattern recognized by java.util.regex.Pattern
* @return this Safelist, for chaining.
*/
public Safelist removeWildcardGlobalAttributes(String wildcards) {
return removeWildcardAttributes(All, wildcards);
}

/**
* Configure this Safelist to preserve relative links in an element's URL attribute, or convert them to absolute
* links. By default, this is <b>false</b>: URLs will be made absolute (e.g. start with an allowed protocol, like
Expand Down Expand Up @@ -541,6 +634,22 @@ public boolean isSafeAttribute(String tagName, Element el, Attribute attr) {
return expect.getIgnoreCase(attrKey).equals(attr.getValue());
}
}
// might be a wildcard, e.g., "data-.+"?
if (wildcardAttributes.containsKey(tag)) {
for (Pattern pattern : wildcardAttributes.get(tag).values()) {
if (pattern.matcher(attr.getKey()).matches()) {
return true;
}
}
}
// might be a global wildcard, e.g., "data-.+"?
if (wildcardAttributes.containsKey(AllTagName)) {
for (Pattern pattern : wildcardAttributes.get(AllTagName).values()) {
if (pattern.matcher(attr.getKey()).matches()) {
return true;
}
}
}
// no attributes defined for tag, try :all tag
return !tagName.equals(All) && isSafeAttribute(All, el, attr);
}
Expand Down
21 changes: 21 additions & 0 deletions src/test/java/org/jsoup/safety/SafelistTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
public class SafelistTest {
private static final String TEST_TAG = "testTag";
private static final String TEST_ATTRIBUTE = "testAttribute";
private static final String TEST_DATA_ATTRIBUTE = "data-" + TEST_ATTRIBUTE;
private static final String TEST_SCHEME = "valid-scheme";
private static final String TEST_VALUE = TEST_SCHEME + "://testValue";

Expand Down Expand Up @@ -75,5 +76,25 @@ void noscriptIsBlocked() {
assertNull(safelist);
}

@Test
public void testAttributeWildcard() {
Safelist safelist1 = Safelist.none();
Safelist safelist2 = new Safelist(safelist1).addWildcardAttributes(TEST_TAG, "data-.+");
Attribute attr = new Attribute(TEST_DATA_ATTRIBUTE, TEST_VALUE);

assertFalse(safelist1.isSafeAttribute(TEST_TAG, null, attr));
assertTrue(safelist2.isSafeAttribute(TEST_TAG, null, attr));
assertFalse(safelist1.isSafeAttribute(TEST_TAG + "1", null, attr));
}

@Test
public void test8GlobalAttributeWildcard() {
Safelist safelist1 = Safelist.none();
Safelist safelist2 = new Safelist(safelist1).addWildcardGlobalAttributes("data-.+");
Attribute attr = new Attribute(TEST_DATA_ATTRIBUTE, TEST_VALUE);

assertFalse(safelist1.isSafeAttribute(TEST_TAG, null, attr));
assertTrue(safelist2.isSafeAttribute(TEST_TAG, null, attr));
assertTrue(safelist2.isSafeAttribute(TEST_TAG + "1", null, attr));
}
}
Loading