Skip to content

Commit

Permalink
Normalize and dedupe attribute source ranges
Browse files Browse the repository at this point in the history
Fixes #2067
  • Loading branch information
jhy committed Nov 28, 2023
1 parent de0e749 commit fc4b175
Show file tree
Hide file tree
Showing 3 changed files with 126 additions and 16 deletions.
6 changes: 6 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,12 @@

## 1.17.2 (Pending)

### Bug Fixes

* When tracking the source position of attributes, if source attribute name was mix-cased but the parser was
lower-case normalizing attribute names, the source position for that attribute was not tracked
correctly. [2067](https://github.com/jhy/jsoup/issues/2067)

---
Older changes for versions 0.1.1 (2010-Jan-31) through 1.17.1 (2023-Nov-27) may be found in
[change-archive.txt](./change-archive.txt).
18 changes: 18 additions & 0 deletions src/main/java/org/jsoup/nodes/Attributes.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import org.jsoup.SerializationException;
import org.jsoup.helper.Validate;
import org.jsoup.internal.Normalizer;
import org.jsoup.internal.SharedConstants;
import org.jsoup.internal.StringUtil;
import org.jsoup.parser.ParseSettings;
Expand Down Expand Up @@ -524,6 +525,23 @@ public void normalize() {
if (!isInternalKey(keys[i]))
keys[i] = lowerCase(keys[i]);
}

// if we are tracking attribute source ranges, normalize those keys also
//noinspection unchecked
Map<String, Range.AttributeRange> ranges = (Map<String, Range.AttributeRange>) userData(AttrRangeKey);
if (ranges != null) {
Object[] names = ranges.keySet().toArray(); // copy to array to avoid CMEs during put
for (Object name : names) {
String normal = lowerCase((String) name);
if (normal.equals(name)) continue;
if (ranges.containsKey(normal)) {
ranges.remove(name); // dedupe now that we have normalized
} else {
Range.AttributeRange range = ranges.remove(name);
ranges.put(normal, range);
}
}
}
}

/**
Expand Down
118 changes: 102 additions & 16 deletions src/test/java/org/jsoup/parser/PositionTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import org.jsoup.nodes.Document;
import org.jsoup.nodes.DocumentType;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.LeafNode;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.Range;
import org.jsoup.nodes.TextNode;
Expand All @@ -26,23 +27,24 @@
Functional tests for the Position tracking behavior (across nodes, treebuilder, etc.)
*/
class PositionTest {
static Parser TrackingParser = Parser.htmlParser().setTrackPosition(true);
static Parser TrackingHtmlParser = Parser.htmlParser().setTrackPosition(true);
static Parser TrackingXmlParser = Parser.xmlParser().setTrackPosition(true);

@Test void parserTrackDefaults() {
Parser htmlParser = Parser.htmlParser();
assertFalse(htmlParser.isTrackPosition());
htmlParser.setTrackPosition(true);
assertTrue(htmlParser.isTrackPosition());

Parser xmlParser = Parser.htmlParser();
Parser xmlParser = Parser.xmlParser();
assertFalse(xmlParser.isTrackPosition());
xmlParser.setTrackPosition(true);
assertTrue(xmlParser.isTrackPosition());
}

@Test void tracksPosition() {
String content = "<p id=1\n class=foo>\n<span>Hello\n &reg;\n there &copy.</span> now.\n <!-- comment --> ";
Document doc = Jsoup.parse(content, TrackingParser);
Document doc = Jsoup.parse(content, TrackingHtmlParser);

Element html = doc.expectFirst("html");
Element body = doc.expectFirst("body");
Expand Down Expand Up @@ -115,7 +117,7 @@ class PositionTest {
@Test void tracksExpectedPoppedElements() {
// When TreeBuilder hits a direct .pop(), vs popToClose(..)
String html = "<html><head><meta></head><body><img><p>One</p><p>Two</p></body></html>";
Document doc = Jsoup.parse(html, TrackingParser);
Document doc = Jsoup.parse(html, TrackingHtmlParser);

StringBuilder track = new StringBuilder();
doc.expectFirst("html").stream().forEach(el -> {
Expand Down Expand Up @@ -154,7 +156,7 @@ static void accumulatePositions(Node node, StringBuilder sb) {
@Test void tracksImplicitPoppedElements() {
// When TreeBuilder hits a direct .pop(), vs popToClose(..)
String html = "<meta><img><p>One<p>Two<p>Three";
Document doc = Jsoup.parse(html, TrackingParser);
Document doc = Jsoup.parse(html, TrackingHtmlParser);

StringBuilder track = new StringBuilder();
doc.expectFirst("html").stream().forEach(el -> {
Expand Down Expand Up @@ -184,7 +186,7 @@ private void printRange(Node node) {

@Test void tracksMarkup() {
String html = "<!doctype\nhtml>\n<title>jsoup &copy;\n2022</title><body>\n<![CDATA[\n<jsoup>\n]]>";
Document doc = Jsoup.parse(html, TrackingParser);
Document doc = Jsoup.parse(html, TrackingHtmlParser);

DocumentType doctype = doc.documentType();
assertNotNull(doctype);
Expand All @@ -206,7 +208,7 @@ private void printRange(Node node) {

@Test void tracksDataNodes() {
String html = "<head>\n<script>foo;\nbar()\n5 <= 4;</script>";
Document doc = Jsoup.parse(html, TrackingParser);
Document doc = Jsoup.parse(html, TrackingHtmlParser);

Element script = doc.expectFirst("script");
assertNotNull(script);
Expand All @@ -218,7 +220,7 @@ private void printRange(Node node) {

@Test void tracksXml() {
String xml = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<!doctype html>\n<rss url=foo>\nXML\n</rss>\n<!-- comment -->";
Document doc = Jsoup.parse(xml, Parser.xmlParser().setTrackPosition(true));
Document doc = Jsoup.parse(xml, TrackingXmlParser);

XmlDeclaration decl = (XmlDeclaration) doc.childNode(0);
assertEquals("1,1:0-1,39:38", decl.sourceRange().toString());
Expand All @@ -241,7 +243,7 @@ private void printRange(Node node) {

@Test void tracksFromFetch() throws IOException {
String url = FileServlet.urlTo("/htmltests/large.html"); // 280 K
Document doc = Jsoup.connect(url).parser(TrackingParser).get();
Document doc = Jsoup.connect(url).parser(TrackingHtmlParser).get();

Element firstP = doc.expectFirst("p");
assertNotNull(firstP);
Expand All @@ -259,7 +261,7 @@ private void printRange(Node node) {

@Test void tracksFromXmlFetch() throws IOException {
String url = FileServlet.urlTo("/htmltests/test-rss.xml");
Document doc = Jsoup.connect(url).parser(Parser.xmlParser().setTrackPosition(true)).get();
Document doc = Jsoup.connect(url).parser(TrackingXmlParser).get();

Element item = doc.expectFirst("item + item");
assertNotNull(item);
Expand All @@ -269,7 +271,7 @@ private void printRange(Node node) {

@Test void tracksTableMovedText() {
String html = "<table>foo<tr>bar<td>baz</td>qux</tr>coo</table>";
Document doc = Jsoup.parse(html, TrackingParser);
Document doc = Jsoup.parse(html, TrackingHtmlParser);

StringBuilder track = new StringBuilder();
List<TextNode> textNodes = doc.nodeStream(TextNode.class)
Expand All @@ -289,7 +291,7 @@ private void printRange(Node node) {
@Test void tracksClosingHtmlTagsInXml() {
// verifies https://github.com/jhy/jsoup/issues/1935
String xml = "<p>One</p><title>Two</title><data>Three</data>";
Document doc = Jsoup.parse(xml, Parser.xmlParser().setTrackPosition(true));
Document doc = Jsoup.parse(xml, TrackingXmlParser);
Elements els = doc.children();
for (Element el : els) {
assertTrue(el.sourceRange().isTracked());
Expand All @@ -300,7 +302,7 @@ private void printRange(Node node) {
@Test void tracksClosingHeadingTags() {
// https://github.com/jhy/jsoup/issues/1987
String html = "<h1>One</h1><h2>Two</h2><h10>Ten</h10>";
Document doc = Jsoup.parse(html, TrackingParser);
Document doc = Jsoup.parse(html, TrackingHtmlParser);

Elements els = doc.body().children();
for (Element el : els) {
Expand All @@ -315,7 +317,7 @@ private void printRange(Node node) {

@Test void tracksAttributes() {
String html = "<div one=\"Hello there\" id=1 class=foo attr1 = \"bar &amp; qux\" attr2='val &gt x' attr3=\"\" attr4 attr5>Text";
Document doc = Jsoup.parse(html, TrackingParser);
Document doc = Jsoup.parse(html, TrackingHtmlParser);

Element div = doc.expectFirst("div");

Expand All @@ -336,13 +338,12 @@ private void printRange(Node node) {
accumulatePositions(attr, track);
}

System.out.println(track);
assertEquals("one:5-8=10-21; id:23-25=26-27; class:28-33=34-37; attr1:38-43=47-60; attr2:62-67=69-78; attr3:80-85=85-85; attr4:89-94=94-94; attr5:95-100=100-100; ", track.toString());
}

@Test void tracksAttributesAcrossLines() {
String html = "<div one=\"Hello\nthere\" \nid=1 \nclass=\nfoo\nattr5>Text";
Document doc = Jsoup.parse(html, TrackingParser);
Document doc = Jsoup.parse(html, TrackingHtmlParser);

Element div = doc.expectFirst("div");

Expand All @@ -368,6 +369,91 @@ private void printRange(Node node) {
assertEquals("one:5-8=10-21; id:24-26=27-28; class:30-35=37-40; attr5:41-46=46-46; ", track.toString());
}

@Test void trackAttributePositionInFirstElement() {
String html = "<html lang=en class=dark><p hidden></p></html>";

Document htmlDoc = Jsoup.parse(html, TrackingHtmlParser);
StringBuilder htmlPos = new StringBuilder();
htmlDoc.expectFirst("html").nodeStream().forEach(node -> {
accumulatePositions(node, htmlPos);
accumulateAttributePositions(node, htmlPos);
});

assertEquals("html:0-25~39-46; lang:6-10=11-13; class:14-19=20-24; head:25-25~25-25; body:25-25~46-46; p:25-35~35-39; hidden:28-34=34-34; ", htmlPos.toString());

Document xmlDoc = Jsoup.parse(html, TrackingXmlParser);
StringBuilder xmlPos = new StringBuilder();
xmlDoc.expectFirst("html").nodeStream().forEach(node -> {
accumulatePositions(node, xmlPos);
accumulateAttributePositions(node, xmlPos);
});

assertEquals("html:0-25~39-46; lang:6-10=11-13; class:14-19=20-24; p:25-35~35-39; hidden:28-34=34-34; ", xmlPos.toString());
}

@Test void trackAttributePositionWithCase() {
String pomXml = "<project xmlns=\"http://maven.apache.org/POM/4.0.0\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:schemaLocation=\"http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd\">\n" +
" <modelVersion>4.0.0</modelVersion>";

Document htmlDoc = Jsoup.parse(pomXml, TrackingHtmlParser);
StringBuilder htmlPos = new StringBuilder();
htmlDoc.expectFirst("html").nodeStream().forEach(node -> {
accumulatePositions(node, htmlPos);
accumulateAttributePositions(node, htmlPos);
});

assertEquals("html:0-0~243-243; head:0-0~0-0; body:0-0~243-243; project:0-204~243-243; xmlns:9-14=16-49; xmlns:xsi:51-60=62-103; xsi:schemalocation:105-123=125-202; #text:204-209; modelversion:209-223~228-243; #text:223-228; ", htmlPos.toString());

Document xmlDoc = Jsoup.parse(pomXml, TrackingXmlParser);
StringBuilder xmlPos = new StringBuilder();
xmlDoc.expectFirst("project").nodeStream().forEach(node -> {
accumulatePositions(node, xmlPos);
accumulateAttributePositions(node, xmlPos);
});

assertEquals("project:0-204~243-243; xmlns:9-14=16-49; xmlns:xsi:51-60=62-103; xsi:schemaLocation:105-123=125-202; #text:204-209; modelVersion:209-223~228-243; #text:223-228; ", xmlPos.toString());

Document xmlDocLc = Jsoup.parse(pomXml, Parser.xmlParser().setTrackPosition(true).settings(new ParseSettings(false, false)));
StringBuilder xmlPosLc = new StringBuilder();
xmlDocLc.expectFirst("project").nodeStream().forEach(node -> {
accumulatePositions(node, xmlPosLc);
accumulateAttributePositions(node, xmlPosLc);
});

assertEquals("project:0-204~243-243; xmlns:9-14=16-49; xmlns:xsi:51-60=62-103; xsi:schemalocation:105-123=125-202; #text:204-209; modelversion:209-223~228-243; #text:223-228; ", xmlPosLc.toString());
}


@Test void trackAttributesPositionsDedupes() {
String html = "<p id=1 id=2 Id=3 Id=4 id=5 Id=6>";
Document htmlDoc = Jsoup.parse(html, TrackingHtmlParser);
Document htmlDocUc = Jsoup.parse(html, Parser.htmlParser().setTrackPosition(true).settings(new ParseSettings(true, true)));
Document xmlDoc = Jsoup.parse(html, TrackingXmlParser);
Document xmlDocLc = Jsoup.parse(html, Parser.xmlParser().setTrackPosition(true).settings(new ParseSettings(false, false)));

StringBuilder htmlPos = new StringBuilder();
StringBuilder htmlUcPos = new StringBuilder();
StringBuilder xmlPos = new StringBuilder();
StringBuilder xmlLcPos = new StringBuilder();

accumulateAttributePositions(htmlDoc .expectFirst("p"), htmlPos);
accumulateAttributePositions(htmlDocUc .expectFirst("p"), htmlUcPos);
accumulateAttributePositions(xmlDoc .expectFirst("p"), xmlPos);
accumulateAttributePositions(xmlDocLc .expectFirst("p"), xmlLcPos);

assertEquals("id:3-5=6-7; ", htmlPos .toString());
assertEquals("id:3-5=6-7; Id:13-15=16-17; ", htmlUcPos .toString());
assertEquals("id:3-5=6-7; Id:13-15=16-17; ", xmlPos .toString());
assertEquals("id:3-5=6-7; ", xmlLcPos .toString());
}

static void accumulateAttributePositions(Node node, StringBuilder sb) {
if (node instanceof LeafNode) return; // leafnode pseudo attributes are not tracked
for (Attribute attribute : node.attributes()) {
accumulatePositions(attribute, sb);
}
}

static void accumulatePositions(Attribute attr, StringBuilder sb) {
Range.AttributeRange range = attr.sourceRange();

Expand Down

0 comments on commit fc4b175

Please sign in to comment.