Skip to content

Commit

Permalink
Fix O(n!) tag name processing. Fixes dkpro#27.
Browse files Browse the repository at this point in the history
Also moves all initialization into constructor and simplifies it.
  • Loading branch information
tfmorris committed Apr 10, 2016
1 parent a5ea75b commit 775abef
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 50 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -458,11 +458,8 @@ public String getMinimalHtml(String html, Locale locale)
for (Paragraph p : paragraphs) {
if (!p.isBoilerplate()) {
// get the tag name
String tagNameOrig = p.getTagName();
String tag = p.getTagName();

// extract the tag name from i.e. "html.body.div.div.div.div.div.div.p."
String[] split = tagNameOrig.split("\\.");
String tag = split[split.length - 1];
//edited as sometimes the tag is empty because it is div or br
if (tag.trim().isEmpty()) {
tag = "p";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,38 +32,37 @@
public class Paragraph
extends LinkedList<Node>
{
private static final long serialVersionUID = 1L;

// private ArrayList<String> textNodes;
int charsCountInLinks = 0;
private String classType = "";
private String contextFreeClass = "";
private String tagName = "";
private String rawText = "";
private boolean isHeading = false;

public Paragraph(Node firstNode)
public Paragraph(Node firstNode, boolean heading)
{
add(firstNode);
}

public void initRawInfo()
{
StringBuilder sb = new StringBuilder();
for (Node n : this) {
// NodeHelper.cleanEmptyElements(n);
if (n instanceof TextNode) {
this.setTagName(getPath(n));
String nodeRawText = ((TextNode) n).text();
sb.append(Utils.normalizeBreaks(nodeRawText).trim());
Node node = firstNode;
while (NodeHelper.isInnerText(node) || node instanceof TextNode) {
node = node.parent();
}
if (node != null) {
this.tagName = node.nodeName();
}
this.isHeading = heading;
if (firstNode instanceof TextNode) {
String nodeRawText = ((TextNode) firstNode).text();
this.rawText = Utils.normalizeBreaks(nodeRawText).trim();

if (NodeHelper.isLink(n)) {
charsCountInLinks += nodeRawText.length();
}
if (NodeHelper.isLink(firstNode)) {
charsCountInLinks += nodeRawText.length();
}
}

rawText = sb.toString();
}


public int getLinksLength()
{
return this.charsCountInLinks;
Expand Down Expand Up @@ -94,29 +93,6 @@ public String getTagName()
return this.tagName;
}

public String getPath(Node n)
{
String nodePath = "";
while (n != null) {
if (n instanceof TextNode) {
n = n.parent();
}
if (NodeHelper.isInnerText(n)) {
n = n.parent();
}
String parentNodeName = n.nodeName();
nodePath = parentNodeName + "." + nodePath;

if (!parentNodeName.equalsIgnoreCase("html")) {
n = n.parent();
}
else {
break;
}
}

return nodePath;
}

public void setTagName(String name)
{
Expand All @@ -125,7 +101,7 @@ public void setTagName(String name)

public boolean isHeading()
{
return this.getTagName().matches(".*\\.h\\d\\.");
return isHeading;
}

public boolean isBoilerplate()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,26 +19,34 @@
package de.tudarmstadt.ukp.dkpro.c4corpus.boilerplate.impl;

import org.jsoup.helper.StringUtil;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.select.NodeVisitor;

import java.security.InvalidParameterException;
import java.util.LinkedList;
import java.util.regex.Pattern;

/**
* Extract a list of paragraphs from html page. Paragraphs here means blocks of
* the document that might be a boilerplate or not.
*
* @author Omnia Zayed original code author is Phu-Hiep DUONG (found on-line but
* edited some parts)
*
* FIXME: The repo referenced above appears to be https://github.com/duongphuhiep/justext
* but it includes no license.
*/
public class ParagraphsExplorer
implements NodeVisitor
{

private static final Pattern HEADING_PATTERN = Pattern.compile("h[1-6]");
private final LinkedList<Paragraph> paragraphs;
private final LinkedList<Node> nodes;
private boolean inHeading = false;
private int headingDepth = 0;

public enum AncestorState
{
Expand All @@ -49,12 +57,18 @@ public enum AncestorState
public ParagraphsExplorer()
{
this.paragraphs = new LinkedList<>();
nodes = new LinkedList<>();
this.nodes = new LinkedList<>();
}

@Override
public void head(Node node, int depth)
{
if (!inHeading && node instanceof Element) {
inHeading = HEADING_PATTERN.matcher(((Element) node).tagName()).matches();
if (inHeading) {
headingDepth = depth;
}
}
if (node.childNodeSize() == 0) {
if (node instanceof TextNode && StringUtil.isBlank(node.outerHtml())) {
return;
Expand All @@ -67,7 +81,10 @@ public void head(Node node, int depth)
@Override
public void tail(Node node, int depth)
{
//do nothing
if (depth == headingDepth) {
// Headings can't be nested
inHeading = false;
}
}

/**
Expand Down Expand Up @@ -101,6 +118,10 @@ private void mergeToResult(Node node)
return;
case INNERTEXT_ONLY:
appendToLastParagraph(node);
case UNKNOW:
break;
default:
break;
}
}

Expand Down Expand Up @@ -141,8 +162,7 @@ public static AncestorState getAncestorState(Node lastNode, Node currentNode)

private void insertAsNewParagraph(Node node)
{
Paragraph p = new Paragraph(node);
p.initRawInfo();
Paragraph p = new Paragraph(node, inHeading);
// if (!p.getRawText().isEmpty()) {
paragraphs.add(p);
// }
Expand Down

0 comments on commit 775abef

Please sign in to comment.