-
Notifications
You must be signed in to change notification settings - Fork 9
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[kbss-cvut/termi-ui#451] Implement support for getting file content w…
…ithout unconfirmed term occurrences.
- Loading branch information
Showing
7 changed files
with
141 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
15 changes: 15 additions & 0 deletions
15
src/main/java/cz/cvut/kbss/termit/exception/FileContentProcessingException.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
package cz.cvut.kbss.termit.exception; | ||
|
||
/** | ||
* Indicates an error when processing file content. | ||
*/ | ||
public class FileContentProcessingException extends TermItException { | ||
|
||
public FileContentProcessingException(String message) { | ||
super(message); | ||
} | ||
|
||
public FileContentProcessingException(String message, Throwable cause) { | ||
super(message, cause); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
51 changes: 51 additions & 0 deletions
51
...main/java/cz/cvut/kbss/termit/service/document/html/UnconfirmedTermOccurrenceRemover.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
package cz.cvut.kbss.termit.service.document.html; | ||
|
||
import cz.cvut.kbss.termit.exception.FileContentProcessingException; | ||
import cz.cvut.kbss.termit.service.export.util.TypeAwareByteArrayResource; | ||
import cz.cvut.kbss.termit.util.TypeAwareResource; | ||
import org.jsoup.Jsoup; | ||
import org.jsoup.nodes.Document; | ||
import org.jsoup.nodes.Node; | ||
import org.jsoup.select.Elements; | ||
|
||
import java.io.BufferedWriter; | ||
import java.io.ByteArrayOutputStream; | ||
import java.io.IOException; | ||
import java.io.OutputStreamWriter; | ||
import java.nio.charset.StandardCharsets; | ||
|
||
/** | ||
* Removes unconfirmed term occurrences from content. | ||
*/ | ||
public class UnconfirmedTermOccurrenceRemover { | ||
|
||
|
||
/** | ||
* Removes unconfirmed term occurrences from the specified input. | ||
* <p> | ||
* Removing such occurrences means the corresponding elements are replaced with their text content. | ||
* <p> | ||
* An occurrence is considered unconfirmed when it has a confidence score, confirmed occurrences do not have | ||
* scores. | ||
* | ||
* @param input Input to process | ||
* @return Processed content | ||
*/ | ||
public TypeAwareResource removeUnconfirmedOccurrences(TypeAwareResource input) { | ||
try { | ||
final Document doc = Jsoup.parse(input.getInputStream(), StandardCharsets.UTF_8.name(), ""); | ||
doc.outputSettings().prettyPrint(false); | ||
final Elements spanElements = doc.select("span[score]"); | ||
spanElements.forEach(Node::unwrap); | ||
|
||
final ByteArrayOutputStream out = new ByteArrayOutputStream(); | ||
final BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out)); | ||
writer.write(doc.toString()); // Write modified HTML to output stream | ||
writer.close(); | ||
return new TypeAwareByteArrayResource(out.toByteArray(), input.getMediaType().orElse(null), | ||
input.getFileExtension().orElse(null)); | ||
} catch (IOException e) { | ||
throw new FileContentProcessingException("Unable to read resource for unconfirmed occurrence removal.", e); | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
42 changes: 42 additions & 0 deletions
42
.../java/cz/cvut/kbss/termit/service/document/html/UnconfirmedTermOccurrenceRemoverTest.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
package cz.cvut.kbss.termit.service.document.html; | ||
|
||
import cz.cvut.kbss.termit.environment.Environment; | ||
import cz.cvut.kbss.termit.service.export.util.TypeAwareByteArrayResource; | ||
import cz.cvut.kbss.termit.util.TypeAwareResource; | ||
import org.jsoup.Jsoup; | ||
import org.jsoup.nodes.Document; | ||
import org.junit.jupiter.api.Test; | ||
import org.springframework.http.MediaType; | ||
|
||
import java.io.InputStream; | ||
import java.nio.charset.StandardCharsets; | ||
|
||
import static org.junit.jupiter.api.Assertions.assertFalse; | ||
import static org.junit.jupiter.api.Assertions.assertTrue; | ||
|
||
class UnconfirmedTermOccurrenceRemoverTest { | ||
|
||
@Test | ||
void removeUnconfirmedOccurrencesReturnsContentWithoutSpansWithScoreThatIndicateOccurrenceIsUnconfirmed() | ||
throws Exception { | ||
TypeAwareResource input; | ||
try (final InputStream is = Environment.loadFile("data/rdfa-simple.html")) { | ||
input = new TypeAwareByteArrayResource(is.readAllBytes(), MediaType.TEXT_HTML_VALUE, ".html"); | ||
} | ||
final TypeAwareResource result = new UnconfirmedTermOccurrenceRemover().removeUnconfirmedOccurrences(input); | ||
final Document doc = Jsoup.parse(result.getInputStream(), StandardCharsets.UTF_8.name(), ""); | ||
assertTrue(doc.select("span[score]").isEmpty()); | ||
} | ||
|
||
@Test | ||
void removeUnconfirmedOccurrencesPreservesSpansWithoutScoreRepresentingConfirmedOccurrences() throws Exception { | ||
TypeAwareResource input; | ||
try (final InputStream is = Environment.loadFile("data/rdfa-simple-no-score.html")) { | ||
input = new TypeAwareByteArrayResource(is.readAllBytes(), MediaType.TEXT_HTML_VALUE, ".html"); | ||
} | ||
final TypeAwareResource result = new UnconfirmedTermOccurrenceRemover().removeUnconfirmedOccurrences(input); | ||
final Document doc = Jsoup.parse(result.getInputStream(), StandardCharsets.UTF_8.name(), ""); | ||
assertFalse( | ||
doc.select("span[resource='http://onto.fel.cvut.cz/ontologies/mpp/domains/uzemni-plan']").isEmpty()); | ||
} | ||
} |