Skip to content

Commit

Permalink
[kbss-cvut/termi-ui#451] Implement support for getting file content w…
Browse files Browse the repository at this point in the history
…ithout unconfirmed term occurrences.
  • Loading branch information
ledsoft committed Apr 16, 2024
1 parent 88af8e5 commit 94ae67a
Show file tree
Hide file tree
Showing 7 changed files with 141 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
/**
* Indicates a failure during document annotation generation.
*/
public class AnnotationGenerationException extends TermItException {
public class AnnotationGenerationException extends FileContentProcessingException {

public AnnotationGenerationException(String message) {
super(message);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
package cz.cvut.kbss.termit.exception;

/**
* Indicates an error when processing file content.
*/
public class FileContentProcessingException extends TermItException {

public FileContentProcessingException(String message) {
super(message);
}

public FileContentProcessingException(String message, Throwable cause) {
super(message, cause);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
import cz.cvut.kbss.termit.service.document.DocumentManager;
import cz.cvut.kbss.termit.service.document.ResourceRetrievalSpecification;
import cz.cvut.kbss.termit.service.document.TextAnalysisService;
import cz.cvut.kbss.termit.service.document.html.UnconfirmedTermOccurrenceRemover;
import cz.cvut.kbss.termit.service.repository.ChangeRecordService;
import cz.cvut.kbss.termit.service.repository.ResourceRepositoryService;
import cz.cvut.kbss.termit.util.TypeAwareResource;
Expand Down Expand Up @@ -146,8 +147,13 @@ public TypeAwareResource getContent(Resource resource, ResourceRetrievalSpecific
Objects.requireNonNull(resource);
verifyFileOperationPossible(resource, "Content retrieval");
final File file = (File) resource;
return retrievalSpecification.at().map(instant -> documentManager.getAsResource(file, instant))
.orElseGet(() -> documentManager.getAsResource(file));
TypeAwareResource result = retrievalSpecification.at()
.map(instant -> documentManager.getAsResource(file, instant))
.orElseGet(() -> documentManager.getAsResource(file));
if (retrievalSpecification.withoutUnconfirmedOccurrences()) {
result = new UnconfirmedTermOccurrenceRemover().removeUnconfirmedOccurrences(result);
}
return result;
}

private void verifyFileOperationPossible(Resource resource, String operation) {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
package cz.cvut.kbss.termit.service.document.html;

import cz.cvut.kbss.termit.exception.FileContentProcessingException;
import cz.cvut.kbss.termit.service.export.util.TypeAwareByteArrayResource;
import cz.cvut.kbss.termit.util.TypeAwareResource;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Node;
import org.jsoup.select.Elements;

import java.io.BufferedWriter;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.nio.charset.StandardCharsets;

/**
* Removes unconfirmed term occurrences from content.
*/
public class UnconfirmedTermOccurrenceRemover {


/**
* Removes unconfirmed term occurrences from the specified input.
* <p>
* Removing such occurrences means the corresponding elements are replaced with their text content.
* <p>
* An occurrence is considered unconfirmed when it has a confidence score, confirmed occurrences do not have
* scores.
*
* @param input Input to process
* @return Processed content
*/
public TypeAwareResource removeUnconfirmedOccurrences(TypeAwareResource input) {
try {
final Document doc = Jsoup.parse(input.getInputStream(), StandardCharsets.UTF_8.name(), "");
doc.outputSettings().prettyPrint(false);
final Elements spanElements = doc.select("span[score]");
spanElements.forEach(Node::unwrap);

final ByteArrayOutputStream out = new ByteArrayOutputStream();
final BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out));
writer.write(doc.toString()); // Write modified HTML to output stream
writer.close();
return new TypeAwareByteArrayResource(out.toByteArray(), input.getMediaType().orElse(null),
input.getFileExtension().orElse(null));
} catch (IOException e) {
throw new FileContentProcessingException("Unable to read resource for unconfirmed occurrence removal.", e);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -46,13 +46,12 @@ public boolean equals(Object o) {
if (this == o) {
return true;
}
if (!(o instanceof TypeAwareFileSystemResource)) {
if (!(o instanceof TypeAwareFileSystemResource that)) {
return false;
}
if (!super.equals(o)) {
return false;
}
TypeAwareFileSystemResource that = (TypeAwareFileSystemResource) o;
return Objects.equals(mediaType, that.mediaType);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
*/
package cz.cvut.kbss.termit.service.business;

import cz.cvut.kbss.termit.environment.Environment;
import cz.cvut.kbss.termit.environment.Generator;
import cz.cvut.kbss.termit.event.DocumentRenameEvent;
import cz.cvut.kbss.termit.event.FileRenameEvent;
Expand All @@ -33,19 +34,25 @@
import cz.cvut.kbss.termit.service.document.DocumentManager;
import cz.cvut.kbss.termit.service.document.ResourceRetrievalSpecification;
import cz.cvut.kbss.termit.service.document.TextAnalysisService;
import cz.cvut.kbss.termit.service.export.util.TypeAwareByteArrayResource;
import cz.cvut.kbss.termit.service.repository.ChangeRecordService;
import cz.cvut.kbss.termit.service.repository.ResourceRepositoryService;
import cz.cvut.kbss.termit.util.TypeAwareResource;
import cz.cvut.kbss.termit.util.Utils;
import org.jsoup.Jsoup;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.extension.ExtendWith;
import org.mockito.*;
import org.mockito.junit.jupiter.MockitoExtension;
import org.springframework.context.ApplicationEventPublisher;
import org.springframework.http.MediaType;
import org.springframework.transaction.TransactionSystemException;

import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.net.URI;
import java.nio.charset.StandardCharsets;
import java.time.Instant;
import java.util.*;

Expand Down Expand Up @@ -479,4 +486,20 @@ void getContentAtTimestampLoadsContentOfFileAtTimestampFromDocumentManager() {
sut.getContent(file, new ResourceRetrievalSpecification(Optional.of(at), false));
verify(documentManager).getAsResource(file, at);
}

@Test
void getContentWithoutUnconfirmedOccurrencesRemovesUnconfirmedOccurrencesFromFileContentBeforeReturningIt()
throws Exception {
final File file = Generator.generateFileWithId("test.hml");
TypeAwareResource content;
try (final InputStream is = Environment.loadFile("data/rdfa-simple.html")) {
content = new TypeAwareByteArrayResource(is.readAllBytes(), MediaType.TEXT_HTML_VALUE, ".html");
}
when(documentManager.getAsResource(file)).thenReturn(content);

final TypeAwareResource result = sut.getContent(file,
new ResourceRetrievalSpecification(Optional.empty(), true));
final org.jsoup.nodes.Document doc = Jsoup.parse(result.getInputStream(), StandardCharsets.UTF_8.name(), "");
assertTrue(doc.select("span[score]").isEmpty());
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
package cz.cvut.kbss.termit.service.document.html;

import cz.cvut.kbss.termit.environment.Environment;
import cz.cvut.kbss.termit.service.export.util.TypeAwareByteArrayResource;
import cz.cvut.kbss.termit.util.TypeAwareResource;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.junit.jupiter.api.Test;
import org.springframework.http.MediaType;

import java.io.InputStream;
import java.nio.charset.StandardCharsets;

import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertTrue;

class UnconfirmedTermOccurrenceRemoverTest {

@Test
void removeUnconfirmedOccurrencesReturnsContentWithoutSpansWithScoreThatIndicateOccurrenceIsUnconfirmed()
throws Exception {
TypeAwareResource input;
try (final InputStream is = Environment.loadFile("data/rdfa-simple.html")) {
input = new TypeAwareByteArrayResource(is.readAllBytes(), MediaType.TEXT_HTML_VALUE, ".html");
}
final TypeAwareResource result = new UnconfirmedTermOccurrenceRemover().removeUnconfirmedOccurrences(input);
final Document doc = Jsoup.parse(result.getInputStream(), StandardCharsets.UTF_8.name(), "");
assertTrue(doc.select("span[score]").isEmpty());
}

@Test
void removeUnconfirmedOccurrencesPreservesSpansWithoutScoreRepresentingConfirmedOccurrences() throws Exception {
TypeAwareResource input;
try (final InputStream is = Environment.loadFile("data/rdfa-simple-no-score.html")) {
input = new TypeAwareByteArrayResource(is.readAllBytes(), MediaType.TEXT_HTML_VALUE, ".html");
}
final TypeAwareResource result = new UnconfirmedTermOccurrenceRemover().removeUnconfirmedOccurrences(input);
final Document doc = Jsoup.parse(result.getInputStream(), StandardCharsets.UTF_8.name(), "");
assertFalse(
doc.select("span[resource='http://onto.fel.cvut.cz/ontologies/mpp/domains/uzemni-plan']").isEmpty());
}
}

0 comments on commit 94ae67a

Please sign in to comment.