Skip to content

Commit

Permalink
avoid problem when empty values are provided
Browse files Browse the repository at this point in the history
  • Loading branch information
lfoppiano committed Oct 13, 2023
1 parent 5b4389a commit 9f97e06
Showing 1 changed file with 36 additions and 5 deletions.
41 changes: 36 additions & 5 deletions src/main/java/org/grobid/core/engines/MaterialParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,11 @@
import org.slf4j.LoggerFactory;

import javax.inject.Inject;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.*;
import java.util.function.Predicate;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.IntStream;

import static org.apache.commons.collections4.CollectionUtils.isEmpty;
import static org.apache.commons.collections4.CollectionUtils.isNotEmpty;
Expand Down Expand Up @@ -90,8 +90,39 @@ public List<Material> process(String text) {
}

public List<List<Material>> processParallel(List<String> texts) {
List<List<LayoutToken>> asLayoutTokens = texts.stream().map(SuperconductorsParser::textToLayoutTokens).collect(Collectors.toList());
return processParallelLT(asLayoutTokens);

List<Integer> emptyIndices = IntStream.range(0, texts.size())
.filter(i -> StringUtils.isBlank(texts.get(i)))
.boxed()
.toList();

Set<Integer> emptyIndicesSet = new HashSet<>(emptyIndices);
List<String> textsCopy = new ArrayList<>();

for (int is = 0; is < texts.size(); is++) {
if (!emptyIndices.contains(is)) {
textsCopy.add(texts.get(is));
}
}

List<List<LayoutToken>> asLayoutTokens = textsCopy.stream()
.map(SuperconductorsParser::textToLayoutTokens)
.collect(Collectors.toList());

List<List<Material>> processed = processParallelLT(asLayoutTokens);

emptyIndices.forEach(i -> processed.add(i, new ArrayList<>()));

List<List<Material>> output = new ArrayList<>();
for (int is = 0; is < texts.size(); is++) {
if (!emptyIndices.contains(is)) {
output.add(processed.get(is));
} else {
output.add(new ArrayList<>());
}
}

return output;
}

public List<List<Material>> processParallelLT(List<List<LayoutToken>> layoutTokensBatch) {
Expand Down

0 comments on commit 9f97e06

Please sign in to comment.