From 5b4389a3691a679c29bcc0016b6f4c69e2298013 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Fri, 13 Oct 2023 13:11:28 +0900 Subject: [PATCH] add specialised request for parsing material names in parallel --- .../grobid/core/engines/MaterialParser.java | 49 +++++++++++++++++++ .../controller/MaterialController.java | 23 +++++++++ 2 files changed, 72 insertions(+) diff --git a/src/main/java/org/grobid/core/engines/MaterialParser.java b/src/main/java/org/grobid/core/engines/MaterialParser.java index 52ea4700..44648f0d 100644 --- a/src/main/java/org/grobid/core/engines/MaterialParser.java +++ b/src/main/java/org/grobid/core/engines/MaterialParser.java @@ -7,11 +7,13 @@ import org.apache.commons.text.StringEscapeUtils; import org.grobid.core.GrobidModel; import org.grobid.core.analyzers.DeepAnalyzer; +import org.grobid.core.data.document.Span; import org.grobid.core.data.material.ChemicalComposition; import org.grobid.core.data.material.Formula; import org.grobid.core.data.material.Material; import org.grobid.core.engines.label.TaggingLabel; import org.grobid.core.exceptions.GrobidException; +import org.grobid.core.exceptions.GrobidExceptionStatus; import org.grobid.core.features.FeaturesVectorMaterial; import org.grobid.core.layout.BoundingBox; import org.grobid.core.layout.LayoutToken; @@ -87,6 +89,53 @@ public List process(String text) { return process(SuperconductorsParser.textToLayoutTokens(text)); } + public List> processParallel(List texts) { + List> asLayoutTokens = texts.stream().map(SuperconductorsParser::textToLayoutTokens).collect(Collectors.toList()); + return processParallelLT(asLayoutTokens); + } + + public List> processParallelLT(List> layoutTokensBatch) { + + List> entities = new ArrayList<>(); + + //Normalisation + List> normalisedTokens = layoutTokensBatch.stream() + .map(SuperconductorsParser::normalizeAndRetokenizeLayoutTokens) + .toList(); + + try { + List tokensWithFeatures = normalisedTokens.stream().map(nt -> addFeatures(nt) + "\n").toList(); + + String labellingResult = null; + try { + labellingResult = label(tokensWithFeatures); + } catch (Exception e) { + throw new GrobidException("CRF labeling for superconductors parsing failed.", e); + } + + List resultingBlocks = Arrays.asList(labellingResult.split("\n\n")); + List> localEntities = extractParallelResults(normalisedTokens, resultingBlocks); + + entities.addAll(localEntities); + } catch (Exception e) { + throw new GrobidException("An exception occurred while running Grobid.", e); + } + + return entities; + } + + public List> extractParallelResults(List> tokens, List results) { + List> spans = new ArrayList<>(); + if (tokens.size() != results.size()) { + throw new GrobidException("One of the text provided is invalid or empty and cannot be tagged. Please provide a clean input.", GrobidExceptionStatus.BAD_INPUT_DATA); + } + for (int i = 0; i < tokens.size(); i++) { + spans.add(extractResults(tokens.get(i), results.get(i))); + } + + return spans; + } + public List process(List tokens) { diff --git a/src/main/java/org/grobid/service/controller/MaterialController.java b/src/main/java/org/grobid/service/controller/MaterialController.java index c9155478..87b892ce 100644 --- a/src/main/java/org/grobid/service/controller/MaterialController.java +++ b/src/main/java/org/grobid/service/controller/MaterialController.java @@ -12,6 +12,7 @@ import javax.ws.rs.Path; import javax.ws.rs.Produces; import javax.ws.rs.core.MediaType; +import java.util.Arrays; import java.util.List; @Singleton @@ -42,10 +43,32 @@ public List processTextSuperconductorsGet(@FormDataParam("text") Strin return parseMaterial(text); } + + @Path("parse2") + @Produces(MediaType.APPLICATION_JSON) + @POST + public List> processTextSuperconductorsPost2(@FormDataParam("texts") String texts) { + return parseMaterials(texts); + } + + @Path("parse2") + @Produces(MediaType.APPLICATION_JSON) + @GET + public List> processTextSuperconductorsGet2(@FormDataParam("texts") String texts) { + return parseMaterials(texts); + } + private List parseMaterial(@FormDataParam("text") String text) { String textPreprocessed = text.replace("\r\n", "\n"); return materialParser.process(textPreprocessed); } + private List> parseMaterials(@FormDataParam("text") String text) { + String textPreprocessed = text.replace("\r\n", "\n"); + + List list = Arrays.asList(textPreprocessed.split("\n")); + return materialParser.processParallel(list); + } + }