-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
#57 - Create docker images for components and applications
- created docker image for Pdf to Xmi pipeline - renamed XmiNER... files to XmiNer...
- Loading branch information
Showing
7 changed files
with
195 additions
and
59 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
<tool id="SS_XMI-NER" name="SS_XMI-NER" version="1.0"> | ||
<description>PDF to XMI conversion Component</description> | ||
<requirements> | ||
<container type="docker">omtd-ss-pdf-xmi-docker</container> | ||
</requirements> | ||
<command> | ||
mkdir tmp; | ||
#for $file in $pdf_files | ||
cp $file tmp/$file.element_identifier; | ||
#end for | ||
java -cp /opt/ss-io-pdf/ss-io-pdf-1.0.1-SNAPSHOT-pdf-xmi-pipeline-standalone.jar eu.openminted.uc.socialsciences.io.docker.PdfXmiPipeline tmp $output.job_working_directory/working/out/ | ||
rm -r tmp; | ||
</command> | ||
<inputs> | ||
<param type="data_collection" collection_type="list" name="pdf_files" label="Input PDF files" format="pdf" /> | ||
</inputs> | ||
<outputs> | ||
<collection name="output" type="list" label="XMI files"> | ||
<discover_datasets pattern="__designation__" directory="out" format="xmi" visible="true" /> | ||
</collection> | ||
</outputs> | ||
</tool> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
FROM ubuntu:14.04 | ||
MAINTAINER Masoud Kiaeeha <[email protected]> | ||
|
||
RUN locale-gen en_US.UTF-8 | ||
ENV LANG='en_US.UTF-8' LANGUAGE='en_US:en' LC_ALL='en_US.UTF-8' | ||
|
||
#ENV LANG C.UTF-8 | ||
|
||
# Install java | ||
# -- -- --- - -- -- -- --- - -- | ||
RUN apt-get update && apt-get -y upgrade && apt-get -y install software-properties-common && add-apt-repository ppa:webupd8team/java -y && apt-get update | ||
RUN (echo oracle-java8-installer shared/accepted-oracle-license-v1-1 select true | /usr/bin/debconf-set-selections) && apt-get install -y oracle-java8-installer oracle-java8-set-default | ||
ENV JAVA_HOME /usr/lib/jvm/java-8-oracle | ||
ENV PATH $JAVA_HOME/bin:$PATH | ||
|
||
# Install xmlstarlet | ||
RUN apt-get -y install xmlstarlet | ||
|
||
RUN mkdir /opt/ss-module-ner/ | ||
COPY ../ss-io-pdf/target/ss-io-pdf-1.0.1-SNAPSHOT-pdf-xmi-pipeline-standalone.jar /opt/ss-io-pdf/ss-io-pdf-1.0.1-SNAPSHOT-pdf-xmi-pipeline-standalone.jar | ||
# -- -- --- - -- -- -- --- - -- |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,8 +1,4 @@ | ||
#!/bin/bash | ||
|
||
Dockerfile="./omtd-ss-xmi-ner.dockerfile" | ||
DockerImg="omtd-ss-xmi-ner-docker" | ||
|
||
docker build -t $DockerImg -f $Dockerfile .. | ||
|
||
|
||
docker build -t "omtd-ss-xmi-ner-docker" -f "./omtd-ss-xmi-ner.dockerfile" .. | ||
docker build -t "omtd-ss-pdf-xmi-docker" -f "./omtd-ss-pdf-xmi.dockerfile" .. |
81 changes: 81 additions & 0 deletions
81
ss-io-pdf/src/main/java/eu/openminted/uc/socialsciences/io/docker/PdfXmiPipeline.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
package eu.openminted.uc.socialsciences.io.docker; | ||
|
||
import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; | ||
import static org.apache.uima.fit.factory.CollectionReaderFactory.createReader; | ||
|
||
import java.io.File; | ||
import java.io.IOException; | ||
|
||
import org.apache.uima.UIMAException; | ||
import org.apache.uima.analysis_engine.AnalysisEngine; | ||
import org.apache.uima.collection.CollectionReader; | ||
import org.apache.uima.fit.pipeline.SimplePipeline; | ||
import org.apache.uima.resource.ResourceInitializationException; | ||
|
||
import de.tudarmstadt.ukp.dkpro.core.io.xmi.XmiWriter; | ||
import eu.openminted.share.annotations.api.Component; | ||
import eu.openminted.share.annotations.api.DataFormat; | ||
import eu.openminted.share.annotations.api.ResourceInput; | ||
import eu.openminted.share.annotations.api.ResourceOutput; | ||
import eu.openminted.share.annotations.api.constants.ComponentConstants; | ||
import eu.openminted.uc.socialsciences.io.pdf.cermine.CerminePdfReader; | ||
|
||
@Component(value = ComponentConstants.ComponentTypeConstants.reader) | ||
@ResourceInput(type = "corpus", dataFormat = @DataFormat(dataFormat = "pdf", fileExtension = ".pdf" | ||
, mimeType = "application/pdf"), encoding = "UTF-8", keyword = "pdf") | ||
@ResourceOutput(type = "corpus", dataFormat = @DataFormat(fileExtension = ".xmi"), encoding = "UTF-8", keyword = "xmi") | ||
public class PdfXmiPipeline | ||
{ | ||
public static void main(String args[]) throws Exception | ||
{ | ||
assertArguments(args); | ||
|
||
String inputDirectory = args[0]; | ||
String outputDirectory = args[1]; | ||
|
||
File inputDirectoryFile = new File(inputDirectory); | ||
File outputDirectoryFile = new File(outputDirectory); | ||
|
||
if (!outputDirectoryFile.exists()) { | ||
outputDirectoryFile.mkdirs(); | ||
} | ||
|
||
assertDirectory(inputDirectoryFile); | ||
assertDirectory(outputDirectoryFile); | ||
|
||
createAndRunPipeline(inputDirectory, outputDirectory); | ||
} | ||
|
||
private static void createAndRunPipeline(String inputFolder, String outputFolder) | ||
throws ResourceInitializationException, UIMAException, IOException | ||
{ | ||
CollectionReader reader = createReader(CerminePdfReader.class, | ||
CerminePdfReader.PARAM_SOURCE_LOCATION, inputFolder, | ||
CerminePdfReader.PARAM_PATTERNS, "[+]**/*.pdf", | ||
CerminePdfReader.PARAM_NORMALIZE_TEXT, true); | ||
|
||
AnalysisEngine engine = createEngine(XmiWriter.class, | ||
XmiWriter.PARAM_TARGET_LOCATION, outputFolder, | ||
XmiWriter.PARAM_OVERWRITE, true); | ||
|
||
SimplePipeline.runPipeline(reader, engine); | ||
} | ||
|
||
private static void assertDirectory(File folder) | ||
{ | ||
if (!folder.isDirectory()) { | ||
System.err.println("[" + folder + "] is not a directory!"); | ||
System.exit(1); | ||
} | ||
} | ||
|
||
private static void assertArguments(String[] args) | ||
{ | ||
if (args == null || args.length < 2) { | ||
System.err.println("Two arguments should be provided!"); | ||
System.err.println("args[0]: input directory"); | ||
System.err.println("args[1]: output directory"); | ||
System.exit(1); | ||
} | ||
} | ||
} |
47 changes: 0 additions & 47 deletions
47
ss-module-ner/src/main/java/eu/openminted/uc/socialsciences/ner/docker/XmiNERPipeline.java
This file was deleted.
Oops, something went wrong.
65 changes: 65 additions & 0 deletions
65
ss-module-ner/src/main/java/eu/openminted/uc/socialsciences/ner/docker/XmiNerPipeline.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
package eu.openminted.uc.socialsciences.ner.docker; | ||
|
||
import java.io.File; | ||
|
||
import org.apache.log4j.LogManager; | ||
import org.apache.log4j.Logger; | ||
|
||
import eu.openminted.share.annotations.api.Component; | ||
import eu.openminted.share.annotations.api.DataFormat; | ||
import eu.openminted.share.annotations.api.ResourceInput; | ||
import eu.openminted.share.annotations.api.ResourceOutput; | ||
import eu.openminted.share.annotations.api.constants.ComponentConstants; | ||
import eu.openminted.uc.socialsciences.ner.Pipeline; | ||
|
||
@Component(value = ComponentConstants.ComponentTypeConstants.namedEntityRecognizer) | ||
@ResourceInput(type = "corpus", dataFormat = @DataFormat(fileExtension = ".xmi"), encoding = "UTF-8", keyword = "xmi") | ||
@ResourceOutput(type = "corpus", dataFormat = @DataFormat(fileExtension = ".xmi"), encoding = "UTF-8", keyword = "xmi") | ||
public class XmiNerPipeline | ||
{ | ||
private static final Logger logger = LogManager.getLogger(XmiNerPipeline.class); | ||
|
||
public static void main(String[] args) | ||
{ | ||
assertArguments(args); | ||
|
||
String inputDirectory = args[0]; | ||
String outputDirectory = args[1]; | ||
|
||
File inputDirectoryFile = new File(inputDirectory); | ||
File outputDirectoryFile = new File(outputDirectory); | ||
|
||
if (!outputDirectoryFile.exists()) { | ||
outputDirectoryFile.mkdirs(); | ||
} | ||
|
||
assertDirectory(inputDirectoryFile); | ||
assertDirectory(outputDirectoryFile); | ||
logger.info("Setting parameters for NER"); | ||
Pipeline pipelineNER = new Pipeline(); | ||
pipelineNER.setInput(inputDirectory + "[+]**/*.xmi"); | ||
pipelineNER.setOutput(outputDirectory); | ||
pipelineNER.setUseStanfordModels(false); | ||
logger.info("Running NER"); | ||
pipelineNER.run(); | ||
logger.info("NER finished"); | ||
} | ||
|
||
private static void assertDirectory(File folder) | ||
{ | ||
if (!folder.isDirectory()) { | ||
System.err.println("[" + folder + "] is not a directory!"); | ||
System.exit(1); | ||
} | ||
} | ||
|
||
private static void assertArguments(String[] args) | ||
{ | ||
if (args == null || args.length < 2) { | ||
System.err.println("Two arguments should be provided!"); | ||
System.err.println("args[0]: input directory"); | ||
System.err.println("args[1]: output directory"); | ||
System.exit(1); | ||
} | ||
} | ||
} |