Skip to content

Commit

Permalink
#57 - Create docker images for components and applications
Browse files Browse the repository at this point in the history
- created docker image for Pdf to Xmi pipeline
- renamed XmiNER... files to XmiNer...
  • Loading branch information
maxxkia committed Nov 16, 2017
1 parent b8bfa4b commit fd610cd
Show file tree
Hide file tree
Showing 7 changed files with 195 additions and 59 deletions.
22 changes: 22 additions & 0 deletions docker/PdfXmiGalaxyWrapper.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
<tool id="SS_XMI-NER" name="SS_XMI-NER" version="1.0">
<description>PDF to XMI conversion Component</description>
<requirements>
<container type="docker">omtd-ss-pdf-xmi-docker</container>
</requirements>
<command>
mkdir tmp;
#for $file in $pdf_files
cp $file tmp/$file.element_identifier;
#end for
java -cp /opt/ss-io-pdf/ss-io-pdf-1.0.1-SNAPSHOT-pdf-xmi-pipeline-standalone.jar eu.openminted.uc.socialsciences.io.docker.PdfXmiPipeline tmp $output.job_working_directory/working/out/
rm -r tmp;
</command>
<inputs>
<param type="data_collection" collection_type="list" name="pdf_files" label="Input PDF files" format="pdf" />
</inputs>
<outputs>
<collection name="output" type="list" label="XMI files">
<discover_datasets pattern="__designation__" directory="out" format="xmi" visible="true" />
</collection>
</outputs>
</tool>
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
<tool id="SS_XMI-NER" name="SS_XMI-NER" version="1.0">
<description>NER Pipeline for social sciences text</description>
<description>NER Component for social sciences text</description>
<requirements>
<container type="docker">omtd-ss-xmi-ner-docker</container>
</requirements>
Expand All @@ -8,17 +8,15 @@
#for $file in $xmi_files
cp $file tmp/$file.element_identifier;
#end for
java -cp /opt/ss-module-ner/ss-module-ner-1.0.1-SNAPSHOT-ss-ner-standalone.jar eu.openminted.uc.socialsciences.ner.eval.PerformanceMeasure tmp $output.job_working_directory/working/out/
java -cp /opt/ss-module-ner/ss-module-ner-1.0.1-SNAPSHOT-ss-ner-standalone.jar eu.openminted.uc.socialsciences.ner.docker.XmiNerPipeline tmp $output.job_working_directory/working/out/
rm -r tmp;
</command>
<inputs>
<param type="data_collection" collection_type="list" name="xmi_files"
label="Input XMI files" format="xmi" />
<param type="data_collection" collection_type="list" name="xmi_files" label="Input XMI files" format="xmi" />
</inputs>
<outputs>
<collection name="output" type="list" label="XMI files">
<discover_datasets pattern="__designation__"
directory="out" format="xmi" visible="true" />
<discover_datasets pattern="__designation__" directory="out" format="xmi" visible="true" />
</collection>
</outputs>
</tool>
21 changes: 21 additions & 0 deletions docker/omtd-ss-pdf-xmi.dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
FROM ubuntu:14.04
MAINTAINER Masoud Kiaeeha <[email protected]>

RUN locale-gen en_US.UTF-8
ENV LANG='en_US.UTF-8' LANGUAGE='en_US:en' LC_ALL='en_US.UTF-8'

#ENV LANG C.UTF-8

# Install java
# -- -- --- - -- -- -- --- - --
RUN apt-get update && apt-get -y upgrade && apt-get -y install software-properties-common && add-apt-repository ppa:webupd8team/java -y && apt-get update
RUN (echo oracle-java8-installer shared/accepted-oracle-license-v1-1 select true | /usr/bin/debconf-set-selections) && apt-get install -y oracle-java8-installer oracle-java8-set-default
ENV JAVA_HOME /usr/lib/jvm/java-8-oracle
ENV PATH $JAVA_HOME/bin:$PATH

# Install xmlstarlet
RUN apt-get -y install xmlstarlet

RUN mkdir /opt/ss-module-ner/
COPY ../ss-io-pdf/target/ss-io-pdf-1.0.1-SNAPSHOT-pdf-xmi-pipeline-standalone.jar /opt/ss-io-pdf/ss-io-pdf-1.0.1-SNAPSHOT-pdf-xmi-pipeline-standalone.jar
# -- -- --- - -- -- -- --- - --
8 changes: 2 additions & 6 deletions docker/omtd-ss-workflows-createDockerImg.sh
Original file line number Diff line number Diff line change
@@ -1,8 +1,4 @@
#!/bin/bash

Dockerfile="./omtd-ss-xmi-ner.dockerfile"
DockerImg="omtd-ss-xmi-ner-docker"

docker build -t $DockerImg -f $Dockerfile ..


docker build -t "omtd-ss-xmi-ner-docker" -f "./omtd-ss-xmi-ner.dockerfile" ..
docker build -t "omtd-ss-pdf-xmi-docker" -f "./omtd-ss-pdf-xmi.dockerfile" ..
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
package eu.openminted.uc.socialsciences.io.docker;

import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine;
import static org.apache.uima.fit.factory.CollectionReaderFactory.createReader;

import java.io.File;
import java.io.IOException;

import org.apache.uima.UIMAException;
import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.collection.CollectionReader;
import org.apache.uima.fit.pipeline.SimplePipeline;
import org.apache.uima.resource.ResourceInitializationException;

import de.tudarmstadt.ukp.dkpro.core.io.xmi.XmiWriter;
import eu.openminted.share.annotations.api.Component;
import eu.openminted.share.annotations.api.DataFormat;
import eu.openminted.share.annotations.api.ResourceInput;
import eu.openminted.share.annotations.api.ResourceOutput;
import eu.openminted.share.annotations.api.constants.ComponentConstants;
import eu.openminted.uc.socialsciences.io.pdf.cermine.CerminePdfReader;

@Component(value = ComponentConstants.ComponentTypeConstants.reader)
@ResourceInput(type = "corpus", dataFormat = @DataFormat(dataFormat = "pdf", fileExtension = ".pdf"
, mimeType = "application/pdf"), encoding = "UTF-8", keyword = "pdf")
@ResourceOutput(type = "corpus", dataFormat = @DataFormat(fileExtension = ".xmi"), encoding = "UTF-8", keyword = "xmi")
public class PdfXmiPipeline
{
public static void main(String args[]) throws Exception
{
assertArguments(args);

String inputDirectory = args[0];
String outputDirectory = args[1];

File inputDirectoryFile = new File(inputDirectory);
File outputDirectoryFile = new File(outputDirectory);

if (!outputDirectoryFile.exists()) {
outputDirectoryFile.mkdirs();
}

assertDirectory(inputDirectoryFile);
assertDirectory(outputDirectoryFile);

createAndRunPipeline(inputDirectory, outputDirectory);
}

private static void createAndRunPipeline(String inputFolder, String outputFolder)
throws ResourceInitializationException, UIMAException, IOException
{
CollectionReader reader = createReader(CerminePdfReader.class,
CerminePdfReader.PARAM_SOURCE_LOCATION, inputFolder,
CerminePdfReader.PARAM_PATTERNS, "[+]**/*.pdf",
CerminePdfReader.PARAM_NORMALIZE_TEXT, true);

AnalysisEngine engine = createEngine(XmiWriter.class,
XmiWriter.PARAM_TARGET_LOCATION, outputFolder,
XmiWriter.PARAM_OVERWRITE, true);

SimplePipeline.runPipeline(reader, engine);
}

private static void assertDirectory(File folder)
{
if (!folder.isDirectory()) {
System.err.println("[" + folder + "] is not a directory!");
System.exit(1);
}
}

private static void assertArguments(String[] args)
{
if (args == null || args.length < 2) {
System.err.println("Two arguments should be provided!");
System.err.println("args[0]: input directory");
System.err.println("args[1]: output directory");
System.exit(1);
}
}
}

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
package eu.openminted.uc.socialsciences.ner.docker;

import java.io.File;

import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;

import eu.openminted.share.annotations.api.Component;
import eu.openminted.share.annotations.api.DataFormat;
import eu.openminted.share.annotations.api.ResourceInput;
import eu.openminted.share.annotations.api.ResourceOutput;
import eu.openminted.share.annotations.api.constants.ComponentConstants;
import eu.openminted.uc.socialsciences.ner.Pipeline;

@Component(value = ComponentConstants.ComponentTypeConstants.namedEntityRecognizer)
@ResourceInput(type = "corpus", dataFormat = @DataFormat(fileExtension = ".xmi"), encoding = "UTF-8", keyword = "xmi")
@ResourceOutput(type = "corpus", dataFormat = @DataFormat(fileExtension = ".xmi"), encoding = "UTF-8", keyword = "xmi")
public class XmiNerPipeline
{
private static final Logger logger = LogManager.getLogger(XmiNerPipeline.class);

public static void main(String[] args)
{
assertArguments(args);

String inputDirectory = args[0];
String outputDirectory = args[1];

File inputDirectoryFile = new File(inputDirectory);
File outputDirectoryFile = new File(outputDirectory);

if (!outputDirectoryFile.exists()) {
outputDirectoryFile.mkdirs();
}

assertDirectory(inputDirectoryFile);
assertDirectory(outputDirectoryFile);
logger.info("Setting parameters for NER");
Pipeline pipelineNER = new Pipeline();
pipelineNER.setInput(inputDirectory + "[+]**/*.xmi");
pipelineNER.setOutput(outputDirectory);
pipelineNER.setUseStanfordModels(false);
logger.info("Running NER");
pipelineNER.run();
logger.info("NER finished");
}

private static void assertDirectory(File folder)
{
if (!folder.isDirectory()) {
System.err.println("[" + folder + "] is not a directory!");
System.exit(1);
}
}

private static void assertArguments(String[] args)
{
if (args == null || args.length < 2) {
System.err.println("Two arguments should be provided!");
System.err.println("args[0]: input directory");
System.err.println("args[1]: output directory");
System.exit(1);
}
}
}

0 comments on commit fd610cd

Please sign in to comment.