Merge branch 'release/1.2.0'

jplu · Sep 1, 2016 · d65491e · d65491e
2 parents db4f515 + 2be3f2e
commit d65491e
Show file tree

Hide file tree

Showing 20 changed files with 336 additions and 118 deletions.
diff --git a/README.md b/README.md
@@ -10,7 +10,7 @@ to get results in NIF format. The REST API is created via [Dropwizard](http://ww
 
 # Requirements
 
-Java 1.8 and Maven 3.0.3 minimum. Docker (1.6 or later) is optional.
+Java 1.8 and Maven 3.0.5 minimum. Docker (1.6 or later) is optional.
 
 # Maven
 
@@ -49,7 +49,7 @@ mvn clean verify -P all-tests
 # Usage
 
 ```
-usage: java -jar stanfordNLPRESTAPI-1.1.5.jar
+usage: java -jar stanfordNLPRESTAPI-1.2.0.jar
        [-h] [-v] {server,check,pos,ner} ...
 
 positional arguments:
@@ -73,7 +73,7 @@ The first way is via CLI with two possible sub-commands, **ner** and **pos**.
 To use the **ner** CLI:
 
 ```
-usage: java -jar stanfordNLPRESTAPI-1.1.5.jar
+usage: java -jar stanfordNLPRESTAPI-1.2.0.jar
        ner -t TEXT [-f FORMAT] [-h] [file]
 
 NER command on text
@@ -92,7 +92,7 @@ optional arguments:
 To use the **pos** CLI:
 
 ```
-usage: java -jar stanfordNLPRESTAPI-1.1.5.jar
+usage: java -jar stanfordNLPRESTAPI-1.2.0.jar
        pos -t TEXT [-f FORMAT] [-h] [file]
 
 POS command on text
@@ -112,7 +112,7 @@ optional arguments:
 The second way is via a Web service:
 
 ```
-usage: java -jar stanfordNLPRESTAPI-1.1.5.jar
+usage: java -jar stanfordNLPRESTAPI-1.2.0.jar
        server [-h] [file]
 
 Runs the Dropwizard application as an HTTP server
@@ -141,7 +141,7 @@ mvn docker:build
 Once the image is built, it is possible to run it with:
 
 ```
-docker run -d -p 7000:7000 -p 7001:7001 jplu/stanford-nlp-rest-api:1.1.5.jar
+docker run -d -p 7000:7000 -p 7001:7001 -v models:/maven/models -v conf:/maven/conf jplu/stanford-nlp-rest-api:1.2.0
 ```
 
 Or with:
@@ -150,62 +150,25 @@ Or with:
 mvn docker:start
 ```
 
+The container needs at most 5 minutes (depending of the power of your machine) to be up because of 
+the loading of all the models of Stanford CoreNLP.
+
 ## Configuration
 
-The CLI commands and the Web service use the same configuration file (*conf/config.yaml*):
-
-```yaml
-pos:
-  model: "models/english-bidirectional-distsim.tagger"
-ner:
-  model: "edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
-  useSuTime: false
-  applyNumericClassifiers: false
-parse:
-  model: "models/englishRNN.ser.gz"
-coref:
-  mdType: "rule"
-  mode: "statistical"
-  doClustering: "true"
-
-logging:
-  level: INFO
-  appenders:
-    - type: console
-      threshold: ALL
-      timeZone: UTC
-      target: stdout
-    - type: file
-      currentLogFilename: /logs/stanford.log
-      threshold: ALL
-      archive: true
-      archivedLogFilenamePattern: /logs/stanford-%d.log
-      archivedFileCount: 5
-      timeZone: UTC
-
-server:
-  requestLog:
-    enabled: true
-    appenders:
-        - type: console
-          threshold: ALL
-          timeZone: UTC
-          target: stdout
-        - type: file
-          currentLogFilename: /logs/stanford-queries.log
-          threshold: ALL
-          archive: true
-          archivedLogFilenamePattern: /logs/stanford-queries-%d.log
-          archivedFileCount: 5
-          timeZone: UTC
-  applicationConnectors:
-    - type: http
-      port: 7000
-  adminConnectors:
-    - type: http
-      port: 7001
-```
+The CLI commands and the Web service use the same [configuration file](https://github.com/jplu/stanfordNLPRESTAPI/blob/master/conf/config.yaml).
+
+## Used Models
+
+This application contains by default all the English models provided by Stanford CoreNLP team. In
+case you want to add models you will have to download and put them in the *models* folder. You can
+also download the jar files provided by Stanford with models for other languages. To use them you
+will have to include them in the CLASSPATH. We provide two models:
 
+* OKE2016[1]: NER model trained with the OKE2016 challenge training dataset.
+* NEEL2016[2][3][4]: NER model for tweets trained with the NEEL2016 challenge training dataset.
+
+The model for the POS tagging can be found on the GATE [website](https://gate.ac.uk/wiki/twitter-postagger.html)
+and put in the *models* folder.
 
 # How to contribute
 
@@ -236,4 +199,11 @@ only one, so others will find your issue helpful, too. To open an issue:
 
 # License
 
-This project is licensed under the terms of the GPL v3 license.
+All the content of this repository is licensed under the terms of the GPL v3 license.
+
+# References
+
+* [1]: Plu J., Rizzo G., Troncy R. (2016) Enhancing Entity Linking by Combining NER Models. In: 13th Extended Semantic Web Conference (ESWC'16), Challenges Track, Heraklion, Greece.
+* [2]: Rizzo G., van Erp M., Plu J., Troncy R. (2015), NEEL 2016: Named Entity rEcognition & Linking Challenge Report. In (WWW'16), 6th International Workshop on Making Sense of Microposts (#Microposts'16), Montréal, Québec, Canada.
+* [3]: Rizzo G., Cano A.E., Pereira B., Varga A. (2015), Making Sense of Microposts (#Microposts2015) Named Entity rEcognition & Linking Challenge. In (WWW'15), 5th International Workshop on Making Sense of Microposts (#Microposts'15), Florence, Italy.
+* [4]: Cano A.E., Rizzo G., Varga A., Rowe M., Stankovic M., Dadzie A.S. (2014), Making Sense of Microposts (#Microposts2014) Named Entity Extraction & Linking Challenge. In (WWW'14),4th International Workshop on Making Sense of Microposts (#Microposts'14), Seoul, Korea.
diff --git a/conf/config.yaml b/conf/config.yaml
@@ -17,17 +17,17 @@
 #
 
 pos:
-  model: "models/english-bidirectional-distsim.tagger"
+  model: edu/stanford/nlp/models/pos-tagger/english-bidirectional/english-bidirectional-distsim.tagger
 ner:
-  model: "edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz"
+  model: edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz
   useSuTime: false
   applyNumericClassifiers: false
 parse:
-  model: "models/englishRNN.ser.gz"
+  model: edu/stanford/nlp/models/lexparser/englishRNN.ser.gz
 coref:
-  mdType: "rule"
-  mode: "statistical"
-  doClustering: "true"
+  mdType: rule
+  mode: statistical
+  doClustering: true
 
 logging:
   level: INFO
@@ -37,10 +37,10 @@ logging:
       timeZone: UTC
       target: stdout
     - type: file
-      currentLogFilename: /logs/stanford.log
+      currentLogFilename: logs/stanford.log
       threshold: ALL
       archive: true
-      archivedLogFilenamePattern: /logs/stanford-%d.log
+      archivedLogFilenamePattern: logs/stanford-%d.log
       archivedFileCount: 5
       timeZone: UTC
 
@@ -53,10 +53,10 @@ server:
           timeZone: UTC
           target: stdout
         - type: file
-          currentLogFilename: /logs/stanford-queries.log
+          currentLogFilename: logs/stanford-queries.log
           threshold: ALL
           archive: true
-          archivedLogFilenamePattern: /logs/stanford-queries-%d.log
+          archivedLogFilenamePattern: logs/stanford-queries-%d.log
           archivedFileCount: 5
           timeZone: UTC
   applicationConnectors:

diff --git a/logs/.gitignore b/logs/.gitignore
@@ -0,0 +1,4 @@
+# Ignore everything in this directory
+*
+# Except this file
+!.gitignore
diff --git a/models/.gitignore b/models/.gitignore
@@ -0,0 +1 @@
+gate-EN-twitter.model
diff --git a/models/englishRNN.ser.gz → models/NEEL2016.ser.gz b/models/englishRNN.ser.gz → models/NEEL2016.ser.gz
diff --git a/models/OKE2016.gz b/models/OKE2016.gz
diff --git a/models/english-bidirectional-distsim.tagger b/models/english-bidirectional-distsim.tagger
diff --git a/pom.xml b/pom.xml
@@ -23,7 +23,7 @@
     <groupId>fr.eurecom</groupId>
     <artifactId>stanfordNLPRESTAPI</artifactId>
     <name>StanfordNLPRESTAPI</name>
-    <version>1.1.5</version>
+    <version>1.2.0</version>
 
     <properties>
         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
@@ -343,7 +343,7 @@
                                     <version>1.8</version>
                                 </requireJavaVersion>
                                 <requireMavenVersion>
-                                    <version>3.0.3</version>
+                                    <version>3.0.5</version>
                                 </requireMavenVersion>
                             </rules>
                         </configuration>
@@ -485,16 +485,26 @@
                                     <port>7000</port>
                                     <port>7001</port>
                                 </ports>
+                                <volumes>
+                                    <volume>/maven/conf</volume>
+                                    <volume>/maven/models</volume>
+                                </volumes>
                                 <workdir>/maven</workdir>
                                 <cmd>
-                                    <shell>java -Xmx4g -jar ${project.build.finalName}.jar server config.yaml</shell>
+                                    <shell>java -Xmx4g -jar ${project.build.finalName}.jar server conf/config.yaml</shell>
                                 </cmd>
                             </build>
                             <run>
                                 <ports>
                                     <port>7000:7000</port>
                                     <port>7001:7001</port>
                                 </ports>
+                                <volumes>
+                                    <bind>
+                                        <volume>${project.basedir}/conf:/maven/conf</volume>
+                                        <volume>${project.basedir}/models:/maven/models</volume>
+                                    </bind>
+                                </volumes>
                                 <wait>
                                     <http>
                                         <url>http://localhost:7001/healthcheck</url>
@@ -532,7 +542,7 @@
             <groupId>edu.stanford.nlp</groupId>
             <artifactId>stanford-corenlp</artifactId>
             <version>3.6.0</version>
-            <classifier>models</classifier>
+            <classifier>models-english</classifier>
             <exclusions>
                 <exclusion>
                     <groupId>joda-time</groupId>

diff --git a/src/main/docker/docker-assembly.xml b/src/main/docker/docker-assembly.xml
@@ -25,17 +25,5 @@
             <source>target/${project.build.finalName}.jar</source>
             <outputDirectory>/</outputDirectory>
         </file>
-        <file>
-            <source>conf/config.yaml</source>
-            <outputDirectory>/</outputDirectory>
-        </file>
-        <file>
-            <source>models/englishRNN.ser.gz</source>
-            <outputDirectory>/models/</outputDirectory>
-        </file>
-        <file>
-            <source>models/english-bidirectional-distsim.tagger</source>
-            <outputDirectory>/models/</outputDirectory>
-        </file>
     </files>
 </assembly>
diff --git a/src/main/java/fr/eurecom/stanfordnlprestapi/cli/NerCommand.java b/src/main/java/fr/eurecom/stanfordnlprestapi/cli/NerCommand.java
@@ -79,13 +79,13 @@ protected final void run(final Bootstrap<T> newBootstrap, final Namespace
 
     if (newNamespace.get("format") == null || "turtle".equals(newNamespace.get("format"))
         || !"jsonld".equals(newNamespace.get("format"))) {
-      NerCommand.LOGGER.info(this.pipeline.run(newNamespace.getString("text")).rdfString(
-          "stanfordnlp", RDFFormat.TURTLE_PRETTY, NlpProcess.NER));
+      NerCommand.LOGGER.info(System.lineSeparator() + this.pipeline.run(newNamespace.getString(
+          "text")).rdfString("stanfordnlp", RDFFormat.TURTLE_PRETTY, NlpProcess.NER));
     }
-
+    
     if ("jsonld".equals(newNamespace.get("format"))) {
-      NerCommand.LOGGER.info(this.pipeline.run(newNamespace.getString("text")).rdfString(
-          "stanfordnlp", RDFFormat.JSONLD_PRETTY, NlpProcess.NER));
+      NerCommand.LOGGER.info(System.lineSeparator() + this.pipeline.run(newNamespace.getString(
+          "text")).rdfString("stanfordnlp", RDFFormat.JSONLD_PRETTY, NlpProcess.NER));
     }
   }
 

diff --git a/src/main/java/fr/eurecom/stanfordnlprestapi/cli/PosCommand.java b/src/main/java/fr/eurecom/stanfordnlprestapi/cli/PosCommand.java
@@ -79,13 +79,13 @@ protected final void run(final Bootstrap<T> newBootstrap, final Namespace
 
     if (newNamespace.get("format") == null || "turtle".equals(newNamespace.get("format"))
         || !"jsonld".equals(newNamespace.get("format"))) {
-      PosCommand.LOGGER.info(this.pipeline.run(newNamespace.getString("text")).rdfString(
-          "stanfordnlp", RDFFormat.TURTLE_PRETTY, NlpProcess.POS));
+      PosCommand.LOGGER.info(System.lineSeparator() + this.pipeline.run(newNamespace.getString(
+          "text")).rdfString("stanfordnlp", RDFFormat.TURTLE_PRETTY, NlpProcess.POS));
     }
 
     if ("jsonld".equals(newNamespace.get("format"))) {
-      PosCommand.LOGGER.info(this.pipeline.run(newNamespace.getString("text")).rdfString(
-          "stanfordnlp", RDFFormat.JSONLD_PRETTY, NlpProcess.POS));
+      PosCommand.LOGGER.info(System.lineSeparator() + this.pipeline.run(newNamespace.getString(
+          "text")).rdfString("stanfordnlp", RDFFormat.JSONLD_PRETTY, NlpProcess.POS));
     }
   }
 

diff --git a/src/main/java/fr/eurecom/stanfordnlprestapi/core/StanfordNlp.java b/src/main/java/fr/eurecom/stanfordnlprestapi/core/StanfordNlp.java
@@ -41,6 +41,7 @@
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Properties;
+import java.util.regex.Pattern;
 
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -190,15 +191,18 @@ private void buildEntitiesFromSentence(final CoreMap stanfordSentence,
       if (!"O".equals(token.get(CoreAnnotations.NamedEntityTagAnnotation.class))
           && sb.toString().isEmpty()) {
         start = token.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
-
-        sb.append(token.get(CoreAnnotations.TextAnnotation.class));
-
         type = token.get(CoreAnnotations.NamedEntityTagAnnotation.class);
 
-        if (stanfordSentence.get(CoreAnnotations.TokensAnnotation.class).indexOf(token)
+        sb.append(token.get(CoreAnnotations.TextAnnotation.class));
+
+        if (Pattern.compile("@|#").matcher(sb.toString()).find()
+            || stanfordSentence.get(CoreAnnotations.TokensAnnotation.class).indexOf(token)
             == stanfordSentence.get(CoreAnnotations.TokensAnnotation.class).size() - 1) {
           sentence.addEntity(new Entity(sb.toString(), type, sentence, context, start,
               token.get(CoreAnnotations.CharacterOffsetEndAnnotation.class)));
+
+          sb = new StringBuilder();
+          type = "";
         }
       } else if (!"O".equals(token.get(CoreAnnotations.NamedEntityTagAnnotation.class))) {
         sb.append(' ');
@@ -210,7 +214,6 @@ private void buildEntitiesFromSentence(final CoreMap stanfordSentence,
               token.get(CoreAnnotations.CharacterOffsetEndAnnotation.class)));
         }
       } else if (!sb.toString().isEmpty()) {
-
         final int index = stanfordSentence.get(
             CoreAnnotations.TokensAnnotation.class).indexOf(token);
         final int end = stanfordSentence.get(CoreAnnotations.TokensAnnotation.class).get(

diff --git a/src/main/java/fr/eurecom/stanfordnlprestapi/datatypes/Context.java b/src/main/java/fr/eurecom/stanfordnlprestapi/datatypes/Context.java
@@ -104,12 +104,6 @@ public final Model rdfModel(final String tool, final NlpProcess process) {
     final Model model = ModelFactory.createDefaultModel();
     final Map<String, String> prefixes = new HashMap<>();
 
-    prefixes.put("nif", nif);
-    prefixes.put("local", base);
-    prefixes.put("xsd", "http://www.w3.org/2001/XMLSchema#");
-
-    model.setNsPrefixes(prefixes);
-
     model.add(ResourceFactory.createResource(base + "char=" + this.start + ',' + this.end),
         RDF.type, ResourceFactory.createResource(nif + "String"));
     model.add(ResourceFactory.createResource(base + "char=" + this.start + ',' + this.end),
@@ -127,6 +121,12 @@ public final Model rdfModel(final String tool, final NlpProcess process) {
     model.add(ResourceFactory.createResource(base + "char=" + this.start + ',' + this.end),
         ResourceFactory.createProperty(nif + "isString"),
         ResourceFactory.createTypedLiteral(this.text));
+
+    prefixes.put("nif", nif);
+    prefixes.put("local", base);
+    prefixes.put("xsd", "http://www.w3.org/2001/XMLSchema#");
+
+    model.setNsPrefixes(prefixes);
 
     for (final Sentence sentence : this.sentences) {
       model.add(sentence.rdfModel(tool, process));

diff --git a/src/main/java/fr/eurecom/stanfordnlprestapi/resources/PipelineResource.java b/src/main/java/fr/eurecom/stanfordnlprestapi/resources/PipelineResource.java
@@ -67,12 +67,14 @@ public PipelineResource(final String annotators) {
     final Properties props = new Properties();
 
     props.setProperty("annotators", annotators);
-    props.setProperty("pos.model", "models/english-bidirectional-distsim.tagger");
+    props.setProperty("pos.model",
+        "edu/stanford/nlp/models/pos-tagger/english-bidirectional/"
+            + "english-bidirectional-distsim.tagger");
     props.setProperty("ner.model",
         "edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz");
     props.setProperty("ner.useSUTime", "false");
     props.setProperty("ner.applyNumericClassifiers", "false");
-    props.setProperty("parse.model", "models/englishRNN.ser.gz");
+    props.setProperty("parse.model", "edu/stanford/nlp/models/lexparser/englishRNN.ser.gz");
     props.setProperty("coref.doClustering", "true");
     props.setProperty("coref.md.type", "rule");
     props.setProperty("coref.mode", "statistical");