kbss-cvut · psiotwo · Jul 23, 2021 · Jul 23, 2021 · Jul 23, 2021 · Jul 23, 2021
diff --git a/.github/workflows/before-push-to-termit.yml b/.github/workflows/before-push-to-termit.yml
@@ -0,0 +1,24 @@
+name: Before merge to 'termit'
+
+on:
+  push:
+    branches: [ termit ]
+  pull_request:
+    branches: [ termit ]
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up JDK 11
+        uses: actions/setup-java@v2
+        with:
+          java-version: '11'
+          distribution: 'adopt'
+      - name: Grant execute permission for gradlew
+        run: chmod +x gradlew
+      - name: Build with Gradle
+        run: ./gradlew build -x :lemmatizer-morphodita:build -x :lemmatizer-tests:build
diff --git a/.github/workflows/on-push-to-termit.yml b/.github/workflows/on-push-to-termit.yml
@@ -0,0 +1,37 @@
+name: On push to 'termit'
+on:
+  push:
+    branches: [ termit ]
+  workflow_dispatch:
+env:
+  IMAGE_NAME: annotace-spark
+  USERNAME: ${{ github.actor }}
+  TOKEN: ${{ secrets.GITHUB_TOKEN }}
+jobs:
+  build-and-publish:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - name: Grant execute permission for gradlew
+        run: chmod +x gradlew
+      - name: Build with Gradle
+        run: ./gradlew build -x :lemmatizer-morphodita:build -x :lemmatizer-tests:build
+      - name: Build image
+        run: docker build . --file Dockerfile --tag $IMAGE_NAME
+      - name: Log into registry
+        run: echo "${{ secrets.GITHUB_TOKEN }}" | docker login docker.pkg.github.com -u ${{ github.actor }} --password-stdin
+      - name: Push image
+        run: |
+          IMAGE_ID=docker.pkg.github.com/${{ github.repository }}/$IMAGE_NAME
+          # Change all uppercase to lowercase
+          IMAGE_ID=$(echo $IMAGE_ID | tr '[A-Z]' '[a-z]')
+          # Strip git ref prefix from version
+          VERSION=$(echo "${{ github.ref }}" | sed -e 's,.*/\(.*\),\1,')
+          # Strip "v" prefix from tag name
+          [[ "${{ github.ref }}" == "refs/tags/"* ]] && VERSION=$(echo $VERSION | sed -e 's/^v//')
+          # Use Docker `latest` tag convention
+          [ "$VERSION" == "termit" ] && VERSION=latest
+          echo IMAGE_ID=$IMAGE_ID
+          echo VERSION=$VERSION
+          docker tag $IMAGE_NAME $IMAGE_ID:$VERSION
+          docker push $IMAGE_ID:$VERSION
diff --git a/.gitignore b/.gitignore
@@ -1 +1,6 @@
-.idea/**
+.idea/**
+.gradle
+**/build/
+*.iml
+**/out/
+lib
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,12 @@
+FROM gradle:8.0.2-jdk11-alpine as build
+RUN mkdir annotace
+WORKDIR /annotace
+COPY . .
+RUN gradle bootJar -Pcore,lemmatizer-spark,keywordextractor-ker
+
+FROM eclipse-temurin:11-jdk-alpine as runtime
+COPY --from=build /annotace/core/build/libs/*.jar /
+RUN mv annotace*.jar annotace.jar
+
+EXPOSE 8080
+ENTRYPOINT ["java","-jar","/annotace.jar"]
diff --git a/Dockerfile-morphodita b/Dockerfile-morphodita
@@ -0,0 +1,35 @@
+ARG MORPHODITA_TAGGERS
+ARG MORPHODITA_ZIP
+ARG MORPHODITA_ZIP_SO
+
+########################################################################################################################
+
+FROM alpine as unzip
+ARG MORPHODITA_TAGGERS
+ARG MORPHODITA_ZIP
+RUN mkdir taggers
+RUN mkdir morphodita
+COPY $MORPHODITA_TAGGERS /taggers
+COPY $MORPHODITA_ZIP /morphodita
+WORKDIR /morphodita
+RUN unzip *.zip
+
+FROM gradle:8.0.2-jdk11-alpine as buildMaven
+ARG MORPHODITA_ZIP_SO
+RUN mkdir annotace
+WORKDIR /annotace
+COPY . .
+RUN gradle clean bootJar -x test
+
+FROM eclipse-temurin:11-jdk-alpine as runtime
+ARG MORPHODITA_ZIP_SO
+# Work around an issue with missing library on Alpine Linux - https://www.svlada.com/fun-times-with-gcc-musl-alpine-linux/
+RUN apk add --update --no-cache libc6-compat
+RUN cp /lib64/ld-linux-x86-64.so.2 /lib/
+COPY --from=buildMaven /annotace/core/build/libs/annotace-*.jar /
+RUN mv *.jar annotace.jar
+COPY --from=unzip /taggers .
+COPY --from=unzip /morphodita/$MORPHODITA_ZIP_SO /lib
+
+EXPOSE 8080
+ENTRYPOINT ["java","-jar","/annotace.jar"]
diff --git a/README.md b/README.md
@@ -1,4 +1,48 @@
-In order to run MorphoDiTa JNI, it is necessary to
- - Download MorphoDiTa 1.9.2 binaries
- - Take system library (.so on linux, .dll on Win) and put it on the Java library path (java.library.path system var)
- - Download and put necessary MorphoDiTa language models into src/main/resources
+# Annotace
+
+Annotace is a text analysis service used e.g. by [TermIt](https://github.com/kbss-cvut/termit) and its [web annotation plugin](https://github.com/alanbuzek/termit-extension).
+
+## How to run it?
+
+- Install Java 11
+- Run `./gradlew bootRun` (on Linux/WSL) or `gradlew.bat bootRun` on Windows
+
+## Lemmatizers
+
+Annotace supports two lemmatizer implementations: 
+
+- [Spark](https://sparknlp.org/)-based lemmatizer is more suitable for annotation of English texts. This is the default lemmatizer
+- [MorphoDiTa](https://ufal.mff.cuni.cz/morphodita)-based lemmatizer is more suitable for annotation of Czech or Slovak texts. It comes in two variants:
+  - JNI-based - runs locally using the MorphoDiTa library itself
+  - Service-based - invokes a remote annotation service (needs to be configured)
+
+## Setup
+
+Spark-based Annotace setup does not require any additional configuration or files. Either run it directly `./gradlew bootRun`
+or use Docker. There is an [image](ghcr.io/kbss-cvut/annotace/annotace-spark:latest) published at GitHub package registry.
+
+Running Annotace with MorphoDiTa is a bit more complicated.
+
+### Annotace with MorphoDiTa Locally
+
+1. Download the MorphoDiTa [ZIP archive](https://github.com/kbss-cvut/annotace/pkgs/container/annotace%2Fannotace-spark) and extract it.
+2. Find a file with JNI bindings corresponding to your platform in the extracted directory. For 64-bit Linux the file is `morphodita-1.9.2-bin/bin-linux64/java/libmorphodita_java.so`.
+3. Set path to the **directory containing this file** as `java.library.path` environment variable name.
+4. Provide mapping of taggers (language models) to Annotace. Either by editing `application.yml` before build or by passing them as environment variables.
+5. Run Annotace with the MorphoDiTa lemmatizer by setting `ANNOTACE_LEMMATIZER` to `morphodita-jni`.
+
+A complete command line example would be: 
+`ANNOTACE_LEMMATIZER=morphodita-jni ANNOTACE_MORPHODITA_TAGGERS_CS=/opt/annotace/lib/czech-morfflex2.0-pdtc1.0-220710.tagger LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/annotace/lib/morphodita-1.9.2-bin/bin-linux64/java ./gradlew bootRun`
+
+
+### Annotace with MorphoDiTa in Docker
+
+1. Download the MorphoDiTa [ZIP archive](https://github.com/ufal/morphodita/releases/download/v1.9.2/morphodita-1.9.2-bin.zip).
+2. Set `MORPHODITA_ZIP` in `docker-compose-morphodita.yml` to path to the downloaded MorphoDiTa ZIP file.
+3. Download and extract taggers (language models). Put them into a single directory.
+4. Set `MORPHODITA_TAGGERS` in `docker-compose-morphodita.yml` to path to the taggers' directory.
+5. Run `docker compose -f docker-compose-morphodita.yml up -d --build` to build and start Annotace wih MorphoDiTa.
+
+## License
+
+Annotace is licensed under GPL v3.0, Spark and MorphoDiTa are distributed under their respective licenses.
diff --git a/api/build.gradle b/api/build.gradle
@@ -0,0 +1,3 @@
+dependencies {
+    implementation(libs.jackson.annotations)
+}
diff --git a/api/src/main/java/cz/cvut/kbss/textanalysis/keywordextractor/KeywordExtractorAPI.java b/api/src/main/java/cz/cvut/kbss/textanalysis/keywordextractor/KeywordExtractorAPI.java
@@ -0,0 +1,8 @@
+package cz.cvut.kbss.textanalysis.keywordextractor;
+
+import cz.cvut.kbss.textanalysis.keywordextractor.model.KeywordExtractorResult;
+
+public interface KeywordExtractorAPI {
+
+    KeywordExtractorResult process(final String input);
+}
diff --git a/.../cz/cvut/kbss/textanalysis/Stopwords.java → ...tractor/model/KeywordExtractorResult.java b/.../cz/cvut/kbss/textanalysis/Stopwords.java → ...tractor/model/KeywordExtractorResult.java
@@ -16,22 +16,23 @@
  * along with this program.  If not, see <https://www.gnu.org/licenses/>.
  * © 2019 GitHub, Inc.
  */
-package cz.cvut.kbss.textanalysis;
+package cz.cvut.kbss.textanalysis.keywordextractor.model;
+
+import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
 
-import java.io.File;
-import java.io.IOException;
-import java.nio.file.Files;
 import java.util.Collections;
 import java.util.List;
+import lombok.Data;
+
+@JsonIgnoreProperties(ignoreUnknown = true)
+@Data
+public class KeywordExtractorResult {
 
-public class Stopwords {
+    private List<String> keywords;
 
-    public List<String> getStopwords(){
-        try {
-            return Files.readAllLines(new File(Stopwords.class.getClassLoader().getResource("stopwords-Czech.txt").getFile()).toPath());
-        } catch (IOException e) {
-            e.printStackTrace();
-            return Collections.emptyList();
-        }
+    public static KeywordExtractorResult createEmpty() {
+        KeywordExtractorResult response = new KeywordExtractorResult();
+        response.setKeywords(Collections.emptyList());
+        return response;
     }
 }
diff --git a/api/src/main/java/cz/cvut/kbss/textanalysis/lemmatizer/LemmatizerApi.java b/api/src/main/java/cz/cvut/kbss/textanalysis/lemmatizer/LemmatizerApi.java
@@ -0,0 +1,32 @@
+/**
+ * Annotac Copyright (C) 2019 Czech Technical University in Prague
+ *
+ * This program is free software: you can redistribute it and/or modify it under the terms of the
+ * GNU General Public License as published by the Free Software Foundation, either version 3 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
+ * even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along with this program.  If
+ * not, see <https://www.gnu.org/licenses/>. © 2019 GitHub, Inc.
+ */
+
+package cz.cvut.kbss.textanalysis.lemmatizer;
+
+import cz.cvut.kbss.textanalysis.lemmatizer.model.LemmatizerResult;
+import cz.cvut.kbss.textanalysis.lemmatizer.model.SingleLemmaResult;
+import java.util.List;
+
+public interface LemmatizerApi {
+
+    /**
+     * Lemmatizes the given text w.r.t. the given language.
+     *
+     * @param text text to lemmatize
+     * @param lang language to use
+     * @return result of the lemmatizations
+     */
+    LemmatizerResult process(String text, String lang);
+}
diff --git a/.../textanalysis/model/MorphoDitaResult.java → ...is/lemmatizer/model/LemmatizerResult.java b/.../textanalysis/model/MorphoDitaResult.java → ...is/lemmatizer/model/LemmatizerResult.java
@@ -16,34 +16,21 @@
  * along with this program.  If not, see <https://www.gnu.org/licenses/>.
  * © 2019 GitHub, Inc.
  */
-package cz.cvut.kbss.textanalysis.model;
+package cz.cvut.kbss.textanalysis.lemmatizer.model;
 
 import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
 import com.fasterxml.jackson.annotation.JsonProperty;
 
 import java.util.List;
+import lombok.Data;
 
 @JsonIgnoreProperties(ignoreUnknown = true)
-public class MorphoDitaResult {
+@Data
+public class LemmatizerResult {
 
     @JsonProperty
-    private List<List<MorphoDitaResultJson>> result;
+    private String lemmatizer;
 
-    public MorphoDitaResult() {
-    }
-
-    public List<List<MorphoDitaResultJson>> getResult() {
-        return result;
-    }
-
-    public void setResult(List<List<MorphoDitaResultJson>> result) {
-        this.result = result;
-    }
-
-    @Override
-    public String toString() {
-        return "MorphoDitaResult{" +
-                "result=" + result +
-                '}';
-    }
+    @JsonProperty
+    private List<List<SingleLemmaResult>> result;
 }
diff --git a/...vice/morphodita/MorphoDitaServiceAPI.java → ...s/lemmatizer/model/SingleLemmaResult.java b/...vice/morphodita/MorphoDitaServiceAPI.java → ...s/lemmatizer/model/SingleLemmaResult.java
@@ -16,12 +16,18 @@
  * along with this program.  If not, see <https://www.gnu.org/licenses/>.
  * © 2019 GitHub, Inc.
  */
-package cz.cvut.kbss.textanalysis.service.morphodita;
+package cz.cvut.kbss.textanalysis.lemmatizer.model;
 
-import cz.cvut.kbss.textanalysis.model.MorphoDitaResultJson;
-import java.util.List;
+import lombok.Data;
 
-public interface MorphoDitaServiceAPI {
+@Data
+public class SingleLemmaResult {
 
-    List<List<MorphoDitaResultJson>> getMorphoDiteResultProcessed(String s);
+    private String token;
+
+    private String lemma;
+
+    private String spaces;
+
+    private boolean negated;
 }
diff --git a/build.gradle b/build.gradle
@@ -0,0 +1,52 @@
+plugins {
+    id "org.springframework.boot" version "2.7.10" apply false
+    id "io.spring.dependency-management" version "1.0.11.RELEASE" apply false
+}
+
+group "cz.cvut.kbss"
+description "Text analysis for Czech language and annotation recommendation service"
+version "0.0.1"
+
+def revision = "git rev-list --count HEAD".execute().text.trim()
+def hash = "git rev-parse --short HEAD".execute().text.trim()
+version = "0.0.1.r${revision}.${hash}";
+
+ext {
+    junitVersion = "5.9.2"
+}
+
+subprojects {
+    apply plugin: "java"
+    apply plugin: "java-library"
+
+    compileJava {
+        sourceCompatibility = "11"
+        targetCompatibility = "11"
+    }
+
+    test {
+        useJUnitPlatform()
+    }
+
+    group parent.group
+    version parent.version
+
+    repositories {
+        mavenCentral()
+        maven {
+            name = "kbss-private"
+            url = uri("https://kbss.felk.cvut.cz/m2repo-private")
+        }
+    }
+
+    dependencies {
+        implementation "org.slf4j:slf4j-api:1.7.36"
+        implementation "ch.qos.logback:logback-classic:1.2.11"
+        compileOnly "org.projectlombok:lombok:1.18.20"
+
+        annotationProcessor "org.projectlombok:lombok:1.18.20"
+
+        testImplementation(libs.junit.api)
+        testRuntimeOnly(libs.junit.engine)
+    }
+}