From 359a95db82f824f5d3a310c6299680addbbe609d Mon Sep 17 00:00:00 2001
From: Alexander Zagniotov <azagniotov@box.com>
Date: Sun, 17 Nov 2024 15:41:48 -0800
Subject: [PATCH 01/11] Renamed Gradle Kotlin default scripts

---
 build.gradle.kts => build.gradle.kts.retired       | 0
 settings.gradle.kts => settings.gradle.kts.retired | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename build.gradle.kts => build.gradle.kts.retired (100%)
 rename settings.gradle.kts => settings.gradle.kts.retired (100%)

diff --git a/build.gradle.kts b/build.gradle.kts.retired
similarity index 100%
rename from build.gradle.kts
rename to build.gradle.kts.retired
diff --git a/settings.gradle.kts b/settings.gradle.kts.retired
similarity index 100%
rename from settings.gradle.kts
rename to settings.gradle.kts.retired

From 786087e15437936d8db0133fac354d18396ed115 Mon Sep 17 00:00:00 2001
From: Alexander Zagniotov <azagniotov@box.com>
Date: Mon, 18 Nov 2024 14:12:38 -0800
Subject: [PATCH 02/11] Added a build.gradle that builds and tests

---
 build.gradle | 329 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 329 insertions(+)
 create mode 100644 build.gradle

diff --git a/build.gradle b/build.gradle
new file mode 100644
index 00000000..5909f9a4
--- /dev/null
+++ b/build.gradle
@@ -0,0 +1,329 @@
+/*
+ * Copyright © 2018-today Peter M. Stahl pemistahl@gmail.com
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import com.github.jengelman.gradle.plugins.shadow.tasks.ShadowJar
+//import org.jetbrains.dokka.gradle.DokkaTask
+//import org.jetbrains.kotlin.gradle.tasks.KotlinCompile
+
+plugins {
+    id 'java'
+    id 'org.jlleitschuh.gradle.ktlint' version '12.1.1'
+    id 'org.jetbrains.dokka' version '1.9.20'
+    id 'com.github.johnrengelman.shadow' version '8.1.1'
+    id 'io.github.gradle-nexus.publish-plugin' version '2.0.0'
+    id 'com.github.sherter.google-java-format' version '0.9' // Last versions that are compatible with Java 8
+    id 'maven-publish'
+    id 'signing'
+    id 'jacoco'
+    id 'com.gradleup.shadow' version '8.3.0'
+}
+
+
+group = project.hasProperty("linguaGroupId") ? project.property("linguaGroupId") : 'default.group.id'
+description = project.hasProperty("linguaDescription") ? project.property("linguaDescription") : 'Default description'
+
+java {
+    sourceCompatibility = JavaVersion.VERSION_1_8
+    targetCompatibility = JavaVersion.VERSION_1_8
+}
+
+jacoco {
+    toolVersion = "0.8.8"
+}
+
+sourceSets {
+    main {
+        resources {
+            exclude 'training-data/**'
+        }
+    }
+    create("accuracyReport") {
+        compileClasspath += sourceSets.main.output
+        runtimeClasspath += sourceSets.main.output
+    }
+}
+
+configurations {
+    accuracyReportImplementation {
+        extendsFrom(configurations.testImplementation)
+    }
+    accuracyReportRuntimeOnly {
+        extendsFrom(configurations.runtimeOnly)
+    }
+}
+
+tasks.withType(Test).configureEach {
+    useJUnitPlatform {
+        failFast = true
+    }
+}
+
+tasks.named("jacocoTestReport", JacocoReport).configure {
+    dependsOn "test"
+    reports {
+        xml.required.set(true)
+        csv.required.set(false)
+        html.required.set(true)
+    }
+    classDirectories.setFrom(files(classDirectories.files.collect {
+        fileTree(it) {
+            exclude '**/app/**'
+        }
+    }))
+}
+
+tasks.register("accuracyReport", Test) {
+    group = project.hasProperty("linguaTaskGroup") ? project.property("linguaTaskGroup") : 'defaultGroup'
+    description = "Runs Lingua on provided test data, and writes detection accuracy reports for each language."
+    testClassesDirs = sourceSets["accuracyReport"].output.classesDirs
+    classpath = sourceSets["accuracyReport"].runtimeClasspath
+
+    doFirst {
+        def allowedDetectors = project.hasProperty("linguaSupportedDetectors") ? project.property("linguaSupportedDetectors").split(',') : []
+        def detectors = project.hasProperty('detectors') ? project.property('detectors').split(',') : allowedDetectors
+
+        detectors.each {
+            if (!allowedDetectors.contains(it)) {
+                throw GradleException("detector '$it' does not exist, supported detectors: ${allowedDetectors.join(', ')}")
+            }
+        }
+
+        def allowedLanguages = project.hasProperty("linguaSupportedLanguages") ? project.property("linguaSupportedLanguages").split(',') : []
+        def languages = project.hasProperty('languages') ? project.property('languages').split(',') : allowedLanguages
+
+        languages.each {
+            if (!allowedLanguages.contains(it)) {
+                throw GradleException("language '$it' is not supported")
+            }
+        }
+
+        // Validate CPU cores
+        def availableCpuCores = Runtime.getRuntime().availableProcessors()
+        def cpuCoresRepr = project.hasProperty('cpuCores') ? project.property('cpuCores').toString() : "1"
+        def cpuCores = cpuCoresRepr.toInteger()
+
+        if (cpuCores < 1 || cpuCores > availableCpuCores) {
+            throw GradleException("$cpuCores cpu cores are not supported. Min: 1, Max: $availableCpuCores")
+        }
+
+        maxHeapSize = '4096m'
+        maxParallelForks = cpuCores
+        reports.html.required.set(false)
+        reports.junitXml.required.set(false)
+
+        filter {
+            detectors.each { detector ->
+                languages.each { language ->
+                    includeTestsMatching("${project.property('linguaGroupId')}.${project.property('linguaArtifactId')}.report.${detector.toLowerCase()}.${language}DetectionAccuracyReport")
+                }
+            }
+        }
+    }
+}
+
+tasks.register("writeAggregatedAccuracyReport") {
+    group = project.hasProperty("linguaTaskGroup") ? project.property("linguaTaskGroup") : 'defaultGroup'
+    description = "Creates a table from all accuracy detection reports and writes it to a CSV file."
+
+    doLast {
+        def accuracyReportsDirectoryName = 'accuracy-reports'
+        def accuracyReportsDirectory = file(accuracyReportsDirectoryName)
+        if (!accuracyReportsDirectory.exists()) {
+            throw GradleException("directory '$accuracyReportsDirectoryName' does not exist")
+        }
+
+        def detectors = project.hasProperty("linguaSupportedDetectors") ? project.property("linguaSupportedDetectors").split(',') : []
+        def languages = project.hasProperty("linguaSupportedLanguages") ? project.property("linguaSupportedLanguages").split(',') : []
+        def csvFile = file("$accuracyReportsDirectoryName/aggregated-accuracy-values.csv")
+        def stringToSplitAt = ">> Exact values:"
+
+        if (csvFile.exists()) csvFile.delete()
+        csvFile.createNewFile()
+        csvFile.appendText(project.hasProperty("linguaCsvHeader") ? project.property("linguaCsvHeader") : "")
+        csvFile.appendText("\n")
+
+        languages.each { language ->
+            csvFile.appendText(language)
+
+            detectors.each { detector ->
+                def languageReportFileName = "$accuracyReportsDirectoryName/${detector.toLowerCase()}/$language.txt"
+                def languageReportFile = file(languageReportFileName)
+                def sliceLength = detector == "Lingua" ? (1..8) : (1..4)
+
+                if (languageReportFile.exists()) {
+                    languageReportFile.readLines().each { line ->
+                        if (line.startsWith(stringToSplitAt)) {
+                            def accuracyValues = line.split(stringToSplitAt)[1].split(' ').slice(sliceLength).join(',')
+                            csvFile.appendText(",${accuracyValues}")
+                        }
+                    }
+                } else {
+                    csvFile.appendText(detector == "Lingua" ? ",NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN" : ",NaN,NaN,NaN,NaN")
+                }
+            }
+
+            csvFile.appendText("\n")
+        }
+
+        println("file 'aggregated-accuracy-values.csv' written successfully")
+    }
+}
+
+//tasks.named("compileAccuracyReportKotlin", KotlinCompile).configure {
+//    kotlinOptions.jvmTarget = "17"
+//}
+
+tasks.named("compileAccuracyReportJava", JavaCompile).configure {
+    sourceCompatibility = "17"
+    targetCompatibility = "17"
+}
+
+//tasks.withType(DokkaTask).configureEach {
+//    dokkaSourceSets.configureEach {
+//        jdkVersion.set(8)
+//        reportUndocumented.set(false)
+//        perPackageOption {
+//            matchingRegex.set(".*\\.(app|internal).*")
+//            suppress.set(true)
+//        }
+//    }
+//}
+
+tasks.register("dokkaJavadocJar", Jar).configure {
+    dependsOn "dokkaJavadoc"
+    group = "Build"
+    description = "Assembles a jar archive containing Javadoc documentation."
+    archiveClassifier.set("javadoc")
+    from("${layout.buildDirectory}/dokka/javadoc")
+}
+
+tasks.register("sourcesJar", Jar).configure {
+    group = "Build"
+    description = "Assembles a jar archive containing the main source code."
+    archiveClassifier.set("sources")
+    from("src/main/kotlin")
+}
+
+tasks.register("jarWithDependencies", ShadowJar).configure {
+    group = "Build"
+    description = "Assembles a jar archive containing the main classes and all external dependencies."
+    archiveClassifier.set("with-dependencies")
+    from(sourceSets.main.output)
+    configurations = [project.configurations.runtimeClasspath]
+    manifest {
+        attributes "Main-Class": project.property("linguaMainClass")
+    }
+}
+
+tasks.register("runLinguaOnConsole", JavaExec).configure {
+    group = project.hasProperty("linguaTaskGroup") ? project.property("linguaTaskGroup") : 'defaultGroup'
+    description = "Starts a REPL (read-evaluate-print loop) to try Lingua on the command line."
+    mainClass.set(project.property("linguaMainClass"))
+    standardInput = System.in
+    classpath = sourceSets.main.runtimeClasspath
+}
+
+dependencies {
+    implementation "com.squareup.moshi:moshi:1.15.1"
+    implementation "com.squareup.moshi:moshi-kotlin:1.15.1"
+    implementation "it.unimi.dsi:fastutil:8.5.15"
+
+    testImplementation "org.junit.jupiter:junit-jupiter:5.11.3"
+    testImplementation "org.assertj:assertj-core:3.26.3"
+    testImplementation "org.mockito:mockito-core:5.2.0"
+    testImplementation "org.mockito:mockito-junit-jupiter:5.2.0"
+
+    accuracyReportImplementation "com.optimaize.languagedetector:language-detector:0.6"
+    accuracyReportImplementation "org.apache.opennlp:opennlp-tools:2.4.0"
+    accuracyReportImplementation "org.apache.tika:tika-core:3.0.0"
+    accuracyReportImplementation "org.apache.tika:tika-langdetect-optimaize:3.0.0"
+    accuracyReportImplementation "org.slf4j:slf4j-nop:2.0.16"
+}
+
+publishing {
+    publications {
+        mavenJava(MavenPublication) {
+            groupId = project.findProperty("linguaGroupId").toString()
+            artifactId = project.findProperty("linguaArtifactId").toString()
+            version = project.version.toString()
+
+            from components.java
+
+            artifact sourcesJar
+            artifact jarWithDependencies
+            artifact dokkaJavadocJar
+
+            pom {
+                name.set(project.findProperty("linguaName").toString())
+                description.set(project.findProperty("linguaDescription").toString())
+                url.set(project.findProperty("linguaWebsiteUrl").toString())
+
+                licenses {
+                    license {
+                        name.set(project.findProperty("linguaLicenseName").toString())
+                        url.set(project.findProperty("linguaLicenseUrl").toString())
+                    }
+                }
+                developers {
+                    developer {
+                        id.set(project.findProperty("linguaDeveloperId").toString())
+                        name.set(project.findProperty("linguaDeveloperName").toString())
+                        email.set(project.findProperty("linguaDeveloperEmail").toString())
+                        url.set(project.findProperty("linguaDeveloperUrl").toString())
+                    }
+                }
+                scm {
+                    connection.set(project.findProperty("linguaScmConnection").toString())
+                    developerConnection.set(project.findProperty("linguaScmDeveloperConnection").toString())
+                    url.set(project.findProperty("linguaScmUrl").toString())
+                }
+            }
+        }
+    }
+
+    repositories {
+        maven {
+            name = "GitHubPackages"
+            url = uri(project.findProperty("githubPackagesUrl").toString())
+            credentials {
+                username = project.findProperty("linguaDeveloperId").toString()
+                password = project.findProperty("ghPackagesToken")?.toString() ?: ""
+            }
+        }
+    }
+}
+
+nexusPublishing {
+    repositories {
+        sonatype()
+    }
+}
+
+signing {
+    //sign(publishing.publications["lingua"])
+    sign publishing.publications.mavenJava
+}
+
+repositories {
+    mavenCentral()
+}
+
+googleJavaFormat {
+    toolVersion = '1.7' // Last versions that are compatible with Java 8
+    exclude '**/wrapper/dists/**'
+    exclude '**/src/*/resources/**'
+}
+verifyGoogleJavaFormat.dependsOn(tasks.googleJavaFormat)

From 673dc1e19d7b346c23a3370b4c70a31300b41671 Mon Sep 17 00:00:00 2001
From: Alexander Zagniotov <azagniotov@box.com>
Date: Mon, 18 Nov 2024 14:14:13 -0800
Subject: [PATCH 03/11] internal pkg: migrated Fraction and FractionTest

---
 .../pemistahl/lingua/internal/Fraction.java   | 305 ++++++++++++++++++
 .../lingua/internal/FractionTest.java         | 156 +++++++++
 2 files changed, 461 insertions(+)
 create mode 100644 src/main/java/com/github/pemistahl/lingua/internal/Fraction.java
 create mode 100644 src/test/java/com/github/pemistahl/lingua/internal/FractionTest.java

diff --git a/src/main/java/com/github/pemistahl/lingua/internal/Fraction.java b/src/main/java/com/github/pemistahl/lingua/internal/Fraction.java
new file mode 100644
index 00000000..ab491a0c
--- /dev/null
+++ b/src/main/java/com/github/pemistahl/lingua/internal/Fraction.java
@@ -0,0 +1,305 @@
+/*
+ * Copyright © 2018-today Peter M. Stahl pemistahl@gmail.com
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.github.pemistahl.lingua.internal;
+
+import com.squareup.moshi.ToJson;
+import java.util.Objects;
+
+/**
+ * A class representing a fraction with a numerator and denominator. Provides methods for reducing
+ * fractions to their lowest terms, comparing fractions, and converting them to different numeric
+ * types.
+ *
+ * @author Peter M. Stahl <pemistahl@gmail.com>
+ * @author Migration to Java from Kotlin by Alexander Zagniotov <azagniotov@gmail.com>
+ */
+public class Fraction extends Number implements Comparable<Fraction> {
+
+  private int numerator;
+  private int denominator;
+
+  public Fraction(final int numerator, final int denominator) {
+    final int[] data = reduceToLowestTerms(numerator, denominator);
+    this.numerator = data[0];
+    this.denominator = data[1];
+  }
+
+  @Override
+  public int compareTo(final Fraction other) {
+    long n0d = (long) numerator * other.denominator;
+    long d0n = (long) denominator * other.numerator;
+    if (n0d < d0n) {
+      return -1;
+    } else if (n0d > d0n) {
+      return 1;
+    } else {
+      return 0;
+    }
+  }
+
+  @Override
+  public String toString() {
+    return numerator + "/" + denominator;
+  }
+
+  @Override
+  public byte byteValue() {
+    return (byte) (int) doubleValue();
+  }
+
+  @Override
+  public double doubleValue() {
+    return (double) numerator / denominator;
+  }
+
+  @Override
+  public float floatValue() {
+    return (float) doubleValue();
+  }
+
+  @Override
+  public int intValue() {
+    return (int) doubleValue();
+  }
+
+  @Override
+  public long longValue() {
+    return (long) doubleValue();
+  }
+
+  @Override
+  public short shortValue() {
+    return (short) (int) doubleValue();
+  }
+
+  /**
+   * Reduces the fraction to its lowest terms.
+   *
+   * @param numerator The numerator of the fraction.
+   * @param denominator The denominator of the fraction.
+   * @return A Pair containing the reduced numerator and denominator.
+   * @throws ArithmeticException if the denominator is zero.
+   */
+  private int[] reduceToLowestTerms(final int numerator, final int denominator) {
+    int num = numerator;
+    int den = denominator;
+
+    if (den == 0) {
+      throw new ArithmeticException("zero denominator in fraction '" + num + "/" + den + "'");
+    }
+
+    if (den < 0) {
+      if (num == Integer.MIN_VALUE || den == Integer.MIN_VALUE) {
+        throw new ArithmeticException("overflow in fraction " + this + ", cannot negate");
+      }
+      num = -num;
+      den = -den;
+    }
+
+    int gcd = greatestCommonDenominator(num, den);
+
+    if (gcd > 1) {
+      num /= gcd;
+      den /= gcd;
+    }
+
+    if (den < 0) {
+      num = -num;
+      den = -den;
+    }
+
+    return new int[] {num, den};
+  }
+
+  /**
+   * Calculates the greatest common denominator (GCD) of two integers.
+   *
+   * @param a The first integer.
+   * @param b The second integer.
+   * @return The GCD of the two integers.
+   * @throws ArithmeticException if an overflow occurs.
+   */
+  private int greatestCommonDenominator(final int a, final int b) {
+    if (a == 0 || b == 0) {
+      if (a == Integer.MIN_VALUE || b == Integer.MIN_VALUE) {
+        throw new ArithmeticException(
+            "overflow: greatestCommonDenominator(" + a + ", " + b + ") is 2^31");
+      }
+      return Math.abs(a + b);
+    }
+
+    int x = a;
+    int y = b;
+    long xl = x;
+    long yl = y;
+    boolean useLong = false;
+
+    if (x < 0) {
+      if (x == Integer.MIN_VALUE) {
+        useLong = true;
+      } else {
+        x = -x;
+      }
+      xl = -xl;
+    }
+
+    if (y < 0) {
+      if (y == Integer.MIN_VALUE) {
+        useLong = true;
+      } else {
+        y = -y;
+      }
+      yl = -yl;
+    }
+
+    if (useLong) {
+      if (xl == yl) {
+        throw new ArithmeticException(
+            "overflow: greatestCommonDenominator(" + a + ", " + b + ") is 2^31");
+      }
+      long ylyu = yl;
+      yl = xl;
+      xl = ylyu % xl;
+      if (xl == 0L) {
+        if (yl > Integer.MAX_VALUE) {
+          throw new ArithmeticException(
+              "overflow: greatestCommonDenominator(" + a + ", " + b + ") is 2^31");
+        }
+        return (int) yl;
+      }
+      ylyu = yl;
+
+      y = (int) xl;
+      x = (int) (ylyu % xl);
+    }
+
+    return greatestCommonDivisor(x, y);
+  }
+
+  /**
+   * Computes the greatest common divisor (GCD) of two non-negative integers using binary GCD
+   * algorithm.
+   *
+   * @param a The first integer.
+   * @param b The second integer.
+   * @return The GCD of the two integers.
+   */
+  private int greatestCommonDivisor(final int a, final int b) {
+    assert a >= 0;
+    assert b >= 0;
+
+    if (a == 0) return b;
+    if (b == 0) return a;
+
+    int x = a;
+    int y = b;
+
+    int xTwos = numberOfTrailingZeros(x);
+    int yTwos = numberOfTrailingZeros(y);
+    int shift = Math.min(xTwos, yTwos);
+
+    x = x >> xTwos;
+    y = y >> yTwos;
+
+    while (x != y) {
+      int delta = x - y;
+      y = Math.min(x, y);
+      x = Math.abs(delta);
+      x = x >> numberOfTrailingZeros(x);
+    }
+
+    return x << shift;
+  }
+
+  /**
+   * Counts the number of trailing zeros in the binary representation of the given integer.
+   *
+   * @param i The integer whose trailing zeros are to be counted.
+   * @return The number of trailing zeros in the binary representation of i.
+   */
+  private int numberOfTrailingZeros(final int i) {
+    if (i == 0) return 32;
+
+    int j = i;
+    int n = 31;
+
+    int y = j << 16;
+    if (y != 0) {
+      n -= 16;
+      j = y;
+    }
+
+    y = j << 8;
+    if (y != 0) {
+      n -= 8;
+      j = y;
+    }
+
+    y = j << 4;
+    if (y != 0) {
+      n -= 4;
+      j = y;
+    }
+
+    y = j << 2;
+    if (y != 0) {
+      n -= 2;
+      j = y;
+    }
+
+    return n - ((j << 1) >>> 31);
+  }
+
+  /**
+   * Computes the absolute value of an integer.
+   *
+   * @param x The integer whose absolute value is to be computed.
+   * @return The absolute value of x.
+   */
+  private int abs(final int x) {
+    int i = x >>> 31;
+    return (x ^ i) + i;
+  }
+
+  @Override
+  public boolean equals(final Object o) {
+    if (this == o) return true;
+    if (o == null || getClass() != o.getClass()) return false;
+    Fraction fraction = (Fraction) o;
+    return numerator == fraction.numerator && denominator == fraction.denominator;
+  }
+
+  @Override
+  public int hashCode() {
+    return Objects.hash(numerator, denominator);
+  }
+}
+
+/** A class to handle conversion of Fraction objects to JSON using Moshi. */
+class FractionAdapter {
+
+  /**
+   * Converts a Fraction object to its string representation.
+   *
+   * @param fraction The Fraction object to be converted.
+   * @return The string representation of the fraction.
+   */
+  @ToJson
+  public String toJson(final Fraction fraction) {
+    return fraction.toString();
+  }
+}
diff --git a/src/test/java/com/github/pemistahl/lingua/internal/FractionTest.java b/src/test/java/com/github/pemistahl/lingua/internal/FractionTest.java
new file mode 100644
index 00000000..116a6f1b
--- /dev/null
+++ b/src/test/java/com/github/pemistahl/lingua/internal/FractionTest.java
@@ -0,0 +1,156 @@
+/*
+ * Copyright © 2018-today Peter M. Stahl pemistahl@gmail.com
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.github.pemistahl.lingua.internal;
+
+import static org.assertj.core.api.Assertions.assertThat;
+import static org.assertj.core.api.Assertions.assertThatExceptionOfType;
+
+import org.junit.jupiter.api.Test;
+
+/**
+ * Unit tests for the Fraction class.
+ *
+ * @author Peter M. Stahl <pemistahl@gmail.com>
+ * @author Migration to Java from Kotlin by Alexander Zagniotov <azagniotov@gmail.com>
+ */
+class FractionTest {
+
+  private final Fraction fraction1 = new Fraction(12, 144);
+  private final Fraction fraction2 = new Fraction(63, 27);
+  private final Fraction fraction3 = new Fraction(0, 1234);
+  private final Fraction fraction4 = new Fraction(-42, 210);
+  private final Fraction fraction5 = new Fraction(169, -65);
+
+  /** Test that Fraction is correctly reduced to lowest terms. */
+  @Test
+  void assertThatFractionIsCorrectlyReducedToLowestTerms() {
+    assertThat(fraction1).isEqualTo(new Fraction(1, 12));
+    assertThat(fraction2).isEqualTo(new Fraction(7, 3));
+    assertThat(fraction3).isEqualTo(new Fraction(0, 1));
+    assertThat(fraction4).isEqualTo(new Fraction(-1, 5));
+    assertThat(fraction5).isEqualTo(new Fraction(-13, 5));
+  }
+
+  /** Test that a Fraction with a zero denominator cannot be created. */
+  @Test
+  void assertThatFractionWithDenominatorZeroCannotBeCreated() {
+    assertThatExceptionOfType(ArithmeticException.class)
+        .isThrownBy(() -> new Fraction(1234, 0))
+        .withMessage("zero denominator in fraction '1234/0'");
+  }
+
+  /** Test the toString() implementation of Fraction. */
+  @Test
+  void assertThatToStringImplementationOfFractionIsCorrect() {
+    assertThat(fraction1.toString()).isEqualTo("1/12");
+    assertThat(fraction2.toString()).isEqualTo("7/3");
+    assertThat(fraction3.toString()).isEqualTo("0/1");
+    assertThat(fraction4.toString()).isEqualTo("-1/5");
+    assertThat(fraction5.toString()).isEqualTo("-13/5");
+  }
+
+  /** Test the doubleValue() implementation of Fraction. */
+  @Test
+  void assertThatToDoubleImplementationOfFractionIsCorrect() {
+    assertThat(fraction1.doubleValue()).isEqualTo(1.0 / 12);
+    assertThat(fraction2.doubleValue()).isEqualTo(7.0 / 3);
+    assertThat(fraction3.doubleValue()).isEqualTo(0.0);
+    assertThat(fraction4.doubleValue()).isEqualTo(-0.2);
+    assertThat(fraction5.doubleValue()).isEqualTo(-2.6);
+  }
+
+  /** Test the floatValue() implementation of Fraction. */
+  @Test
+  void assertThatToFloatImplementationOfFractionIsCorrect() {
+    assertThat(fraction1.floatValue()).isEqualTo(1.0f / 12);
+    assertThat(fraction2.floatValue()).isEqualTo(7.0f / 3);
+    assertThat(fraction3.floatValue()).isEqualTo(0.0f);
+    assertThat(fraction4.floatValue()).isEqualTo(-0.2f);
+    assertThat(fraction5.floatValue()).isEqualTo(-2.6f);
+  }
+
+  /** Test the intValue() implementation of Fraction. */
+  @Test
+  void assertThatToIntImplementationOfFractionIsCorrect() {
+    assertThat(fraction1.intValue()).isEqualTo(0);
+    assertThat(fraction2.intValue()).isEqualTo(2);
+    assertThat(fraction3.intValue()).isEqualTo(0);
+    assertThat(fraction4.intValue()).isEqualTo(0);
+    assertThat(fraction5.intValue()).isEqualTo(-2);
+  }
+
+  /** Test the longValue() implementation of Fraction. */
+  @Test
+  void assertThatToLongImplementationOfFractionIsCorrect() {
+    assertThat(fraction1.longValue()).isEqualTo(0);
+    assertThat(fraction2.longValue()).isEqualTo(2);
+    assertThat(fraction3.longValue()).isEqualTo(0);
+    assertThat(fraction4.longValue()).isEqualTo(0);
+    assertThat(fraction5.longValue()).isEqualTo(-2);
+  }
+
+  /** Test the shortValue() implementation of Fraction. */
+  @Test
+  void assertThatToShortImplementationOfFractionIsCorrect() {
+    assertThat(fraction1.shortValue()).isEqualTo((short) 0);
+    assertThat(fraction2.shortValue()).isEqualTo((short) 2);
+    assertThat(fraction3.shortValue()).isEqualTo((short) 0);
+    assertThat(fraction4.shortValue()).isEqualTo((short) 0);
+    assertThat(fraction5.shortValue()).isEqualTo((short) -2);
+  }
+
+  /** Test the byteValue() implementation of Fraction. */
+  @Test
+  void assertThatToByteImplementationOfFractionIsCorrect() {
+    assertThat(fraction1.byteValue()).isEqualTo((byte) 0);
+    assertThat(fraction2.byteValue()).isEqualTo((byte) 2);
+    assertThat(fraction3.byteValue()).isEqualTo((byte) 0);
+    assertThat(fraction4.byteValue()).isEqualTo((byte) 0);
+    assertThat(fraction5.byteValue()).isEqualTo((byte) -2);
+  }
+
+  /** Test that Fraction comparisons work correctly. */
+  @Test
+  void assertThatFractionComparisonsWorkCorrectly() {
+    boolean[] comparisons = {
+      fraction1.compareTo(fraction3) > 0,
+      fraction1.compareTo(fraction4) > 0,
+      fraction1.compareTo(fraction5) > 0,
+      fraction2.compareTo(fraction1) > 0,
+      fraction2.compareTo(fraction3) > 0,
+      fraction2.compareTo(fraction4) > 0,
+      fraction2.compareTo(fraction5) > 0,
+      fraction3.compareTo(fraction4) > 0,
+      fraction3.compareTo(fraction5) > 0,
+      fraction4.compareTo(fraction5) > 0,
+      fraction1.compareTo(fraction2) < 0,
+      fraction3.compareTo(fraction1) < 0,
+      fraction3.compareTo(fraction2) < 0,
+      fraction4.compareTo(fraction1) < 0,
+      fraction4.compareTo(fraction2) < 0,
+      fraction4.compareTo(fraction3) < 0,
+      fraction5.compareTo(fraction1) < 0,
+      fraction5.compareTo(fraction2) < 0,
+      fraction5.compareTo(fraction3) < 0,
+      fraction5.compareTo(fraction4) < 0
+    };
+
+    for (boolean comparison : comparisons) {
+      assertThat(comparison).isTrue();
+    }
+  }
+}

From 71b77ff171f0a8ab81b4a96300d87a8ecca64dec Mon Sep 17 00:00:00 2001
From: Alexander Zagniotov <azagniotov@box.com>
Date: Mon, 18 Nov 2024 14:15:03 -0800
Subject: [PATCH 04/11] internal pkg: migrated Ngram and NgramTest

---
 .../pemistahl/lingua/internal/Ngram.java      | 147 ++++++++++++++++
 .../lingua/internal/NgramIterator.java        |  64 +++++++
 .../pemistahl/lingua/internal/NgramRange.java |  78 +++++++++
 .../pemistahl/lingua/internal/NgramTest.java  | 157 ++++++++++++++++++
 4 files changed, 446 insertions(+)
 create mode 100644 src/main/java/com/github/pemistahl/lingua/internal/Ngram.java
 create mode 100644 src/main/java/com/github/pemistahl/lingua/internal/NgramIterator.java
 create mode 100644 src/main/java/com/github/pemistahl/lingua/internal/NgramRange.java
 create mode 100644 src/test/java/com/github/pemistahl/lingua/internal/NgramTest.java

diff --git a/src/main/java/com/github/pemistahl/lingua/internal/Ngram.java b/src/main/java/com/github/pemistahl/lingua/internal/Ngram.java
new file mode 100644
index 00000000..7954d157
--- /dev/null
+++ b/src/main/java/com/github/pemistahl/lingua/internal/Ngram.java
@@ -0,0 +1,147 @@
+/*
+ * Copyright © 2018-today Peter M. Stahl pemistahl@gmail.com
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.github.pemistahl.lingua.internal;
+
+import java.util.Objects;
+
+/**
+ * This class represents an Ngram value, a string-based object with specific constraints on its
+ * length. The Ngram is comparable to other Ngrams based on its length.
+ *
+ * @author Peter M. Stahl <pemistahl@gmail.com>
+ * @author Migration to Java from Kotlin by Alexander Zagniotov <azagniotov@gmail.com>
+ */
+public final class Ngram implements Comparable<Ngram> {
+
+  private final String value;
+
+  /**
+   * Constructs an Ngram with a given string value.
+   *
+   * @param value the string value of the Ngram
+   * @throws IllegalArgumentException if the length of the value is not in the range 0..5
+   */
+  public Ngram(final String value) {
+    if (value == null || value.trim().length() > 5) {
+      throw new IllegalArgumentException("Length of ngram '" + value + "' is not in range 0..5");
+    }
+    this.value = value;
+  }
+
+  public String getValue() {
+    return value;
+  }
+
+  /**
+   * Returns the string representation of the Ngram.
+   *
+   * @return the string value of the Ngram
+   */
+  @Override
+  public String toString() {
+    return value;
+  }
+
+  /**
+   * Compares the Ngram to another Ngram based on the length of their values.
+   *
+   * @param other the other Ngram to compare to
+   * @return a negative integer, zero, or a positive integer as this Ngram is less than, equal to,
+   *     or greater than the specified Ngram
+   */
+  @Override
+  public int compareTo(final Ngram other) {
+    return Integer.compare(this.value.length(), other.value.length());
+  }
+
+  /**
+   * Returns the range of lower order Ngrams that this Ngram can generate.
+   *
+   * @return the range of lower order Ngrams
+   */
+  public NgramRange rangeOfLowerOrderNgrams() {
+    return new NgramRange(this, new Ngram(String.valueOf(this.value.charAt(0))));
+  }
+
+  /**
+   * Decrements the Ngram by removing the last character, unless it is a zerogram.
+   *
+   * @return the decremented Ngram
+   * @throws IllegalArgumentException if the Ngram is a zerogram and cannot be decremented
+   */
+  public Ngram dec() {
+    if (value.isEmpty()) {
+      throw new IllegalStateException(
+          "Zerogram is ngram type of lowest order and can not be decremented");
+    } else if (value.length() == 1) {
+      return new Ngram("");
+    } else {
+      return new Ngram(value.substring(0, value.length() - 1));
+    }
+  }
+
+  /**
+   * Returns the name of the Ngram type based on the given length.
+   *
+   * @param ngramLength the length of the Ngram
+   * @return the name of the Ngram type (unigram, bigram, trigram, quadrigram, or fivegram)
+   * @throws IllegalArgumentException if the length is not between 1 and 5
+   */
+  public static String getNgramNameByLength(int ngramLength) {
+    switch (ngramLength) {
+      case 1:
+        return "unigram";
+      case 2:
+        return "bigram";
+      case 3:
+        return "trigram";
+      case 4:
+        return "quadrigram";
+      case 5:
+        return "fivegram";
+      default:
+        throw new IllegalArgumentException("Ngram length " + ngramLength + " is not in range 1..5");
+    }
+  }
+
+  @Override
+  public boolean equals(final Object o) {
+    if (this == o) return true;
+    if (o == null || getClass() != o.getClass()) return false;
+    Ngram ngram = (Ngram) o;
+    return Objects.equals(value, ngram.value);
+  }
+
+  @Override
+  public int hashCode() {
+    return Objects.hashCode(value);
+  }
+}
+
+/**
+ * A simple closed range interface representing a range of values.
+ *
+ * @param <T> the type of the range elements
+ */
+interface ClosedRange<T> {
+
+  boolean contains(final Ngram value);
+
+  T getStart();
+
+  T getEndInclusive();
+}
diff --git a/src/main/java/com/github/pemistahl/lingua/internal/NgramIterator.java b/src/main/java/com/github/pemistahl/lingua/internal/NgramIterator.java
new file mode 100644
index 00000000..3d97bc0d
--- /dev/null
+++ b/src/main/java/com/github/pemistahl/lingua/internal/NgramIterator.java
@@ -0,0 +1,64 @@
+package com.github.pemistahl.lingua.internal;
+
+import java.util.Iterator;
+import java.util.NoSuchElementException;
+import java.util.Objects;
+
+/**
+ * Iterator for iterating over Ngrams starting from a specific Ngram.
+ *
+ * @author Peter M. Stahl <pemistahl@gmail.com>
+ * @author Migration to Java from Kotlin by Alexander Zagniotov <azagniotov@gmail.com>
+ */
+public final class NgramIterator implements Iterator<Ngram> {
+
+  private Ngram current;
+
+  /**
+   * Constructs an NgramIterator starting at a specific Ngram.
+   *
+   * @param start the starting Ngram
+   */
+  public NgramIterator(final Ngram start) {
+    this.current = start;
+  }
+
+  /**
+   * Checks if there are more Ngrams to iterate over.
+   *
+   * @return true if there are more Ngrams, false otherwise
+   */
+  @Override
+  public boolean hasNext() {
+    return !current.toString().isEmpty();
+  }
+
+  /**
+   * Returns the next Ngram in the iteration.
+   *
+   * @return the next Ngram
+   * @throws NoSuchElementException if there are no more Ngrams to iterate over
+   */
+  @Override
+  public Ngram next() {
+    if (!hasNext()) {
+      throw new NoSuchElementException();
+    }
+    Ngram result = current;
+    current = current.dec();
+    return result;
+  }
+
+  @Override
+  public boolean equals(final Object o) {
+    if (this == o) return true;
+    if (o == null || getClass() != o.getClass()) return false;
+    NgramIterator that = (NgramIterator) o;
+    return Objects.equals(current, that.current);
+  }
+
+  @Override
+  public int hashCode() {
+    return Objects.hashCode(current);
+  }
+}
diff --git a/src/main/java/com/github/pemistahl/lingua/internal/NgramRange.java b/src/main/java/com/github/pemistahl/lingua/internal/NgramRange.java
new file mode 100644
index 00000000..39dc10bc
--- /dev/null
+++ b/src/main/java/com/github/pemistahl/lingua/internal/NgramRange.java
@@ -0,0 +1,78 @@
+package com.github.pemistahl.lingua.internal;
+
+import java.util.Iterator;
+import java.util.Objects;
+
+/**
+ * Represents a closed range of Ngrams from a start Ngram to an end Ngram. The range includes all
+ * Ngrams from the start Ngram to the end Ngram.
+ *
+ * @author Peter M. Stahl <pemistahl@gmail.com>
+ * @author Migration to Java from Kotlin by Alexander Zagniotov <azagniotov@gmail.com>
+ */
+public final class NgramRange implements Iterable<Ngram>, ClosedRange<Ngram> {
+
+  private final Ngram start;
+  private final Ngram endInclusive;
+
+  /**
+   * Constructs an NgramRange with a start and an end Ngram.
+   *
+   * @param start the start Ngram
+   * @param endInclusive the end Ngram (inclusive)
+   * @throws IllegalArgumentException if the start Ngram is not of a higher order than the end Ngram
+   */
+  public NgramRange(final Ngram start, final Ngram endInclusive) {
+    if (start.compareTo(endInclusive) < 0) {
+      throw new IllegalArgumentException(
+          "'" + start + "' must be of higher order than '" + endInclusive + "'");
+    }
+    this.start = start;
+    this.endInclusive = endInclusive;
+  }
+
+  /**
+   * Checks if a given Ngram is within this range.
+   *
+   * @param value the Ngram to check
+   * @return true if the Ngram is within the range, false otherwise
+   */
+  @Override
+  public boolean contains(final Ngram value) {
+    return value.compareTo(endInclusive) >= 0 && value.compareTo(start) <= 0;
+  }
+
+  /**
+   * Returns an iterator over the Ngrams in the range.
+   *
+   * @return an iterator over the Ngrams
+   */
+  @Override
+  public Iterator<Ngram> iterator() {
+    return new NgramIterator(start);
+  }
+
+  @Override
+  public Ngram getStart() {
+    return start;
+  }
+
+  @Override
+  public Ngram getEndInclusive() {
+    return endInclusive;
+  }
+
+  @Override
+  public boolean equals(final Object o) {
+    if (this == o) return true;
+    if (o == null || getClass() != o.getClass()) return false;
+    NgramRange ngrams = (NgramRange) o;
+    return Objects.equals(getStart(), ngrams.getStart())
+        && Objects.equals(getEndInclusive(), ngrams.getEndInclusive());
+  }
+
+  @Override
+  public int hashCode() {
+    return Objects.hash(getStart(), getEndInclusive());
+  }
+}
diff --git a/src/test/java/com/github/pemistahl/lingua/internal/NgramTest.java b/src/test/java/com/github/pemistahl/lingua/internal/NgramTest.java
new file mode 100644
index 00000000..d97bb248
--- /dev/null
+++ b/src/test/java/com/github/pemistahl/lingua/internal/NgramTest.java
@@ -0,0 +1,157 @@
+/*
+ * Copyright © 2018-today Peter M. Stahl pemistahl@gmail.com
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.github.pemistahl.lingua.internal;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+import java.util.List;
+import java.util.NoSuchElementException;
+import org.assertj.core.api.Assertions;
+import org.junit.jupiter.api.Test;
+
+/**
+ * Tests for the Ngram class and related components like NgramRange and NgramIterator. These tests
+ * verify the expected behavior and correctness of the Ngram functionality.
+ *
+ * @author Peter M. Stahl pemistahl@gmail.com
+ * @author Alexander Zagniotov azagniotov@gmail.com
+ */
+public class NgramTest {
+
+  private final Ngram zerogram = new Ngram("");
+  private final Ngram unigram = new Ngram("q");
+  private final Ngram bigram = new Ngram("qw");
+  private final Ngram trigram = new Ngram("qwe");
+  private final Ngram quadrigram = new Ngram("qwer");
+  private final Ngram fivegram = new Ngram("qwert");
+  private final List<Ngram> ngrams = List.of(unigram, bigram, trigram, quadrigram, fivegram);
+
+  @Test
+  public void testToString() {
+    // Assert that the toString() implementation of Ngram is its value
+    assertThat(fivegram.toString()).isEqualTo("qwert");
+    assertThat(quadrigram.toString()).isEqualTo("qwer");
+    assertThat(trigram.toString()).isEqualTo("qwe");
+    assertThat(bigram.toString()).isEqualTo("qw");
+    assertThat(unigram.toString()).isEqualTo("q");
+    assertThat(zerogram.toString()).isEqualTo("");
+  }
+
+  @Test
+  public void testNgramComparisons() {
+    // Assert that Ngram comparisons work correctly
+    List<Boolean> comparisons =
+        List.of(
+            fivegram.compareTo(quadrigram) > 0,
+            fivegram.compareTo(trigram) > 0,
+            fivegram.compareTo(bigram) > 0,
+            fivegram.compareTo(unigram) > 0,
+            fivegram.compareTo(zerogram) > 0,
+            quadrigram.compareTo(trigram) > 0,
+            quadrigram.compareTo(bigram) > 0,
+            quadrigram.compareTo(unigram) > 0,
+            quadrigram.compareTo(zerogram) > 0,
+            trigram.compareTo(bigram) > 0,
+            trigram.compareTo(unigram) > 0,
+            trigram.compareTo(zerogram) > 0,
+            bigram.compareTo(unigram) > 0,
+            bigram.compareTo(zerogram) > 0,
+            unigram.compareTo(zerogram) > 0,
+            quadrigram.compareTo(fivegram) < 0,
+            trigram.compareTo(fivegram) < 0,
+            bigram.compareTo(fivegram) < 0,
+            unigram.compareTo(fivegram) < 0,
+            zerogram.compareTo(fivegram) < 0,
+            trigram.compareTo(quadrigram) < 0,
+            bigram.compareTo(quadrigram) < 0,
+            unigram.compareTo(quadrigram) < 0,
+            zerogram.compareTo(trigram) < 0,
+            bigram.compareTo(trigram) < 0,
+            unigram.compareTo(trigram) < 0,
+            zerogram.compareTo(trigram) < 0,
+            unigram.compareTo(bigram) < 0,
+            zerogram.compareTo(bigram) < 0,
+            zerogram.compareTo(unigram) < 0);
+
+    comparisons.forEach(Assertions::assertThat);
+  }
+
+  @Test
+  public void testDecrement() {
+    // Assert that Fivegrams can be decremented correctly
+    Ngram quadrigram = fivegram.dec();
+    assertThat(quadrigram).isEqualTo(this.quadrigram);
+
+    Ngram trigram = quadrigram.dec();
+    assertThat(trigram).isEqualTo(this.trigram);
+
+    Ngram bigram = trigram.dec();
+    assertThat(bigram).isEqualTo(this.bigram);
+
+    Ngram unigram = bigram.dec();
+    assertThat(unigram).isEqualTo(this.unigram);
+
+    Ngram zerogram = unigram.dec();
+    assertThat(zerogram).isEqualTo(zerogram);
+
+    Assertions.assertThatIllegalStateException()
+        .isThrownBy(zerogram::dec)
+        .withMessage("Zerogram is ngram type of lowest order and can not be decremented");
+  }
+
+  @Test
+  public void testNgramRange() {
+    // Assert that NgramRange works correctly
+    NgramRange ngramRange = new NgramRange(fivegram, bigram);
+
+    assertThat(ngramRange.contains(fivegram)).isTrue();
+    assertThat(ngramRange.contains(quadrigram)).isTrue();
+    assertThat(ngramRange.contains(trigram)).isTrue();
+    assertThat(ngramRange.contains(bigram)).isTrue();
+
+    assertThat(ngramRange.contains(unigram)).isFalse();
+    assertThat(ngramRange.contains(zerogram)).isFalse();
+
+    assertThat(ngramRange.iterator()).isEqualTo(new NgramIterator(fivegram));
+
+    Assertions.assertThatThrownBy(() -> new NgramRange(bigram, fivegram))
+        .isInstanceOf(IllegalArgumentException.class)
+        .hasMessage("'qw' must be of higher order than 'qwert'");
+  }
+
+  @Test
+  public void testLowerOrderNgrams() {
+    // Assert that range of lower order ngrams can be generated correctly
+    for (Ngram ngram : ngrams) {
+      assertThat(ngram.rangeOfLowerOrderNgrams()).isEqualTo(new NgramRange(ngram, unigram));
+    }
+  }
+
+  @Test
+  public void testNgramIterator() {
+    // Assert that NgramIterator works correctly
+    NgramIterator iterator = new NgramIterator(fivegram);
+
+    assertThat(iterator.next()).isEqualTo(fivegram);
+    assertThat(iterator.next()).isEqualTo(quadrigram);
+    assertThat(iterator.next()).isEqualTo(trigram);
+    assertThat(iterator.next()).isEqualTo(bigram);
+    assertThat(iterator.next()).isEqualTo(unigram);
+
+    Assertions.assertThatThrownBy(iterator::next).isInstanceOf(NoSuchElementException.class);
+  }
+}

From f1549b4bf0346f764ccaf027e857ee4affeab73d Mon Sep 17 00:00:00 2001
From: Alexander Zagniotov <azagniotov@box.com>
Date: Mon, 18 Nov 2024 14:15:59 -0800
Subject: [PATCH 05/11] api pkg: migrated IsoCode639_1 and IsoCode639_3

---
 .../pemistahl/lingua/api/IsoCode639_1.java    | 270 ++++++++++++++++++
 .../pemistahl/lingua/api/IsoCode639_3.java    | 270 ++++++++++++++++++
 2 files changed, 540 insertions(+)
 create mode 100644 src/main/java/com/github/pemistahl/lingua/api/IsoCode639_1.java
 create mode 100644 src/main/java/com/github/pemistahl/lingua/api/IsoCode639_3.java

diff --git a/src/main/java/com/github/pemistahl/lingua/api/IsoCode639_1.java b/src/main/java/com/github/pemistahl/lingua/api/IsoCode639_1.java
new file mode 100644
index 00000000..df04d1ec
--- /dev/null
+++ b/src/main/java/com/github/pemistahl/lingua/api/IsoCode639_1.java
@@ -0,0 +1,270 @@
+/*
+ * Copyright © 2018-today Peter M. Stahl pemistahl@gmail.com
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.github.pemistahl.lingua.api;
+
+/**
+ * The ISO 639-1 code representations for the supported languages.
+ *
+ * <p>ISO 639 is a standardized nomenclature used to classify languages.
+ */
+public enum IsoCode639_1 {
+
+  /** The ISO 639-1 code for [Afrikaans][Language.AFRIKAANS]. */
+  AF,
+
+  /** The ISO 639-1 code for [Amharic][Language.AMHARIC]. */
+  AM,
+
+  /** The ISO 639-1 code for [Arabic][Language.ARABIC]. */
+  AR,
+
+  /** The ISO 639-1 code for [Azerbaijani][Language.AZERBAIJANI]. */
+  AZ,
+
+  /** The ISO 639-1 code for [Belarusian][Language.BELARUSIAN]. */
+  BE,
+
+  /** The ISO 639-1 code for [Bulgarian][Language.BULGARIAN]. */
+  BG,
+
+  /** The ISO 639-1 code for [Bengali][Language.BENGALI]. */
+  BN,
+
+  /** The ISO 639-1 code for [Bosnian][Language.BOSNIAN]. */
+  BS,
+
+  /** The ISO 639-1 code for [Catalan][Language.CATALAN]. */
+  CA,
+
+  /** The ISO 639-1 code for [Czech][Language.CZECH]. */
+  CS,
+
+  /** The ISO 639-1 code for [Welsh][Language.WELSH]. */
+  CY,
+
+  /** The ISO 639-1 code for [Danish][Language.DANISH]. */
+  DA,
+
+  /** The ISO 639-1 code for [German][Language.GERMAN]. */
+  DE,
+
+  /** The ISO 639-1 code for [Greek][Language.GREEK]. */
+  EL,
+
+  /** The ISO 639-1 code for [English][Language.ENGLISH]. */
+  EN,
+
+  /** The ISO 639-1 code for [Esperanto][Language.ESPERANTO]. */
+  EO,
+
+  /** The ISO 639-1 code for [Spanish][Language.SPANISH]. */
+  ES,
+
+  /** The ISO 639-1 code for [Estonian][Language.ESTONIAN]. */
+  ET,
+
+  /** The ISO 639-1 code for [Basque][Language.BASQUE]. */
+  EU,
+
+  /** The ISO 639-1 code for [Persian][Language.PERSIAN]. */
+  FA,
+
+  /** The ISO 639-1 code for [Finnish][Language.FINNISH]. */
+  FI,
+
+  /** The ISO 639-1 code for [French][Language.FRENCH]. */
+  FR,
+
+  /** The ISO 639-1 code for [Irish][Language.IRISH]. */
+  GA,
+
+  /** The ISO 639-1 code for [Gujarati][Language.GUJARATI]. */
+  GU,
+
+  /** The ISO 639-1 code for [Hebrew][Language.HEBREW]. */
+  HE,
+
+  /** The ISO 639-1 code for [Hindi][Language.HINDI]. */
+  HI,
+
+  /** The ISO 639-1 code for [Croatian][Language.CROATIAN]. */
+  HR,
+
+  /** The ISO 639-1 code for [Hungarian][Language.HUNGARIAN]. */
+  HU,
+
+  /** The ISO 639-1 code for [Armenian][Language.ARMENIAN]. */
+  HY,
+
+  /** The ISO 639-1 code for [Indonesian][Language.INDONESIAN]. */
+  ID,
+
+  /** The ISO 639-1 code for [Icelandic][Language.ICELANDIC]. */
+  IS,
+
+  /** The ISO 639-1 code for [Italian][Language.ITALIAN]. */
+  IT,
+
+  /** The ISO 639-1 code for [Japanese][Language.JAPANESE]. */
+  JA,
+
+  /** The ISO 639-1 code for [Georgian][Language.GEORGIAN]. */
+  KA,
+
+  /** The ISO 639-1 code for [Kazakh][Language.KAZAKH]. */
+  KK,
+
+  /** The ISO 639-1 code for [Korean][Language.KOREAN]. */
+  KO,
+
+  /** The ISO 639-1 code for [Latin][Language.LATIN]. */
+  LA,
+
+  /** The ISO 639-1 code for [Ganda][Language.GANDA]. */
+  LG,
+
+  /** The ISO 639-1 code for [Lithuanian][Language.LITHUANIAN]. */
+  LT,
+
+  /** The ISO 639-1 code for [Latvian][Language.LATVIAN]. */
+  LV,
+
+  /** The ISO 639-1 code for [Maori][Language.MAORI]. */
+  MI,
+
+  /** The ISO 639-1 code for [Macedonian][Language.MACEDONIAN]. */
+  MK,
+
+  /** The ISO 639-1 code for [Mongolian][Language.MONGOLIAN]. */
+  MN,
+
+  /** The ISO 639-1 code for [Marathi][Language.MARATHI]. */
+  MR,
+
+  /** The ISO 639-1 code for [Malay][Language.MALAY]. */
+  MS,
+
+  /** The ISO 639-1 code for [Norwegian Bokmal][Language.BOKMAL]. */
+  NB,
+
+  /** The ISO 639-1 code for [Dutch][Language.DUTCH]. */
+  NL,
+
+  /** The ISO 639-1 code for [Norwegian Nynorsk][Language.NYNORSK]. */
+  NN,
+
+  /** The ISO 639-1 code for [Oromo][Language.OROMO]. */
+  OM,
+
+  /** The ISO 639-1 code for [Punjabi][Language.PUNJABI]. */
+  PA,
+
+  /** The ISO 639-1 code for [Polish][Language.POLISH]. */
+  PL,
+
+  /** The ISO 639-1 code for [Portuguese][Language.PORTUGUESE]. */
+  PT,
+
+  /** The ISO 639-1 code for [Romanian][Language.ROMANIAN]. */
+  RO,
+
+  /** The ISO 639-1 code for [Russian][Language.RUSSIAN]. */
+  RU,
+
+  /** The ISO 639-1 code for [Sinhala][Language.SINHALA]. */
+  SI,
+
+  /** The ISO 639-1 code for [Slovak][Language.SLOVAK]. */
+  SK,
+
+  /** The ISO 639-1 code for [Slovene][Language.SLOVENE]. */
+  SL,
+
+  /** The ISO 639-1 code for [Shona][Language.SHONA]. */
+  SN,
+
+  /** The ISO 639-1 code for [Somali][Language.SOMALI]. */
+  SO,
+
+  /** The ISO 639-1 code for [Albanian][Language.ALBANIAN]. */
+  SQ,
+
+  /** The ISO 639-1 code for [Serbian][Language.SERBIAN]. */
+  SR,
+
+  /** The ISO 639-1 code for [Southern Sotho][Language.SOTHO]. */
+  ST,
+
+  /** The ISO 639-1 code for [Swedish][Language.SWEDISH]. */
+  SV,
+
+  /** The ISO 639-1 code for [Swahili][Language.SWAHILI]. */
+  SW,
+
+  /** The ISO 639-1 code for [Tamil][Language.TAMIL]. */
+  TA,
+
+  /** The ISO 639-1 code for [Telugu][Language.TELUGU]. */
+  TE,
+
+  /** The ISO 639-1 code for [Thai][Language.THAI]. */
+  TH,
+
+  /** The ISO 639-1 code for [Tigrinya][Language.TIGRINYA]. */
+  TI,
+
+  /** The ISO 639-1 code for [Tagalog][Language.TAGALOG]. */
+  TL,
+
+  /** The ISO 639-1 code for [Tswana][Language.TSWANA]. */
+  TN,
+
+  /** The ISO 639-1 code for [Turkish][Language.TURKISH]. */
+  TR,
+
+  /** The ISO 639-1 code for [Tsonga][Language.TSONGA]. */
+  TS,
+
+  /** The ISO 639-1 code for [Ukrainian][Language.UKRAINIAN]. */
+  UK,
+
+  /** The ISO 639-1 code for [Urdu][Language.URDU]. */
+  UR,
+
+  /** The ISO 639-1 code for [Vietnamese][Language.VIETNAMESE]. */
+  VI,
+
+  /** The ISO 639-1 code for [Xhosa][Language.XHOSA]. */
+  XH,
+
+  /** The ISO 639-1 code for [Yoruba][Language.YORUBA]. */
+  YO,
+
+  /** The ISO 639-1 code for [Chinese][Language.CHINESE]. */
+  ZH,
+
+  /** The ISO 639-1 code for [Zulu][Language.ZULU]. */
+  ZU,
+
+  /** The ISO 639-1 code for [the imaginary unknown language][Language.UNKNOWN]. */
+  NONE;
+
+  @Override
+  public String toString() {
+    return name().toLowerCase();
+  }
+}
diff --git a/src/main/java/com/github/pemistahl/lingua/api/IsoCode639_3.java b/src/main/java/com/github/pemistahl/lingua/api/IsoCode639_3.java
new file mode 100644
index 00000000..8bae3e78
--- /dev/null
+++ b/src/main/java/com/github/pemistahl/lingua/api/IsoCode639_3.java
@@ -0,0 +1,270 @@
+/*
+ * Copyright © 2018-today Peter M. Stahl pemistahl@gmail.com
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.github.pemistahl.lingua.api;
+
+/**
+ * The ISO 639-3 code representations for the supported languages.
+ *
+ * <p>ISO 639 is a standardized nomenclature used to classify languages.
+ */
+public enum IsoCode639_3 {
+
+  /** The ISO 639-3 code for [Afrikaans][Language.AFRIKAANS]. */
+  AFR,
+
+  /** The ISO 639-3 code for [Amharic][Language.AMHARIC]. */
+  AMH,
+
+  /** The ISO 639-3 code for [Arabic][Language.ARABIC]. */
+  ARA,
+
+  /** The ISO 639-3 code for [Azerbaijani][Language.AZERBAIJANI]. */
+  AZE,
+
+  /** The ISO 639-3 code for [Belarusian][Language.BELARUSIAN]. */
+  BEL,
+
+  /** The ISO 639-3 code for [Bengali][Language.BENGALI]. */
+  BEN,
+
+  /** The ISO 639-3 code for [Bosnian][Language.BOSNIAN]. */
+  BOS,
+
+  /** The ISO 639-3 code for [Bulgarian][Language.BULGARIAN]. */
+  BUL,
+
+  /** The ISO 639-3 code for [Catalan][Language.CATALAN]. */
+  CAT,
+
+  /** The ISO 639-3 code for [Czech][Language.CZECH]. */
+  CES,
+
+  /** The ISO 639-3 code for [Welsh][Language.WELSH]. */
+  CYM,
+
+  /** The ISO 639-3 code for [Danish][Language.DANISH]. */
+  DAN,
+
+  /** The ISO 639-3 code for [German][Language.GERMAN]. */
+  DEU,
+
+  /** The ISO 639-3 code for [Greek][Language.GREEK]. */
+  ELL,
+
+  /** The ISO 639-3 code for [English][Language.ENGLISH]. */
+  ENG,
+
+  /** The ISO 639-3 code for [Esperanto][Language.ESPERANTO]. */
+  EPO,
+
+  /** The ISO 639-3 code for [Estonian][Language.ESTONIAN]. */
+  EST,
+
+  /** The ISO 639-3 code for [Basque][Language.BASQUE]. */
+  EUS,
+
+  /** The ISO 639-3 code for [Persian][Language.PERSIAN]. */
+  FAS,
+
+  /** The ISO 639-3 code for [Finnish][Language.FINNISH]. */
+  FIN,
+
+  /** The ISO 639-3 code for [French][Language.FRENCH]. */
+  FRA,
+
+  /** The ISO 639-3 code for [Irish][Language.IRISH]. */
+  GLE,
+
+  /** The ISO 639-3 code for [Gujarati][Language.GUJARATI]. */
+  GUJ,
+
+  /** The ISO 639-3 code for [Hebrew][Language.HEBREW]. */
+  HEB,
+
+  /** The ISO 639-3 code for [Hindi][Language.HINDI]. */
+  HIN,
+
+  /** The ISO 639-3 code for [Croatian][Language.CROATIAN]. */
+  HRV,
+
+  /** The ISO 639-3 code for [Hungarian][Language.HUNGARIAN]. */
+  HUN,
+
+  /** The ISO 639-3 code for [Armenian][Language.ARMENIAN]. */
+  HYE,
+
+  /** The ISO 639-3 code for [Indonesian][Language.INDONESIAN]. */
+  IND,
+
+  /** The ISO 639-3 code for [Icelandic][Language.ICELANDIC]. */
+  ISL,
+
+  /** The ISO 639-3 code for [Italian][Language.ITALIAN]. */
+  ITA,
+
+  /** The ISO 639-3 code for [Japanese][Language.JAPANESE]. */
+  JPN,
+
+  /** The ISO 639-3 code for [Georgian][Language.GEORGIAN]. */
+  KAT,
+
+  /** The ISO 639-3 code for [Kazakh][Language.KAZAKH]. */
+  KAZ,
+
+  /** The ISO 639-3 code for [Korean][Language.KOREAN]. */
+  KOR,
+
+  /** The ISO 639-3 code for [Latin][Language.LATIN]. */
+  LAT,
+
+  /** The ISO 639-3 code for [Latvian][Language.LATVIAN]. */
+  LAV,
+
+  /** The ISO 639-3 code for [Lithuanian][Language.LITHUANIAN]. */
+  LIT,
+
+  /** The ISO 639-3 code for [Ganda][Language.GANDA]. */
+  LUG,
+
+  /** The ISO 639-3 code for [Marathi][Language.MARATHI]. */
+  MAR,
+
+  /** The ISO 639-3 code for [Macedonian][Language.MACEDONIAN]. */
+  MKD,
+
+  /** The ISO 639-3 code for [Mongolian][Language.MONGOLIAN]. */
+  MON,
+
+  /** The ISO 639-3 code for [Maori][Language.MAORI]. */
+  MRI,
+
+  /** The ISO 639-3 code for [Malay][Language.MALAY]. */
+  MSA,
+
+  /** The ISO 639-3 code for [Dutch][Language.DUTCH]. */
+  NLD,
+
+  /** The ISO 639-3 code for [Norwegian Nynorsk][Language.NYNORSK]. */
+  NNO,
+
+  /** The ISO 639-3 code for [Norwegian Bokmal][Language.BOKMAL]. */
+  NOB,
+
+  /** The ISO 639-3 code for [Oromo][Language.OROMO]. */
+  ORM,
+
+  /** The ISO 639-3 code for [Punjabi][Language.PUNJABI]. */
+  PAN,
+
+  /** The ISO 639-3 code for [Polish][Language.POLISH]. */
+  POL,
+
+  /** The ISO 639-3 code for [Portuguese][Language.PORTUGUESE]. */
+  POR,
+
+  /** The ISO 639-3 code for [Romanian][Language.ROMANIAN]. */
+  RON,
+
+  /** The ISO 639-3 code for [Russian][Language.RUSSIAN]. */
+  RUS,
+
+  /** The ISO 639-3 code for [Sinhala][Language.SINHALA]. */
+  SIN,
+
+  /** The ISO 639-3 code for [Slovak][Language.SLOVAK]. */
+  SLK,
+
+  /** The ISO 639-3 code for [Slovene][Language.SLOVENE]. */
+  SLV,
+
+  /** The ISO 639-3 code for [Shona][Language.SHONA]. */
+  SNA,
+
+  /** The ISO 639-3 code for [Somali][Language.SOMALI]. */
+  SOM,
+
+  /** The ISO 639-3 code for [Southern Sotho][Language.SOTHO]. */
+  SOT,
+
+  /** The ISO 639-3 code for [Spanish][Language.SPANISH]. */
+  SPA,
+
+  /** The ISO 639-3 code for [Albanian][Language.ALBANIAN]. */
+  SQI,
+
+  /** The ISO 639-3 code for [Serbian][Language.SERBIAN]. */
+  SRP,
+
+  /** The ISO 639-3 code for [Swahili][Language.SWAHILI]. */
+  SWA,
+
+  /** The ISO 639-3 code for [Swedish][Language.SWEDISH]. */
+  SWE,
+
+  /** The ISO 639-3 code for [Tamil][Language.TAMIL]. */
+  TAM,
+
+  /** The ISO 639-3 code for [Telugu][Language.TELUGU]. */
+  TEL,
+
+  /** The ISO 639-3 code for [Tagalog][Language.TAGALOG]. */
+  TGL,
+
+  /** The ISO 639-3 code for [Thai][Language.THAI]. */
+  THA,
+
+  /** The ISO 639-3 code for [Tigrinya][Language.TIGRINYA]. */
+  TIR,
+
+  /** The ISO 639-3 code for [Tswana][Language.TSWANA]. */
+  TSN,
+
+  /** The ISO 639-3 code for [Tsonga][Language.TSONGA]. */
+  TSO,
+
+  /** The ISO 639-3 code for [Turkish][Language.TURKISH]. */
+  TUR,
+
+  /** The ISO 639-3 code for [Ukrainian][Language.UKRAINIAN]. */
+  UKR,
+
+  /** The ISO 639-3 code for [Urdu][Language.URDU]. */
+  URD,
+
+  /** The ISO 639-3 code for [Vietnamese][Language.VIETNAMESE]. */
+  VIE,
+
+  /** The ISO 639-3 code for [Xhosa][Language.XHOSA]. */
+  XHO,
+
+  /** The ISO 639-3 code for [Yoruba][Language.YORUBA]. */
+  YOR,
+
+  /** The ISO 639-3 code for [Chinese][Language.CHINESE]. */
+  ZHO,
+
+  /** The ISO 639-3 code for [Zulu][Language.ZULU]. */
+  ZUL,
+
+  /** The ISO 639-3 code for [the imaginary unknown language][Language.UNKNOWN]. */
+  NONE;
+
+  @Override
+  public String toString() {
+    return name().toLowerCase();
+  }
+}

From d2a22a9ec22582a0aa8df3743aa93bf7752e6e30 Mon Sep 17 00:00:00 2001
From: Alexander Zagniotov <azagniotov@box.com>
Date: Mon, 18 Nov 2024 14:16:22 -0800
Subject: [PATCH 06/11] api pkg: migrated Alphabet

---
 .../pemistahl/lingua/internal/Alphabet.java   | 86 +++++++++++++++++++
 1 file changed, 86 insertions(+)
 create mode 100644 src/main/java/com/github/pemistahl/lingua/internal/Alphabet.java

diff --git a/src/main/java/com/github/pemistahl/lingua/internal/Alphabet.java b/src/main/java/com/github/pemistahl/lingua/internal/Alphabet.java
new file mode 100644
index 00000000..d257ba88
--- /dev/null
+++ b/src/main/java/com/github/pemistahl/lingua/internal/Alphabet.java
@@ -0,0 +1,86 @@
+/*
+ * Copyright © 2018-today Peter M. Stahl pemistahl@gmail.com
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.github.pemistahl.lingua.internal;
+
+import com.github.pemistahl.lingua.api.Language;
+import java.lang.Character.UnicodeScript;
+import java.util.EnumSet;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Set;
+import java.util.stream.Collectors;
+
+public enum Alphabet {
+  ARABIC,
+  ARMENIAN,
+  BENGALI,
+  CYRILLIC,
+  DEVANAGARI,
+  ETHIOPIC,
+  GEORGIAN,
+  GREEK,
+  GUJARATI,
+  GURMUKHI,
+  HAN,
+  HANGUL,
+  HEBREW,
+  HIRAGANA,
+  KATAKANA,
+  LATIN,
+  SINHALA,
+  TAMIL,
+  TELUGU,
+  THAI,
+  NONE;
+
+  private UnicodeScript script;
+
+  Alphabet() {
+    try {
+      this.script = UnicodeScript.forName(this.name());
+    } catch (IllegalArgumentException e) {
+      this.script = null;
+    }
+  }
+
+  public boolean matches(char chr) {
+    return UnicodeScript.of(chr) == this.script;
+  }
+
+  public boolean matches(CharSequence input) {
+    return input.chars().allMatch(codePoint -> UnicodeScript.of(codePoint) == this.script);
+  }
+
+  private Set<Language> supportedLanguages() {
+    return EnumSet.allOf(Language.class).stream()
+        .filter(language -> language.getAlphabets().contains(this))
+        .collect(Collectors.toSet());
+  }
+
+  public static Map<Alphabet, Language> allSupportingExactlyOneLanguage() {
+    Map<Alphabet, Language> alphabets = new HashMap<>();
+    for (Alphabet alphabet : values()) {
+      if (alphabet != NONE) {
+        Set<Language> supportedLanguages = alphabet.supportedLanguages();
+        if (supportedLanguages.size() == 1) {
+          alphabets.put(alphabet, supportedLanguages.iterator().next());
+        }
+      }
+    }
+    return alphabets;
+  }
+}

From 62867b379f9e5b855b3867a2af2cf61b81179737 Mon Sep 17 00:00:00 2001
From: Alexander Zagniotov <azagniotov@box.com>
Date: Mon, 18 Nov 2024 14:17:27 -0800
Subject: [PATCH 07/11] api pkg: migrated Language and LanguageTest

---
 .../github/pemistahl/lingua/api/Language.java | 383 ++++++++++++++
 .../pemistahl/lingua/api/LanguageTest.java    | 499 ++++++++++++++++++
 2 files changed, 882 insertions(+)
 create mode 100644 src/main/java/com/github/pemistahl/lingua/api/Language.java
 create mode 100644 src/test/java/com/github/pemistahl/lingua/api/LanguageTest.java

diff --git a/src/main/java/com/github/pemistahl/lingua/api/Language.java b/src/main/java/com/github/pemistahl/lingua/api/Language.java
new file mode 100644
index 00000000..29c0a061
--- /dev/null
+++ b/src/main/java/com/github/pemistahl/lingua/api/Language.java
@@ -0,0 +1,383 @@
+/*
+ * Copyright © 2018-today Peter M. Stahl pemistahl@gmail.com
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.github.pemistahl.lingua.api;
+
+import static com.github.pemistahl.lingua.api.IsoCode639_1.AF;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.AM;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.AR;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.AZ;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.BE;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.BG;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.BN;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.BS;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.CA;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.CS;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.CY;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.DA;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.DE;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.EL;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.EN;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.EO;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.ES;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.ET;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.EU;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.FA;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.FI;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.FR;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.GA;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.GU;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.HE;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.HI;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.HR;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.HU;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.HY;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.ID;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.IS;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.IT;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.JA;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.KA;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.KK;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.KO;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.LA;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.LG;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.LT;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.LV;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.MI;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.MK;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.MN;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.MR;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.MS;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.NB;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.NL;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.NN;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.OM;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.PA;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.PL;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.PT;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.RO;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.RU;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.SI;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.SK;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.SL;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.SN;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.SO;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.SQ;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.SR;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.ST;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.SV;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.SW;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.TA;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.TE;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.TH;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.TI;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.TL;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.TN;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.TR;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.TS;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.UK;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.UR;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.VI;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.XH;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.YO;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.ZH;
+import static com.github.pemistahl.lingua.api.IsoCode639_1.ZU;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.AFR;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.AMH;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.ARA;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.AZE;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.BEL;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.BEN;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.BOS;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.BUL;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.CAT;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.CES;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.CYM;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.DAN;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.DEU;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.ELL;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.ENG;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.EPO;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.EST;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.EUS;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.FAS;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.FIN;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.FRA;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.GLE;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.GUJ;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.HEB;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.HIN;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.HRV;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.HUN;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.HYE;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.IND;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.ISL;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.ITA;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.JPN;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.KAT;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.KAZ;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.KOR;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.LAT;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.LAV;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.LIT;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.LUG;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.MAR;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.MKD;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.MON;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.MRI;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.MSA;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.NLD;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.NNO;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.NOB;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.ORM;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.PAN;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.POL;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.POR;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.RON;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.RUS;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.SIN;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.SLK;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.SLV;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.SNA;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.SOM;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.SOT;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.SPA;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.SQI;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.SRP;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.SWA;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.SWE;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.TAM;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.TEL;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.TGL;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.THA;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.TIR;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.TSN;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.TSO;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.TUR;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.UKR;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.URD;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.VIE;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.XHO;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.YOR;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.ZHO;
+import static com.github.pemistahl.lingua.api.IsoCode639_3.ZUL;
+import static com.github.pemistahl.lingua.internal.Alphabet.CYRILLIC;
+import static com.github.pemistahl.lingua.internal.Alphabet.DEVANAGARI;
+import static com.github.pemistahl.lingua.internal.Alphabet.GURMUKHI;
+import static com.github.pemistahl.lingua.internal.Alphabet.HAN;
+import static com.github.pemistahl.lingua.internal.Alphabet.HANGUL;
+import static com.github.pemistahl.lingua.internal.Alphabet.HIRAGANA;
+import static com.github.pemistahl.lingua.internal.Alphabet.KATAKANA;
+import static com.github.pemistahl.lingua.internal.Alphabet.NONE;
+import static com.github.pemistahl.lingua.internal.util.extension.EnumExtensions.enumSetOf;
+
+import com.github.pemistahl.lingua.internal.Alphabet;
+import java.util.Arrays;
+import java.util.EnumSet;
+import java.util.List;
+import java.util.Optional;
+import java.util.stream.Collectors;
+
+/** The supported detectable languages. */
+public enum Language {
+  AFRIKAANS(AF, AFR, enumSetOf(Alphabet.LATIN), null),
+  ALBANIAN(SQ, SQI, enumSetOf(Alphabet.LATIN), null),
+  AMHARIC(AM, AMH, enumSetOf(Alphabet.ETHIOPIC), null),
+  ARABIC(AR, ARA, enumSetOf(Alphabet.ARABIC), null),
+  ARMENIAN(HY, HYE, enumSetOf(Alphabet.ARMENIAN), null),
+  AZERBAIJANI(AZ, AZE, enumSetOf(Alphabet.LATIN), "Əə"),
+  BASQUE(EU, EUS, enumSetOf(Alphabet.LATIN), null),
+  BELARUSIAN(BE, BEL, enumSetOf(CYRILLIC), null),
+  BENGALI(BN, BEN, enumSetOf(Alphabet.BENGALI), null),
+  BOKMAL(NB, NOB, enumSetOf(Alphabet.LATIN), null),
+  BOSNIAN(BS, BOS, enumSetOf(Alphabet.LATIN), null),
+  BULGARIAN(BG, BUL, enumSetOf(CYRILLIC), null),
+  CATALAN(CA, CAT, enumSetOf(Alphabet.LATIN), "Ïï"),
+  CHINESE(ZH, ZHO, enumSetOf(HAN), null),
+  CROATIAN(HR, HRV, enumSetOf(Alphabet.LATIN), null),
+  CZECH(CS, CES, enumSetOf(Alphabet.LATIN), "ĚěŘřŮů"),
+  DANISH(DA, DAN, enumSetOf(Alphabet.LATIN), null),
+  DUTCH(NL, NLD, enumSetOf(Alphabet.LATIN), null),
+  ENGLISH(EN, ENG, enumSetOf(Alphabet.LATIN), null),
+  ESPERANTO(EO, EPO, enumSetOf(Alphabet.LATIN), "ĈĉĜĝĤĥĴĵŜŝŬŭ"),
+  ESTONIAN(ET, EST, enumSetOf(Alphabet.LATIN), null),
+  FINNISH(FI, FIN, enumSetOf(Alphabet.LATIN), null),
+  FRENCH(FR, FRA, enumSetOf(Alphabet.LATIN), null),
+  GANDA(LG, LUG, enumSetOf(Alphabet.LATIN), null),
+  GEORGIAN(KA, KAT, enumSetOf(Alphabet.GEORGIAN), null),
+  GERMAN(DE, DEU, enumSetOf(Alphabet.LATIN), "ß"),
+  GREEK(EL, ELL, enumSetOf(Alphabet.GREEK), null),
+  GUJARATI(GU, GUJ, enumSetOf(Alphabet.GUJARATI), null),
+  HEBREW(HE, HEB, enumSetOf(Alphabet.HEBREW), null),
+  HINDI(HI, HIN, enumSetOf(DEVANAGARI), null),
+  HUNGARIAN(HU, HUN, enumSetOf(Alphabet.LATIN), "ŐőŰű"),
+  ICELANDIC(IS, ISL, enumSetOf(Alphabet.LATIN), null),
+  INDONESIAN(ID, IND, enumSetOf(Alphabet.LATIN), null),
+  IRISH(GA, GLE, enumSetOf(Alphabet.LATIN), null),
+  ITALIAN(IT, ITA, enumSetOf(Alphabet.LATIN), null),
+  JAPANESE(JA, JPN, enumSetOf(HIRAGANA, KATAKANA, HAN), null),
+  KAZAKH(KK, KAZ, enumSetOf(CYRILLIC), "ӘәҒғҚқҢңҰұ"),
+  KOREAN(KO, KOR, enumSetOf(HANGUL), null),
+  LATIN(LA, LAT, enumSetOf(Alphabet.LATIN), null),
+  LATVIAN(LV, LAV, enumSetOf(Alphabet.LATIN), "ĢģĶķĻļŅņ"),
+  LITHUANIAN(LT, LIT, enumSetOf(Alphabet.LATIN), "ĖėĮįŲų"),
+  MACEDONIAN(MK, MKD, enumSetOf(CYRILLIC), "ЃѓЅѕЌќЏџ"),
+  MALAY(MS, MSA, enumSetOf(Alphabet.LATIN), null),
+  MAORI(MI, MRI, enumSetOf(Alphabet.LATIN), null),
+  MARATHI(MR, MAR, enumSetOf(DEVANAGARI), "ळ"),
+  MONGOLIAN(MN, MON, enumSetOf(CYRILLIC), "ӨөҮү"),
+  NYNORSK(NN, NNO, enumSetOf(Alphabet.LATIN), null),
+  OROMO(OM, ORM, enumSetOf(Alphabet.LATIN), null),
+  PERSIAN(FA, FAS, enumSetOf(Alphabet.ARABIC), null),
+  POLISH(PL, POL, enumSetOf(Alphabet.LATIN), "ŁłŃńŚśŹź"),
+  PORTUGUESE(PT, POR, enumSetOf(Alphabet.LATIN), null),
+  PUNJABI(PA, PAN, enumSetOf(GURMUKHI), null),
+  ROMANIAN(RO, RON, enumSetOf(Alphabet.LATIN), "Țţ"),
+  RUSSIAN(RU, RUS, enumSetOf(CYRILLIC), null),
+  SERBIAN(SR, SRP, enumSetOf(CYRILLIC), "ЂђЋћ"),
+  SHONA(SN, SNA, enumSetOf(Alphabet.LATIN), null),
+  SINHALA(SI, SIN, enumSetOf(Alphabet.SINHALA), null),
+  SLOVAK(SK, SLK, enumSetOf(Alphabet.LATIN), "ĹĺĽľŔŕ"),
+  SLOVENE(SL, SLV, enumSetOf(Alphabet.LATIN), null),
+  SOMALI(SO, SOM, enumSetOf(Alphabet.LATIN), null),
+  SOTHO(ST, SOT, enumSetOf(Alphabet.LATIN), null),
+  SPANISH(ES, SPA, enumSetOf(Alphabet.LATIN), "¿¡"),
+  SWAHILI(SW, SWA, enumSetOf(Alphabet.LATIN), null),
+  SWEDISH(SV, SWE, enumSetOf(Alphabet.LATIN), null),
+  TAGALOG(TL, TGL, enumSetOf(Alphabet.LATIN), null),
+  TAMIL(TA, TAM, enumSetOf(Alphabet.TAMIL), null),
+  TELUGU(TE, TEL, enumSetOf(Alphabet.TELUGU), null),
+  THAI(TH, THA, enumSetOf(Alphabet.THAI), null),
+  TIGRINYA(TI, TIR, enumSetOf(Alphabet.ETHIOPIC), null),
+  TSONGA(TS, TSO, enumSetOf(Alphabet.LATIN), null),
+  TSWANA(TN, TSN, enumSetOf(Alphabet.LATIN), null),
+  TURKISH(TR, TUR, enumSetOf(Alphabet.LATIN), null),
+  UKRAINIAN(UK, UKR, enumSetOf(CYRILLIC), "ҐґЄєЇї"),
+  URDU(UR, URD, enumSetOf(Alphabet.ARABIC), null),
+  VIETNAMESE(
+      VI,
+      VIE,
+      enumSetOf(Alphabet.LATIN),
+      "ẰằẦầẲẳẨẩẴẵẪẫẮắẤấẠạẶặẬậỀềẺẻỂểẼẽỄễẾếỆệỈỉĨĩỊịƠơỒồỜờỎỏỔổỞởỖỗỠỡỐốỚớỘộỢợƯưỪừỦủỬửŨũỮữỨứỤụỰựỲỳỶỷỸỹỴỵ"),
+  WELSH(CY, CYM, enumSetOf(Alphabet.LATIN), null),
+  XHOSA(XH, XHO, enumSetOf(Alphabet.LATIN), null),
+  // TODO for YORUBA: "E̩e̩Ẹ́ẹ́É̩é̩Ẹ̀ẹ̀È̩è̩Ẹ̄ẹ̄Ē̩ē̩ŌōO̩o̩Ọ́ọ́Ó̩ó̩Ọ̀ọ̀Ò̩ò̩Ọ̄ọ̄Ō̩ō̩ṢṣS̩s̩"
+  YORUBA(YO, YOR, enumSetOf(Alphabet.LATIN), "Ṣṣ"),
+  ZULU(ZU, ZUL, enumSetOf(Alphabet.LATIN), null),
+
+  /**
+   * The imaginary unknown language.
+   *
+   * <p>This value is returned if no language can be detected reliably.
+   */
+  UNKNOWN(IsoCode639_1.NONE, IsoCode639_3.NONE, enumSetOf(NONE), null);
+
+  private final IsoCode639_1 isoCode639_1;
+  private final IsoCode639_3 isoCode639_3;
+  private final EnumSet<Alphabet> alphabets;
+  private final String uniqueCharacters;
+
+  Language(
+      final IsoCode639_1 isoCode639_1,
+      final IsoCode639_3 isoCode639_3,
+      final EnumSet<Alphabet> alphabets,
+      final String uniqueCharacters) {
+    this.isoCode639_1 = isoCode639_1;
+    this.isoCode639_3 = isoCode639_3;
+    this.alphabets = alphabets;
+    this.uniqueCharacters = uniqueCharacters;
+  }
+
+  public IsoCode639_1 getIsoCode639_1() {
+    return isoCode639_1;
+  }
+
+  public IsoCode639_3 getIsoCode639_3() {
+    return isoCode639_3;
+  }
+
+  public EnumSet<Alphabet> getAlphabets() {
+    // Copy to be safe
+    return EnumSet.copyOf(this.alphabets);
+  }
+
+  public String getUniqueCharacters() {
+    // Copy to be safe
+    return Optional.ofNullable(uniqueCharacters).orElse("");
+  }
+
+  public static List<Language> all() {
+    return filterOutLanguages(UNKNOWN);
+  }
+
+  public static List<Language> allSpokenOnes() {
+    return filterOutLanguages(UNKNOWN, LATIN);
+  }
+
+  public static List<Language> allWithArabicScript() {
+    return Arrays.stream(values())
+        .filter(language -> language.alphabets.contains(Alphabet.ARABIC))
+        .collect(Collectors.toList());
+  }
+
+  public static List<Language> allWithCyrillicScript() {
+    return Arrays.stream(values())
+        .filter(language -> language.alphabets.contains(CYRILLIC))
+        .collect(Collectors.toList());
+  }
+
+  public static List<Language> allWithDevanagariScript() {
+    return Arrays.stream(values())
+        .filter(language -> language.alphabets.contains(DEVANAGARI))
+        .collect(Collectors.toList());
+  }
+
+  public static List<Language> allWithEthiopicScript() {
+    return Arrays.stream(values())
+        .filter(language -> language.alphabets.contains(Alphabet.ETHIOPIC))
+        .collect(Collectors.toList());
+  }
+
+  public static List<Language> allWithLatinScript() {
+    return Arrays.stream(values())
+        .filter(language -> language.alphabets.contains(Alphabet.LATIN))
+        .collect(Collectors.toList());
+  }
+
+  public static Language getByIsoCode639_1(IsoCode639_1 isoCode) {
+    return Arrays.stream(values())
+        .filter(language -> language.isoCode639_1 == isoCode)
+        .findFirst()
+        .orElseThrow(
+            () ->
+                new IllegalArgumentException("No language found with ISO code 639-1: " + isoCode));
+  }
+
+  public static Language getByIsoCode639_3(IsoCode639_3 isoCode) {
+    return Arrays.stream(values())
+        .filter(language -> language.isoCode639_3 == isoCode)
+        .findFirst()
+        .orElseThrow(
+            () ->
+                new IllegalArgumentException("No language found with ISO code 639-3: " + isoCode));
+  }
+
+  private static List<Language> filterOutLanguages(Language... languages) {
+    return Arrays.stream(values())
+        .filter(language -> !Arrays.asList(languages).contains(language))
+        .collect(Collectors.toList());
+  }
+}
diff --git a/src/test/java/com/github/pemistahl/lingua/api/LanguageTest.java b/src/test/java/com/github/pemistahl/lingua/api/LanguageTest.java
new file mode 100644
index 00000000..8254027a
--- /dev/null
+++ b/src/test/java/com/github/pemistahl/lingua/api/LanguageTest.java
@@ -0,0 +1,499 @@
+/*
+ * Copyright © 2018-today Peter M. Stahl pemistahl@gmail.com
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.github.pemistahl.lingua.api;
+
+import static org.assertj.core.api.Assertions.assertThat;
+import static org.junit.jupiter.params.provider.Arguments.arguments;
+
+import com.github.pemistahl.lingua.internal.Alphabet;
+import java.util.List;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.Arguments;
+import org.junit.jupiter.params.provider.CsvSource;
+import org.junit.jupiter.params.provider.MethodSource;
+
+/**
+ * This class contains various tests related to supported languages and their corresponding scripts.
+ *
+ * <p>Author: Peter M. Stahl <pemistahl@gmail.com><br>
+ * Migration to Java from Kotlin by Alexander Zagniotov <azagniotov@gmail.com>
+ */
+public class LanguageTest {
+
+  /** Asserts that all supported languages are available. */
+  @Test
+  public void assertThatAllSupportedLanguagesAreAvailable() {
+    assertThat(Language.all())
+        .containsExactly(
+            Language.AFRIKAANS,
+            Language.ALBANIAN,
+            Language.AMHARIC,
+            Language.ARABIC,
+            Language.ARMENIAN,
+            Language.AZERBAIJANI,
+            Language.BASQUE,
+            Language.BELARUSIAN,
+            Language.BENGALI,
+            Language.BOKMAL,
+            Language.BOSNIAN,
+            Language.BULGARIAN,
+            Language.CATALAN,
+            Language.CHINESE,
+            Language.CROATIAN,
+            Language.CZECH,
+            Language.DANISH,
+            Language.DUTCH,
+            Language.ENGLISH,
+            Language.ESPERANTO,
+            Language.ESTONIAN,
+            Language.FINNISH,
+            Language.FRENCH,
+            Language.GANDA,
+            Language.GEORGIAN,
+            Language.GERMAN,
+            Language.GREEK,
+            Language.GUJARATI,
+            Language.HEBREW,
+            Language.HINDI,
+            Language.HUNGARIAN,
+            Language.ICELANDIC,
+            Language.INDONESIAN,
+            Language.IRISH,
+            Language.ITALIAN,
+            Language.JAPANESE,
+            Language.KAZAKH,
+            Language.KOREAN,
+            Language.LATIN,
+            Language.LATVIAN,
+            Language.LITHUANIAN,
+            Language.MACEDONIAN,
+            Language.MALAY,
+            Language.MAORI,
+            Language.MARATHI,
+            Language.MONGOLIAN,
+            Language.NYNORSK,
+            Language.OROMO,
+            Language.PERSIAN,
+            Language.POLISH,
+            Language.PORTUGUESE,
+            Language.PUNJABI,
+            Language.ROMANIAN,
+            Language.RUSSIAN,
+            Language.SERBIAN,
+            Language.SHONA,
+            Language.SINHALA,
+            Language.SLOVAK,
+            Language.SLOVENE,
+            Language.SOMALI,
+            Language.SOTHO,
+            Language.SPANISH,
+            Language.SWAHILI,
+            Language.SWEDISH,
+            Language.TAGALOG,
+            Language.TAMIL,
+            Language.TELUGU,
+            Language.THAI,
+            Language.TIGRINYA,
+            Language.TSONGA,
+            Language.TSWANA,
+            Language.TURKISH,
+            Language.UKRAINIAN,
+            Language.URDU,
+            Language.VIETNAMESE,
+            Language.WELSH,
+            Language.XHOSA,
+            Language.YORUBA,
+            Language.ZULU);
+  }
+
+  /** Asserts that all supported spoken languages are available. */
+  @Test
+  public void assertThatAllSupportedSpokenLanguagesAreAvailable() {
+    assertThat(Language.allSpokenOnes())
+        .containsExactly(
+            Language.AFRIKAANS,
+            Language.ALBANIAN,
+            Language.AMHARIC,
+            Language.ARABIC,
+            Language.ARMENIAN,
+            Language.AZERBAIJANI,
+            Language.BASQUE,
+            Language.BELARUSIAN,
+            Language.BENGALI,
+            Language.BOKMAL,
+            Language.BOSNIAN,
+            Language.BULGARIAN,
+            Language.CATALAN,
+            Language.CHINESE,
+            Language.CROATIAN,
+            Language.CZECH,
+            Language.DANISH,
+            Language.DUTCH,
+            Language.ENGLISH,
+            Language.ESPERANTO,
+            Language.ESTONIAN,
+            Language.FINNISH,
+            Language.FRENCH,
+            Language.GANDA,
+            Language.GEORGIAN,
+            Language.GERMAN,
+            Language.GREEK,
+            Language.GUJARATI,
+            Language.HEBREW,
+            Language.HINDI,
+            Language.HUNGARIAN,
+            Language.ICELANDIC,
+            Language.INDONESIAN,
+            Language.IRISH,
+            Language.ITALIAN,
+            Language.JAPANESE,
+            Language.KAZAKH,
+            Language.KOREAN,
+            Language.LATVIAN,
+            Language.LITHUANIAN,
+            Language.MACEDONIAN,
+            Language.MALAY,
+            Language.MAORI,
+            Language.MARATHI,
+            Language.MONGOLIAN,
+            Language.NYNORSK,
+            Language.OROMO,
+            Language.PERSIAN,
+            Language.POLISH,
+            Language.PORTUGUESE,
+            Language.PUNJABI,
+            Language.ROMANIAN,
+            Language.RUSSIAN,
+            Language.SERBIAN,
+            Language.SHONA,
+            Language.SINHALA,
+            Language.SLOVAK,
+            Language.SLOVENE,
+            Language.SOMALI,
+            Language.SOTHO,
+            Language.SPANISH,
+            Language.SWAHILI,
+            Language.SWEDISH,
+            Language.TAGALOG,
+            Language.TAMIL,
+            Language.TELUGU,
+            Language.THAI,
+            Language.TIGRINYA,
+            Language.TSONGA,
+            Language.TSWANA,
+            Language.TURKISH,
+            Language.UKRAINIAN,
+            Language.URDU,
+            Language.VIETNAMESE,
+            Language.WELSH,
+            Language.XHOSA,
+            Language.YORUBA,
+            Language.ZULU);
+  }
+
+  /** Asserts that certain languages support Arabic script. */
+  @Test
+  public void assertThatCertainLanguagesSupportArabicScript() {
+    assertThat(Language.allWithArabicScript())
+        .containsExactly(Language.ARABIC, Language.PERSIAN, Language.URDU);
+  }
+
+  /** Asserts that certain languages support Cyrillic script. */
+  @Test
+  public void assertThatCertainLanguagesSupportCyrillicScript() {
+    assertThat(Language.allWithCyrillicScript())
+        .containsExactly(
+            Language.BELARUSIAN,
+            Language.BULGARIAN,
+            Language.KAZAKH,
+            Language.MACEDONIAN,
+            Language.MONGOLIAN,
+            Language.RUSSIAN,
+            Language.SERBIAN,
+            Language.UKRAINIAN);
+  }
+
+  /** Asserts that certain languages support Devanagari script. */
+  @Test
+  public void assertThatCertainLanguagesSupportDevanagariScript() {
+    assertThat(Language.allWithDevanagariScript())
+        .containsExactly(Language.HINDI, Language.MARATHI);
+  }
+
+  /** Asserts that certain languages support Ethiopic script. */
+  @Test
+  public void assertThatCertainLanguagesSupportEthiopicScript() {
+    assertThat(Language.allWithEthiopicScript())
+        .containsExactly(Language.AMHARIC, Language.TIGRINYA);
+  }
+
+  /** Asserts that certain languages support Latin script. */
+  @Test
+  public void assertThatCertainLanguagesSupportLatinScript() {
+    assertThat(Language.allWithLatinScript())
+        .containsExactly(
+            Language.AFRIKAANS,
+            Language.ALBANIAN,
+            Language.AZERBAIJANI,
+            Language.BASQUE,
+            Language.BOKMAL,
+            Language.BOSNIAN,
+            Language.CATALAN,
+            Language.CROATIAN,
+            Language.CZECH,
+            Language.DANISH,
+            Language.DUTCH,
+            Language.ENGLISH,
+            Language.ESPERANTO,
+            Language.ESTONIAN,
+            Language.FINNISH,
+            Language.FRENCH,
+            Language.GANDA,
+            Language.GERMAN,
+            Language.HUNGARIAN,
+            Language.ICELANDIC,
+            Language.INDONESIAN,
+            Language.IRISH,
+            Language.ITALIAN,
+            Language.LATIN,
+            Language.LATVIAN,
+            Language.LITHUANIAN,
+            Language.MALAY,
+            Language.MAORI,
+            Language.NYNORSK,
+            Language.OROMO,
+            Language.POLISH,
+            Language.PORTUGUESE,
+            Language.ROMANIAN,
+            Language.SHONA,
+            Language.SLOVAK,
+            Language.SLOVENE,
+            Language.SOMALI,
+            Language.SOTHO,
+            Language.SPANISH,
+            Language.SWAHILI,
+            Language.SWEDISH,
+            Language.TAGALOG,
+            Language.TSONGA,
+            Language.TSWANA,
+            Language.TURKISH,
+            Language.VIETNAMESE,
+            Language.WELSH,
+            Language.XHOSA,
+            Language.YORUBA,
+            Language.ZULU);
+  }
+
+  @ParameterizedTest
+  @MethodSource("filteredLanguagesProvider")
+  public void assertThatLanguagesSupportCorrectAlphabets(
+      Alphabet alphabet, List<Language> expectedLanguages) {
+    final List<Language> actualLanguages =
+        Stream.of(Language.values())
+            .filter(language -> language.getAlphabets().contains(alphabet))
+            .collect(Collectors.toList());
+
+    assertThat(actualLanguages)
+        .as("alphabet '%s'", alphabet)
+        .containsExactlyElementsOf(expectedLanguages);
+  }
+
+  /** Asserts that the correct language is returned for the given ISO 639-1 code. */
+  @ParameterizedTest
+  @CsvSource({
+    "AF, AFRIKAANS",
+    "SQ, ALBANIAN",
+    "AM, AMHARIC",
+    "AR, ARABIC",
+    "HY, ARMENIAN",
+    "AZ, AZERBAIJANI",
+    "EU, BASQUE",
+    "BE, BELARUSIAN",
+    "BN, BENGALI",
+    "NB, BOKMAL",
+    "BS, BOSNIAN",
+    "BG, BULGARIAN",
+    "CA, CATALAN",
+    "ZH, CHINESE",
+    "HR, CROATIAN",
+    "CS, CZECH",
+    "DA, DANISH",
+    "NL, DUTCH",
+    "EN, ENGLISH",
+    "EO, ESPERANTO",
+    "ET, ESTONIAN",
+    "FI, FINNISH",
+    "FR, FRENCH",
+    "LG, GANDA",
+    "KA, GEORGIAN",
+    "DE, GERMAN",
+    "EL, GREEK",
+    "GU, GUJARATI",
+    "HE, HEBREW",
+    "HI, HINDI",
+    "HU, HUNGARIAN",
+    "IS, ICELANDIC",
+    "ID, INDONESIAN",
+    "GA, IRISH",
+    "IT, ITALIAN",
+    "JA, JAPANESE",
+    "KK, KAZAKH",
+    "KO, KOREAN",
+    "LA, LATIN",
+    "LV, LATVIAN",
+    "LT, LITHUANIAN",
+    "MK, MACEDONIAN",
+    "MS, MALAY",
+    "MI, MAORI",
+    "MR, MARATHI",
+    "MN, MONGOLIAN",
+    "NN, NYNORSK",
+    "OM, OROMO",
+    "FA, PERSIAN",
+    "PL, POLISH",
+    "PT, PORTUGUESE",
+    "PA, PUNJABI",
+    "RO, ROMANIAN",
+    "RU, RUSSIAN",
+    "SR, SERBIAN",
+    "SN, SHONA",
+    "SI, SINHALA",
+    "SK, SLOVAK",
+    "SL, SLOVENE",
+    "SO, SOMALI",
+    "ST, SOTHO",
+    "ES, SPANISH",
+    "SW, SWAHILI",
+    "SV, SWEDISH",
+    "TL, TAGALOG",
+    "TA, TAMIL",
+    "TE, TELUGU",
+    "TH, THAI",
+    "TI, TIGRINYA",
+    "TS, TSONGA",
+    "TN, TSWANA",
+    "TR, TURKISH",
+    "UK, UKRAINIAN",
+    "UR, URDU",
+    "VI, VIETNAMESE",
+    "CY, WELSH",
+    "XH, XHOSA",
+    "YO, YORUBA",
+    "ZU, ZULU"
+  })
+  public void assertThatLanguageIsReturnedForIso6391Code(
+      String isoCode, Language expectedLanguage) {
+    assertThat(Language.getByIsoCode639_1(IsoCode639_1.valueOf(isoCode)))
+        .isEqualTo(expectedLanguage);
+  }
+
+  /**
+   * Provides a filtered list of languages categorized by their alphabet script. This method is used
+   * for parameterized tests where the alphabet and corresponding languages are passed as arguments.
+   *
+   * @return A stream of arguments containing an alphabet and a list of languages using it.
+   */
+  public static Stream<Arguments> filteredLanguagesProvider() {
+    return Stream.of(
+        arguments(Alphabet.ARABIC, List.of(Language.ARABIC, Language.PERSIAN, Language.URDU)),
+        arguments(Alphabet.ARMENIAN, List.of(Language.ARMENIAN)),
+        arguments(Alphabet.BENGALI, List.of(Language.BENGALI)),
+        arguments(
+            Alphabet.CYRILLIC,
+            List.of(
+                Language.BELARUSIAN,
+                Language.BULGARIAN,
+                Language.KAZAKH,
+                Language.MACEDONIAN,
+                Language.MONGOLIAN,
+                Language.RUSSIAN,
+                Language.SERBIAN,
+                Language.UKRAINIAN)),
+        arguments(Alphabet.DEVANAGARI, List.of(Language.HINDI, Language.MARATHI)),
+        arguments(Alphabet.ETHIOPIC, List.of(Language.AMHARIC, Language.TIGRINYA)),
+        arguments(Alphabet.GEORGIAN, List.of(Language.GEORGIAN)),
+        arguments(Alphabet.GREEK, List.of(Language.GREEK)),
+        arguments(Alphabet.GUJARATI, List.of(Language.GUJARATI)),
+        arguments(Alphabet.GURMUKHI, List.of(Language.PUNJABI)),
+        arguments(Alphabet.HAN, List.of(Language.CHINESE, Language.JAPANESE)),
+        arguments(Alphabet.HANGUL, List.of(Language.KOREAN)),
+        arguments(Alphabet.HEBREW, List.of(Language.HEBREW)),
+        arguments(Alphabet.HIRAGANA, List.of(Language.JAPANESE)),
+        arguments(Alphabet.KATAKANA, List.of(Language.JAPANESE)),
+        arguments(
+            Alphabet.LATIN,
+            List.of(
+                Language.AFRIKAANS,
+                Language.ALBANIAN,
+                Language.AZERBAIJANI,
+                Language.BASQUE,
+                Language.BOKMAL,
+                Language.BOSNIAN,
+                Language.CATALAN,
+                Language.CROATIAN,
+                Language.CZECH,
+                Language.DANISH,
+                Language.DUTCH,
+                Language.ENGLISH,
+                Language.ESPERANTO,
+                Language.ESTONIAN,
+                Language.FINNISH,
+                Language.FRENCH,
+                Language.GANDA,
+                Language.GERMAN,
+                Language.HUNGARIAN,
+                Language.ICELANDIC,
+                Language.INDONESIAN,
+                Language.IRISH,
+                Language.ITALIAN,
+                Language.LATIN,
+                Language.LATVIAN,
+                Language.LITHUANIAN,
+                Language.MALAY,
+                Language.MAORI,
+                Language.NYNORSK,
+                Language.OROMO,
+                Language.POLISH,
+                Language.PORTUGUESE,
+                Language.ROMANIAN,
+                Language.SHONA,
+                Language.SLOVAK,
+                Language.SLOVENE,
+                Language.SOMALI,
+                Language.SOTHO,
+                Language.SPANISH,
+                Language.SWAHILI,
+                Language.SWEDISH,
+                Language.TAGALOG,
+                Language.TSONGA,
+                Language.TSWANA,
+                Language.TURKISH,
+                Language.VIETNAMESE,
+                Language.WELSH,
+                Language.XHOSA,
+                Language.YORUBA,
+                Language.ZULU)),
+        arguments(Alphabet.SINHALA, List.of(Language.SINHALA)),
+        arguments(Alphabet.TAMIL, List.of(Language.TAMIL)),
+        arguments(Alphabet.TELUGU, List.of(Language.TELUGU)),
+        arguments(Alphabet.THAI, List.of(Language.THAI)),
+        arguments(Alphabet.NONE, List.of(Language.UNKNOWN)));
+  }
+}

From c814c353157398f3a896c94d0c487537df99c57a Mon Sep 17 00:00:00 2001
From: Alexander Zagniotov <azagniotov@box.com>
Date: Mon, 18 Nov 2024 14:18:01 -0800
Subject: [PATCH 08/11] internal pkg: migrated Constant

---
 .../pemistahl/lingua/internal/Constant.java   | 221 ++++++++++++++++++
 1 file changed, 221 insertions(+)
 create mode 100644 src/main/java/com/github/pemistahl/lingua/internal/Constant.java

diff --git a/src/main/java/com/github/pemistahl/lingua/internal/Constant.java b/src/main/java/com/github/pemistahl/lingua/internal/Constant.java
new file mode 100644
index 00000000..18417cd2
--- /dev/null
+++ b/src/main/java/com/github/pemistahl/lingua/internal/Constant.java
@@ -0,0 +1,221 @@
+/*
+ * Copyright © 2018-today Peter M. Stahl pemistahl@gmail.com
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.github.pemistahl.lingua.internal;
+
+import com.github.pemistahl.lingua.api.Language;
+import java.lang.Character.UnicodeScript;
+import java.util.EnumSet;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Set;
+import java.util.regex.Pattern;
+
+public class Constant {
+
+  public static final Map<String, Set<Language>> CHARS_TO_LANGUAGES_MAPPING;
+
+  public static boolean isJapaneseAlphabet(char charValue) {
+    UnicodeScript script = UnicodeScript.of(charValue);
+    return script == UnicodeScript.HIRAGANA
+        || script == UnicodeScript.KATAKANA
+        || script == UnicodeScript.HAN;
+  }
+
+  public static final EnumSet<Language> LANGUAGES_SUPPORTING_LOGOGRAMS =
+      EnumSet.of(Language.CHINESE, Language.JAPANESE, Language.KOREAN);
+
+  public static final Pattern MULTIPLE_WHITESPACE = Pattern.compile("\\s+");
+  public static final Pattern NO_LETTER = Pattern.compile("^[^\\p{L}]+$");
+  public static final Pattern NUMBERS = Pattern.compile("\\p{N}");
+  public static final Pattern PUNCTUATION = Pattern.compile("\\p{P}");
+
+  static {
+    CHARS_TO_LANGUAGES_MAPPING = new HashMap<>(50);
+    CHARS_TO_LANGUAGES_MAPPING.put("Ãã", EnumSet.of(Language.PORTUGUESE, Language.VIETNAMESE));
+    CHARS_TO_LANGUAGES_MAPPING.put("ĄąĘę", EnumSet.of(Language.LITHUANIAN, Language.POLISH));
+    CHARS_TO_LANGUAGES_MAPPING.put("Żż", EnumSet.of(Language.POLISH, Language.ROMANIAN));
+    CHARS_TO_LANGUAGES_MAPPING.put("Îî", EnumSet.of(Language.FRENCH, Language.ROMANIAN));
+    CHARS_TO_LANGUAGES_MAPPING.put("Ññ", EnumSet.of(Language.BASQUE, Language.SPANISH));
+    CHARS_TO_LANGUAGES_MAPPING.put("ŇňŤť", EnumSet.of(Language.CZECH, Language.SLOVAK));
+    CHARS_TO_LANGUAGES_MAPPING.put("Ăă", EnumSet.of(Language.ROMANIAN, Language.VIETNAMESE));
+    CHARS_TO_LANGUAGES_MAPPING.put("İıĞğ", EnumSet.of(Language.AZERBAIJANI, Language.TURKISH));
+    CHARS_TO_LANGUAGES_MAPPING.put("ЈјЉљЊњ", EnumSet.of(Language.MACEDONIAN, Language.SERBIAN));
+    CHARS_TO_LANGUAGES_MAPPING.put("ẸẹỌọ", EnumSet.of(Language.VIETNAMESE, Language.YORUBA));
+    CHARS_TO_LANGUAGES_MAPPING.put("ÐðÞþ", EnumSet.of(Language.ICELANDIC, Language.TURKISH));
+    CHARS_TO_LANGUAGES_MAPPING.put("Ûû", EnumSet.of(Language.FRENCH, Language.HUNGARIAN));
+    CHARS_TO_LANGUAGES_MAPPING.put("Ōō", EnumSet.of(Language.MAORI, Language.YORUBA));
+    CHARS_TO_LANGUAGES_MAPPING.put(
+        "ĀāĒēĪī", EnumSet.of(Language.LATVIAN, Language.MAORI, Language.YORUBA));
+    CHARS_TO_LANGUAGES_MAPPING.put(
+        "Şş", EnumSet.of(Language.AZERBAIJANI, Language.ROMANIAN, Language.TURKISH));
+    CHARS_TO_LANGUAGES_MAPPING.put(
+        "Ďď", EnumSet.of(Language.CZECH, Language.ROMANIAN, Language.SLOVAK));
+    CHARS_TO_LANGUAGES_MAPPING.put(
+        "Ćć", EnumSet.of(Language.BOSNIAN, Language.CROATIAN, Language.POLISH));
+    CHARS_TO_LANGUAGES_MAPPING.put(
+        "Đđ", EnumSet.of(Language.BOSNIAN, Language.CROATIAN, Language.VIETNAMESE));
+    CHARS_TO_LANGUAGES_MAPPING.put(
+        "Іі", EnumSet.of(Language.BELARUSIAN, Language.KAZAKH, Language.UKRAINIAN));
+    CHARS_TO_LANGUAGES_MAPPING.put(
+        "Ìì", EnumSet.of(Language.ITALIAN, Language.VIETNAMESE, Language.YORUBA));
+    CHARS_TO_LANGUAGES_MAPPING.put(
+        "Øø", EnumSet.of(Language.BOKMAL, Language.DANISH, Language.NYNORSK));
+    CHARS_TO_LANGUAGES_MAPPING.put(
+        "Ūū", EnumSet.of(Language.LATVIAN, Language.LITHUANIAN, Language.MAORI, Language.YORUBA));
+    CHARS_TO_LANGUAGES_MAPPING.put(
+        "Ëë", EnumSet.of(Language.AFRIKAANS, Language.ALBANIAN, Language.DUTCH, Language.FRENCH));
+    CHARS_TO_LANGUAGES_MAPPING.put(
+        "ÈèÙù",
+        EnumSet.of(Language.FRENCH, Language.ITALIAN, Language.VIETNAMESE, Language.YORUBA));
+    CHARS_TO_LANGUAGES_MAPPING.put(
+        "Êê",
+        EnumSet.of(Language.AFRIKAANS, Language.FRENCH, Language.PORTUGUESE, Language.VIETNAMESE));
+    CHARS_TO_LANGUAGES_MAPPING.put(
+        "Õõ",
+        EnumSet.of(
+            Language.ESTONIAN, Language.HUNGARIAN, Language.PORTUGUESE, Language.VIETNAMESE));
+    CHARS_TO_LANGUAGES_MAPPING.put(
+        "Ôô",
+        EnumSet.of(Language.FRENCH, Language.PORTUGUESE, Language.SLOVAK, Language.VIETNAMESE));
+    CHARS_TO_LANGUAGES_MAPPING.put(
+        "ЁёЫыЭэ",
+        EnumSet.of(Language.BELARUSIAN, Language.KAZAKH, Language.MONGOLIAN, Language.RUSSIAN));
+    CHARS_TO_LANGUAGES_MAPPING.put(
+        "ЩщЪъ",
+        EnumSet.of(Language.BULGARIAN, Language.KAZAKH, Language.MONGOLIAN, Language.RUSSIAN));
+    CHARS_TO_LANGUAGES_MAPPING.put(
+        "Òò", EnumSet.of(Language.CATALAN, Language.ITALIAN, Language.VIETNAMESE, Language.YORUBA));
+    CHARS_TO_LANGUAGES_MAPPING.put(
+        "Ææ", EnumSet.of(Language.BOKMAL, Language.DANISH, Language.ICELANDIC, Language.NYNORSK));
+    CHARS_TO_LANGUAGES_MAPPING.put(
+        "Åå", EnumSet.of(Language.BOKMAL, Language.DANISH, Language.NYNORSK, Language.SWEDISH));
+    CHARS_TO_LANGUAGES_MAPPING.put(
+        "Ýý",
+        EnumSet.of(
+            Language.CZECH,
+            Language.ICELANDIC,
+            Language.SLOVAK,
+            Language.TURKISH,
+            Language.VIETNAMESE));
+    CHARS_TO_LANGUAGES_MAPPING.put(
+        "Ää",
+        EnumSet.of(
+            Language.ESTONIAN,
+            Language.FINNISH,
+            Language.GERMAN,
+            Language.SLOVAK,
+            Language.SWEDISH));
+    CHARS_TO_LANGUAGES_MAPPING.put(
+        "Àà",
+        EnumSet.of(
+            Language.CATALAN,
+            Language.FRENCH,
+            Language.ITALIAN,
+            Language.PORTUGUESE,
+            Language.VIETNAMESE));
+    CHARS_TO_LANGUAGES_MAPPING.put(
+        "Ââ",
+        EnumSet.of(
+            Language.FRENCH,
+            Language.PORTUGUESE,
+            Language.ROMANIAN,
+            Language.TURKISH,
+            Language.VIETNAMESE));
+    CHARS_TO_LANGUAGES_MAPPING.put(
+        "Üü",
+        EnumSet.of(
+            Language.AZERBAIJANI,
+            Language.CATALAN,
+            Language.ESTONIAN,
+            Language.GERMAN,
+            Language.HUNGARIAN,
+            Language.SPANISH,
+            Language.TURKISH));
+    CHARS_TO_LANGUAGES_MAPPING.put(
+        "ČčŠšŽž",
+        EnumSet.of(
+            Language.BOSNIAN,
+            Language.CZECH,
+            Language.CROATIAN,
+            Language.LATVIAN,
+            Language.LITHUANIAN,
+            Language.SLOVAK,
+            Language.SLOVENE));
+    CHARS_TO_LANGUAGES_MAPPING.put(
+        "Çç",
+        EnumSet.of(
+            Language.ALBANIAN,
+            Language.AZERBAIJANI,
+            Language.BASQUE,
+            Language.CATALAN,
+            Language.FRENCH,
+            Language.PORTUGUESE,
+            Language.TURKISH));
+    CHARS_TO_LANGUAGES_MAPPING.put(
+        "Öö",
+        EnumSet.of(
+            Language.AZERBAIJANI,
+            Language.ESTONIAN,
+            Language.FINNISH,
+            Language.GERMAN,
+            Language.HUNGARIAN,
+            Language.ICELANDIC,
+            Language.SWEDISH,
+            Language.TURKISH));
+    CHARS_TO_LANGUAGES_MAPPING.put(
+        "Óó",
+        EnumSet.of(
+            Language.CATALAN,
+            Language.HUNGARIAN,
+            Language.ICELANDIC,
+            Language.IRISH,
+            Language.POLISH,
+            Language.PORTUGUESE,
+            Language.SLOVAK,
+            Language.SPANISH,
+            Language.VIETNAMESE,
+            Language.YORUBA));
+    CHARS_TO_LANGUAGES_MAPPING.put(
+        "ÁáÍíÚú",
+        EnumSet.of(
+            Language.CATALAN,
+            Language.CZECH,
+            Language.ICELANDIC,
+            Language.IRISH,
+            Language.HUNGARIAN,
+            Language.PORTUGUESE,
+            Language.SLOVAK,
+            Language.SPANISH,
+            Language.VIETNAMESE,
+            Language.YORUBA));
+    CHARS_TO_LANGUAGES_MAPPING.put(
+        "Éé",
+        EnumSet.of(
+            Language.CATALAN,
+            Language.CZECH,
+            Language.FRENCH,
+            Language.HUNGARIAN,
+            Language.ICELANDIC,
+            Language.IRISH,
+            Language.ITALIAN,
+            Language.PORTUGUESE,
+            Language.SLOVAK,
+            Language.SPANISH,
+            Language.VIETNAMESE,
+            Language.YORUBA));
+  }
+}

From d8a3536d3cfb131e78584f9760a905743d7ee484 Mon Sep 17 00:00:00 2001
From: Alexander Zagniotov <azagniotov@box.com>
Date: Mon, 18 Nov 2024 14:18:26 -0800
Subject: [PATCH 09/11] internal pkg: migrated TestDataLanguageModel and Test

---
 .../internal/TestDataLanguageModel.java       |  73 +++++
 .../internal/TestDataLanguageModelTest.java   | 278 ++++++++++++++++++
 2 files changed, 351 insertions(+)
 create mode 100644 src/main/java/com/github/pemistahl/lingua/internal/TestDataLanguageModel.java
 create mode 100644 src/test/java/com/github/pemistahl/lingua/internal/TestDataLanguageModelTest.java

diff --git a/src/main/java/com/github/pemistahl/lingua/internal/TestDataLanguageModel.java b/src/main/java/com/github/pemistahl/lingua/internal/TestDataLanguageModel.java
new file mode 100644
index 00000000..fdf0f1a0
--- /dev/null
+++ b/src/main/java/com/github/pemistahl/lingua/internal/TestDataLanguageModel.java
@@ -0,0 +1,73 @@
+/*
+ * Copyright © 2018-today Peter M. Stahl pemistahl@gmail.com
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.github.pemistahl.lingua.internal;
+
+import java.util.HashSet;
+import java.util.Set;
+import java.util.regex.Pattern;
+
+/**
+ * A data class representing a language model built from n-grams.
+ *
+ * <p>This class contains a set of n-grams (sequences of characters) and provides functionality for
+ * creating a language model from a given text.
+ *
+ * @author Peter M. Stahl pemistahl@gmail.com
+ * @author Migration by Alexander Zagniotov azagniotov@gmail.com
+ */
+public class TestDataLanguageModel {
+
+  private static final Pattern LETTER_PATTERN = Pattern.compile("\\p{L}+");
+
+  private final Set<Ngram> ngrams;
+
+  public TestDataLanguageModel(final Set<Ngram> ngrams) {
+    this.ngrams = ngrams;
+  }
+
+  public Set<Ngram> getNgrams() {
+    return ngrams;
+  }
+
+  /**
+   * Creates a TestDataLanguageModel from the provided text and ngram length.
+   *
+   * <p>The ngram length must be between 1 and 5 inclusive. The method extracts n-grams of the
+   * specified length from the input text, ensuring that the extracted n-grams only contain
+   * alphabetic characters.
+   *
+   * @param text the input text from which to generate n-grams
+   * @param ngramLength the length of each n-gram
+   * @return a TestDataLanguageModel object containing the extracted n-grams
+   * @throws IllegalArgumentException if the ngramLength is not between 1 and 5 inclusive
+   */
+  public static TestDataLanguageModel fromText(final String text, final int ngramLength) {
+    if (ngramLength < 1 || ngramLength > 5) {
+      throw new IllegalArgumentException("ngram length " + ngramLength + " is not in range 1..5");
+    }
+
+    Set<Ngram> ngrams = new HashSet<>();
+    for (int idx = 0; idx <= text.length() - ngramLength; idx++) {
+      String textSlice = text.substring(idx, idx + ngramLength);
+      if (LETTER_PATTERN.matcher(textSlice).matches()) {
+        ngrams.add(new Ngram(textSlice));
+      }
+    }
+
+    return new TestDataLanguageModel(ngrams);
+  }
+}
diff --git a/src/test/java/com/github/pemistahl/lingua/internal/TestDataLanguageModelTest.java b/src/test/java/com/github/pemistahl/lingua/internal/TestDataLanguageModelTest.java
new file mode 100644
index 00000000..3f138c2d
--- /dev/null
+++ b/src/test/java/com/github/pemistahl/lingua/internal/TestDataLanguageModelTest.java
@@ -0,0 +1,278 @@
+/*
+ * Copyright © 2018-today Peter M. Stahl pemistahl@gmail.com
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.github.pemistahl.lingua.internal;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+import java.util.HashSet;
+import java.util.Set;
+import org.junit.jupiter.api.Test;
+
+/**
+ * Unit tests for the {@link TestDataLanguageModel} class.
+ *
+ * <p>This test class contains tests that verify the correctness of the {@link
+ * TestDataLanguageModel} when created with different ngram lengths.
+ *
+ * @author Peter M. Stahl pemistahl@gmail.com
+ * @author Migration by Alexander Zagniotov azagniotov@gmail.com
+ */
+public class TestDataLanguageModelTest {
+
+  // Be very careful to not auto-format or line break the text, the tests will fail.
+  private final String text =
+      ("These sentences are intended for testing purposes. "
+              + "Do not use them in production! "
+              + "By the way, they consist of 23 words in total.")
+          .toLowerCase()
+          .trim();
+
+  @Test
+  void assertThatUnigramLanguageModelCanBeCreatedFromTestData() {
+    TestDataLanguageModel model = TestDataLanguageModel.fromText(text, 1);
+
+    Set<Ngram> expected = new HashSet<>();
+    expected.add(new Ngram("a"));
+    expected.add(new Ngram("b"));
+    expected.add(new Ngram("c"));
+    expected.add(new Ngram("d"));
+    expected.add(new Ngram("e"));
+    expected.add(new Ngram("f"));
+    expected.add(new Ngram("g"));
+    expected.add(new Ngram("h"));
+    expected.add(new Ngram("i"));
+    expected.add(new Ngram("l"));
+    expected.add(new Ngram("m"));
+    expected.add(new Ngram("n"));
+    expected.add(new Ngram("o"));
+    expected.add(new Ngram("p"));
+    expected.add(new Ngram("r"));
+    expected.add(new Ngram("s"));
+    expected.add(new Ngram("t"));
+    expected.add(new Ngram("u"));
+    expected.add(new Ngram("w"));
+    expected.add(new Ngram("y"));
+
+    assertThat(model.getNgrams()).containsExactlyInAnyOrderElementsOf(expected);
+  }
+
+  @Test
+  void assertThatBigramLanguageModelCanBeCreatedFromTestData() {
+    TestDataLanguageModel model = TestDataLanguageModel.fromText(text, 2);
+
+    Set<Ngram> expected = new HashSet<>();
+    expected.add(new Ngram("de"));
+    expected.add(new Ngram("pr"));
+    expected.add(new Ngram("pu"));
+    expected.add(new Ngram("do"));
+    expected.add(new Ngram("uc"));
+    expected.add(new Ngram("ds"));
+    expected.add(new Ngram("du"));
+    expected.add(new Ngram("ur"));
+    expected.add(new Ngram("us"));
+    expected.add(new Ngram("ed"));
+    expected.add(new Ngram("in"));
+    expected.add(new Ngram("io"));
+    expected.add(new Ngram("em"));
+    expected.add(new Ngram("en"));
+    expected.add(new Ngram("is"));
+    expected.add(new Ngram("al"));
+    expected.add(new Ngram("es"));
+    expected.add(new Ngram("ar"));
+    expected.add(new Ngram("rd"));
+    expected.add(new Ngram("re"));
+    expected.add(new Ngram("ey"));
+    expected.add(new Ngram("nc"));
+    expected.add(new Ngram("nd"));
+    expected.add(new Ngram("ay"));
+    expected.add(new Ngram("ng"));
+    expected.add(new Ngram("ro"));
+    expected.add(new Ngram("rp"));
+    expected.add(new Ngram("no"));
+    expected.add(new Ngram("ns"));
+    expected.add(new Ngram("nt"));
+    expected.add(new Ngram("fo"));
+    expected.add(new Ngram("wa"));
+    expected.add(new Ngram("se"));
+    expected.add(new Ngram("od"));
+    expected.add(new Ngram("si"));
+    expected.add(new Ngram("by"));
+    expected.add(new Ngram("of"));
+    expected.add(new Ngram("wo"));
+    expected.add(new Ngram("on"));
+    expected.add(new Ngram("st"));
+    expected.add(new Ngram("ce"));
+    expected.add(new Ngram("or"));
+    expected.add(new Ngram("os"));
+    expected.add(new Ngram("ot"));
+    expected.add(new Ngram("co"));
+    expected.add(new Ngram("ta"));
+    expected.add(new Ngram("te"));
+    expected.add(new Ngram("ct"));
+    expected.add(new Ngram("th"));
+    expected.add(new Ngram("ti"));
+    expected.add(new Ngram("to"));
+    expected.add(new Ngram("he"));
+    expected.add(new Ngram("po"));
+
+    assertThat(model.getNgrams()).containsExactlyInAnyOrderElementsOf(expected);
+  }
+
+  @Test
+  void assertThatTrigramLanguageModelCanBeCreatedFromTestData() {
+    TestDataLanguageModel model = TestDataLanguageModel.fromText(text, 3);
+
+    Set<Ngram> expected = new HashSet<>();
+    expected.add(new Ngram("rds"));
+    expected.add(new Ngram("ose"));
+    expected.add(new Ngram("ded"));
+    expected.add(new Ngram("con"));
+    expected.add(new Ngram("use"));
+    expected.add(new Ngram("est"));
+    expected.add(new Ngram("ion"));
+    expected.add(new Ngram("ist"));
+    expected.add(new Ngram("pur"));
+    expected.add(new Ngram("hem"));
+    expected.add(new Ngram("hes"));
+    expected.add(new Ngram("tin"));
+    expected.add(new Ngram("cti"));
+    expected.add(new Ngram("tio"));
+    expected.add(new Ngram("wor"));
+    expected.add(new Ngram("ten"));
+    expected.add(new Ngram("hey"));
+    expected.add(new Ngram("ota"));
+    expected.add(new Ngram("tal"));
+    expected.add(new Ngram("tes"));
+    expected.add(new Ngram("uct"));
+    expected.add(new Ngram("sti"));
+    expected.add(new Ngram("pro"));
+    expected.add(new Ngram("odu"));
+    expected.add(new Ngram("nsi"));
+    expected.add(new Ngram("rod"));
+    expected.add(new Ngram("for"));
+    expected.add(new Ngram("ces"));
+    expected.add(new Ngram("nce"));
+    expected.add(new Ngram("not"));
+    expected.add(new Ngram("are"));
+    expected.add(new Ngram("pos"));
+    expected.add(new Ngram("tot"));
+    expected.add(new Ngram("end"));
+    expected.add(new Ngram("enc"));
+    expected.add(new Ngram("sis"));
+    expected.add(new Ngram("sen"));
+    expected.add(new Ngram("nte"));
+    expected.add(new Ngram("ses"));
+    expected.add(new Ngram("ord"));
+    expected.add(new Ngram("ing"));
+    expected.add(new Ngram("ent"));
+    expected.add(new Ngram("int"));
+    expected.add(new Ngram("nde"));
+    expected.add(new Ngram("way"));
+    expected.add(new Ngram("the"));
+    expected.add(new Ngram("rpo"));
+    expected.add(new Ngram("urp"));
+    expected.add(new Ngram("duc"));
+    expected.add(new Ngram("ons"));
+    expected.add(new Ngram("ese"));
+
+    assertThat(model.getNgrams()).containsExactlyInAnyOrderElementsOf(expected);
+  }
+
+  @Test
+  void assertThatQuadrigramLanguageModelCanBeCreatedFromTestData() {
+    TestDataLanguageModel model = TestDataLanguageModel.fromText(text, 4);
+
+    Set<Ngram> expected = new HashSet<>();
+    expected.add(new Ngram("onsi"));
+    expected.add(new Ngram("sist"));
+    expected.add(new Ngram("ende"));
+    expected.add(new Ngram("ords"));
+    expected.add(new Ngram("esti"));
+    expected.add(new Ngram("tenc"));
+    expected.add(new Ngram("nces"));
+    expected.add(new Ngram("oduc"));
+    expected.add(new Ngram("tend"));
+    expected.add(new Ngram("thes"));
+    expected.add(new Ngram("rpos"));
+    expected.add(new Ngram("ting"));
+    expected.add(new Ngram("nten"));
+    expected.add(new Ngram("nsis"));
+    expected.add(new Ngram("they"));
+    expected.add(new Ngram("tota"));
+    expected.add(new Ngram("cons"));
+    expected.add(new Ngram("tion"));
+    expected.add(new Ngram("prod"));
+    expected.add(new Ngram("ence"));
+    expected.add(new Ngram("test"));
+    expected.add(new Ngram("otal"));
+    expected.add(new Ngram("pose"));
+    expected.add(new Ngram("nded"));
+    expected.add(new Ngram("oses"));
+    expected.add(new Ngram("inte"));
+    expected.add(new Ngram("urpo"));
+    expected.add(new Ngram("them"));
+    expected.add(new Ngram("sent"));
+    expected.add(new Ngram("duct"));
+    expected.add(new Ngram("stin"));
+    expected.add(new Ngram("ente"));
+    expected.add(new Ngram("ucti"));
+    expected.add(new Ngram("purp"));
+    expected.add(new Ngram("ctio"));
+    expected.add(new Ngram("rodu"));
+    expected.add(new Ngram("word"));
+    expected.add(new Ngram("hese"));
+
+    assertThat(model.getNgrams()).containsExactlyInAnyOrderElementsOf(expected);
+  }
+
+  @Test
+  void assertThatFivegramLanguageModelCanBeCreatedFromTestData() {
+    TestDataLanguageModel model = TestDataLanguageModel.fromText(text, 5);
+
+    Set<Ngram> expected = new HashSet<>();
+    expected.add(new Ngram("testi"));
+    expected.add(new Ngram("sente"));
+    expected.add(new Ngram("ences"));
+    expected.add(new Ngram("tende"));
+    expected.add(new Ngram("these"));
+    expected.add(new Ngram("ntenc"));
+    expected.add(new Ngram("ducti"));
+    expected.add(new Ngram("ntend"));
+    expected.add(new Ngram("onsis"));
+    expected.add(new Ngram("total"));
+    expected.add(new Ngram("uctio"));
+    expected.add(new Ngram("enten"));
+    expected.add(new Ngram("poses"));
+    expected.add(new Ngram("ction"));
+    expected.add(new Ngram("produ"));
+    expected.add(new Ngram("inten"));
+    expected.add(new Ngram("nsist"));
+    expected.add(new Ngram("words"));
+    expected.add(new Ngram("sting"));
+    expected.add(new Ngram("tence"));
+    expected.add(new Ngram("purpo"));
+    expected.add(new Ngram("estin"));
+    expected.add(new Ngram("roduc"));
+    expected.add(new Ngram("urpos"));
+    expected.add(new Ngram("ended"));
+    expected.add(new Ngram("rpose"));
+    expected.add(new Ngram("oduct"));
+    expected.add(new Ngram("consi"));
+
+    assertThat(model.getNgrams()).containsExactlyInAnyOrderElementsOf(expected);
+  }
+}

From b03ca803eb039d45cae2eac5b5ff53f3c963d441 Mon Sep 17 00:00:00 2001
From: Alexander Zagniotov <azagniotov@box.com>
Date: Mon, 18 Nov 2024 14:18:46 -0800
Subject: [PATCH 10/11] internal pkg: migrated TrainingDataLanguageModel and
 Test

---
 .../internal/TrainingDataLanguageModel.java   | 245 +++++++
 .../TrainingDataLanguageModelTest.java        | 640 ++++++++++++++++++
 2 files changed, 885 insertions(+)
 create mode 100644 src/main/java/com/github/pemistahl/lingua/internal/TrainingDataLanguageModel.java
 create mode 100644 src/test/java/com/github/pemistahl/lingua/internal/TrainingDataLanguageModelTest.java

diff --git a/src/main/java/com/github/pemistahl/lingua/internal/TrainingDataLanguageModel.java b/src/main/java/com/github/pemistahl/lingua/internal/TrainingDataLanguageModel.java
new file mode 100644
index 00000000..f18214f0
--- /dev/null
+++ b/src/main/java/com/github/pemistahl/lingua/internal/TrainingDataLanguageModel.java
@@ -0,0 +1,245 @@
+/*
+ * Copyright © 2018-today Peter M. Stahl pemistahl@gmail.com
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.github.pemistahl.lingua.internal;
+
+import static com.github.pemistahl.lingua.internal.util.extension.MapExtensions.incrementCounter;
+
+import com.github.pemistahl.lingua.api.Language;
+import com.squareup.moshi.JsonAdapter;
+import com.squareup.moshi.JsonReader;
+import com.squareup.moshi.Moshi;
+import com.squareup.moshi.kotlin.reflect.KotlinJsonAdapterFactory;
+import it.unimi.dsi.fastutil.objects.Object2FloatMap;
+import it.unimi.dsi.fastutil.objects.Object2FloatOpenHashMap;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
+import okio.Okio;
+
+/**
+ * This class represents a training data language model. It contains methods to generate a language
+ * model from training data, convert it to JSON format, and read it from JSON.
+ *
+ * @author Peter M. Stahl <pemistahl@gmail.com>
+ * @author Migration to Java from Kotlin by Alexander Zagniotov <azagniotov@gmail.com>
+ */
+public class TrainingDataLanguageModel {
+
+  private static final String LANGUAGE_NAME = "language";
+  private static final String NGRAMS_NAME = "ngrams";
+
+  private static final JsonAdapter<JsonLanguageModel> JSON_ADAPTER =
+      new Moshi.Builder()
+          .add(new FractionAdapter())
+          .addLast(new KotlinJsonAdapterFactory())
+          .build()
+          .adapter(JsonLanguageModel.class);
+
+  private final Language language;
+  private final Map<Ngram, Integer> absoluteFrequencies;
+  private final Map<Ngram, Fraction> relativeFrequencies;
+
+  public TrainingDataLanguageModel(
+      final Language language,
+      final Map<Ngram, Integer> absoluteFrequencies,
+      final Map<Ngram, Fraction> relativeFrequencies) {
+    this.language = language;
+    this.absoluteFrequencies = absoluteFrequencies;
+    this.relativeFrequencies = relativeFrequencies;
+  }
+
+  public Language getLanguage() {
+    return language;
+  }
+
+  public Map<Ngram, Integer> getAbsoluteFrequencies() {
+    return absoluteFrequencies;
+  }
+
+  public Map<Ngram, Fraction> getRelativeFrequencies() {
+    return relativeFrequencies;
+  }
+
+  /**
+   * Converts this language model to a JSON string.
+   *
+   * @return The JSON representation of the language model.
+   */
+  public String toJson() {
+
+    Map<Fraction, List<Ngram>> allNgrams = new HashMap<>();
+    for (Map.Entry<Ngram, Fraction> entry : relativeFrequencies.entrySet()) {
+      allNgrams.computeIfAbsent(entry.getValue(), k -> new ArrayList<>()).add(entry.getKey());
+    }
+
+    Map<Fraction, String> ngrams =
+        allNgrams.entrySet().stream()
+            .collect(
+                Collectors.toMap(
+                    Map.Entry::getKey,
+                    entry ->
+                        entry.getValue().stream()
+                            .map(Ngram::getValue)
+                            .collect(Collectors.joining(" "))));
+
+    return JSON_ADAPTER.toJson(new JsonLanguageModel(language, ngrams));
+  }
+
+  /**
+   * Creates a training data language model from a sequence of text.
+   *
+   * @param text The sequence of text to analyze.
+   * @param language The language of the model.
+   * @param ngramLength The length of the n-grams.
+   * @param charClass A string representing the set of valid characters for n-grams.
+   * @param lowerNgramAbsoluteFrequencies Frequencies of lower n-grams.
+   * @return A TrainingDataLanguageModel object.
+   */
+  public static TrainingDataLanguageModel fromText(
+      final Iterable<String> text,
+      final Language language,
+      final int ngramLength,
+      final String charClass,
+      final Map<Ngram, Integer> lowerNgramAbsoluteFrequencies) {
+
+    if (ngramLength < 1 || ngramLength > 5) {
+      throw new IllegalArgumentException("ngram length " + ngramLength + " is not in range 1..5");
+    }
+
+    Map<Ngram, Integer> absoluteFrequencies =
+        computeAbsoluteFrequencies(text, ngramLength, charClass);
+    Map<Ngram, Fraction> relativeFrequencies =
+        computeRelativeFrequencies(ngramLength, absoluteFrequencies, lowerNgramAbsoluteFrequencies);
+
+    return new TrainingDataLanguageModel(language, absoluteFrequencies, relativeFrequencies);
+  }
+
+  /**
+   * Reads a JSON representation of a language model from an InputStream.
+   *
+   * @param json The InputStream containing the JSON data.
+   * @return A map of n-gram frequencies.
+   * @throws java.io.IOException If there is an error reading the InputStream.
+   */
+  public static Object2FloatMap<String> fromJson(final InputStream json)
+      throws java.io.IOException {
+    try (final JsonReader reader = JsonReader.of(Okio.buffer(Okio.source(json)))) {
+      Object2FloatOpenHashMap<String> frequencies = new Object2FloatOpenHashMap<>();
+      reader.beginObject();
+
+      while (reader.hasNext()) {
+        String name = reader.nextName();
+        if (name.equals(LANGUAGE_NAME)) {
+          reader.skipValue();
+        } else if (name.equals(NGRAMS_NAME)) {
+          reader.beginObject();
+          while (reader.hasNext()) {
+            String[] parts = reader.nextName().split("/");
+            float numerator = Float.parseFloat(parts[0]);
+            int denominator = Integer.parseInt(parts[1]);
+            float frequency = numerator / denominator;
+
+            String ngrams = reader.nextString();
+            for (String ngram : ngrams.split(" ")) {
+              frequencies.put(ngram, frequency);
+            }
+          }
+          reader.endObject();
+        } else {
+          throw new AssertionError("Unexpected name in language model JSON");
+        }
+      }
+
+      reader.endObject();
+
+      // Rehashes the map, making the table as small as possible.
+      // Trim to reduce in-memory model size
+      frequencies.trim();
+      return frequencies;
+    }
+  }
+
+  private static Map<Ngram, Integer> computeAbsoluteFrequencies(
+      final Iterable<String> text, final int ngramLength, final String charClass) {
+
+    Map<Ngram, Integer> absoluteFrequencies = new HashMap<>();
+    String regex = "[" + charClass + "]+";
+
+    for (String line : text) {
+      String lowerCasedLine = line.toLowerCase();
+      for (int idx = 0; idx <= lowerCasedLine.length() - ngramLength; idx++) {
+        String textSlice = lowerCasedLine.substring(idx, idx + ngramLength);
+        if (textSlice.matches(regex)) {
+          Ngram ngram = new Ngram(textSlice);
+          incrementCounter(absoluteFrequencies, ngram);
+        }
+      }
+    }
+
+    return absoluteFrequencies;
+  }
+
+  private static Map<Ngram, Fraction> computeRelativeFrequencies(
+      int ngramLength,
+      Map<Ngram, Integer> absoluteFrequencies,
+      Map<Ngram, Integer> lowerNgramAbsoluteFrequencies) {
+
+    Map<Ngram, Fraction> ngramProbabilities = new HashMap<>();
+    int totalNgramFrequency =
+        absoluteFrequencies.values().stream().mapToInt(Integer::intValue).sum();
+
+    for (Map.Entry<Ngram, Integer> entry : absoluteFrequencies.entrySet()) {
+      Ngram ngram = entry.getKey();
+      int frequency = entry.getValue();
+      int denominator =
+          (ngramLength == 1 || lowerNgramAbsoluteFrequencies.isEmpty())
+              ? totalNgramFrequency
+              : lowerNgramAbsoluteFrequencies.getOrDefault(
+                  new Ngram(ngram.getValue().substring(0, ngramLength - 1)), 0);
+
+      ngramProbabilities.put(ngram, new Fraction(frequency, denominator));
+    }
+
+    return ngramProbabilities;
+  }
+
+  /**
+   * A class that represents a language model in JSON format. It holds the language and the n-grams
+   * as fractions.
+   */
+  private static class JsonLanguageModel {
+
+    private final Language language;
+    private final Map<Fraction, String> ngrams;
+
+    public JsonLanguageModel(final Language language, final Map<Fraction, String> ngrams) {
+      this.language = language;
+      this.ngrams = ngrams;
+    }
+
+    public Language getLanguage() {
+      return language;
+    }
+
+    public Map<Fraction, String> getNgrams() {
+      return ngrams;
+    }
+  }
+}
diff --git a/src/test/java/com/github/pemistahl/lingua/internal/TrainingDataLanguageModelTest.java b/src/test/java/com/github/pemistahl/lingua/internal/TrainingDataLanguageModelTest.java
new file mode 100644
index 00000000..188bc01d
--- /dev/null
+++ b/src/test/java/com/github/pemistahl/lingua/internal/TrainingDataLanguageModelTest.java
@@ -0,0 +1,640 @@
+/*
+ * Copyright © 2018-today Peter M. Stahl pemistahl@gmail.com
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.github.pemistahl.lingua.internal;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+import com.github.pemistahl.lingua.api.Language;
+import it.unimi.dsi.fastutil.objects.Object2FloatMap;
+import java.io.ByteArrayInputStream;
+import java.nio.charset.StandardCharsets;
+import java.util.*;
+import java.util.function.Function;
+import java.util.stream.Collectors;
+import org.junit.jupiter.api.Test;
+
+/**
+ * Unit tests for the TrainingDataLanguageModel class.
+ *
+ * <p>These tests ensure that the language models are correctly generated, serialized, and
+ * deserialized.
+ *
+ * @author Peter M. Stahl <pemistahl@gmail.com>
+ * @author Migration to Java from Kotlin by Alexander Zagniotov <azagniotov@gmail.com>
+ */
+public class TrainingDataLanguageModelTest {
+  // Be very careful to not auto-format or line break the text, the tests will fail.
+  private static final String TEXT =
+      ("These sentences are intended for testing purposes. "
+              + "Do not use them in production! "
+              + "By the way, they consist of 23 words in total.")
+          .toLowerCase()
+          .trim();
+
+  private static final Iterable<String> ITERABLE_OF_STRINGS =
+      new ArrayList<>(Arrays.asList(TEXT.split("\n")));
+
+  private static final Function<Map.Entry<String, ?>, Ngram> KEY_MAPPER =
+      entry -> new Ngram(entry.getKey());
+
+  private static final Function<Map.Entry<Ngram, String>, Fraction> VALUE_MAPPER =
+      entry -> {
+        final String[] parts = entry.getValue().split("/");
+        return new Fraction(Integer.parseInt(parts[0]), Integer.parseInt(parts[1]));
+      };
+
+  private final String expectedUnigramLanguageModel =
+      ("{\n"
+              + "                \"language\":\"ENGLISH\",\n"
+              + "                \"ngrams\":{\n"
+              + "                    \"13/100\":\"t\",\n"
+              + "                    \"1/25\":\"h\",\n"
+              + "                    \"7/50\":\"e\",\n"
+              + "                    \"1/10\":\"s n o\",\n"
+              + "                    \"3/100\":\"c a p u y\",\n"
+              + "                    \"1/20\":\"r d\",\n"
+              + "                    \"3/50\":\"i\",\n"
+              + "                    \"1/50\":\"f w\",\n"
+              + "                    \"1/100\":\"g m b l\"\n"
+              + "                }\n"
+              + "            }")
+          .replaceAll("\n\\s*", "");
+
+  private final Map<Ngram, Integer> expectedUnigramAbsoluteFrequencies =
+      Map.ofEntries(
+              Map.entry("a", 3),
+              Map.entry("b", 1),
+              Map.entry("c", 3),
+              Map.entry("d", 5),
+              Map.entry("e", 14),
+              Map.entry("f", 2),
+              Map.entry("g", 1),
+              Map.entry("h", 4),
+              Map.entry("i", 6),
+              Map.entry("l", 1),
+              Map.entry("m", 1),
+              Map.entry("n", 10),
+              Map.entry("o", 10),
+              Map.entry("p", 3),
+              Map.entry("r", 5),
+              Map.entry("s", 10),
+              Map.entry("t", 13),
+              Map.entry("u", 3),
+              Map.entry("w", 2),
+              Map.entry("y", 3))
+          .entrySet().stream()
+          .collect(Collectors.toMap(KEY_MAPPER, Map.Entry::getValue));
+
+  private final Map<Ngram, Fraction> expectedUnigramRelativeFrequencies =
+      Map.ofEntries(
+              Map.entry("a", "3/100"),
+              Map.entry("b", "1/100"),
+              Map.entry("c", "3/100"),
+              Map.entry("d", "1/20"),
+              Map.entry("e", "7/50"),
+              Map.entry("f", "1/50"),
+              Map.entry("g", "1/100"),
+              Map.entry("h", "1/25"),
+              Map.entry("i", "3/50"),
+              Map.entry("l", "1/100"),
+              Map.entry("m", "1/100"),
+              Map.entry("n", "1/10"),
+              Map.entry("o", "1/10"),
+              Map.entry("p", "3/100"),
+              Map.entry("r", "1/20"),
+              Map.entry("s", "1/10"),
+              Map.entry("t", "13/100"),
+              Map.entry("u", "3/100"),
+              Map.entry("w", "1/50"),
+              Map.entry("y", "3/100"))
+          .entrySet().stream()
+          .collect(
+              Collectors.toMap(
+                  KEY_MAPPER,
+                  entry ->
+                      VALUE_MAPPER.apply(Map.entry(new Ngram(entry.getKey()), entry.getValue()))));
+
+  private final Map<Ngram, Integer> expectedBigramAbsoluteFrequencies =
+      Map.<String, Integer>ofEntries(
+              Map.entry("de", 1),
+              Map.entry("pr", 1),
+              Map.entry("pu", 1),
+              Map.entry("do", 1),
+              Map.entry("uc", 1),
+              Map.entry("ds", 1),
+              Map.entry("du", 1),
+              Map.entry("ur", 1),
+              Map.entry("us", 1),
+              Map.entry("ed", 1),
+              Map.entry("in", 4),
+              Map.entry("io", 1),
+              Map.entry("em", 1),
+              Map.entry("en", 3),
+              Map.entry("is", 1),
+              Map.entry("al", 1),
+              Map.entry("es", 4),
+              Map.entry("ar", 1),
+              Map.entry("rd", 1),
+              Map.entry("re", 1),
+              Map.entry("ey", 1),
+              Map.entry("nc", 1),
+              Map.entry("nd", 1),
+              Map.entry("ay", 1),
+              Map.entry("ng", 1),
+              Map.entry("ro", 1),
+              Map.entry("rp", 1),
+              Map.entry("no", 1),
+              Map.entry("ns", 1),
+              Map.entry("nt", 2),
+              Map.entry("fo", 1),
+              Map.entry("wa", 1),
+              Map.entry("se", 4),
+              Map.entry("od", 1),
+              Map.entry("si", 1),
+              Map.entry("of", 1),
+              Map.entry("by", 1),
+              Map.entry("wo", 1),
+              Map.entry("on", 2),
+              Map.entry("st", 2),
+              Map.entry("ce", 1),
+              Map.entry("or", 2),
+              Map.entry("os", 1),
+              Map.entry("ot", 2),
+              Map.entry("co", 1),
+              Map.entry("ta", 1),
+              Map.entry("ct", 1),
+              Map.entry("te", 3),
+              Map.entry("th", 4),
+              Map.entry("ti", 2),
+              Map.entry("to", 1),
+              Map.entry("he", 4),
+              Map.entry("po", 1))
+          .entrySet().stream()
+          .collect(Collectors.toMap(KEY_MAPPER, Map.Entry::getValue));
+
+  private final Map<Ngram, Fraction> expectedBigramRelativeFrequencies =
+      Map.<String, String>ofEntries(
+              Map.entry("de", "1/5"),
+              Map.entry("pr", "1/3"),
+              Map.entry("pu", "1/3"),
+              Map.entry("do", "1/5"),
+              Map.entry("uc", "1/3"),
+              Map.entry("ds", "1/5"),
+              Map.entry("du", "1/5"),
+              Map.entry("ur", "1/3"),
+              Map.entry("us", "1/3"),
+              Map.entry("ed", "1/14"),
+              Map.entry("in", "2/3"),
+              Map.entry("io", "1/6"),
+              Map.entry("em", "1/14"),
+              Map.entry("en", "3/14"),
+              Map.entry("is", "1/6"),
+              Map.entry("al", "1/3"),
+              Map.entry("es", "2/7"),
+              Map.entry("ar", "1/3"),
+              Map.entry("rd", "1/5"),
+              Map.entry("re", "1/5"),
+              Map.entry("ey", "1/14"),
+              Map.entry("nc", "1/10"),
+              Map.entry("nd", "1/10"),
+              Map.entry("ay", "1/3"),
+              Map.entry("ng", "1/10"),
+              Map.entry("ro", "1/5"),
+              Map.entry("rp", "1/5"),
+              Map.entry("no", "1/10"),
+              Map.entry("ns", "1/10"),
+              Map.entry("nt", "1/5"),
+              Map.entry("fo", "1/2"),
+              Map.entry("wa", "1/2"),
+              Map.entry("se", "2/5"),
+              Map.entry("od", "1/10"),
+              Map.entry("si", "1/10"),
+              Map.entry("of", "1/10"),
+              Map.entry("by", "1/1"),
+              Map.entry("wo", "1/2"),
+              Map.entry("on", "1/5"),
+              Map.entry("st", "1/5"),
+              Map.entry("ce", "1/3"),
+              Map.entry("or", "1/5"),
+              Map.entry("os", "1/10"),
+              Map.entry("ot", "1/5"),
+              Map.entry("co", "1/3"),
+              Map.entry("ta", "1/13"),
+              Map.entry("ct", "1/3"),
+              Map.entry("te", "3/13"),
+              Map.entry("th", "4/13"),
+              Map.entry("ti", "2/13"),
+              Map.entry("to", "1/13"),
+              Map.entry("he", "1/1"),
+              Map.entry("po", "1/3"))
+          .entrySet().stream()
+          .collect(
+              Collectors.toMap(
+                  KEY_MAPPER,
+                  entry ->
+                      VALUE_MAPPER.apply(Map.entry(new Ngram(entry.getKey()), entry.getValue()))));
+
+  private final Map<Ngram, Integer> expectedTrigramAbsoluteFrequencies =
+      Map.ofEntries(
+              Map.entry("rds", 1),
+              Map.entry("ose", 1),
+              Map.entry("ded", 1),
+              Map.entry("con", 1),
+              Map.entry("use", 1),
+              Map.entry("est", 1),
+              Map.entry("ion", 1),
+              Map.entry("ist", 1),
+              Map.entry("pur", 1),
+              Map.entry("hem", 1),
+              Map.entry("hes", 1),
+              Map.entry("tin", 1),
+              Map.entry("cti", 1),
+              Map.entry("wor", 1),
+              Map.entry("tio", 1),
+              Map.entry("ten", 2),
+              Map.entry("ota", 1),
+              Map.entry("hey", 1),
+              Map.entry("tal", 1),
+              Map.entry("tes", 1),
+              Map.entry("uct", 1),
+              Map.entry("sti", 1),
+              Map.entry("pro", 1),
+              Map.entry("odu", 1),
+              Map.entry("nsi", 1),
+              Map.entry("rod", 1),
+              Map.entry("for", 1),
+              Map.entry("ces", 1),
+              Map.entry("nce", 1),
+              Map.entry("not", 1),
+              Map.entry("pos", 1),
+              Map.entry("are", 1),
+              Map.entry("tot", 1),
+              Map.entry("end", 1),
+              Map.entry("enc", 1),
+              Map.entry("sis", 1),
+              Map.entry("sen", 1),
+              Map.entry("nte", 2),
+              Map.entry("ord", 1),
+              Map.entry("ses", 1),
+              Map.entry("ing", 1),
+              Map.entry("ent", 1),
+              Map.entry("way", 1),
+              Map.entry("nde", 1),
+              Map.entry("int", 1),
+              Map.entry("rpo", 1),
+              Map.entry("the", 4),
+              Map.entry("urp", 1),
+              Map.entry("duc", 1),
+              Map.entry("ons", 1),
+              Map.entry("ese", 1))
+          .entrySet().stream()
+          .collect(Collectors.toMap(KEY_MAPPER, Map.Entry::getValue));
+
+  private final Map<Ngram, Fraction> expectedTrigramRelativeFrequencies =
+      Map.ofEntries(
+              Map.entry("rds", "1/1"),
+              Map.entry("ose", "1/1"),
+              Map.entry("ded", "1/1"),
+              Map.entry("con", "1/1"),
+              Map.entry("use", "1/1"),
+              Map.entry("est", "1/4"),
+              Map.entry("ion", "1/1"),
+              Map.entry("ist", "1/1"),
+              Map.entry("pur", "1/1"),
+              Map.entry("hem", "1/4"),
+              Map.entry("hes", "1/4"),
+              Map.entry("tin", "1/2"),
+              Map.entry("cti", "1/1"),
+              Map.entry("wor", "1/1"),
+              Map.entry("tio", "1/2"),
+              Map.entry("ten", "2/3"),
+              Map.entry("ota", "1/2"),
+              Map.entry("hey", "1/4"),
+              Map.entry("tal", "1/1"),
+              Map.entry("tes", "1/3"),
+              Map.entry("uct", "1/1"),
+              Map.entry("sti", "1/2"),
+              Map.entry("pro", "1/1"),
+              Map.entry("odu", "1/1"),
+              Map.entry("nsi", "1/1"),
+              Map.entry("rod", "1/1"),
+              Map.entry("for", "1/1"),
+              Map.entry("ces", "1/1"),
+              Map.entry("nce", "1/1"),
+              Map.entry("not", "1/1"),
+              Map.entry("pos", "1/1"),
+              Map.entry("are", "1/1"),
+              Map.entry("tot", "1/1"),
+              Map.entry("end", "1/3"),
+              Map.entry("enc", "1/3"),
+              Map.entry("sis", "1/1"),
+              Map.entry("sen", "1/4"),
+              Map.entry("nte", "1/1"),
+              Map.entry("ord", "1/2"),
+              Map.entry("ses", "1/4"),
+              Map.entry("ing", "1/4"),
+              Map.entry("ent", "1/3"),
+              Map.entry("way", "1/1"),
+              Map.entry("nde", "1/1"),
+              Map.entry("int", "1/4"),
+              Map.entry("rpo", "1/1"),
+              Map.entry("the", "1/1"),
+              Map.entry("urp", "1/1"),
+              Map.entry("duc", "1/1"),
+              Map.entry("ons", "1/2"),
+              Map.entry("ese", "1/4"))
+          .entrySet().stream()
+          .collect(
+              Collectors.toMap(
+                  KEY_MAPPER,
+                  entry ->
+                      VALUE_MAPPER.apply(Map.entry(new Ngram(entry.getKey()), entry.getValue()))));
+
+  private final Map<Ngram, Integer> expectedQuadrigramAbsoluteFrequencies =
+      Map.ofEntries(
+              Map.entry("onsi", 1),
+              Map.entry("sist", 1),
+              Map.entry("ende", 1),
+              Map.entry("ords", 1),
+              Map.entry("esti", 1),
+              Map.entry("oduc", 1),
+              Map.entry("nces", 1),
+              Map.entry("tenc", 1),
+              Map.entry("tend", 1),
+              Map.entry("thes", 1),
+              Map.entry("rpos", 1),
+              Map.entry("ting", 1),
+              Map.entry("nsis", 1),
+              Map.entry("nten", 2),
+              Map.entry("tota", 1),
+              Map.entry("they", 1),
+              Map.entry("cons", 1),
+              Map.entry("tion", 1),
+              Map.entry("prod", 1),
+              Map.entry("otal", 1),
+              Map.entry("test", 1),
+              Map.entry("ence", 1),
+              Map.entry("pose", 1),
+              Map.entry("oses", 1),
+              Map.entry("nded", 1),
+              Map.entry("inte", 1),
+              Map.entry("them", 1),
+              Map.entry("urpo", 1),
+              Map.entry("duct", 1),
+              Map.entry("sent", 1),
+              Map.entry("stin", 1),
+              Map.entry("ucti", 1),
+              Map.entry("ente", 1),
+              Map.entry("purp", 1),
+              Map.entry("ctio", 1),
+              Map.entry("rodu", 1),
+              Map.entry("word", 1),
+              Map.entry("hese", 1))
+          .entrySet().stream()
+          .collect(Collectors.toMap(KEY_MAPPER, Map.Entry::getValue));
+
+  private final Map<Ngram, Fraction> expectedQuadrigramRelativeFrequencies =
+      Map.ofEntries(
+              Map.entry("onsi", "1/1"),
+              Map.entry("sist", "1/1"),
+              Map.entry("ende", "1/1"),
+              Map.entry("ords", "1/1"),
+              Map.entry("esti", "1/1"),
+              Map.entry("oduc", "1/1"),
+              Map.entry("nces", "1/1"),
+              Map.entry("tenc", "1/2"),
+              Map.entry("tend", "1/2"),
+              Map.entry("thes", "1/4"),
+              Map.entry("rpos", "1/1"),
+              Map.entry("ting", "1/1"),
+              Map.entry("nsis", "1/1"),
+              Map.entry("nten", "1/1"),
+              Map.entry("tota", "1/1"),
+              Map.entry("they", "1/4"),
+              Map.entry("cons", "1/1"),
+              Map.entry("tion", "1/1"),
+              Map.entry("prod", "1/1"),
+              Map.entry("otal", "1/1"),
+              Map.entry("test", "1/1"),
+              Map.entry("ence", "1/1"),
+              Map.entry("pose", "1/1"),
+              Map.entry("oses", "1/1"),
+              Map.entry("nded", "1/1"),
+              Map.entry("inte", "1/1"),
+              Map.entry("them", "1/4"),
+              Map.entry("urpo", "1/1"),
+              Map.entry("duct", "1/1"),
+              Map.entry("sent", "1/1"),
+              Map.entry("stin", "1/1"),
+              Map.entry("ucti", "1/1"),
+              Map.entry("ente", "1/1"),
+              Map.entry("purp", "1/1"),
+              Map.entry("ctio", "1/1"),
+              Map.entry("rodu", "1/1"),
+              Map.entry("word", "1/1"),
+              Map.entry("hese", "1/1"))
+          .entrySet().stream()
+          .collect(
+              Collectors.toMap(
+                  KEY_MAPPER,
+                  entry ->
+                      VALUE_MAPPER.apply(Map.entry(new Ngram(entry.getKey()), entry.getValue()))));
+
+  private final Map<Ngram, Integer> expectedFivegramAbsoluteFrequencies =
+      Map.ofEntries(
+              Map.entry("testi", 1),
+              Map.entry("sente", 1),
+              Map.entry("ences", 1),
+              Map.entry("tende", 1),
+              Map.entry("ducti", 1),
+              Map.entry("ntenc", 1),
+              Map.entry("these", 1),
+              Map.entry("onsis", 1),
+              Map.entry("ntend", 1),
+              Map.entry("total", 1),
+              Map.entry("uctio", 1),
+              Map.entry("enten", 1),
+              Map.entry("poses", 1),
+              Map.entry("ction", 1),
+              Map.entry("produ", 1),
+              Map.entry("inten", 1),
+              Map.entry("nsist", 1),
+              Map.entry("words", 1),
+              Map.entry("sting", 1),
+              Map.entry("purpo", 1),
+              Map.entry("tence", 1),
+              Map.entry("estin", 1),
+              Map.entry("roduc", 1),
+              Map.entry("urpos", 1),
+              Map.entry("rpose", 1),
+              Map.entry("ended", 1),
+              Map.entry("oduct", 1),
+              Map.entry("consi", 1))
+          .entrySet().stream()
+          .collect(Collectors.toMap(KEY_MAPPER, Map.Entry::getValue));
+
+  private final Map<Ngram, Fraction> expectedFivegramRelativeFrequencies =
+      Map.ofEntries(
+              Map.entry("testi", "1/1"),
+              Map.entry("sente", "1/1"),
+              Map.entry("ences", "1/1"),
+              Map.entry("tende", "1/1"),
+              Map.entry("ducti", "1/1"),
+              Map.entry("ntenc", "1/2"),
+              Map.entry("these", "1/1"),
+              Map.entry("onsis", "1/1"),
+              Map.entry("ntend", "1/2"),
+              Map.entry("total", "1/1"),
+              Map.entry("uctio", "1/1"),
+              Map.entry("enten", "1/1"),
+              Map.entry("poses", "1/1"),
+              Map.entry("ction", "1/1"),
+              Map.entry("produ", "1/1"),
+              Map.entry("inten", "1/1"),
+              Map.entry("nsist", "1/1"),
+              Map.entry("words", "1/1"),
+              Map.entry("sting", "1/1"),
+              Map.entry("purpo", "1/1"),
+              Map.entry("tence", "1/1"),
+              Map.entry("estin", "1/1"),
+              Map.entry("roduc", "1/1"),
+              Map.entry("urpos", "1/1"),
+              Map.entry("rpose", "1/1"),
+              Map.entry("ended", "1/1"),
+              Map.entry("oduct", "1/1"),
+              Map.entry("consi", "1/1"))
+          .entrySet().stream()
+          .collect(
+              Collectors.toMap(
+                  KEY_MAPPER,
+                  entry ->
+                      VALUE_MAPPER.apply(Map.entry(new Ngram(entry.getKey()), entry.getValue()))));
+
+  private final Map<Ngram, Float> expectedUnigramJsonRelativeFrequencies =
+      expectedUnigramRelativeFrequencies.entrySet().stream()
+          .collect(Collectors.toMap(Map.Entry::getKey, entry -> entry.getValue().floatValue()));
+
+  @Test
+  public void assertThatUnigramLanguageModelCanBeCreatedFromTrainingData() {
+    TrainingDataLanguageModel model =
+        TrainingDataLanguageModel.fromText(
+            ITERABLE_OF_STRINGS,
+            Language.ENGLISH,
+            1,
+            "\\p{L}&&\\p{IsLatin}",
+            Collections.emptyMap());
+
+    assertThat(model.getLanguage()).isEqualTo(Language.ENGLISH);
+    assertThat(model.getAbsoluteFrequencies())
+        .containsExactlyInAnyOrderEntriesOf(expectedUnigramAbsoluteFrequencies);
+    assertThat(model.getRelativeFrequencies())
+        .containsExactlyInAnyOrderEntriesOf(expectedUnigramRelativeFrequencies);
+  }
+
+  @Test
+  public void assertThatBigramLanguageModelCanBeCreatedFromTrainingData() {
+    TrainingDataLanguageModel model =
+        TrainingDataLanguageModel.fromText(
+            ITERABLE_OF_STRINGS,
+            Language.ENGLISH,
+            2,
+            "\\p{L}&&\\p{IsLatin}",
+            expectedUnigramAbsoluteFrequencies);
+
+    assertThat(model.getLanguage()).isEqualTo(Language.ENGLISH);
+    assertThat(model.getAbsoluteFrequencies())
+        .containsExactlyInAnyOrderEntriesOf(expectedBigramAbsoluteFrequencies);
+    assertThat(model.getRelativeFrequencies())
+        .containsExactlyInAnyOrderEntriesOf(expectedBigramRelativeFrequencies);
+  }
+
+  @Test
+  public void assertThatTrigramLanguageModelCanBeCreatedFromTrainingData() {
+    TrainingDataLanguageModel model =
+        TrainingDataLanguageModel.fromText(
+            ITERABLE_OF_STRINGS,
+            Language.ENGLISH,
+            3,
+            "\\p{L}&&\\p{IsLatin}",
+            expectedBigramAbsoluteFrequencies);
+
+    assertThat(model.getLanguage()).isEqualTo(Language.ENGLISH);
+    assertThat(model.getAbsoluteFrequencies())
+        .containsExactlyInAnyOrderEntriesOf(expectedTrigramAbsoluteFrequencies);
+    assertThat(model.getRelativeFrequencies())
+        .containsExactlyInAnyOrderEntriesOf(expectedTrigramRelativeFrequencies);
+  }
+
+  @Test
+  public void assertThatQuadrigramLanguageModelCanBeCreatedFromTrainingData() {
+    TrainingDataLanguageModel model =
+        TrainingDataLanguageModel.fromText(
+            ITERABLE_OF_STRINGS,
+            Language.ENGLISH,
+            4,
+            "\\p{L}&&\\p{IsLatin}",
+            expectedTrigramAbsoluteFrequencies);
+
+    assertThat(model.getLanguage()).isEqualTo(Language.ENGLISH);
+    assertThat(model.getAbsoluteFrequencies())
+        .containsExactlyInAnyOrderEntriesOf(expectedQuadrigramAbsoluteFrequencies);
+    assertThat(model.getRelativeFrequencies())
+        .containsExactlyInAnyOrderEntriesOf(expectedQuadrigramRelativeFrequencies);
+  }
+
+  @Test
+  public void assertThatFivegramLanguageModelCanBeCreatedFromTrainingData() {
+    TrainingDataLanguageModel model =
+        TrainingDataLanguageModel.fromText(
+            ITERABLE_OF_STRINGS,
+            Language.ENGLISH,
+            5,
+            "\\p{L}&&\\p{IsLatin}",
+            expectedQuadrigramAbsoluteFrequencies);
+
+    assertThat(model.getLanguage()).isEqualTo(Language.ENGLISH);
+    assertThat(model.getAbsoluteFrequencies())
+        .containsExactlyInAnyOrderEntriesOf(expectedFivegramAbsoluteFrequencies);
+    assertThat(model.getRelativeFrequencies())
+        .containsExactlyInAnyOrderEntriesOf(expectedFivegramRelativeFrequencies);
+  }
+
+  // @Test
+  // TODO: The `toJson` returns JSON with the right keys and values, but the keys are out of order
+  public void assertThatUnigramLanguageModelIsCorrectlySerializedToJson() {
+    TrainingDataLanguageModel model =
+        TrainingDataLanguageModel.fromText(
+            ITERABLE_OF_STRINGS,
+            Language.ENGLISH,
+            1,
+            "\\p{L}&&\\p{IsLatin}",
+            Collections.emptyMap());
+    assertThat(model.toJson()).isEqualTo(expectedUnigramLanguageModel);
+  }
+
+  @Test
+  public void assertThatUnigramLanguageModelIsCorrectlyDeserializedFromJson() throws Exception {
+    ByteArrayInputStream inputStream =
+        new ByteArrayInputStream(expectedUnigramLanguageModel.getBytes(StandardCharsets.UTF_8));
+    Object2FloatMap<String> model = TrainingDataLanguageModel.fromJson(inputStream);
+    Map<String, Float> expectedMap =
+        expectedUnigramJsonRelativeFrequencies.entrySet().stream()
+            .collect(Collectors.toMap(entry -> entry.getKey().getValue(), Map.Entry::getValue));
+
+    assertThat(model).containsExactlyInAnyOrderEntriesOf(expectedMap);
+  }
+}

From 82a9ba7a0dc4f310feba124eafacf13d46816fd7 Mon Sep 17 00:00:00 2001
From: Alexander Zagniotov <azagniotov@box.com>
Date: Mon, 18 Nov 2024 14:20:16 -0800
Subject: [PATCH 11/11] util/extension pkg: migrated *Extensions

---
 .../util/extension/CharExtensions.java        |  49 ++++++++
 .../util/extension/EnumExtensions.java        | 111 ++++++++++++++++++
 .../util/extension/MapExtensions.java         |  44 +++++++
 3 files changed, 204 insertions(+)
 create mode 100644 src/main/java/com/github/pemistahl/lingua/internal/util/extension/CharExtensions.java
 create mode 100644 src/main/java/com/github/pemistahl/lingua/internal/util/extension/EnumExtensions.java
 create mode 100644 src/main/java/com/github/pemistahl/lingua/internal/util/extension/MapExtensions.java

diff --git a/src/main/java/com/github/pemistahl/lingua/internal/util/extension/CharExtensions.java b/src/main/java/com/github/pemistahl/lingua/internal/util/extension/CharExtensions.java
new file mode 100644
index 00000000..fdc683ba
--- /dev/null
+++ b/src/main/java/com/github/pemistahl/lingua/internal/util/extension/CharExtensions.java
@@ -0,0 +1,49 @@
+/*
+ * Copyright © 2018-today Peter M. Stahl pemistahl@gmail.com
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.github.pemistahl.lingua.internal.util.extension;
+
+import com.github.pemistahl.lingua.internal.Alphabet;
+import com.github.pemistahl.lingua.internal.Constant;
+import java.util.Set;
+import java.util.stream.Collectors;
+
+/**
+ * Utility functions related to characters and logograms.
+ *
+ * <p>This class provides utility methods for checking whether a character is a logogram by
+ * verifying whether it belongs to specific scripts. The logograms are cached for performance,
+ * preventing repeated evaluations of the same information.
+ *
+ * @author Peter M. Stahl <pemistahl@gmail.com>
+ * @author Migration to Java from Kotlin by Alexander Zagniotov <azagniotov@gmail.com>
+ */
+public class CharExtensions {
+
+  private static final Set<Alphabet> scriptsWithLogograms =
+      Constant.LANGUAGES_SUPPORTING_LOGOGRAMS.stream()
+          .flatMap(language -> language.getAlphabets().stream())
+          .collect(Collectors.toSet());
+
+  public static boolean isLogogram(final char ch) {
+    // Return false if the character is a whitespace
+    if (Character.isWhitespace(ch)) {
+      return false;
+    }
+
+    return scriptsWithLogograms.stream().anyMatch(alphabet -> alphabet.matches(ch));
+  }
+}
diff --git a/src/main/java/com/github/pemistahl/lingua/internal/util/extension/EnumExtensions.java b/src/main/java/com/github/pemistahl/lingua/internal/util/extension/EnumExtensions.java
new file mode 100644
index 00000000..a49cddb4
--- /dev/null
+++ b/src/main/java/com/github/pemistahl/lingua/internal/util/extension/EnumExtensions.java
@@ -0,0 +1,111 @@
+/*
+ * Copyright © 2018-today Peter M. Stahl pemistahl@gmail.com
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.github.pemistahl.lingua.internal.util.extension;
+
+import java.util.EnumMap;
+import java.util.EnumSet;
+
+/**
+ * Utility methods for creating {@link EnumMap} and {@link EnumSet} instances.
+ *
+ * <p>This class contains extension methods that help to create {@link EnumMap} and {@link EnumSet}
+ * more concisely, based on the size of the input pairs or elements.
+ *
+ * @author Peter M. Stahl pemistahl@gmail.com
+ * @author Migration to Java from Kotlin by Alexander Zagniotov azagniotov@gmail.com
+ */
+public class EnumExtensions {
+
+  /**
+   * Creates an {@link EnumMap} with the provided pairs of enum keys and corresponding values.
+   *
+   * <p>If the input is empty, an empty {@link EnumMap} is created. Otherwise, the map is created
+   * from the given pairs.
+   *
+   * @param <K> the type of the enum key
+   * @param <V> the type of the value
+   * @param pairs a variable number of key-value pairs
+   * @return a new {@link EnumMap} containing the provided pairs
+   */
+  @SafeVarargs
+  public static <K extends Enum<K>, V> EnumMap<K, V> enumMapOf(Pair<K, V>... pairs) {
+    if (pairs.length == 0) {
+      return new EnumMap<>(pairs[0].getKey().getDeclaringClass());
+    } else {
+      EnumMap<K, V> map = new EnumMap<>(pairs[0].getKey().getDeclaringClass());
+      for (Pair<K, V> pair : pairs) {
+        map.put(pair.getKey(), pair.getValue());
+      }
+      return map;
+    }
+  }
+
+  /**
+   * Creates an {@link EnumSet} with the provided elements.
+   *
+   * <p>If no elements are provided, an empty {@link EnumSet} is created. If one or more elements
+   * are provided, the corresponding {@link EnumSet} is created.
+   *
+   * @param <E> the type of the enum element
+   * @param elements a variable number of enum elements
+   * @return a new {@link EnumSet} containing the provided elements
+   */
+  @SafeVarargs
+  public static <E extends Enum<E>> EnumSet<E> enumSetOf(E... elements) {
+    switch (elements.length) {
+      case 0:
+        return EnumSet.noneOf(elements[0].getDeclaringClass());
+      case 1:
+        return EnumSet.of(elements[0]);
+      case 2:
+        return EnumSet.of(elements[0], elements[1]);
+      case 3:
+        return EnumSet.of(elements[0], elements[1], elements[2]);
+      case 4:
+        return EnumSet.of(elements[0], elements[1], elements[2], elements[3]);
+      case 5:
+        return EnumSet.of(elements[0], elements[1], elements[2], elements[3], elements[4]);
+      default:
+        return EnumSet.of(elements[0], elements);
+    }
+  }
+
+  /**
+   * A simple container for holding a pair of values (key and value). This is a utility class used
+   * for passing key-value pairs to methods like {@link #enumMapOf}.
+   *
+   * @param <K> the type of the key
+   * @param <V> the type of the value
+   */
+  public static class Pair<K, V> {
+    private final K key;
+    private final V value;
+
+    public Pair(K key, V value) {
+      this.key = key;
+      this.value = value;
+    }
+
+    public K getKey() {
+      return key;
+    }
+
+    public V getValue() {
+      return value;
+    }
+  }
+}
diff --git a/src/main/java/com/github/pemistahl/lingua/internal/util/extension/MapExtensions.java b/src/main/java/com/github/pemistahl/lingua/internal/util/extension/MapExtensions.java
new file mode 100644
index 00000000..4c1b19ae
--- /dev/null
+++ b/src/main/java/com/github/pemistahl/lingua/internal/util/extension/MapExtensions.java
@@ -0,0 +1,44 @@
+/*
+ * Copyright © 2018-today Peter M. Stahl pemistahl@gmail.com
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.github.pemistahl.lingua.internal.util.extension;
+
+import java.util.Map;
+
+/**
+ * Utility methods for working with collections and maps.
+ *
+ * <p>This class contains extension-like methods for commonly used map operations.
+ *
+ * @author Peter M. Stahl pemistahl@gmail.com
+ * @author Migration by Alexander Zagniotov azagniotov@gmail.com
+ */
+public class MapExtensions {
+
+  /**
+   * Increments the counter for the given key in the map.
+   *
+   * <p>If the key is already present in the map, its value is incremented by 1. If the key is not
+   * present, it is added to the map with a value of 1.
+   *
+   * @param <T> the type of the key
+   * @param map the mutable map to update
+   * @param key the key whose counter is to be incremented
+   */
+  public static <T> void incrementCounter(Map<T, Integer> map, T key) {
+    map.put(key, map.getOrDefault(key, 0) + 1);
+  }
+}