From 359a95db82f824f5d3a310c6299680addbbe609d Mon Sep 17 00:00:00 2001 From: Alexander Zagniotov Date: Sun, 17 Nov 2024 15:41:48 -0800 Subject: [PATCH 01/11] Renamed Gradle Kotlin default scripts --- build.gradle.kts => build.gradle.kts.retired | 0 settings.gradle.kts => settings.gradle.kts.retired | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename build.gradle.kts => build.gradle.kts.retired (100%) rename settings.gradle.kts => settings.gradle.kts.retired (100%) diff --git a/build.gradle.kts b/build.gradle.kts.retired similarity index 100% rename from build.gradle.kts rename to build.gradle.kts.retired diff --git a/settings.gradle.kts b/settings.gradle.kts.retired similarity index 100% rename from settings.gradle.kts rename to settings.gradle.kts.retired From 786087e15437936d8db0133fac354d18396ed115 Mon Sep 17 00:00:00 2001 From: Alexander Zagniotov Date: Mon, 18 Nov 2024 14:12:38 -0800 Subject: [PATCH 02/11] Added a build.gradle that builds and tests --- build.gradle | 329 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 329 insertions(+) create mode 100644 build.gradle diff --git a/build.gradle b/build.gradle new file mode 100644 index 00000000..5909f9a4 --- /dev/null +++ b/build.gradle @@ -0,0 +1,329 @@ +/* + * Copyright © 2018-today Peter M. Stahl pemistahl@gmail.com + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import com.github.jengelman.gradle.plugins.shadow.tasks.ShadowJar +//import org.jetbrains.dokka.gradle.DokkaTask +//import org.jetbrains.kotlin.gradle.tasks.KotlinCompile + +plugins { + id 'java' + id 'org.jlleitschuh.gradle.ktlint' version '12.1.1' + id 'org.jetbrains.dokka' version '1.9.20' + id 'com.github.johnrengelman.shadow' version '8.1.1' + id 'io.github.gradle-nexus.publish-plugin' version '2.0.0' + id 'com.github.sherter.google-java-format' version '0.9' // Last versions that are compatible with Java 8 + id 'maven-publish' + id 'signing' + id 'jacoco' + id 'com.gradleup.shadow' version '8.3.0' +} + + +group = project.hasProperty("linguaGroupId") ? project.property("linguaGroupId") : 'default.group.id' +description = project.hasProperty("linguaDescription") ? project.property("linguaDescription") : 'Default description' + +java { + sourceCompatibility = JavaVersion.VERSION_1_8 + targetCompatibility = JavaVersion.VERSION_1_8 +} + +jacoco { + toolVersion = "0.8.8" +} + +sourceSets { + main { + resources { + exclude 'training-data/**' + } + } + create("accuracyReport") { + compileClasspath += sourceSets.main.output + runtimeClasspath += sourceSets.main.output + } +} + +configurations { + accuracyReportImplementation { + extendsFrom(configurations.testImplementation) + } + accuracyReportRuntimeOnly { + extendsFrom(configurations.runtimeOnly) + } +} + +tasks.withType(Test).configureEach { + useJUnitPlatform { + failFast = true + } +} + +tasks.named("jacocoTestReport", JacocoReport).configure { + dependsOn "test" + reports { + xml.required.set(true) + csv.required.set(false) + html.required.set(true) + } + classDirectories.setFrom(files(classDirectories.files.collect { + fileTree(it) { + exclude '**/app/**' + } + })) +} + +tasks.register("accuracyReport", Test) { + group = project.hasProperty("linguaTaskGroup") ? project.property("linguaTaskGroup") : 'defaultGroup' + description = "Runs Lingua on provided test data, and writes detection accuracy reports for each language." + testClassesDirs = sourceSets["accuracyReport"].output.classesDirs + classpath = sourceSets["accuracyReport"].runtimeClasspath + + doFirst { + def allowedDetectors = project.hasProperty("linguaSupportedDetectors") ? project.property("linguaSupportedDetectors").split(',') : [] + def detectors = project.hasProperty('detectors') ? project.property('detectors').split(',') : allowedDetectors + + detectors.each { + if (!allowedDetectors.contains(it)) { + throw GradleException("detector '$it' does not exist, supported detectors: ${allowedDetectors.join(', ')}") + } + } + + def allowedLanguages = project.hasProperty("linguaSupportedLanguages") ? project.property("linguaSupportedLanguages").split(',') : [] + def languages = project.hasProperty('languages') ? project.property('languages').split(',') : allowedLanguages + + languages.each { + if (!allowedLanguages.contains(it)) { + throw GradleException("language '$it' is not supported") + } + } + + // Validate CPU cores + def availableCpuCores = Runtime.getRuntime().availableProcessors() + def cpuCoresRepr = project.hasProperty('cpuCores') ? project.property('cpuCores').toString() : "1" + def cpuCores = cpuCoresRepr.toInteger() + + if (cpuCores < 1 || cpuCores > availableCpuCores) { + throw GradleException("$cpuCores cpu cores are not supported. Min: 1, Max: $availableCpuCores") + } + + maxHeapSize = '4096m' + maxParallelForks = cpuCores + reports.html.required.set(false) + reports.junitXml.required.set(false) + + filter { + detectors.each { detector -> + languages.each { language -> + includeTestsMatching("${project.property('linguaGroupId')}.${project.property('linguaArtifactId')}.report.${detector.toLowerCase()}.${language}DetectionAccuracyReport") + } + } + } + } +} + +tasks.register("writeAggregatedAccuracyReport") { + group = project.hasProperty("linguaTaskGroup") ? project.property("linguaTaskGroup") : 'defaultGroup' + description = "Creates a table from all accuracy detection reports and writes it to a CSV file." + + doLast { + def accuracyReportsDirectoryName = 'accuracy-reports' + def accuracyReportsDirectory = file(accuracyReportsDirectoryName) + if (!accuracyReportsDirectory.exists()) { + throw GradleException("directory '$accuracyReportsDirectoryName' does not exist") + } + + def detectors = project.hasProperty("linguaSupportedDetectors") ? project.property("linguaSupportedDetectors").split(',') : [] + def languages = project.hasProperty("linguaSupportedLanguages") ? project.property("linguaSupportedLanguages").split(',') : [] + def csvFile = file("$accuracyReportsDirectoryName/aggregated-accuracy-values.csv") + def stringToSplitAt = ">> Exact values:" + + if (csvFile.exists()) csvFile.delete() + csvFile.createNewFile() + csvFile.appendText(project.hasProperty("linguaCsvHeader") ? project.property("linguaCsvHeader") : "") + csvFile.appendText("\n") + + languages.each { language -> + csvFile.appendText(language) + + detectors.each { detector -> + def languageReportFileName = "$accuracyReportsDirectoryName/${detector.toLowerCase()}/$language.txt" + def languageReportFile = file(languageReportFileName) + def sliceLength = detector == "Lingua" ? (1..8) : (1..4) + + if (languageReportFile.exists()) { + languageReportFile.readLines().each { line -> + if (line.startsWith(stringToSplitAt)) { + def accuracyValues = line.split(stringToSplitAt)[1].split(' ').slice(sliceLength).join(',') + csvFile.appendText(",${accuracyValues}") + } + } + } else { + csvFile.appendText(detector == "Lingua" ? ",NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN" : ",NaN,NaN,NaN,NaN") + } + } + + csvFile.appendText("\n") + } + + println("file 'aggregated-accuracy-values.csv' written successfully") + } +} + +//tasks.named("compileAccuracyReportKotlin", KotlinCompile).configure { +// kotlinOptions.jvmTarget = "17" +//} + +tasks.named("compileAccuracyReportJava", JavaCompile).configure { + sourceCompatibility = "17" + targetCompatibility = "17" +} + +//tasks.withType(DokkaTask).configureEach { +// dokkaSourceSets.configureEach { +// jdkVersion.set(8) +// reportUndocumented.set(false) +// perPackageOption { +// matchingRegex.set(".*\\.(app|internal).*") +// suppress.set(true) +// } +// } +//} + +tasks.register("dokkaJavadocJar", Jar).configure { + dependsOn "dokkaJavadoc" + group = "Build" + description = "Assembles a jar archive containing Javadoc documentation." + archiveClassifier.set("javadoc") + from("${layout.buildDirectory}/dokka/javadoc") +} + +tasks.register("sourcesJar", Jar).configure { + group = "Build" + description = "Assembles a jar archive containing the main source code." + archiveClassifier.set("sources") + from("src/main/kotlin") +} + +tasks.register("jarWithDependencies", ShadowJar).configure { + group = "Build" + description = "Assembles a jar archive containing the main classes and all external dependencies." + archiveClassifier.set("with-dependencies") + from(sourceSets.main.output) + configurations = [project.configurations.runtimeClasspath] + manifest { + attributes "Main-Class": project.property("linguaMainClass") + } +} + +tasks.register("runLinguaOnConsole", JavaExec).configure { + group = project.hasProperty("linguaTaskGroup") ? project.property("linguaTaskGroup") : 'defaultGroup' + description = "Starts a REPL (read-evaluate-print loop) to try Lingua on the command line." + mainClass.set(project.property("linguaMainClass")) + standardInput = System.in + classpath = sourceSets.main.runtimeClasspath +} + +dependencies { + implementation "com.squareup.moshi:moshi:1.15.1" + implementation "com.squareup.moshi:moshi-kotlin:1.15.1" + implementation "it.unimi.dsi:fastutil:8.5.15" + + testImplementation "org.junit.jupiter:junit-jupiter:5.11.3" + testImplementation "org.assertj:assertj-core:3.26.3" + testImplementation "org.mockito:mockito-core:5.2.0" + testImplementation "org.mockito:mockito-junit-jupiter:5.2.0" + + accuracyReportImplementation "com.optimaize.languagedetector:language-detector:0.6" + accuracyReportImplementation "org.apache.opennlp:opennlp-tools:2.4.0" + accuracyReportImplementation "org.apache.tika:tika-core:3.0.0" + accuracyReportImplementation "org.apache.tika:tika-langdetect-optimaize:3.0.0" + accuracyReportImplementation "org.slf4j:slf4j-nop:2.0.16" +} + +publishing { + publications { + mavenJava(MavenPublication) { + groupId = project.findProperty("linguaGroupId").toString() + artifactId = project.findProperty("linguaArtifactId").toString() + version = project.version.toString() + + from components.java + + artifact sourcesJar + artifact jarWithDependencies + artifact dokkaJavadocJar + + pom { + name.set(project.findProperty("linguaName").toString()) + description.set(project.findProperty("linguaDescription").toString()) + url.set(project.findProperty("linguaWebsiteUrl").toString()) + + licenses { + license { + name.set(project.findProperty("linguaLicenseName").toString()) + url.set(project.findProperty("linguaLicenseUrl").toString()) + } + } + developers { + developer { + id.set(project.findProperty("linguaDeveloperId").toString()) + name.set(project.findProperty("linguaDeveloperName").toString()) + email.set(project.findProperty("linguaDeveloperEmail").toString()) + url.set(project.findProperty("linguaDeveloperUrl").toString()) + } + } + scm { + connection.set(project.findProperty("linguaScmConnection").toString()) + developerConnection.set(project.findProperty("linguaScmDeveloperConnection").toString()) + url.set(project.findProperty("linguaScmUrl").toString()) + } + } + } + } + + repositories { + maven { + name = "GitHubPackages" + url = uri(project.findProperty("githubPackagesUrl").toString()) + credentials { + username = project.findProperty("linguaDeveloperId").toString() + password = project.findProperty("ghPackagesToken")?.toString() ?: "" + } + } + } +} + +nexusPublishing { + repositories { + sonatype() + } +} + +signing { + //sign(publishing.publications["lingua"]) + sign publishing.publications.mavenJava +} + +repositories { + mavenCentral() +} + +googleJavaFormat { + toolVersion = '1.7' // Last versions that are compatible with Java 8 + exclude '**/wrapper/dists/**' + exclude '**/src/*/resources/**' +} +verifyGoogleJavaFormat.dependsOn(tasks.googleJavaFormat) From 673dc1e19d7b346c23a3370b4c70a31300b41671 Mon Sep 17 00:00:00 2001 From: Alexander Zagniotov Date: Mon, 18 Nov 2024 14:14:13 -0800 Subject: [PATCH 03/11] internal pkg: migrated Fraction and FractionTest --- .../pemistahl/lingua/internal/Fraction.java | 305 ++++++++++++++++++ .../lingua/internal/FractionTest.java | 156 +++++++++ 2 files changed, 461 insertions(+) create mode 100644 src/main/java/com/github/pemistahl/lingua/internal/Fraction.java create mode 100644 src/test/java/com/github/pemistahl/lingua/internal/FractionTest.java diff --git a/src/main/java/com/github/pemistahl/lingua/internal/Fraction.java b/src/main/java/com/github/pemistahl/lingua/internal/Fraction.java new file mode 100644 index 00000000..ab491a0c --- /dev/null +++ b/src/main/java/com/github/pemistahl/lingua/internal/Fraction.java @@ -0,0 +1,305 @@ +/* + * Copyright © 2018-today Peter M. Stahl pemistahl@gmail.com + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.pemistahl.lingua.internal; + +import com.squareup.moshi.ToJson; +import java.util.Objects; + +/** + * A class representing a fraction with a numerator and denominator. Provides methods for reducing + * fractions to their lowest terms, comparing fractions, and converting them to different numeric + * types. + * + * @author Peter M. Stahl + * @author Migration to Java from Kotlin by Alexander Zagniotov + */ +public class Fraction extends Number implements Comparable { + + private int numerator; + private int denominator; + + public Fraction(final int numerator, final int denominator) { + final int[] data = reduceToLowestTerms(numerator, denominator); + this.numerator = data[0]; + this.denominator = data[1]; + } + + @Override + public int compareTo(final Fraction other) { + long n0d = (long) numerator * other.denominator; + long d0n = (long) denominator * other.numerator; + if (n0d < d0n) { + return -1; + } else if (n0d > d0n) { + return 1; + } else { + return 0; + } + } + + @Override + public String toString() { + return numerator + "/" + denominator; + } + + @Override + public byte byteValue() { + return (byte) (int) doubleValue(); + } + + @Override + public double doubleValue() { + return (double) numerator / denominator; + } + + @Override + public float floatValue() { + return (float) doubleValue(); + } + + @Override + public int intValue() { + return (int) doubleValue(); + } + + @Override + public long longValue() { + return (long) doubleValue(); + } + + @Override + public short shortValue() { + return (short) (int) doubleValue(); + } + + /** + * Reduces the fraction to its lowest terms. + * + * @param numerator The numerator of the fraction. + * @param denominator The denominator of the fraction. + * @return A Pair containing the reduced numerator and denominator. + * @throws ArithmeticException if the denominator is zero. + */ + private int[] reduceToLowestTerms(final int numerator, final int denominator) { + int num = numerator; + int den = denominator; + + if (den == 0) { + throw new ArithmeticException("zero denominator in fraction '" + num + "/" + den + "'"); + } + + if (den < 0) { + if (num == Integer.MIN_VALUE || den == Integer.MIN_VALUE) { + throw new ArithmeticException("overflow in fraction " + this + ", cannot negate"); + } + num = -num; + den = -den; + } + + int gcd = greatestCommonDenominator(num, den); + + if (gcd > 1) { + num /= gcd; + den /= gcd; + } + + if (den < 0) { + num = -num; + den = -den; + } + + return new int[] {num, den}; + } + + /** + * Calculates the greatest common denominator (GCD) of two integers. + * + * @param a The first integer. + * @param b The second integer. + * @return The GCD of the two integers. + * @throws ArithmeticException if an overflow occurs. + */ + private int greatestCommonDenominator(final int a, final int b) { + if (a == 0 || b == 0) { + if (a == Integer.MIN_VALUE || b == Integer.MIN_VALUE) { + throw new ArithmeticException( + "overflow: greatestCommonDenominator(" + a + ", " + b + ") is 2^31"); + } + return Math.abs(a + b); + } + + int x = a; + int y = b; + long xl = x; + long yl = y; + boolean useLong = false; + + if (x < 0) { + if (x == Integer.MIN_VALUE) { + useLong = true; + } else { + x = -x; + } + xl = -xl; + } + + if (y < 0) { + if (y == Integer.MIN_VALUE) { + useLong = true; + } else { + y = -y; + } + yl = -yl; + } + + if (useLong) { + if (xl == yl) { + throw new ArithmeticException( + "overflow: greatestCommonDenominator(" + a + ", " + b + ") is 2^31"); + } + long ylyu = yl; + yl = xl; + xl = ylyu % xl; + if (xl == 0L) { + if (yl > Integer.MAX_VALUE) { + throw new ArithmeticException( + "overflow: greatestCommonDenominator(" + a + ", " + b + ") is 2^31"); + } + return (int) yl; + } + ylyu = yl; + + y = (int) xl; + x = (int) (ylyu % xl); + } + + return greatestCommonDivisor(x, y); + } + + /** + * Computes the greatest common divisor (GCD) of two non-negative integers using binary GCD + * algorithm. + * + * @param a The first integer. + * @param b The second integer. + * @return The GCD of the two integers. + */ + private int greatestCommonDivisor(final int a, final int b) { + assert a >= 0; + assert b >= 0; + + if (a == 0) return b; + if (b == 0) return a; + + int x = a; + int y = b; + + int xTwos = numberOfTrailingZeros(x); + int yTwos = numberOfTrailingZeros(y); + int shift = Math.min(xTwos, yTwos); + + x = x >> xTwos; + y = y >> yTwos; + + while (x != y) { + int delta = x - y; + y = Math.min(x, y); + x = Math.abs(delta); + x = x >> numberOfTrailingZeros(x); + } + + return x << shift; + } + + /** + * Counts the number of trailing zeros in the binary representation of the given integer. + * + * @param i The integer whose trailing zeros are to be counted. + * @return The number of trailing zeros in the binary representation of i. + */ + private int numberOfTrailingZeros(final int i) { + if (i == 0) return 32; + + int j = i; + int n = 31; + + int y = j << 16; + if (y != 0) { + n -= 16; + j = y; + } + + y = j << 8; + if (y != 0) { + n -= 8; + j = y; + } + + y = j << 4; + if (y != 0) { + n -= 4; + j = y; + } + + y = j << 2; + if (y != 0) { + n -= 2; + j = y; + } + + return n - ((j << 1) >>> 31); + } + + /** + * Computes the absolute value of an integer. + * + * @param x The integer whose absolute value is to be computed. + * @return The absolute value of x. + */ + private int abs(final int x) { + int i = x >>> 31; + return (x ^ i) + i; + } + + @Override + public boolean equals(final Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + Fraction fraction = (Fraction) o; + return numerator == fraction.numerator && denominator == fraction.denominator; + } + + @Override + public int hashCode() { + return Objects.hash(numerator, denominator); + } +} + +/** A class to handle conversion of Fraction objects to JSON using Moshi. */ +class FractionAdapter { + + /** + * Converts a Fraction object to its string representation. + * + * @param fraction The Fraction object to be converted. + * @return The string representation of the fraction. + */ + @ToJson + public String toJson(final Fraction fraction) { + return fraction.toString(); + } +} diff --git a/src/test/java/com/github/pemistahl/lingua/internal/FractionTest.java b/src/test/java/com/github/pemistahl/lingua/internal/FractionTest.java new file mode 100644 index 00000000..116a6f1b --- /dev/null +++ b/src/test/java/com/github/pemistahl/lingua/internal/FractionTest.java @@ -0,0 +1,156 @@ +/* + * Copyright © 2018-today Peter M. Stahl pemistahl@gmail.com + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.pemistahl.lingua.internal; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatExceptionOfType; + +import org.junit.jupiter.api.Test; + +/** + * Unit tests for the Fraction class. + * + * @author Peter M. Stahl + * @author Migration to Java from Kotlin by Alexander Zagniotov + */ +class FractionTest { + + private final Fraction fraction1 = new Fraction(12, 144); + private final Fraction fraction2 = new Fraction(63, 27); + private final Fraction fraction3 = new Fraction(0, 1234); + private final Fraction fraction4 = new Fraction(-42, 210); + private final Fraction fraction5 = new Fraction(169, -65); + + /** Test that Fraction is correctly reduced to lowest terms. */ + @Test + void assertThatFractionIsCorrectlyReducedToLowestTerms() { + assertThat(fraction1).isEqualTo(new Fraction(1, 12)); + assertThat(fraction2).isEqualTo(new Fraction(7, 3)); + assertThat(fraction3).isEqualTo(new Fraction(0, 1)); + assertThat(fraction4).isEqualTo(new Fraction(-1, 5)); + assertThat(fraction5).isEqualTo(new Fraction(-13, 5)); + } + + /** Test that a Fraction with a zero denominator cannot be created. */ + @Test + void assertThatFractionWithDenominatorZeroCannotBeCreated() { + assertThatExceptionOfType(ArithmeticException.class) + .isThrownBy(() -> new Fraction(1234, 0)) + .withMessage("zero denominator in fraction '1234/0'"); + } + + /** Test the toString() implementation of Fraction. */ + @Test + void assertThatToStringImplementationOfFractionIsCorrect() { + assertThat(fraction1.toString()).isEqualTo("1/12"); + assertThat(fraction2.toString()).isEqualTo("7/3"); + assertThat(fraction3.toString()).isEqualTo("0/1"); + assertThat(fraction4.toString()).isEqualTo("-1/5"); + assertThat(fraction5.toString()).isEqualTo("-13/5"); + } + + /** Test the doubleValue() implementation of Fraction. */ + @Test + void assertThatToDoubleImplementationOfFractionIsCorrect() { + assertThat(fraction1.doubleValue()).isEqualTo(1.0 / 12); + assertThat(fraction2.doubleValue()).isEqualTo(7.0 / 3); + assertThat(fraction3.doubleValue()).isEqualTo(0.0); + assertThat(fraction4.doubleValue()).isEqualTo(-0.2); + assertThat(fraction5.doubleValue()).isEqualTo(-2.6); + } + + /** Test the floatValue() implementation of Fraction. */ + @Test + void assertThatToFloatImplementationOfFractionIsCorrect() { + assertThat(fraction1.floatValue()).isEqualTo(1.0f / 12); + assertThat(fraction2.floatValue()).isEqualTo(7.0f / 3); + assertThat(fraction3.floatValue()).isEqualTo(0.0f); + assertThat(fraction4.floatValue()).isEqualTo(-0.2f); + assertThat(fraction5.floatValue()).isEqualTo(-2.6f); + } + + /** Test the intValue() implementation of Fraction. */ + @Test + void assertThatToIntImplementationOfFractionIsCorrect() { + assertThat(fraction1.intValue()).isEqualTo(0); + assertThat(fraction2.intValue()).isEqualTo(2); + assertThat(fraction3.intValue()).isEqualTo(0); + assertThat(fraction4.intValue()).isEqualTo(0); + assertThat(fraction5.intValue()).isEqualTo(-2); + } + + /** Test the longValue() implementation of Fraction. */ + @Test + void assertThatToLongImplementationOfFractionIsCorrect() { + assertThat(fraction1.longValue()).isEqualTo(0); + assertThat(fraction2.longValue()).isEqualTo(2); + assertThat(fraction3.longValue()).isEqualTo(0); + assertThat(fraction4.longValue()).isEqualTo(0); + assertThat(fraction5.longValue()).isEqualTo(-2); + } + + /** Test the shortValue() implementation of Fraction. */ + @Test + void assertThatToShortImplementationOfFractionIsCorrect() { + assertThat(fraction1.shortValue()).isEqualTo((short) 0); + assertThat(fraction2.shortValue()).isEqualTo((short) 2); + assertThat(fraction3.shortValue()).isEqualTo((short) 0); + assertThat(fraction4.shortValue()).isEqualTo((short) 0); + assertThat(fraction5.shortValue()).isEqualTo((short) -2); + } + + /** Test the byteValue() implementation of Fraction. */ + @Test + void assertThatToByteImplementationOfFractionIsCorrect() { + assertThat(fraction1.byteValue()).isEqualTo((byte) 0); + assertThat(fraction2.byteValue()).isEqualTo((byte) 2); + assertThat(fraction3.byteValue()).isEqualTo((byte) 0); + assertThat(fraction4.byteValue()).isEqualTo((byte) 0); + assertThat(fraction5.byteValue()).isEqualTo((byte) -2); + } + + /** Test that Fraction comparisons work correctly. */ + @Test + void assertThatFractionComparisonsWorkCorrectly() { + boolean[] comparisons = { + fraction1.compareTo(fraction3) > 0, + fraction1.compareTo(fraction4) > 0, + fraction1.compareTo(fraction5) > 0, + fraction2.compareTo(fraction1) > 0, + fraction2.compareTo(fraction3) > 0, + fraction2.compareTo(fraction4) > 0, + fraction2.compareTo(fraction5) > 0, + fraction3.compareTo(fraction4) > 0, + fraction3.compareTo(fraction5) > 0, + fraction4.compareTo(fraction5) > 0, + fraction1.compareTo(fraction2) < 0, + fraction3.compareTo(fraction1) < 0, + fraction3.compareTo(fraction2) < 0, + fraction4.compareTo(fraction1) < 0, + fraction4.compareTo(fraction2) < 0, + fraction4.compareTo(fraction3) < 0, + fraction5.compareTo(fraction1) < 0, + fraction5.compareTo(fraction2) < 0, + fraction5.compareTo(fraction3) < 0, + fraction5.compareTo(fraction4) < 0 + }; + + for (boolean comparison : comparisons) { + assertThat(comparison).isTrue(); + } + } +} From 71b77ff171f0a8ab81b4a96300d87a8ecca64dec Mon Sep 17 00:00:00 2001 From: Alexander Zagniotov Date: Mon, 18 Nov 2024 14:15:03 -0800 Subject: [PATCH 04/11] internal pkg: migrated Ngram and NgramTest --- .../pemistahl/lingua/internal/Ngram.java | 147 ++++++++++++++++ .../lingua/internal/NgramIterator.java | 64 +++++++ .../pemistahl/lingua/internal/NgramRange.java | 78 +++++++++ .../pemistahl/lingua/internal/NgramTest.java | 157 ++++++++++++++++++ 4 files changed, 446 insertions(+) create mode 100644 src/main/java/com/github/pemistahl/lingua/internal/Ngram.java create mode 100644 src/main/java/com/github/pemistahl/lingua/internal/NgramIterator.java create mode 100644 src/main/java/com/github/pemistahl/lingua/internal/NgramRange.java create mode 100644 src/test/java/com/github/pemistahl/lingua/internal/NgramTest.java diff --git a/src/main/java/com/github/pemistahl/lingua/internal/Ngram.java b/src/main/java/com/github/pemistahl/lingua/internal/Ngram.java new file mode 100644 index 00000000..7954d157 --- /dev/null +++ b/src/main/java/com/github/pemistahl/lingua/internal/Ngram.java @@ -0,0 +1,147 @@ +/* + * Copyright © 2018-today Peter M. Stahl pemistahl@gmail.com + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.pemistahl.lingua.internal; + +import java.util.Objects; + +/** + * This class represents an Ngram value, a string-based object with specific constraints on its + * length. The Ngram is comparable to other Ngrams based on its length. + * + * @author Peter M. Stahl + * @author Migration to Java from Kotlin by Alexander Zagniotov + */ +public final class Ngram implements Comparable { + + private final String value; + + /** + * Constructs an Ngram with a given string value. + * + * @param value the string value of the Ngram + * @throws IllegalArgumentException if the length of the value is not in the range 0..5 + */ + public Ngram(final String value) { + if (value == null || value.trim().length() > 5) { + throw new IllegalArgumentException("Length of ngram '" + value + "' is not in range 0..5"); + } + this.value = value; + } + + public String getValue() { + return value; + } + + /** + * Returns the string representation of the Ngram. + * + * @return the string value of the Ngram + */ + @Override + public String toString() { + return value; + } + + /** + * Compares the Ngram to another Ngram based on the length of their values. + * + * @param other the other Ngram to compare to + * @return a negative integer, zero, or a positive integer as this Ngram is less than, equal to, + * or greater than the specified Ngram + */ + @Override + public int compareTo(final Ngram other) { + return Integer.compare(this.value.length(), other.value.length()); + } + + /** + * Returns the range of lower order Ngrams that this Ngram can generate. + * + * @return the range of lower order Ngrams + */ + public NgramRange rangeOfLowerOrderNgrams() { + return new NgramRange(this, new Ngram(String.valueOf(this.value.charAt(0)))); + } + + /** + * Decrements the Ngram by removing the last character, unless it is a zerogram. + * + * @return the decremented Ngram + * @throws IllegalArgumentException if the Ngram is a zerogram and cannot be decremented + */ + public Ngram dec() { + if (value.isEmpty()) { + throw new IllegalStateException( + "Zerogram is ngram type of lowest order and can not be decremented"); + } else if (value.length() == 1) { + return new Ngram(""); + } else { + return new Ngram(value.substring(0, value.length() - 1)); + } + } + + /** + * Returns the name of the Ngram type based on the given length. + * + * @param ngramLength the length of the Ngram + * @return the name of the Ngram type (unigram, bigram, trigram, quadrigram, or fivegram) + * @throws IllegalArgumentException if the length is not between 1 and 5 + */ + public static String getNgramNameByLength(int ngramLength) { + switch (ngramLength) { + case 1: + return "unigram"; + case 2: + return "bigram"; + case 3: + return "trigram"; + case 4: + return "quadrigram"; + case 5: + return "fivegram"; + default: + throw new IllegalArgumentException("Ngram length " + ngramLength + " is not in range 1..5"); + } + } + + @Override + public boolean equals(final Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + Ngram ngram = (Ngram) o; + return Objects.equals(value, ngram.value); + } + + @Override + public int hashCode() { + return Objects.hashCode(value); + } +} + +/** + * A simple closed range interface representing a range of values. + * + * @param the type of the range elements + */ +interface ClosedRange { + + boolean contains(final Ngram value); + + T getStart(); + + T getEndInclusive(); +} diff --git a/src/main/java/com/github/pemistahl/lingua/internal/NgramIterator.java b/src/main/java/com/github/pemistahl/lingua/internal/NgramIterator.java new file mode 100644 index 00000000..3d97bc0d --- /dev/null +++ b/src/main/java/com/github/pemistahl/lingua/internal/NgramIterator.java @@ -0,0 +1,64 @@ +package com.github.pemistahl.lingua.internal; + +import java.util.Iterator; +import java.util.NoSuchElementException; +import java.util.Objects; + +/** + * Iterator for iterating over Ngrams starting from a specific Ngram. + * + * @author Peter M. Stahl + * @author Migration to Java from Kotlin by Alexander Zagniotov + */ +public final class NgramIterator implements Iterator { + + private Ngram current; + + /** + * Constructs an NgramIterator starting at a specific Ngram. + * + * @param start the starting Ngram + */ + public NgramIterator(final Ngram start) { + this.current = start; + } + + /** + * Checks if there are more Ngrams to iterate over. + * + * @return true if there are more Ngrams, false otherwise + */ + @Override + public boolean hasNext() { + return !current.toString().isEmpty(); + } + + /** + * Returns the next Ngram in the iteration. + * + * @return the next Ngram + * @throws NoSuchElementException if there are no more Ngrams to iterate over + */ + @Override + public Ngram next() { + if (!hasNext()) { + throw new NoSuchElementException(); + } + Ngram result = current; + current = current.dec(); + return result; + } + + @Override + public boolean equals(final Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + NgramIterator that = (NgramIterator) o; + return Objects.equals(current, that.current); + } + + @Override + public int hashCode() { + return Objects.hashCode(current); + } +} diff --git a/src/main/java/com/github/pemistahl/lingua/internal/NgramRange.java b/src/main/java/com/github/pemistahl/lingua/internal/NgramRange.java new file mode 100644 index 00000000..39dc10bc --- /dev/null +++ b/src/main/java/com/github/pemistahl/lingua/internal/NgramRange.java @@ -0,0 +1,78 @@ +package com.github.pemistahl.lingua.internal; + +import java.util.Iterator; +import java.util.Objects; + +/** + * Represents a closed range of Ngrams from a start Ngram to an end Ngram. The range includes all + * Ngrams from the start Ngram to the end Ngram. + * + * @author Peter M. Stahl + * @author Migration to Java from Kotlin by Alexander Zagniotov + */ +public final class NgramRange implements Iterable, ClosedRange { + + private final Ngram start; + private final Ngram endInclusive; + + /** + * Constructs an NgramRange with a start and an end Ngram. + * + * @param start the start Ngram + * @param endInclusive the end Ngram (inclusive) + * @throws IllegalArgumentException if the start Ngram is not of a higher order than the end Ngram + */ + public NgramRange(final Ngram start, final Ngram endInclusive) { + if (start.compareTo(endInclusive) < 0) { + throw new IllegalArgumentException( + "'" + start + "' must be of higher order than '" + endInclusive + "'"); + } + this.start = start; + this.endInclusive = endInclusive; + } + + /** + * Checks if a given Ngram is within this range. + * + * @param value the Ngram to check + * @return true if the Ngram is within the range, false otherwise + */ + @Override + public boolean contains(final Ngram value) { + return value.compareTo(endInclusive) >= 0 && value.compareTo(start) <= 0; + } + + /** + * Returns an iterator over the Ngrams in the range. + * + * @return an iterator over the Ngrams + */ + @Override + public Iterator iterator() { + return new NgramIterator(start); + } + + @Override + public Ngram getStart() { + return start; + } + + @Override + public Ngram getEndInclusive() { + return endInclusive; + } + + @Override + public boolean equals(final Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + NgramRange ngrams = (NgramRange) o; + return Objects.equals(getStart(), ngrams.getStart()) + && Objects.equals(getEndInclusive(), ngrams.getEndInclusive()); + } + + @Override + public int hashCode() { + return Objects.hash(getStart(), getEndInclusive()); + } +} diff --git a/src/test/java/com/github/pemistahl/lingua/internal/NgramTest.java b/src/test/java/com/github/pemistahl/lingua/internal/NgramTest.java new file mode 100644 index 00000000..d97bb248 --- /dev/null +++ b/src/test/java/com/github/pemistahl/lingua/internal/NgramTest.java @@ -0,0 +1,157 @@ +/* + * Copyright © 2018-today Peter M. Stahl pemistahl@gmail.com + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.pemistahl.lingua.internal; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.List; +import java.util.NoSuchElementException; +import org.assertj.core.api.Assertions; +import org.junit.jupiter.api.Test; + +/** + * Tests for the Ngram class and related components like NgramRange and NgramIterator. These tests + * verify the expected behavior and correctness of the Ngram functionality. + * + * @author Peter M. Stahl pemistahl@gmail.com + * @author Alexander Zagniotov azagniotov@gmail.com + */ +public class NgramTest { + + private final Ngram zerogram = new Ngram(""); + private final Ngram unigram = new Ngram("q"); + private final Ngram bigram = new Ngram("qw"); + private final Ngram trigram = new Ngram("qwe"); + private final Ngram quadrigram = new Ngram("qwer"); + private final Ngram fivegram = new Ngram("qwert"); + private final List ngrams = List.of(unigram, bigram, trigram, quadrigram, fivegram); + + @Test + public void testToString() { + // Assert that the toString() implementation of Ngram is its value + assertThat(fivegram.toString()).isEqualTo("qwert"); + assertThat(quadrigram.toString()).isEqualTo("qwer"); + assertThat(trigram.toString()).isEqualTo("qwe"); + assertThat(bigram.toString()).isEqualTo("qw"); + assertThat(unigram.toString()).isEqualTo("q"); + assertThat(zerogram.toString()).isEqualTo(""); + } + + @Test + public void testNgramComparisons() { + // Assert that Ngram comparisons work correctly + List comparisons = + List.of( + fivegram.compareTo(quadrigram) > 0, + fivegram.compareTo(trigram) > 0, + fivegram.compareTo(bigram) > 0, + fivegram.compareTo(unigram) > 0, + fivegram.compareTo(zerogram) > 0, + quadrigram.compareTo(trigram) > 0, + quadrigram.compareTo(bigram) > 0, + quadrigram.compareTo(unigram) > 0, + quadrigram.compareTo(zerogram) > 0, + trigram.compareTo(bigram) > 0, + trigram.compareTo(unigram) > 0, + trigram.compareTo(zerogram) > 0, + bigram.compareTo(unigram) > 0, + bigram.compareTo(zerogram) > 0, + unigram.compareTo(zerogram) > 0, + quadrigram.compareTo(fivegram) < 0, + trigram.compareTo(fivegram) < 0, + bigram.compareTo(fivegram) < 0, + unigram.compareTo(fivegram) < 0, + zerogram.compareTo(fivegram) < 0, + trigram.compareTo(quadrigram) < 0, + bigram.compareTo(quadrigram) < 0, + unigram.compareTo(quadrigram) < 0, + zerogram.compareTo(trigram) < 0, + bigram.compareTo(trigram) < 0, + unigram.compareTo(trigram) < 0, + zerogram.compareTo(trigram) < 0, + unigram.compareTo(bigram) < 0, + zerogram.compareTo(bigram) < 0, + zerogram.compareTo(unigram) < 0); + + comparisons.forEach(Assertions::assertThat); + } + + @Test + public void testDecrement() { + // Assert that Fivegrams can be decremented correctly + Ngram quadrigram = fivegram.dec(); + assertThat(quadrigram).isEqualTo(this.quadrigram); + + Ngram trigram = quadrigram.dec(); + assertThat(trigram).isEqualTo(this.trigram); + + Ngram bigram = trigram.dec(); + assertThat(bigram).isEqualTo(this.bigram); + + Ngram unigram = bigram.dec(); + assertThat(unigram).isEqualTo(this.unigram); + + Ngram zerogram = unigram.dec(); + assertThat(zerogram).isEqualTo(zerogram); + + Assertions.assertThatIllegalStateException() + .isThrownBy(zerogram::dec) + .withMessage("Zerogram is ngram type of lowest order and can not be decremented"); + } + + @Test + public void testNgramRange() { + // Assert that NgramRange works correctly + NgramRange ngramRange = new NgramRange(fivegram, bigram); + + assertThat(ngramRange.contains(fivegram)).isTrue(); + assertThat(ngramRange.contains(quadrigram)).isTrue(); + assertThat(ngramRange.contains(trigram)).isTrue(); + assertThat(ngramRange.contains(bigram)).isTrue(); + + assertThat(ngramRange.contains(unigram)).isFalse(); + assertThat(ngramRange.contains(zerogram)).isFalse(); + + assertThat(ngramRange.iterator()).isEqualTo(new NgramIterator(fivegram)); + + Assertions.assertThatThrownBy(() -> new NgramRange(bigram, fivegram)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("'qw' must be of higher order than 'qwert'"); + } + + @Test + public void testLowerOrderNgrams() { + // Assert that range of lower order ngrams can be generated correctly + for (Ngram ngram : ngrams) { + assertThat(ngram.rangeOfLowerOrderNgrams()).isEqualTo(new NgramRange(ngram, unigram)); + } + } + + @Test + public void testNgramIterator() { + // Assert that NgramIterator works correctly + NgramIterator iterator = new NgramIterator(fivegram); + + assertThat(iterator.next()).isEqualTo(fivegram); + assertThat(iterator.next()).isEqualTo(quadrigram); + assertThat(iterator.next()).isEqualTo(trigram); + assertThat(iterator.next()).isEqualTo(bigram); + assertThat(iterator.next()).isEqualTo(unigram); + + Assertions.assertThatThrownBy(iterator::next).isInstanceOf(NoSuchElementException.class); + } +} From f1549b4bf0346f764ccaf027e857ee4affeab73d Mon Sep 17 00:00:00 2001 From: Alexander Zagniotov Date: Mon, 18 Nov 2024 14:15:59 -0800 Subject: [PATCH 05/11] api pkg: migrated IsoCode639_1 and IsoCode639_3 --- .../pemistahl/lingua/api/IsoCode639_1.java | 270 ++++++++++++++++++ .../pemistahl/lingua/api/IsoCode639_3.java | 270 ++++++++++++++++++ 2 files changed, 540 insertions(+) create mode 100644 src/main/java/com/github/pemistahl/lingua/api/IsoCode639_1.java create mode 100644 src/main/java/com/github/pemistahl/lingua/api/IsoCode639_3.java diff --git a/src/main/java/com/github/pemistahl/lingua/api/IsoCode639_1.java b/src/main/java/com/github/pemistahl/lingua/api/IsoCode639_1.java new file mode 100644 index 00000000..df04d1ec --- /dev/null +++ b/src/main/java/com/github/pemistahl/lingua/api/IsoCode639_1.java @@ -0,0 +1,270 @@ +/* + * Copyright © 2018-today Peter M. Stahl pemistahl@gmail.com + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.pemistahl.lingua.api; + +/** + * The ISO 639-1 code representations for the supported languages. + * + *

ISO 639 is a standardized nomenclature used to classify languages. + */ +public enum IsoCode639_1 { + + /** The ISO 639-1 code for [Afrikaans][Language.AFRIKAANS]. */ + AF, + + /** The ISO 639-1 code for [Amharic][Language.AMHARIC]. */ + AM, + + /** The ISO 639-1 code for [Arabic][Language.ARABIC]. */ + AR, + + /** The ISO 639-1 code for [Azerbaijani][Language.AZERBAIJANI]. */ + AZ, + + /** The ISO 639-1 code for [Belarusian][Language.BELARUSIAN]. */ + BE, + + /** The ISO 639-1 code for [Bulgarian][Language.BULGARIAN]. */ + BG, + + /** The ISO 639-1 code for [Bengali][Language.BENGALI]. */ + BN, + + /** The ISO 639-1 code for [Bosnian][Language.BOSNIAN]. */ + BS, + + /** The ISO 639-1 code for [Catalan][Language.CATALAN]. */ + CA, + + /** The ISO 639-1 code for [Czech][Language.CZECH]. */ + CS, + + /** The ISO 639-1 code for [Welsh][Language.WELSH]. */ + CY, + + /** The ISO 639-1 code for [Danish][Language.DANISH]. */ + DA, + + /** The ISO 639-1 code for [German][Language.GERMAN]. */ + DE, + + /** The ISO 639-1 code for [Greek][Language.GREEK]. */ + EL, + + /** The ISO 639-1 code for [English][Language.ENGLISH]. */ + EN, + + /** The ISO 639-1 code for [Esperanto][Language.ESPERANTO]. */ + EO, + + /** The ISO 639-1 code for [Spanish][Language.SPANISH]. */ + ES, + + /** The ISO 639-1 code for [Estonian][Language.ESTONIAN]. */ + ET, + + /** The ISO 639-1 code for [Basque][Language.BASQUE]. */ + EU, + + /** The ISO 639-1 code for [Persian][Language.PERSIAN]. */ + FA, + + /** The ISO 639-1 code for [Finnish][Language.FINNISH]. */ + FI, + + /** The ISO 639-1 code for [French][Language.FRENCH]. */ + FR, + + /** The ISO 639-1 code for [Irish][Language.IRISH]. */ + GA, + + /** The ISO 639-1 code for [Gujarati][Language.GUJARATI]. */ + GU, + + /** The ISO 639-1 code for [Hebrew][Language.HEBREW]. */ + HE, + + /** The ISO 639-1 code for [Hindi][Language.HINDI]. */ + HI, + + /** The ISO 639-1 code for [Croatian][Language.CROATIAN]. */ + HR, + + /** The ISO 639-1 code for [Hungarian][Language.HUNGARIAN]. */ + HU, + + /** The ISO 639-1 code for [Armenian][Language.ARMENIAN]. */ + HY, + + /** The ISO 639-1 code for [Indonesian][Language.INDONESIAN]. */ + ID, + + /** The ISO 639-1 code for [Icelandic][Language.ICELANDIC]. */ + IS, + + /** The ISO 639-1 code for [Italian][Language.ITALIAN]. */ + IT, + + /** The ISO 639-1 code for [Japanese][Language.JAPANESE]. */ + JA, + + /** The ISO 639-1 code for [Georgian][Language.GEORGIAN]. */ + KA, + + /** The ISO 639-1 code for [Kazakh][Language.KAZAKH]. */ + KK, + + /** The ISO 639-1 code for [Korean][Language.KOREAN]. */ + KO, + + /** The ISO 639-1 code for [Latin][Language.LATIN]. */ + LA, + + /** The ISO 639-1 code for [Ganda][Language.GANDA]. */ + LG, + + /** The ISO 639-1 code for [Lithuanian][Language.LITHUANIAN]. */ + LT, + + /** The ISO 639-1 code for [Latvian][Language.LATVIAN]. */ + LV, + + /** The ISO 639-1 code for [Maori][Language.MAORI]. */ + MI, + + /** The ISO 639-1 code for [Macedonian][Language.MACEDONIAN]. */ + MK, + + /** The ISO 639-1 code for [Mongolian][Language.MONGOLIAN]. */ + MN, + + /** The ISO 639-1 code for [Marathi][Language.MARATHI]. */ + MR, + + /** The ISO 639-1 code for [Malay][Language.MALAY]. */ + MS, + + /** The ISO 639-1 code for [Norwegian Bokmal][Language.BOKMAL]. */ + NB, + + /** The ISO 639-1 code for [Dutch][Language.DUTCH]. */ + NL, + + /** The ISO 639-1 code for [Norwegian Nynorsk][Language.NYNORSK]. */ + NN, + + /** The ISO 639-1 code for [Oromo][Language.OROMO]. */ + OM, + + /** The ISO 639-1 code for [Punjabi][Language.PUNJABI]. */ + PA, + + /** The ISO 639-1 code for [Polish][Language.POLISH]. */ + PL, + + /** The ISO 639-1 code for [Portuguese][Language.PORTUGUESE]. */ + PT, + + /** The ISO 639-1 code for [Romanian][Language.ROMANIAN]. */ + RO, + + /** The ISO 639-1 code for [Russian][Language.RUSSIAN]. */ + RU, + + /** The ISO 639-1 code for [Sinhala][Language.SINHALA]. */ + SI, + + /** The ISO 639-1 code for [Slovak][Language.SLOVAK]. */ + SK, + + /** The ISO 639-1 code for [Slovene][Language.SLOVENE]. */ + SL, + + /** The ISO 639-1 code for [Shona][Language.SHONA]. */ + SN, + + /** The ISO 639-1 code for [Somali][Language.SOMALI]. */ + SO, + + /** The ISO 639-1 code for [Albanian][Language.ALBANIAN]. */ + SQ, + + /** The ISO 639-1 code for [Serbian][Language.SERBIAN]. */ + SR, + + /** The ISO 639-1 code for [Southern Sotho][Language.SOTHO]. */ + ST, + + /** The ISO 639-1 code for [Swedish][Language.SWEDISH]. */ + SV, + + /** The ISO 639-1 code for [Swahili][Language.SWAHILI]. */ + SW, + + /** The ISO 639-1 code for [Tamil][Language.TAMIL]. */ + TA, + + /** The ISO 639-1 code for [Telugu][Language.TELUGU]. */ + TE, + + /** The ISO 639-1 code for [Thai][Language.THAI]. */ + TH, + + /** The ISO 639-1 code for [Tigrinya][Language.TIGRINYA]. */ + TI, + + /** The ISO 639-1 code for [Tagalog][Language.TAGALOG]. */ + TL, + + /** The ISO 639-1 code for [Tswana][Language.TSWANA]. */ + TN, + + /** The ISO 639-1 code for [Turkish][Language.TURKISH]. */ + TR, + + /** The ISO 639-1 code for [Tsonga][Language.TSONGA]. */ + TS, + + /** The ISO 639-1 code for [Ukrainian][Language.UKRAINIAN]. */ + UK, + + /** The ISO 639-1 code for [Urdu][Language.URDU]. */ + UR, + + /** The ISO 639-1 code for [Vietnamese][Language.VIETNAMESE]. */ + VI, + + /** The ISO 639-1 code for [Xhosa][Language.XHOSA]. */ + XH, + + /** The ISO 639-1 code for [Yoruba][Language.YORUBA]. */ + YO, + + /** The ISO 639-1 code for [Chinese][Language.CHINESE]. */ + ZH, + + /** The ISO 639-1 code for [Zulu][Language.ZULU]. */ + ZU, + + /** The ISO 639-1 code for [the imaginary unknown language][Language.UNKNOWN]. */ + NONE; + + @Override + public String toString() { + return name().toLowerCase(); + } +} diff --git a/src/main/java/com/github/pemistahl/lingua/api/IsoCode639_3.java b/src/main/java/com/github/pemistahl/lingua/api/IsoCode639_3.java new file mode 100644 index 00000000..8bae3e78 --- /dev/null +++ b/src/main/java/com/github/pemistahl/lingua/api/IsoCode639_3.java @@ -0,0 +1,270 @@ +/* + * Copyright © 2018-today Peter M. Stahl pemistahl@gmail.com + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.pemistahl.lingua.api; + +/** + * The ISO 639-3 code representations for the supported languages. + * + *

ISO 639 is a standardized nomenclature used to classify languages. + */ +public enum IsoCode639_3 { + + /** The ISO 639-3 code for [Afrikaans][Language.AFRIKAANS]. */ + AFR, + + /** The ISO 639-3 code for [Amharic][Language.AMHARIC]. */ + AMH, + + /** The ISO 639-3 code for [Arabic][Language.ARABIC]. */ + ARA, + + /** The ISO 639-3 code for [Azerbaijani][Language.AZERBAIJANI]. */ + AZE, + + /** The ISO 639-3 code for [Belarusian][Language.BELARUSIAN]. */ + BEL, + + /** The ISO 639-3 code for [Bengali][Language.BENGALI]. */ + BEN, + + /** The ISO 639-3 code for [Bosnian][Language.BOSNIAN]. */ + BOS, + + /** The ISO 639-3 code for [Bulgarian][Language.BULGARIAN]. */ + BUL, + + /** The ISO 639-3 code for [Catalan][Language.CATALAN]. */ + CAT, + + /** The ISO 639-3 code for [Czech][Language.CZECH]. */ + CES, + + /** The ISO 639-3 code for [Welsh][Language.WELSH]. */ + CYM, + + /** The ISO 639-3 code for [Danish][Language.DANISH]. */ + DAN, + + /** The ISO 639-3 code for [German][Language.GERMAN]. */ + DEU, + + /** The ISO 639-3 code for [Greek][Language.GREEK]. */ + ELL, + + /** The ISO 639-3 code for [English][Language.ENGLISH]. */ + ENG, + + /** The ISO 639-3 code for [Esperanto][Language.ESPERANTO]. */ + EPO, + + /** The ISO 639-3 code for [Estonian][Language.ESTONIAN]. */ + EST, + + /** The ISO 639-3 code for [Basque][Language.BASQUE]. */ + EUS, + + /** The ISO 639-3 code for [Persian][Language.PERSIAN]. */ + FAS, + + /** The ISO 639-3 code for [Finnish][Language.FINNISH]. */ + FIN, + + /** The ISO 639-3 code for [French][Language.FRENCH]. */ + FRA, + + /** The ISO 639-3 code for [Irish][Language.IRISH]. */ + GLE, + + /** The ISO 639-3 code for [Gujarati][Language.GUJARATI]. */ + GUJ, + + /** The ISO 639-3 code for [Hebrew][Language.HEBREW]. */ + HEB, + + /** The ISO 639-3 code for [Hindi][Language.HINDI]. */ + HIN, + + /** The ISO 639-3 code for [Croatian][Language.CROATIAN]. */ + HRV, + + /** The ISO 639-3 code for [Hungarian][Language.HUNGARIAN]. */ + HUN, + + /** The ISO 639-3 code for [Armenian][Language.ARMENIAN]. */ + HYE, + + /** The ISO 639-3 code for [Indonesian][Language.INDONESIAN]. */ + IND, + + /** The ISO 639-3 code for [Icelandic][Language.ICELANDIC]. */ + ISL, + + /** The ISO 639-3 code for [Italian][Language.ITALIAN]. */ + ITA, + + /** The ISO 639-3 code for [Japanese][Language.JAPANESE]. */ + JPN, + + /** The ISO 639-3 code for [Georgian][Language.GEORGIAN]. */ + KAT, + + /** The ISO 639-3 code for [Kazakh][Language.KAZAKH]. */ + KAZ, + + /** The ISO 639-3 code for [Korean][Language.KOREAN]. */ + KOR, + + /** The ISO 639-3 code for [Latin][Language.LATIN]. */ + LAT, + + /** The ISO 639-3 code for [Latvian][Language.LATVIAN]. */ + LAV, + + /** The ISO 639-3 code for [Lithuanian][Language.LITHUANIAN]. */ + LIT, + + /** The ISO 639-3 code for [Ganda][Language.GANDA]. */ + LUG, + + /** The ISO 639-3 code for [Marathi][Language.MARATHI]. */ + MAR, + + /** The ISO 639-3 code for [Macedonian][Language.MACEDONIAN]. */ + MKD, + + /** The ISO 639-3 code for [Mongolian][Language.MONGOLIAN]. */ + MON, + + /** The ISO 639-3 code for [Maori][Language.MAORI]. */ + MRI, + + /** The ISO 639-3 code for [Malay][Language.MALAY]. */ + MSA, + + /** The ISO 639-3 code for [Dutch][Language.DUTCH]. */ + NLD, + + /** The ISO 639-3 code for [Norwegian Nynorsk][Language.NYNORSK]. */ + NNO, + + /** The ISO 639-3 code for [Norwegian Bokmal][Language.BOKMAL]. */ + NOB, + + /** The ISO 639-3 code for [Oromo][Language.OROMO]. */ + ORM, + + /** The ISO 639-3 code for [Punjabi][Language.PUNJABI]. */ + PAN, + + /** The ISO 639-3 code for [Polish][Language.POLISH]. */ + POL, + + /** The ISO 639-3 code for [Portuguese][Language.PORTUGUESE]. */ + POR, + + /** The ISO 639-3 code for [Romanian][Language.ROMANIAN]. */ + RON, + + /** The ISO 639-3 code for [Russian][Language.RUSSIAN]. */ + RUS, + + /** The ISO 639-3 code for [Sinhala][Language.SINHALA]. */ + SIN, + + /** The ISO 639-3 code for [Slovak][Language.SLOVAK]. */ + SLK, + + /** The ISO 639-3 code for [Slovene][Language.SLOVENE]. */ + SLV, + + /** The ISO 639-3 code for [Shona][Language.SHONA]. */ + SNA, + + /** The ISO 639-3 code for [Somali][Language.SOMALI]. */ + SOM, + + /** The ISO 639-3 code for [Southern Sotho][Language.SOTHO]. */ + SOT, + + /** The ISO 639-3 code for [Spanish][Language.SPANISH]. */ + SPA, + + /** The ISO 639-3 code for [Albanian][Language.ALBANIAN]. */ + SQI, + + /** The ISO 639-3 code for [Serbian][Language.SERBIAN]. */ + SRP, + + /** The ISO 639-3 code for [Swahili][Language.SWAHILI]. */ + SWA, + + /** The ISO 639-3 code for [Swedish][Language.SWEDISH]. */ + SWE, + + /** The ISO 639-3 code for [Tamil][Language.TAMIL]. */ + TAM, + + /** The ISO 639-3 code for [Telugu][Language.TELUGU]. */ + TEL, + + /** The ISO 639-3 code for [Tagalog][Language.TAGALOG]. */ + TGL, + + /** The ISO 639-3 code for [Thai][Language.THAI]. */ + THA, + + /** The ISO 639-3 code for [Tigrinya][Language.TIGRINYA]. */ + TIR, + + /** The ISO 639-3 code for [Tswana][Language.TSWANA]. */ + TSN, + + /** The ISO 639-3 code for [Tsonga][Language.TSONGA]. */ + TSO, + + /** The ISO 639-3 code for [Turkish][Language.TURKISH]. */ + TUR, + + /** The ISO 639-3 code for [Ukrainian][Language.UKRAINIAN]. */ + UKR, + + /** The ISO 639-3 code for [Urdu][Language.URDU]. */ + URD, + + /** The ISO 639-3 code for [Vietnamese][Language.VIETNAMESE]. */ + VIE, + + /** The ISO 639-3 code for [Xhosa][Language.XHOSA]. */ + XHO, + + /** The ISO 639-3 code for [Yoruba][Language.YORUBA]. */ + YOR, + + /** The ISO 639-3 code for [Chinese][Language.CHINESE]. */ + ZHO, + + /** The ISO 639-3 code for [Zulu][Language.ZULU]. */ + ZUL, + + /** The ISO 639-3 code for [the imaginary unknown language][Language.UNKNOWN]. */ + NONE; + + @Override + public String toString() { + return name().toLowerCase(); + } +} From d2a22a9ec22582a0aa8df3743aa93bf7752e6e30 Mon Sep 17 00:00:00 2001 From: Alexander Zagniotov Date: Mon, 18 Nov 2024 14:16:22 -0800 Subject: [PATCH 06/11] api pkg: migrated Alphabet --- .../pemistahl/lingua/internal/Alphabet.java | 86 +++++++++++++++++++ 1 file changed, 86 insertions(+) create mode 100644 src/main/java/com/github/pemistahl/lingua/internal/Alphabet.java diff --git a/src/main/java/com/github/pemistahl/lingua/internal/Alphabet.java b/src/main/java/com/github/pemistahl/lingua/internal/Alphabet.java new file mode 100644 index 00000000..d257ba88 --- /dev/null +++ b/src/main/java/com/github/pemistahl/lingua/internal/Alphabet.java @@ -0,0 +1,86 @@ +/* + * Copyright © 2018-today Peter M. Stahl pemistahl@gmail.com + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.pemistahl.lingua.internal; + +import com.github.pemistahl.lingua.api.Language; +import java.lang.Character.UnicodeScript; +import java.util.EnumSet; +import java.util.HashMap; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; + +public enum Alphabet { + ARABIC, + ARMENIAN, + BENGALI, + CYRILLIC, + DEVANAGARI, + ETHIOPIC, + GEORGIAN, + GREEK, + GUJARATI, + GURMUKHI, + HAN, + HANGUL, + HEBREW, + HIRAGANA, + KATAKANA, + LATIN, + SINHALA, + TAMIL, + TELUGU, + THAI, + NONE; + + private UnicodeScript script; + + Alphabet() { + try { + this.script = UnicodeScript.forName(this.name()); + } catch (IllegalArgumentException e) { + this.script = null; + } + } + + public boolean matches(char chr) { + return UnicodeScript.of(chr) == this.script; + } + + public boolean matches(CharSequence input) { + return input.chars().allMatch(codePoint -> UnicodeScript.of(codePoint) == this.script); + } + + private Set supportedLanguages() { + return EnumSet.allOf(Language.class).stream() + .filter(language -> language.getAlphabets().contains(this)) + .collect(Collectors.toSet()); + } + + public static Map allSupportingExactlyOneLanguage() { + Map alphabets = new HashMap<>(); + for (Alphabet alphabet : values()) { + if (alphabet != NONE) { + Set supportedLanguages = alphabet.supportedLanguages(); + if (supportedLanguages.size() == 1) { + alphabets.put(alphabet, supportedLanguages.iterator().next()); + } + } + } + return alphabets; + } +} From 62867b379f9e5b855b3867a2af2cf61b81179737 Mon Sep 17 00:00:00 2001 From: Alexander Zagniotov Date: Mon, 18 Nov 2024 14:17:27 -0800 Subject: [PATCH 07/11] api pkg: migrated Language and LanguageTest --- .../github/pemistahl/lingua/api/Language.java | 383 ++++++++++++++ .../pemistahl/lingua/api/LanguageTest.java | 499 ++++++++++++++++++ 2 files changed, 882 insertions(+) create mode 100644 src/main/java/com/github/pemistahl/lingua/api/Language.java create mode 100644 src/test/java/com/github/pemistahl/lingua/api/LanguageTest.java diff --git a/src/main/java/com/github/pemistahl/lingua/api/Language.java b/src/main/java/com/github/pemistahl/lingua/api/Language.java new file mode 100644 index 00000000..29c0a061 --- /dev/null +++ b/src/main/java/com/github/pemistahl/lingua/api/Language.java @@ -0,0 +1,383 @@ +/* + * Copyright © 2018-today Peter M. Stahl pemistahl@gmail.com + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.pemistahl.lingua.api; + +import static com.github.pemistahl.lingua.api.IsoCode639_1.AF; +import static com.github.pemistahl.lingua.api.IsoCode639_1.AM; +import static com.github.pemistahl.lingua.api.IsoCode639_1.AR; +import static com.github.pemistahl.lingua.api.IsoCode639_1.AZ; +import static com.github.pemistahl.lingua.api.IsoCode639_1.BE; +import static com.github.pemistahl.lingua.api.IsoCode639_1.BG; +import static com.github.pemistahl.lingua.api.IsoCode639_1.BN; +import static com.github.pemistahl.lingua.api.IsoCode639_1.BS; +import static com.github.pemistahl.lingua.api.IsoCode639_1.CA; +import static com.github.pemistahl.lingua.api.IsoCode639_1.CS; +import static com.github.pemistahl.lingua.api.IsoCode639_1.CY; +import static com.github.pemistahl.lingua.api.IsoCode639_1.DA; +import static com.github.pemistahl.lingua.api.IsoCode639_1.DE; +import static com.github.pemistahl.lingua.api.IsoCode639_1.EL; +import static com.github.pemistahl.lingua.api.IsoCode639_1.EN; +import static com.github.pemistahl.lingua.api.IsoCode639_1.EO; +import static com.github.pemistahl.lingua.api.IsoCode639_1.ES; +import static com.github.pemistahl.lingua.api.IsoCode639_1.ET; +import static com.github.pemistahl.lingua.api.IsoCode639_1.EU; +import static com.github.pemistahl.lingua.api.IsoCode639_1.FA; +import static com.github.pemistahl.lingua.api.IsoCode639_1.FI; +import static com.github.pemistahl.lingua.api.IsoCode639_1.FR; +import static com.github.pemistahl.lingua.api.IsoCode639_1.GA; +import static com.github.pemistahl.lingua.api.IsoCode639_1.GU; +import static com.github.pemistahl.lingua.api.IsoCode639_1.HE; +import static com.github.pemistahl.lingua.api.IsoCode639_1.HI; +import static com.github.pemistahl.lingua.api.IsoCode639_1.HR; +import static com.github.pemistahl.lingua.api.IsoCode639_1.HU; +import static com.github.pemistahl.lingua.api.IsoCode639_1.HY; +import static com.github.pemistahl.lingua.api.IsoCode639_1.ID; +import static com.github.pemistahl.lingua.api.IsoCode639_1.IS; +import static com.github.pemistahl.lingua.api.IsoCode639_1.IT; +import static com.github.pemistahl.lingua.api.IsoCode639_1.JA; +import static com.github.pemistahl.lingua.api.IsoCode639_1.KA; +import static com.github.pemistahl.lingua.api.IsoCode639_1.KK; +import static com.github.pemistahl.lingua.api.IsoCode639_1.KO; +import static com.github.pemistahl.lingua.api.IsoCode639_1.LA; +import static com.github.pemistahl.lingua.api.IsoCode639_1.LG; +import static com.github.pemistahl.lingua.api.IsoCode639_1.LT; +import static com.github.pemistahl.lingua.api.IsoCode639_1.LV; +import static com.github.pemistahl.lingua.api.IsoCode639_1.MI; +import static com.github.pemistahl.lingua.api.IsoCode639_1.MK; +import static com.github.pemistahl.lingua.api.IsoCode639_1.MN; +import static com.github.pemistahl.lingua.api.IsoCode639_1.MR; +import static com.github.pemistahl.lingua.api.IsoCode639_1.MS; +import static com.github.pemistahl.lingua.api.IsoCode639_1.NB; +import static com.github.pemistahl.lingua.api.IsoCode639_1.NL; +import static com.github.pemistahl.lingua.api.IsoCode639_1.NN; +import static com.github.pemistahl.lingua.api.IsoCode639_1.OM; +import static com.github.pemistahl.lingua.api.IsoCode639_1.PA; +import static com.github.pemistahl.lingua.api.IsoCode639_1.PL; +import static com.github.pemistahl.lingua.api.IsoCode639_1.PT; +import static com.github.pemistahl.lingua.api.IsoCode639_1.RO; +import static com.github.pemistahl.lingua.api.IsoCode639_1.RU; +import static com.github.pemistahl.lingua.api.IsoCode639_1.SI; +import static com.github.pemistahl.lingua.api.IsoCode639_1.SK; +import static com.github.pemistahl.lingua.api.IsoCode639_1.SL; +import static com.github.pemistahl.lingua.api.IsoCode639_1.SN; +import static com.github.pemistahl.lingua.api.IsoCode639_1.SO; +import static com.github.pemistahl.lingua.api.IsoCode639_1.SQ; +import static com.github.pemistahl.lingua.api.IsoCode639_1.SR; +import static com.github.pemistahl.lingua.api.IsoCode639_1.ST; +import static com.github.pemistahl.lingua.api.IsoCode639_1.SV; +import static com.github.pemistahl.lingua.api.IsoCode639_1.SW; +import static com.github.pemistahl.lingua.api.IsoCode639_1.TA; +import static com.github.pemistahl.lingua.api.IsoCode639_1.TE; +import static com.github.pemistahl.lingua.api.IsoCode639_1.TH; +import static com.github.pemistahl.lingua.api.IsoCode639_1.TI; +import static com.github.pemistahl.lingua.api.IsoCode639_1.TL; +import static com.github.pemistahl.lingua.api.IsoCode639_1.TN; +import static com.github.pemistahl.lingua.api.IsoCode639_1.TR; +import static com.github.pemistahl.lingua.api.IsoCode639_1.TS; +import static com.github.pemistahl.lingua.api.IsoCode639_1.UK; +import static com.github.pemistahl.lingua.api.IsoCode639_1.UR; +import static com.github.pemistahl.lingua.api.IsoCode639_1.VI; +import static com.github.pemistahl.lingua.api.IsoCode639_1.XH; +import static com.github.pemistahl.lingua.api.IsoCode639_1.YO; +import static com.github.pemistahl.lingua.api.IsoCode639_1.ZH; +import static com.github.pemistahl.lingua.api.IsoCode639_1.ZU; +import static com.github.pemistahl.lingua.api.IsoCode639_3.AFR; +import static com.github.pemistahl.lingua.api.IsoCode639_3.AMH; +import static com.github.pemistahl.lingua.api.IsoCode639_3.ARA; +import static com.github.pemistahl.lingua.api.IsoCode639_3.AZE; +import static com.github.pemistahl.lingua.api.IsoCode639_3.BEL; +import static com.github.pemistahl.lingua.api.IsoCode639_3.BEN; +import static com.github.pemistahl.lingua.api.IsoCode639_3.BOS; +import static com.github.pemistahl.lingua.api.IsoCode639_3.BUL; +import static com.github.pemistahl.lingua.api.IsoCode639_3.CAT; +import static com.github.pemistahl.lingua.api.IsoCode639_3.CES; +import static com.github.pemistahl.lingua.api.IsoCode639_3.CYM; +import static com.github.pemistahl.lingua.api.IsoCode639_3.DAN; +import static com.github.pemistahl.lingua.api.IsoCode639_3.DEU; +import static com.github.pemistahl.lingua.api.IsoCode639_3.ELL; +import static com.github.pemistahl.lingua.api.IsoCode639_3.ENG; +import static com.github.pemistahl.lingua.api.IsoCode639_3.EPO; +import static com.github.pemistahl.lingua.api.IsoCode639_3.EST; +import static com.github.pemistahl.lingua.api.IsoCode639_3.EUS; +import static com.github.pemistahl.lingua.api.IsoCode639_3.FAS; +import static com.github.pemistahl.lingua.api.IsoCode639_3.FIN; +import static com.github.pemistahl.lingua.api.IsoCode639_3.FRA; +import static com.github.pemistahl.lingua.api.IsoCode639_3.GLE; +import static com.github.pemistahl.lingua.api.IsoCode639_3.GUJ; +import static com.github.pemistahl.lingua.api.IsoCode639_3.HEB; +import static com.github.pemistahl.lingua.api.IsoCode639_3.HIN; +import static com.github.pemistahl.lingua.api.IsoCode639_3.HRV; +import static com.github.pemistahl.lingua.api.IsoCode639_3.HUN; +import static com.github.pemistahl.lingua.api.IsoCode639_3.HYE; +import static com.github.pemistahl.lingua.api.IsoCode639_3.IND; +import static com.github.pemistahl.lingua.api.IsoCode639_3.ISL; +import static com.github.pemistahl.lingua.api.IsoCode639_3.ITA; +import static com.github.pemistahl.lingua.api.IsoCode639_3.JPN; +import static com.github.pemistahl.lingua.api.IsoCode639_3.KAT; +import static com.github.pemistahl.lingua.api.IsoCode639_3.KAZ; +import static com.github.pemistahl.lingua.api.IsoCode639_3.KOR; +import static com.github.pemistahl.lingua.api.IsoCode639_3.LAT; +import static com.github.pemistahl.lingua.api.IsoCode639_3.LAV; +import static com.github.pemistahl.lingua.api.IsoCode639_3.LIT; +import static com.github.pemistahl.lingua.api.IsoCode639_3.LUG; +import static com.github.pemistahl.lingua.api.IsoCode639_3.MAR; +import static com.github.pemistahl.lingua.api.IsoCode639_3.MKD; +import static com.github.pemistahl.lingua.api.IsoCode639_3.MON; +import static com.github.pemistahl.lingua.api.IsoCode639_3.MRI; +import static com.github.pemistahl.lingua.api.IsoCode639_3.MSA; +import static com.github.pemistahl.lingua.api.IsoCode639_3.NLD; +import static com.github.pemistahl.lingua.api.IsoCode639_3.NNO; +import static com.github.pemistahl.lingua.api.IsoCode639_3.NOB; +import static com.github.pemistahl.lingua.api.IsoCode639_3.ORM; +import static com.github.pemistahl.lingua.api.IsoCode639_3.PAN; +import static com.github.pemistahl.lingua.api.IsoCode639_3.POL; +import static com.github.pemistahl.lingua.api.IsoCode639_3.POR; +import static com.github.pemistahl.lingua.api.IsoCode639_3.RON; +import static com.github.pemistahl.lingua.api.IsoCode639_3.RUS; +import static com.github.pemistahl.lingua.api.IsoCode639_3.SIN; +import static com.github.pemistahl.lingua.api.IsoCode639_3.SLK; +import static com.github.pemistahl.lingua.api.IsoCode639_3.SLV; +import static com.github.pemistahl.lingua.api.IsoCode639_3.SNA; +import static com.github.pemistahl.lingua.api.IsoCode639_3.SOM; +import static com.github.pemistahl.lingua.api.IsoCode639_3.SOT; +import static com.github.pemistahl.lingua.api.IsoCode639_3.SPA; +import static com.github.pemistahl.lingua.api.IsoCode639_3.SQI; +import static com.github.pemistahl.lingua.api.IsoCode639_3.SRP; +import static com.github.pemistahl.lingua.api.IsoCode639_3.SWA; +import static com.github.pemistahl.lingua.api.IsoCode639_3.SWE; +import static com.github.pemistahl.lingua.api.IsoCode639_3.TAM; +import static com.github.pemistahl.lingua.api.IsoCode639_3.TEL; +import static com.github.pemistahl.lingua.api.IsoCode639_3.TGL; +import static com.github.pemistahl.lingua.api.IsoCode639_3.THA; +import static com.github.pemistahl.lingua.api.IsoCode639_3.TIR; +import static com.github.pemistahl.lingua.api.IsoCode639_3.TSN; +import static com.github.pemistahl.lingua.api.IsoCode639_3.TSO; +import static com.github.pemistahl.lingua.api.IsoCode639_3.TUR; +import static com.github.pemistahl.lingua.api.IsoCode639_3.UKR; +import static com.github.pemistahl.lingua.api.IsoCode639_3.URD; +import static com.github.pemistahl.lingua.api.IsoCode639_3.VIE; +import static com.github.pemistahl.lingua.api.IsoCode639_3.XHO; +import static com.github.pemistahl.lingua.api.IsoCode639_3.YOR; +import static com.github.pemistahl.lingua.api.IsoCode639_3.ZHO; +import static com.github.pemistahl.lingua.api.IsoCode639_3.ZUL; +import static com.github.pemistahl.lingua.internal.Alphabet.CYRILLIC; +import static com.github.pemistahl.lingua.internal.Alphabet.DEVANAGARI; +import static com.github.pemistahl.lingua.internal.Alphabet.GURMUKHI; +import static com.github.pemistahl.lingua.internal.Alphabet.HAN; +import static com.github.pemistahl.lingua.internal.Alphabet.HANGUL; +import static com.github.pemistahl.lingua.internal.Alphabet.HIRAGANA; +import static com.github.pemistahl.lingua.internal.Alphabet.KATAKANA; +import static com.github.pemistahl.lingua.internal.Alphabet.NONE; +import static com.github.pemistahl.lingua.internal.util.extension.EnumExtensions.enumSetOf; + +import com.github.pemistahl.lingua.internal.Alphabet; +import java.util.Arrays; +import java.util.EnumSet; +import java.util.List; +import java.util.Optional; +import java.util.stream.Collectors; + +/** The supported detectable languages. */ +public enum Language { + AFRIKAANS(AF, AFR, enumSetOf(Alphabet.LATIN), null), + ALBANIAN(SQ, SQI, enumSetOf(Alphabet.LATIN), null), + AMHARIC(AM, AMH, enumSetOf(Alphabet.ETHIOPIC), null), + ARABIC(AR, ARA, enumSetOf(Alphabet.ARABIC), null), + ARMENIAN(HY, HYE, enumSetOf(Alphabet.ARMENIAN), null), + AZERBAIJANI(AZ, AZE, enumSetOf(Alphabet.LATIN), "Əə"), + BASQUE(EU, EUS, enumSetOf(Alphabet.LATIN), null), + BELARUSIAN(BE, BEL, enumSetOf(CYRILLIC), null), + BENGALI(BN, BEN, enumSetOf(Alphabet.BENGALI), null), + BOKMAL(NB, NOB, enumSetOf(Alphabet.LATIN), null), + BOSNIAN(BS, BOS, enumSetOf(Alphabet.LATIN), null), + BULGARIAN(BG, BUL, enumSetOf(CYRILLIC), null), + CATALAN(CA, CAT, enumSetOf(Alphabet.LATIN), "Ïï"), + CHINESE(ZH, ZHO, enumSetOf(HAN), null), + CROATIAN(HR, HRV, enumSetOf(Alphabet.LATIN), null), + CZECH(CS, CES, enumSetOf(Alphabet.LATIN), "ĚěŘřŮů"), + DANISH(DA, DAN, enumSetOf(Alphabet.LATIN), null), + DUTCH(NL, NLD, enumSetOf(Alphabet.LATIN), null), + ENGLISH(EN, ENG, enumSetOf(Alphabet.LATIN), null), + ESPERANTO(EO, EPO, enumSetOf(Alphabet.LATIN), "ĈĉĜĝĤĥĴĵŜŝŬŭ"), + ESTONIAN(ET, EST, enumSetOf(Alphabet.LATIN), null), + FINNISH(FI, FIN, enumSetOf(Alphabet.LATIN), null), + FRENCH(FR, FRA, enumSetOf(Alphabet.LATIN), null), + GANDA(LG, LUG, enumSetOf(Alphabet.LATIN), null), + GEORGIAN(KA, KAT, enumSetOf(Alphabet.GEORGIAN), null), + GERMAN(DE, DEU, enumSetOf(Alphabet.LATIN), "ß"), + GREEK(EL, ELL, enumSetOf(Alphabet.GREEK), null), + GUJARATI(GU, GUJ, enumSetOf(Alphabet.GUJARATI), null), + HEBREW(HE, HEB, enumSetOf(Alphabet.HEBREW), null), + HINDI(HI, HIN, enumSetOf(DEVANAGARI), null), + HUNGARIAN(HU, HUN, enumSetOf(Alphabet.LATIN), "ŐőŰű"), + ICELANDIC(IS, ISL, enumSetOf(Alphabet.LATIN), null), + INDONESIAN(ID, IND, enumSetOf(Alphabet.LATIN), null), + IRISH(GA, GLE, enumSetOf(Alphabet.LATIN), null), + ITALIAN(IT, ITA, enumSetOf(Alphabet.LATIN), null), + JAPANESE(JA, JPN, enumSetOf(HIRAGANA, KATAKANA, HAN), null), + KAZAKH(KK, KAZ, enumSetOf(CYRILLIC), "ӘәҒғҚқҢңҰұ"), + KOREAN(KO, KOR, enumSetOf(HANGUL), null), + LATIN(LA, LAT, enumSetOf(Alphabet.LATIN), null), + LATVIAN(LV, LAV, enumSetOf(Alphabet.LATIN), "ĢģĶķĻļŅņ"), + LITHUANIAN(LT, LIT, enumSetOf(Alphabet.LATIN), "ĖėĮįŲų"), + MACEDONIAN(MK, MKD, enumSetOf(CYRILLIC), "ЃѓЅѕЌќЏџ"), + MALAY(MS, MSA, enumSetOf(Alphabet.LATIN), null), + MAORI(MI, MRI, enumSetOf(Alphabet.LATIN), null), + MARATHI(MR, MAR, enumSetOf(DEVANAGARI), "ळ"), + MONGOLIAN(MN, MON, enumSetOf(CYRILLIC), "ӨөҮү"), + NYNORSK(NN, NNO, enumSetOf(Alphabet.LATIN), null), + OROMO(OM, ORM, enumSetOf(Alphabet.LATIN), null), + PERSIAN(FA, FAS, enumSetOf(Alphabet.ARABIC), null), + POLISH(PL, POL, enumSetOf(Alphabet.LATIN), "ŁłŃńŚśŹź"), + PORTUGUESE(PT, POR, enumSetOf(Alphabet.LATIN), null), + PUNJABI(PA, PAN, enumSetOf(GURMUKHI), null), + ROMANIAN(RO, RON, enumSetOf(Alphabet.LATIN), "Țţ"), + RUSSIAN(RU, RUS, enumSetOf(CYRILLIC), null), + SERBIAN(SR, SRP, enumSetOf(CYRILLIC), "ЂђЋћ"), + SHONA(SN, SNA, enumSetOf(Alphabet.LATIN), null), + SINHALA(SI, SIN, enumSetOf(Alphabet.SINHALA), null), + SLOVAK(SK, SLK, enumSetOf(Alphabet.LATIN), "Ĺ弾Ŕŕ"), + SLOVENE(SL, SLV, enumSetOf(Alphabet.LATIN), null), + SOMALI(SO, SOM, enumSetOf(Alphabet.LATIN), null), + SOTHO(ST, SOT, enumSetOf(Alphabet.LATIN), null), + SPANISH(ES, SPA, enumSetOf(Alphabet.LATIN), "¿¡"), + SWAHILI(SW, SWA, enumSetOf(Alphabet.LATIN), null), + SWEDISH(SV, SWE, enumSetOf(Alphabet.LATIN), null), + TAGALOG(TL, TGL, enumSetOf(Alphabet.LATIN), null), + TAMIL(TA, TAM, enumSetOf(Alphabet.TAMIL), null), + TELUGU(TE, TEL, enumSetOf(Alphabet.TELUGU), null), + THAI(TH, THA, enumSetOf(Alphabet.THAI), null), + TIGRINYA(TI, TIR, enumSetOf(Alphabet.ETHIOPIC), null), + TSONGA(TS, TSO, enumSetOf(Alphabet.LATIN), null), + TSWANA(TN, TSN, enumSetOf(Alphabet.LATIN), null), + TURKISH(TR, TUR, enumSetOf(Alphabet.LATIN), null), + UKRAINIAN(UK, UKR, enumSetOf(CYRILLIC), "ҐґЄєЇї"), + URDU(UR, URD, enumSetOf(Alphabet.ARABIC), null), + VIETNAMESE( + VI, + VIE, + enumSetOf(Alphabet.LATIN), + "ẰằẦầẲẳẨẩẴẵẪẫẮắẤấẠạẶặẬậỀềẺẻỂểẼẽỄễẾếỆệỈỉĨĩỊịƠơỒồỜờỎỏỔổỞởỖỗỠỡỐốỚớỘộỢợƯưỪừỦủỬửŨũỮữỨứỤụỰựỲỳỶỷỸỹỴỵ"), + WELSH(CY, CYM, enumSetOf(Alphabet.LATIN), null), + XHOSA(XH, XHO, enumSetOf(Alphabet.LATIN), null), + // TODO for YORUBA: "E̩e̩Ẹ́ẹ́É̩é̩Ẹ̀ẹ̀È̩è̩Ẹ̄ẹ̄Ē̩ē̩ŌōO̩o̩Ọ́ọ́Ó̩ó̩Ọ̀ọ̀Ò̩ò̩Ọ̄ọ̄Ō̩ō̩ṢṣS̩s̩" + YORUBA(YO, YOR, enumSetOf(Alphabet.LATIN), "Ṣṣ"), + ZULU(ZU, ZUL, enumSetOf(Alphabet.LATIN), null), + + /** + * The imaginary unknown language. + * + *

This value is returned if no language can be detected reliably. + */ + UNKNOWN(IsoCode639_1.NONE, IsoCode639_3.NONE, enumSetOf(NONE), null); + + private final IsoCode639_1 isoCode639_1; + private final IsoCode639_3 isoCode639_3; + private final EnumSet alphabets; + private final String uniqueCharacters; + + Language( + final IsoCode639_1 isoCode639_1, + final IsoCode639_3 isoCode639_3, + final EnumSet alphabets, + final String uniqueCharacters) { + this.isoCode639_1 = isoCode639_1; + this.isoCode639_3 = isoCode639_3; + this.alphabets = alphabets; + this.uniqueCharacters = uniqueCharacters; + } + + public IsoCode639_1 getIsoCode639_1() { + return isoCode639_1; + } + + public IsoCode639_3 getIsoCode639_3() { + return isoCode639_3; + } + + public EnumSet getAlphabets() { + // Copy to be safe + return EnumSet.copyOf(this.alphabets); + } + + public String getUniqueCharacters() { + // Copy to be safe + return Optional.ofNullable(uniqueCharacters).orElse(""); + } + + public static List all() { + return filterOutLanguages(UNKNOWN); + } + + public static List allSpokenOnes() { + return filterOutLanguages(UNKNOWN, LATIN); + } + + public static List allWithArabicScript() { + return Arrays.stream(values()) + .filter(language -> language.alphabets.contains(Alphabet.ARABIC)) + .collect(Collectors.toList()); + } + + public static List allWithCyrillicScript() { + return Arrays.stream(values()) + .filter(language -> language.alphabets.contains(CYRILLIC)) + .collect(Collectors.toList()); + } + + public static List allWithDevanagariScript() { + return Arrays.stream(values()) + .filter(language -> language.alphabets.contains(DEVANAGARI)) + .collect(Collectors.toList()); + } + + public static List allWithEthiopicScript() { + return Arrays.stream(values()) + .filter(language -> language.alphabets.contains(Alphabet.ETHIOPIC)) + .collect(Collectors.toList()); + } + + public static List allWithLatinScript() { + return Arrays.stream(values()) + .filter(language -> language.alphabets.contains(Alphabet.LATIN)) + .collect(Collectors.toList()); + } + + public static Language getByIsoCode639_1(IsoCode639_1 isoCode) { + return Arrays.stream(values()) + .filter(language -> language.isoCode639_1 == isoCode) + .findFirst() + .orElseThrow( + () -> + new IllegalArgumentException("No language found with ISO code 639-1: " + isoCode)); + } + + public static Language getByIsoCode639_3(IsoCode639_3 isoCode) { + return Arrays.stream(values()) + .filter(language -> language.isoCode639_3 == isoCode) + .findFirst() + .orElseThrow( + () -> + new IllegalArgumentException("No language found with ISO code 639-3: " + isoCode)); + } + + private static List filterOutLanguages(Language... languages) { + return Arrays.stream(values()) + .filter(language -> !Arrays.asList(languages).contains(language)) + .collect(Collectors.toList()); + } +} diff --git a/src/test/java/com/github/pemistahl/lingua/api/LanguageTest.java b/src/test/java/com/github/pemistahl/lingua/api/LanguageTest.java new file mode 100644 index 00000000..8254027a --- /dev/null +++ b/src/test/java/com/github/pemistahl/lingua/api/LanguageTest.java @@ -0,0 +1,499 @@ +/* + * Copyright © 2018-today Peter M. Stahl pemistahl@gmail.com + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.pemistahl.lingua.api; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.junit.jupiter.params.provider.Arguments.arguments; + +import com.github.pemistahl.lingua.internal.Alphabet; +import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.Stream; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.CsvSource; +import org.junit.jupiter.params.provider.MethodSource; + +/** + * This class contains various tests related to supported languages and their corresponding scripts. + * + *

Author: Peter M. Stahl
+ * Migration to Java from Kotlin by Alexander Zagniotov + */ +public class LanguageTest { + + /** Asserts that all supported languages are available. */ + @Test + public void assertThatAllSupportedLanguagesAreAvailable() { + assertThat(Language.all()) + .containsExactly( + Language.AFRIKAANS, + Language.ALBANIAN, + Language.AMHARIC, + Language.ARABIC, + Language.ARMENIAN, + Language.AZERBAIJANI, + Language.BASQUE, + Language.BELARUSIAN, + Language.BENGALI, + Language.BOKMAL, + Language.BOSNIAN, + Language.BULGARIAN, + Language.CATALAN, + Language.CHINESE, + Language.CROATIAN, + Language.CZECH, + Language.DANISH, + Language.DUTCH, + Language.ENGLISH, + Language.ESPERANTO, + Language.ESTONIAN, + Language.FINNISH, + Language.FRENCH, + Language.GANDA, + Language.GEORGIAN, + Language.GERMAN, + Language.GREEK, + Language.GUJARATI, + Language.HEBREW, + Language.HINDI, + Language.HUNGARIAN, + Language.ICELANDIC, + Language.INDONESIAN, + Language.IRISH, + Language.ITALIAN, + Language.JAPANESE, + Language.KAZAKH, + Language.KOREAN, + Language.LATIN, + Language.LATVIAN, + Language.LITHUANIAN, + Language.MACEDONIAN, + Language.MALAY, + Language.MAORI, + Language.MARATHI, + Language.MONGOLIAN, + Language.NYNORSK, + Language.OROMO, + Language.PERSIAN, + Language.POLISH, + Language.PORTUGUESE, + Language.PUNJABI, + Language.ROMANIAN, + Language.RUSSIAN, + Language.SERBIAN, + Language.SHONA, + Language.SINHALA, + Language.SLOVAK, + Language.SLOVENE, + Language.SOMALI, + Language.SOTHO, + Language.SPANISH, + Language.SWAHILI, + Language.SWEDISH, + Language.TAGALOG, + Language.TAMIL, + Language.TELUGU, + Language.THAI, + Language.TIGRINYA, + Language.TSONGA, + Language.TSWANA, + Language.TURKISH, + Language.UKRAINIAN, + Language.URDU, + Language.VIETNAMESE, + Language.WELSH, + Language.XHOSA, + Language.YORUBA, + Language.ZULU); + } + + /** Asserts that all supported spoken languages are available. */ + @Test + public void assertThatAllSupportedSpokenLanguagesAreAvailable() { + assertThat(Language.allSpokenOnes()) + .containsExactly( + Language.AFRIKAANS, + Language.ALBANIAN, + Language.AMHARIC, + Language.ARABIC, + Language.ARMENIAN, + Language.AZERBAIJANI, + Language.BASQUE, + Language.BELARUSIAN, + Language.BENGALI, + Language.BOKMAL, + Language.BOSNIAN, + Language.BULGARIAN, + Language.CATALAN, + Language.CHINESE, + Language.CROATIAN, + Language.CZECH, + Language.DANISH, + Language.DUTCH, + Language.ENGLISH, + Language.ESPERANTO, + Language.ESTONIAN, + Language.FINNISH, + Language.FRENCH, + Language.GANDA, + Language.GEORGIAN, + Language.GERMAN, + Language.GREEK, + Language.GUJARATI, + Language.HEBREW, + Language.HINDI, + Language.HUNGARIAN, + Language.ICELANDIC, + Language.INDONESIAN, + Language.IRISH, + Language.ITALIAN, + Language.JAPANESE, + Language.KAZAKH, + Language.KOREAN, + Language.LATVIAN, + Language.LITHUANIAN, + Language.MACEDONIAN, + Language.MALAY, + Language.MAORI, + Language.MARATHI, + Language.MONGOLIAN, + Language.NYNORSK, + Language.OROMO, + Language.PERSIAN, + Language.POLISH, + Language.PORTUGUESE, + Language.PUNJABI, + Language.ROMANIAN, + Language.RUSSIAN, + Language.SERBIAN, + Language.SHONA, + Language.SINHALA, + Language.SLOVAK, + Language.SLOVENE, + Language.SOMALI, + Language.SOTHO, + Language.SPANISH, + Language.SWAHILI, + Language.SWEDISH, + Language.TAGALOG, + Language.TAMIL, + Language.TELUGU, + Language.THAI, + Language.TIGRINYA, + Language.TSONGA, + Language.TSWANA, + Language.TURKISH, + Language.UKRAINIAN, + Language.URDU, + Language.VIETNAMESE, + Language.WELSH, + Language.XHOSA, + Language.YORUBA, + Language.ZULU); + } + + /** Asserts that certain languages support Arabic script. */ + @Test + public void assertThatCertainLanguagesSupportArabicScript() { + assertThat(Language.allWithArabicScript()) + .containsExactly(Language.ARABIC, Language.PERSIAN, Language.URDU); + } + + /** Asserts that certain languages support Cyrillic script. */ + @Test + public void assertThatCertainLanguagesSupportCyrillicScript() { + assertThat(Language.allWithCyrillicScript()) + .containsExactly( + Language.BELARUSIAN, + Language.BULGARIAN, + Language.KAZAKH, + Language.MACEDONIAN, + Language.MONGOLIAN, + Language.RUSSIAN, + Language.SERBIAN, + Language.UKRAINIAN); + } + + /** Asserts that certain languages support Devanagari script. */ + @Test + public void assertThatCertainLanguagesSupportDevanagariScript() { + assertThat(Language.allWithDevanagariScript()) + .containsExactly(Language.HINDI, Language.MARATHI); + } + + /** Asserts that certain languages support Ethiopic script. */ + @Test + public void assertThatCertainLanguagesSupportEthiopicScript() { + assertThat(Language.allWithEthiopicScript()) + .containsExactly(Language.AMHARIC, Language.TIGRINYA); + } + + /** Asserts that certain languages support Latin script. */ + @Test + public void assertThatCertainLanguagesSupportLatinScript() { + assertThat(Language.allWithLatinScript()) + .containsExactly( + Language.AFRIKAANS, + Language.ALBANIAN, + Language.AZERBAIJANI, + Language.BASQUE, + Language.BOKMAL, + Language.BOSNIAN, + Language.CATALAN, + Language.CROATIAN, + Language.CZECH, + Language.DANISH, + Language.DUTCH, + Language.ENGLISH, + Language.ESPERANTO, + Language.ESTONIAN, + Language.FINNISH, + Language.FRENCH, + Language.GANDA, + Language.GERMAN, + Language.HUNGARIAN, + Language.ICELANDIC, + Language.INDONESIAN, + Language.IRISH, + Language.ITALIAN, + Language.LATIN, + Language.LATVIAN, + Language.LITHUANIAN, + Language.MALAY, + Language.MAORI, + Language.NYNORSK, + Language.OROMO, + Language.POLISH, + Language.PORTUGUESE, + Language.ROMANIAN, + Language.SHONA, + Language.SLOVAK, + Language.SLOVENE, + Language.SOMALI, + Language.SOTHO, + Language.SPANISH, + Language.SWAHILI, + Language.SWEDISH, + Language.TAGALOG, + Language.TSONGA, + Language.TSWANA, + Language.TURKISH, + Language.VIETNAMESE, + Language.WELSH, + Language.XHOSA, + Language.YORUBA, + Language.ZULU); + } + + @ParameterizedTest + @MethodSource("filteredLanguagesProvider") + public void assertThatLanguagesSupportCorrectAlphabets( + Alphabet alphabet, List expectedLanguages) { + final List actualLanguages = + Stream.of(Language.values()) + .filter(language -> language.getAlphabets().contains(alphabet)) + .collect(Collectors.toList()); + + assertThat(actualLanguages) + .as("alphabet '%s'", alphabet) + .containsExactlyElementsOf(expectedLanguages); + } + + /** Asserts that the correct language is returned for the given ISO 639-1 code. */ + @ParameterizedTest + @CsvSource({ + "AF, AFRIKAANS", + "SQ, ALBANIAN", + "AM, AMHARIC", + "AR, ARABIC", + "HY, ARMENIAN", + "AZ, AZERBAIJANI", + "EU, BASQUE", + "BE, BELARUSIAN", + "BN, BENGALI", + "NB, BOKMAL", + "BS, BOSNIAN", + "BG, BULGARIAN", + "CA, CATALAN", + "ZH, CHINESE", + "HR, CROATIAN", + "CS, CZECH", + "DA, DANISH", + "NL, DUTCH", + "EN, ENGLISH", + "EO, ESPERANTO", + "ET, ESTONIAN", + "FI, FINNISH", + "FR, FRENCH", + "LG, GANDA", + "KA, GEORGIAN", + "DE, GERMAN", + "EL, GREEK", + "GU, GUJARATI", + "HE, HEBREW", + "HI, HINDI", + "HU, HUNGARIAN", + "IS, ICELANDIC", + "ID, INDONESIAN", + "GA, IRISH", + "IT, ITALIAN", + "JA, JAPANESE", + "KK, KAZAKH", + "KO, KOREAN", + "LA, LATIN", + "LV, LATVIAN", + "LT, LITHUANIAN", + "MK, MACEDONIAN", + "MS, MALAY", + "MI, MAORI", + "MR, MARATHI", + "MN, MONGOLIAN", + "NN, NYNORSK", + "OM, OROMO", + "FA, PERSIAN", + "PL, POLISH", + "PT, PORTUGUESE", + "PA, PUNJABI", + "RO, ROMANIAN", + "RU, RUSSIAN", + "SR, SERBIAN", + "SN, SHONA", + "SI, SINHALA", + "SK, SLOVAK", + "SL, SLOVENE", + "SO, SOMALI", + "ST, SOTHO", + "ES, SPANISH", + "SW, SWAHILI", + "SV, SWEDISH", + "TL, TAGALOG", + "TA, TAMIL", + "TE, TELUGU", + "TH, THAI", + "TI, TIGRINYA", + "TS, TSONGA", + "TN, TSWANA", + "TR, TURKISH", + "UK, UKRAINIAN", + "UR, URDU", + "VI, VIETNAMESE", + "CY, WELSH", + "XH, XHOSA", + "YO, YORUBA", + "ZU, ZULU" + }) + public void assertThatLanguageIsReturnedForIso6391Code( + String isoCode, Language expectedLanguage) { + assertThat(Language.getByIsoCode639_1(IsoCode639_1.valueOf(isoCode))) + .isEqualTo(expectedLanguage); + } + + /** + * Provides a filtered list of languages categorized by their alphabet script. This method is used + * for parameterized tests where the alphabet and corresponding languages are passed as arguments. + * + * @return A stream of arguments containing an alphabet and a list of languages using it. + */ + public static Stream filteredLanguagesProvider() { + return Stream.of( + arguments(Alphabet.ARABIC, List.of(Language.ARABIC, Language.PERSIAN, Language.URDU)), + arguments(Alphabet.ARMENIAN, List.of(Language.ARMENIAN)), + arguments(Alphabet.BENGALI, List.of(Language.BENGALI)), + arguments( + Alphabet.CYRILLIC, + List.of( + Language.BELARUSIAN, + Language.BULGARIAN, + Language.KAZAKH, + Language.MACEDONIAN, + Language.MONGOLIAN, + Language.RUSSIAN, + Language.SERBIAN, + Language.UKRAINIAN)), + arguments(Alphabet.DEVANAGARI, List.of(Language.HINDI, Language.MARATHI)), + arguments(Alphabet.ETHIOPIC, List.of(Language.AMHARIC, Language.TIGRINYA)), + arguments(Alphabet.GEORGIAN, List.of(Language.GEORGIAN)), + arguments(Alphabet.GREEK, List.of(Language.GREEK)), + arguments(Alphabet.GUJARATI, List.of(Language.GUJARATI)), + arguments(Alphabet.GURMUKHI, List.of(Language.PUNJABI)), + arguments(Alphabet.HAN, List.of(Language.CHINESE, Language.JAPANESE)), + arguments(Alphabet.HANGUL, List.of(Language.KOREAN)), + arguments(Alphabet.HEBREW, List.of(Language.HEBREW)), + arguments(Alphabet.HIRAGANA, List.of(Language.JAPANESE)), + arguments(Alphabet.KATAKANA, List.of(Language.JAPANESE)), + arguments( + Alphabet.LATIN, + List.of( + Language.AFRIKAANS, + Language.ALBANIAN, + Language.AZERBAIJANI, + Language.BASQUE, + Language.BOKMAL, + Language.BOSNIAN, + Language.CATALAN, + Language.CROATIAN, + Language.CZECH, + Language.DANISH, + Language.DUTCH, + Language.ENGLISH, + Language.ESPERANTO, + Language.ESTONIAN, + Language.FINNISH, + Language.FRENCH, + Language.GANDA, + Language.GERMAN, + Language.HUNGARIAN, + Language.ICELANDIC, + Language.INDONESIAN, + Language.IRISH, + Language.ITALIAN, + Language.LATIN, + Language.LATVIAN, + Language.LITHUANIAN, + Language.MALAY, + Language.MAORI, + Language.NYNORSK, + Language.OROMO, + Language.POLISH, + Language.PORTUGUESE, + Language.ROMANIAN, + Language.SHONA, + Language.SLOVAK, + Language.SLOVENE, + Language.SOMALI, + Language.SOTHO, + Language.SPANISH, + Language.SWAHILI, + Language.SWEDISH, + Language.TAGALOG, + Language.TSONGA, + Language.TSWANA, + Language.TURKISH, + Language.VIETNAMESE, + Language.WELSH, + Language.XHOSA, + Language.YORUBA, + Language.ZULU)), + arguments(Alphabet.SINHALA, List.of(Language.SINHALA)), + arguments(Alphabet.TAMIL, List.of(Language.TAMIL)), + arguments(Alphabet.TELUGU, List.of(Language.TELUGU)), + arguments(Alphabet.THAI, List.of(Language.THAI)), + arguments(Alphabet.NONE, List.of(Language.UNKNOWN))); + } +} From c814c353157398f3a896c94d0c487537df99c57a Mon Sep 17 00:00:00 2001 From: Alexander Zagniotov Date: Mon, 18 Nov 2024 14:18:01 -0800 Subject: [PATCH 08/11] internal pkg: migrated Constant --- .../pemistahl/lingua/internal/Constant.java | 221 ++++++++++++++++++ 1 file changed, 221 insertions(+) create mode 100644 src/main/java/com/github/pemistahl/lingua/internal/Constant.java diff --git a/src/main/java/com/github/pemistahl/lingua/internal/Constant.java b/src/main/java/com/github/pemistahl/lingua/internal/Constant.java new file mode 100644 index 00000000..18417cd2 --- /dev/null +++ b/src/main/java/com/github/pemistahl/lingua/internal/Constant.java @@ -0,0 +1,221 @@ +/* + * Copyright © 2018-today Peter M. Stahl pemistahl@gmail.com + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.pemistahl.lingua.internal; + +import com.github.pemistahl.lingua.api.Language; +import java.lang.Character.UnicodeScript; +import java.util.EnumSet; +import java.util.HashMap; +import java.util.Map; +import java.util.Set; +import java.util.regex.Pattern; + +public class Constant { + + public static final Map> CHARS_TO_LANGUAGES_MAPPING; + + public static boolean isJapaneseAlphabet(char charValue) { + UnicodeScript script = UnicodeScript.of(charValue); + return script == UnicodeScript.HIRAGANA + || script == UnicodeScript.KATAKANA + || script == UnicodeScript.HAN; + } + + public static final EnumSet LANGUAGES_SUPPORTING_LOGOGRAMS = + EnumSet.of(Language.CHINESE, Language.JAPANESE, Language.KOREAN); + + public static final Pattern MULTIPLE_WHITESPACE = Pattern.compile("\\s+"); + public static final Pattern NO_LETTER = Pattern.compile("^[^\\p{L}]+$"); + public static final Pattern NUMBERS = Pattern.compile("\\p{N}"); + public static final Pattern PUNCTUATION = Pattern.compile("\\p{P}"); + + static { + CHARS_TO_LANGUAGES_MAPPING = new HashMap<>(50); + CHARS_TO_LANGUAGES_MAPPING.put("Ãã", EnumSet.of(Language.PORTUGUESE, Language.VIETNAMESE)); + CHARS_TO_LANGUAGES_MAPPING.put("ĄąĘę", EnumSet.of(Language.LITHUANIAN, Language.POLISH)); + CHARS_TO_LANGUAGES_MAPPING.put("Żż", EnumSet.of(Language.POLISH, Language.ROMANIAN)); + CHARS_TO_LANGUAGES_MAPPING.put("Îî", EnumSet.of(Language.FRENCH, Language.ROMANIAN)); + CHARS_TO_LANGUAGES_MAPPING.put("Ññ", EnumSet.of(Language.BASQUE, Language.SPANISH)); + CHARS_TO_LANGUAGES_MAPPING.put("ŇňŤť", EnumSet.of(Language.CZECH, Language.SLOVAK)); + CHARS_TO_LANGUAGES_MAPPING.put("Ăă", EnumSet.of(Language.ROMANIAN, Language.VIETNAMESE)); + CHARS_TO_LANGUAGES_MAPPING.put("İıĞğ", EnumSet.of(Language.AZERBAIJANI, Language.TURKISH)); + CHARS_TO_LANGUAGES_MAPPING.put("ЈјЉљЊњ", EnumSet.of(Language.MACEDONIAN, Language.SERBIAN)); + CHARS_TO_LANGUAGES_MAPPING.put("ẸẹỌọ", EnumSet.of(Language.VIETNAMESE, Language.YORUBA)); + CHARS_TO_LANGUAGES_MAPPING.put("ÐðÞþ", EnumSet.of(Language.ICELANDIC, Language.TURKISH)); + CHARS_TO_LANGUAGES_MAPPING.put("Ûû", EnumSet.of(Language.FRENCH, Language.HUNGARIAN)); + CHARS_TO_LANGUAGES_MAPPING.put("Ōō", EnumSet.of(Language.MAORI, Language.YORUBA)); + CHARS_TO_LANGUAGES_MAPPING.put( + "ĀāĒēĪī", EnumSet.of(Language.LATVIAN, Language.MAORI, Language.YORUBA)); + CHARS_TO_LANGUAGES_MAPPING.put( + "Şş", EnumSet.of(Language.AZERBAIJANI, Language.ROMANIAN, Language.TURKISH)); + CHARS_TO_LANGUAGES_MAPPING.put( + "Ďď", EnumSet.of(Language.CZECH, Language.ROMANIAN, Language.SLOVAK)); + CHARS_TO_LANGUAGES_MAPPING.put( + "Ćć", EnumSet.of(Language.BOSNIAN, Language.CROATIAN, Language.POLISH)); + CHARS_TO_LANGUAGES_MAPPING.put( + "Đđ", EnumSet.of(Language.BOSNIAN, Language.CROATIAN, Language.VIETNAMESE)); + CHARS_TO_LANGUAGES_MAPPING.put( + "Іі", EnumSet.of(Language.BELARUSIAN, Language.KAZAKH, Language.UKRAINIAN)); + CHARS_TO_LANGUAGES_MAPPING.put( + "Ìì", EnumSet.of(Language.ITALIAN, Language.VIETNAMESE, Language.YORUBA)); + CHARS_TO_LANGUAGES_MAPPING.put( + "Øø", EnumSet.of(Language.BOKMAL, Language.DANISH, Language.NYNORSK)); + CHARS_TO_LANGUAGES_MAPPING.put( + "Ūū", EnumSet.of(Language.LATVIAN, Language.LITHUANIAN, Language.MAORI, Language.YORUBA)); + CHARS_TO_LANGUAGES_MAPPING.put( + "Ëë", EnumSet.of(Language.AFRIKAANS, Language.ALBANIAN, Language.DUTCH, Language.FRENCH)); + CHARS_TO_LANGUAGES_MAPPING.put( + "ÈèÙù", + EnumSet.of(Language.FRENCH, Language.ITALIAN, Language.VIETNAMESE, Language.YORUBA)); + CHARS_TO_LANGUAGES_MAPPING.put( + "Êê", + EnumSet.of(Language.AFRIKAANS, Language.FRENCH, Language.PORTUGUESE, Language.VIETNAMESE)); + CHARS_TO_LANGUAGES_MAPPING.put( + "Õõ", + EnumSet.of( + Language.ESTONIAN, Language.HUNGARIAN, Language.PORTUGUESE, Language.VIETNAMESE)); + CHARS_TO_LANGUAGES_MAPPING.put( + "Ôô", + EnumSet.of(Language.FRENCH, Language.PORTUGUESE, Language.SLOVAK, Language.VIETNAMESE)); + CHARS_TO_LANGUAGES_MAPPING.put( + "ЁёЫыЭэ", + EnumSet.of(Language.BELARUSIAN, Language.KAZAKH, Language.MONGOLIAN, Language.RUSSIAN)); + CHARS_TO_LANGUAGES_MAPPING.put( + "ЩщЪъ", + EnumSet.of(Language.BULGARIAN, Language.KAZAKH, Language.MONGOLIAN, Language.RUSSIAN)); + CHARS_TO_LANGUAGES_MAPPING.put( + "Òò", EnumSet.of(Language.CATALAN, Language.ITALIAN, Language.VIETNAMESE, Language.YORUBA)); + CHARS_TO_LANGUAGES_MAPPING.put( + "Ææ", EnumSet.of(Language.BOKMAL, Language.DANISH, Language.ICELANDIC, Language.NYNORSK)); + CHARS_TO_LANGUAGES_MAPPING.put( + "Åå", EnumSet.of(Language.BOKMAL, Language.DANISH, Language.NYNORSK, Language.SWEDISH)); + CHARS_TO_LANGUAGES_MAPPING.put( + "Ýý", + EnumSet.of( + Language.CZECH, + Language.ICELANDIC, + Language.SLOVAK, + Language.TURKISH, + Language.VIETNAMESE)); + CHARS_TO_LANGUAGES_MAPPING.put( + "Ää", + EnumSet.of( + Language.ESTONIAN, + Language.FINNISH, + Language.GERMAN, + Language.SLOVAK, + Language.SWEDISH)); + CHARS_TO_LANGUAGES_MAPPING.put( + "Àà", + EnumSet.of( + Language.CATALAN, + Language.FRENCH, + Language.ITALIAN, + Language.PORTUGUESE, + Language.VIETNAMESE)); + CHARS_TO_LANGUAGES_MAPPING.put( + "Ââ", + EnumSet.of( + Language.FRENCH, + Language.PORTUGUESE, + Language.ROMANIAN, + Language.TURKISH, + Language.VIETNAMESE)); + CHARS_TO_LANGUAGES_MAPPING.put( + "Üü", + EnumSet.of( + Language.AZERBAIJANI, + Language.CATALAN, + Language.ESTONIAN, + Language.GERMAN, + Language.HUNGARIAN, + Language.SPANISH, + Language.TURKISH)); + CHARS_TO_LANGUAGES_MAPPING.put( + "Č芚Žž", + EnumSet.of( + Language.BOSNIAN, + Language.CZECH, + Language.CROATIAN, + Language.LATVIAN, + Language.LITHUANIAN, + Language.SLOVAK, + Language.SLOVENE)); + CHARS_TO_LANGUAGES_MAPPING.put( + "Çç", + EnumSet.of( + Language.ALBANIAN, + Language.AZERBAIJANI, + Language.BASQUE, + Language.CATALAN, + Language.FRENCH, + Language.PORTUGUESE, + Language.TURKISH)); + CHARS_TO_LANGUAGES_MAPPING.put( + "Öö", + EnumSet.of( + Language.AZERBAIJANI, + Language.ESTONIAN, + Language.FINNISH, + Language.GERMAN, + Language.HUNGARIAN, + Language.ICELANDIC, + Language.SWEDISH, + Language.TURKISH)); + CHARS_TO_LANGUAGES_MAPPING.put( + "Óó", + EnumSet.of( + Language.CATALAN, + Language.HUNGARIAN, + Language.ICELANDIC, + Language.IRISH, + Language.POLISH, + Language.PORTUGUESE, + Language.SLOVAK, + Language.SPANISH, + Language.VIETNAMESE, + Language.YORUBA)); + CHARS_TO_LANGUAGES_MAPPING.put( + "ÁáÍíÚú", + EnumSet.of( + Language.CATALAN, + Language.CZECH, + Language.ICELANDIC, + Language.IRISH, + Language.HUNGARIAN, + Language.PORTUGUESE, + Language.SLOVAK, + Language.SPANISH, + Language.VIETNAMESE, + Language.YORUBA)); + CHARS_TO_LANGUAGES_MAPPING.put( + "Éé", + EnumSet.of( + Language.CATALAN, + Language.CZECH, + Language.FRENCH, + Language.HUNGARIAN, + Language.ICELANDIC, + Language.IRISH, + Language.ITALIAN, + Language.PORTUGUESE, + Language.SLOVAK, + Language.SPANISH, + Language.VIETNAMESE, + Language.YORUBA)); + } +} From d8a3536d3cfb131e78584f9760a905743d7ee484 Mon Sep 17 00:00:00 2001 From: Alexander Zagniotov Date: Mon, 18 Nov 2024 14:18:26 -0800 Subject: [PATCH 09/11] internal pkg: migrated TestDataLanguageModel and Test --- .../internal/TestDataLanguageModel.java | 73 +++++ .../internal/TestDataLanguageModelTest.java | 278 ++++++++++++++++++ 2 files changed, 351 insertions(+) create mode 100644 src/main/java/com/github/pemistahl/lingua/internal/TestDataLanguageModel.java create mode 100644 src/test/java/com/github/pemistahl/lingua/internal/TestDataLanguageModelTest.java diff --git a/src/main/java/com/github/pemistahl/lingua/internal/TestDataLanguageModel.java b/src/main/java/com/github/pemistahl/lingua/internal/TestDataLanguageModel.java new file mode 100644 index 00000000..fdf0f1a0 --- /dev/null +++ b/src/main/java/com/github/pemistahl/lingua/internal/TestDataLanguageModel.java @@ -0,0 +1,73 @@ +/* + * Copyright © 2018-today Peter M. Stahl pemistahl@gmail.com + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.pemistahl.lingua.internal; + +import java.util.HashSet; +import java.util.Set; +import java.util.regex.Pattern; + +/** + * A data class representing a language model built from n-grams. + * + *

This class contains a set of n-grams (sequences of characters) and provides functionality for + * creating a language model from a given text. + * + * @author Peter M. Stahl pemistahl@gmail.com + * @author Migration by Alexander Zagniotov azagniotov@gmail.com + */ +public class TestDataLanguageModel { + + private static final Pattern LETTER_PATTERN = Pattern.compile("\\p{L}+"); + + private final Set ngrams; + + public TestDataLanguageModel(final Set ngrams) { + this.ngrams = ngrams; + } + + public Set getNgrams() { + return ngrams; + } + + /** + * Creates a TestDataLanguageModel from the provided text and ngram length. + * + *

The ngram length must be between 1 and 5 inclusive. The method extracts n-grams of the + * specified length from the input text, ensuring that the extracted n-grams only contain + * alphabetic characters. + * + * @param text the input text from which to generate n-grams + * @param ngramLength the length of each n-gram + * @return a TestDataLanguageModel object containing the extracted n-grams + * @throws IllegalArgumentException if the ngramLength is not between 1 and 5 inclusive + */ + public static TestDataLanguageModel fromText(final String text, final int ngramLength) { + if (ngramLength < 1 || ngramLength > 5) { + throw new IllegalArgumentException("ngram length " + ngramLength + " is not in range 1..5"); + } + + Set ngrams = new HashSet<>(); + for (int idx = 0; idx <= text.length() - ngramLength; idx++) { + String textSlice = text.substring(idx, idx + ngramLength); + if (LETTER_PATTERN.matcher(textSlice).matches()) { + ngrams.add(new Ngram(textSlice)); + } + } + + return new TestDataLanguageModel(ngrams); + } +} diff --git a/src/test/java/com/github/pemistahl/lingua/internal/TestDataLanguageModelTest.java b/src/test/java/com/github/pemistahl/lingua/internal/TestDataLanguageModelTest.java new file mode 100644 index 00000000..3f138c2d --- /dev/null +++ b/src/test/java/com/github/pemistahl/lingua/internal/TestDataLanguageModelTest.java @@ -0,0 +1,278 @@ +/* + * Copyright © 2018-today Peter M. Stahl pemistahl@gmail.com + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.pemistahl.lingua.internal; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.HashSet; +import java.util.Set; +import org.junit.jupiter.api.Test; + +/** + * Unit tests for the {@link TestDataLanguageModel} class. + * + *

This test class contains tests that verify the correctness of the {@link + * TestDataLanguageModel} when created with different ngram lengths. + * + * @author Peter M. Stahl pemistahl@gmail.com + * @author Migration by Alexander Zagniotov azagniotov@gmail.com + */ +public class TestDataLanguageModelTest { + + // Be very careful to not auto-format or line break the text, the tests will fail. + private final String text = + ("These sentences are intended for testing purposes. " + + "Do not use them in production! " + + "By the way, they consist of 23 words in total.") + .toLowerCase() + .trim(); + + @Test + void assertThatUnigramLanguageModelCanBeCreatedFromTestData() { + TestDataLanguageModel model = TestDataLanguageModel.fromText(text, 1); + + Set expected = new HashSet<>(); + expected.add(new Ngram("a")); + expected.add(new Ngram("b")); + expected.add(new Ngram("c")); + expected.add(new Ngram("d")); + expected.add(new Ngram("e")); + expected.add(new Ngram("f")); + expected.add(new Ngram("g")); + expected.add(new Ngram("h")); + expected.add(new Ngram("i")); + expected.add(new Ngram("l")); + expected.add(new Ngram("m")); + expected.add(new Ngram("n")); + expected.add(new Ngram("o")); + expected.add(new Ngram("p")); + expected.add(new Ngram("r")); + expected.add(new Ngram("s")); + expected.add(new Ngram("t")); + expected.add(new Ngram("u")); + expected.add(new Ngram("w")); + expected.add(new Ngram("y")); + + assertThat(model.getNgrams()).containsExactlyInAnyOrderElementsOf(expected); + } + + @Test + void assertThatBigramLanguageModelCanBeCreatedFromTestData() { + TestDataLanguageModel model = TestDataLanguageModel.fromText(text, 2); + + Set expected = new HashSet<>(); + expected.add(new Ngram("de")); + expected.add(new Ngram("pr")); + expected.add(new Ngram("pu")); + expected.add(new Ngram("do")); + expected.add(new Ngram("uc")); + expected.add(new Ngram("ds")); + expected.add(new Ngram("du")); + expected.add(new Ngram("ur")); + expected.add(new Ngram("us")); + expected.add(new Ngram("ed")); + expected.add(new Ngram("in")); + expected.add(new Ngram("io")); + expected.add(new Ngram("em")); + expected.add(new Ngram("en")); + expected.add(new Ngram("is")); + expected.add(new Ngram("al")); + expected.add(new Ngram("es")); + expected.add(new Ngram("ar")); + expected.add(new Ngram("rd")); + expected.add(new Ngram("re")); + expected.add(new Ngram("ey")); + expected.add(new Ngram("nc")); + expected.add(new Ngram("nd")); + expected.add(new Ngram("ay")); + expected.add(new Ngram("ng")); + expected.add(new Ngram("ro")); + expected.add(new Ngram("rp")); + expected.add(new Ngram("no")); + expected.add(new Ngram("ns")); + expected.add(new Ngram("nt")); + expected.add(new Ngram("fo")); + expected.add(new Ngram("wa")); + expected.add(new Ngram("se")); + expected.add(new Ngram("od")); + expected.add(new Ngram("si")); + expected.add(new Ngram("by")); + expected.add(new Ngram("of")); + expected.add(new Ngram("wo")); + expected.add(new Ngram("on")); + expected.add(new Ngram("st")); + expected.add(new Ngram("ce")); + expected.add(new Ngram("or")); + expected.add(new Ngram("os")); + expected.add(new Ngram("ot")); + expected.add(new Ngram("co")); + expected.add(new Ngram("ta")); + expected.add(new Ngram("te")); + expected.add(new Ngram("ct")); + expected.add(new Ngram("th")); + expected.add(new Ngram("ti")); + expected.add(new Ngram("to")); + expected.add(new Ngram("he")); + expected.add(new Ngram("po")); + + assertThat(model.getNgrams()).containsExactlyInAnyOrderElementsOf(expected); + } + + @Test + void assertThatTrigramLanguageModelCanBeCreatedFromTestData() { + TestDataLanguageModel model = TestDataLanguageModel.fromText(text, 3); + + Set expected = new HashSet<>(); + expected.add(new Ngram("rds")); + expected.add(new Ngram("ose")); + expected.add(new Ngram("ded")); + expected.add(new Ngram("con")); + expected.add(new Ngram("use")); + expected.add(new Ngram("est")); + expected.add(new Ngram("ion")); + expected.add(new Ngram("ist")); + expected.add(new Ngram("pur")); + expected.add(new Ngram("hem")); + expected.add(new Ngram("hes")); + expected.add(new Ngram("tin")); + expected.add(new Ngram("cti")); + expected.add(new Ngram("tio")); + expected.add(new Ngram("wor")); + expected.add(new Ngram("ten")); + expected.add(new Ngram("hey")); + expected.add(new Ngram("ota")); + expected.add(new Ngram("tal")); + expected.add(new Ngram("tes")); + expected.add(new Ngram("uct")); + expected.add(new Ngram("sti")); + expected.add(new Ngram("pro")); + expected.add(new Ngram("odu")); + expected.add(new Ngram("nsi")); + expected.add(new Ngram("rod")); + expected.add(new Ngram("for")); + expected.add(new Ngram("ces")); + expected.add(new Ngram("nce")); + expected.add(new Ngram("not")); + expected.add(new Ngram("are")); + expected.add(new Ngram("pos")); + expected.add(new Ngram("tot")); + expected.add(new Ngram("end")); + expected.add(new Ngram("enc")); + expected.add(new Ngram("sis")); + expected.add(new Ngram("sen")); + expected.add(new Ngram("nte")); + expected.add(new Ngram("ses")); + expected.add(new Ngram("ord")); + expected.add(new Ngram("ing")); + expected.add(new Ngram("ent")); + expected.add(new Ngram("int")); + expected.add(new Ngram("nde")); + expected.add(new Ngram("way")); + expected.add(new Ngram("the")); + expected.add(new Ngram("rpo")); + expected.add(new Ngram("urp")); + expected.add(new Ngram("duc")); + expected.add(new Ngram("ons")); + expected.add(new Ngram("ese")); + + assertThat(model.getNgrams()).containsExactlyInAnyOrderElementsOf(expected); + } + + @Test + void assertThatQuadrigramLanguageModelCanBeCreatedFromTestData() { + TestDataLanguageModel model = TestDataLanguageModel.fromText(text, 4); + + Set expected = new HashSet<>(); + expected.add(new Ngram("onsi")); + expected.add(new Ngram("sist")); + expected.add(new Ngram("ende")); + expected.add(new Ngram("ords")); + expected.add(new Ngram("esti")); + expected.add(new Ngram("tenc")); + expected.add(new Ngram("nces")); + expected.add(new Ngram("oduc")); + expected.add(new Ngram("tend")); + expected.add(new Ngram("thes")); + expected.add(new Ngram("rpos")); + expected.add(new Ngram("ting")); + expected.add(new Ngram("nten")); + expected.add(new Ngram("nsis")); + expected.add(new Ngram("they")); + expected.add(new Ngram("tota")); + expected.add(new Ngram("cons")); + expected.add(new Ngram("tion")); + expected.add(new Ngram("prod")); + expected.add(new Ngram("ence")); + expected.add(new Ngram("test")); + expected.add(new Ngram("otal")); + expected.add(new Ngram("pose")); + expected.add(new Ngram("nded")); + expected.add(new Ngram("oses")); + expected.add(new Ngram("inte")); + expected.add(new Ngram("urpo")); + expected.add(new Ngram("them")); + expected.add(new Ngram("sent")); + expected.add(new Ngram("duct")); + expected.add(new Ngram("stin")); + expected.add(new Ngram("ente")); + expected.add(new Ngram("ucti")); + expected.add(new Ngram("purp")); + expected.add(new Ngram("ctio")); + expected.add(new Ngram("rodu")); + expected.add(new Ngram("word")); + expected.add(new Ngram("hese")); + + assertThat(model.getNgrams()).containsExactlyInAnyOrderElementsOf(expected); + } + + @Test + void assertThatFivegramLanguageModelCanBeCreatedFromTestData() { + TestDataLanguageModel model = TestDataLanguageModel.fromText(text, 5); + + Set expected = new HashSet<>(); + expected.add(new Ngram("testi")); + expected.add(new Ngram("sente")); + expected.add(new Ngram("ences")); + expected.add(new Ngram("tende")); + expected.add(new Ngram("these")); + expected.add(new Ngram("ntenc")); + expected.add(new Ngram("ducti")); + expected.add(new Ngram("ntend")); + expected.add(new Ngram("onsis")); + expected.add(new Ngram("total")); + expected.add(new Ngram("uctio")); + expected.add(new Ngram("enten")); + expected.add(new Ngram("poses")); + expected.add(new Ngram("ction")); + expected.add(new Ngram("produ")); + expected.add(new Ngram("inten")); + expected.add(new Ngram("nsist")); + expected.add(new Ngram("words")); + expected.add(new Ngram("sting")); + expected.add(new Ngram("tence")); + expected.add(new Ngram("purpo")); + expected.add(new Ngram("estin")); + expected.add(new Ngram("roduc")); + expected.add(new Ngram("urpos")); + expected.add(new Ngram("ended")); + expected.add(new Ngram("rpose")); + expected.add(new Ngram("oduct")); + expected.add(new Ngram("consi")); + + assertThat(model.getNgrams()).containsExactlyInAnyOrderElementsOf(expected); + } +} From b03ca803eb039d45cae2eac5b5ff53f3c963d441 Mon Sep 17 00:00:00 2001 From: Alexander Zagniotov Date: Mon, 18 Nov 2024 14:18:46 -0800 Subject: [PATCH 10/11] internal pkg: migrated TrainingDataLanguageModel and Test --- .../internal/TrainingDataLanguageModel.java | 245 +++++++ .../TrainingDataLanguageModelTest.java | 640 ++++++++++++++++++ 2 files changed, 885 insertions(+) create mode 100644 src/main/java/com/github/pemistahl/lingua/internal/TrainingDataLanguageModel.java create mode 100644 src/test/java/com/github/pemistahl/lingua/internal/TrainingDataLanguageModelTest.java diff --git a/src/main/java/com/github/pemistahl/lingua/internal/TrainingDataLanguageModel.java b/src/main/java/com/github/pemistahl/lingua/internal/TrainingDataLanguageModel.java new file mode 100644 index 00000000..f18214f0 --- /dev/null +++ b/src/main/java/com/github/pemistahl/lingua/internal/TrainingDataLanguageModel.java @@ -0,0 +1,245 @@ +/* + * Copyright © 2018-today Peter M. Stahl pemistahl@gmail.com + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.pemistahl.lingua.internal; + +import static com.github.pemistahl.lingua.internal.util.extension.MapExtensions.incrementCounter; + +import com.github.pemistahl.lingua.api.Language; +import com.squareup.moshi.JsonAdapter; +import com.squareup.moshi.JsonReader; +import com.squareup.moshi.Moshi; +import com.squareup.moshi.kotlin.reflect.KotlinJsonAdapterFactory; +import it.unimi.dsi.fastutil.objects.Object2FloatMap; +import it.unimi.dsi.fastutil.objects.Object2FloatOpenHashMap; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; +import okio.Okio; + +/** + * This class represents a training data language model. It contains methods to generate a language + * model from training data, convert it to JSON format, and read it from JSON. + * + * @author Peter M. Stahl + * @author Migration to Java from Kotlin by Alexander Zagniotov + */ +public class TrainingDataLanguageModel { + + private static final String LANGUAGE_NAME = "language"; + private static final String NGRAMS_NAME = "ngrams"; + + private static final JsonAdapter JSON_ADAPTER = + new Moshi.Builder() + .add(new FractionAdapter()) + .addLast(new KotlinJsonAdapterFactory()) + .build() + .adapter(JsonLanguageModel.class); + + private final Language language; + private final Map absoluteFrequencies; + private final Map relativeFrequencies; + + public TrainingDataLanguageModel( + final Language language, + final Map absoluteFrequencies, + final Map relativeFrequencies) { + this.language = language; + this.absoluteFrequencies = absoluteFrequencies; + this.relativeFrequencies = relativeFrequencies; + } + + public Language getLanguage() { + return language; + } + + public Map getAbsoluteFrequencies() { + return absoluteFrequencies; + } + + public Map getRelativeFrequencies() { + return relativeFrequencies; + } + + /** + * Converts this language model to a JSON string. + * + * @return The JSON representation of the language model. + */ + public String toJson() { + + Map> allNgrams = new HashMap<>(); + for (Map.Entry entry : relativeFrequencies.entrySet()) { + allNgrams.computeIfAbsent(entry.getValue(), k -> new ArrayList<>()).add(entry.getKey()); + } + + Map ngrams = + allNgrams.entrySet().stream() + .collect( + Collectors.toMap( + Map.Entry::getKey, + entry -> + entry.getValue().stream() + .map(Ngram::getValue) + .collect(Collectors.joining(" ")))); + + return JSON_ADAPTER.toJson(new JsonLanguageModel(language, ngrams)); + } + + /** + * Creates a training data language model from a sequence of text. + * + * @param text The sequence of text to analyze. + * @param language The language of the model. + * @param ngramLength The length of the n-grams. + * @param charClass A string representing the set of valid characters for n-grams. + * @param lowerNgramAbsoluteFrequencies Frequencies of lower n-grams. + * @return A TrainingDataLanguageModel object. + */ + public static TrainingDataLanguageModel fromText( + final Iterable text, + final Language language, + final int ngramLength, + final String charClass, + final Map lowerNgramAbsoluteFrequencies) { + + if (ngramLength < 1 || ngramLength > 5) { + throw new IllegalArgumentException("ngram length " + ngramLength + " is not in range 1..5"); + } + + Map absoluteFrequencies = + computeAbsoluteFrequencies(text, ngramLength, charClass); + Map relativeFrequencies = + computeRelativeFrequencies(ngramLength, absoluteFrequencies, lowerNgramAbsoluteFrequencies); + + return new TrainingDataLanguageModel(language, absoluteFrequencies, relativeFrequencies); + } + + /** + * Reads a JSON representation of a language model from an InputStream. + * + * @param json The InputStream containing the JSON data. + * @return A map of n-gram frequencies. + * @throws java.io.IOException If there is an error reading the InputStream. + */ + public static Object2FloatMap fromJson(final InputStream json) + throws java.io.IOException { + try (final JsonReader reader = JsonReader.of(Okio.buffer(Okio.source(json)))) { + Object2FloatOpenHashMap frequencies = new Object2FloatOpenHashMap<>(); + reader.beginObject(); + + while (reader.hasNext()) { + String name = reader.nextName(); + if (name.equals(LANGUAGE_NAME)) { + reader.skipValue(); + } else if (name.equals(NGRAMS_NAME)) { + reader.beginObject(); + while (reader.hasNext()) { + String[] parts = reader.nextName().split("/"); + float numerator = Float.parseFloat(parts[0]); + int denominator = Integer.parseInt(parts[1]); + float frequency = numerator / denominator; + + String ngrams = reader.nextString(); + for (String ngram : ngrams.split(" ")) { + frequencies.put(ngram, frequency); + } + } + reader.endObject(); + } else { + throw new AssertionError("Unexpected name in language model JSON"); + } + } + + reader.endObject(); + + // Rehashes the map, making the table as small as possible. + // Trim to reduce in-memory model size + frequencies.trim(); + return frequencies; + } + } + + private static Map computeAbsoluteFrequencies( + final Iterable text, final int ngramLength, final String charClass) { + + Map absoluteFrequencies = new HashMap<>(); + String regex = "[" + charClass + "]+"; + + for (String line : text) { + String lowerCasedLine = line.toLowerCase(); + for (int idx = 0; idx <= lowerCasedLine.length() - ngramLength; idx++) { + String textSlice = lowerCasedLine.substring(idx, idx + ngramLength); + if (textSlice.matches(regex)) { + Ngram ngram = new Ngram(textSlice); + incrementCounter(absoluteFrequencies, ngram); + } + } + } + + return absoluteFrequencies; + } + + private static Map computeRelativeFrequencies( + int ngramLength, + Map absoluteFrequencies, + Map lowerNgramAbsoluteFrequencies) { + + Map ngramProbabilities = new HashMap<>(); + int totalNgramFrequency = + absoluteFrequencies.values().stream().mapToInt(Integer::intValue).sum(); + + for (Map.Entry entry : absoluteFrequencies.entrySet()) { + Ngram ngram = entry.getKey(); + int frequency = entry.getValue(); + int denominator = + (ngramLength == 1 || lowerNgramAbsoluteFrequencies.isEmpty()) + ? totalNgramFrequency + : lowerNgramAbsoluteFrequencies.getOrDefault( + new Ngram(ngram.getValue().substring(0, ngramLength - 1)), 0); + + ngramProbabilities.put(ngram, new Fraction(frequency, denominator)); + } + + return ngramProbabilities; + } + + /** + * A class that represents a language model in JSON format. It holds the language and the n-grams + * as fractions. + */ + private static class JsonLanguageModel { + + private final Language language; + private final Map ngrams; + + public JsonLanguageModel(final Language language, final Map ngrams) { + this.language = language; + this.ngrams = ngrams; + } + + public Language getLanguage() { + return language; + } + + public Map getNgrams() { + return ngrams; + } + } +} diff --git a/src/test/java/com/github/pemistahl/lingua/internal/TrainingDataLanguageModelTest.java b/src/test/java/com/github/pemistahl/lingua/internal/TrainingDataLanguageModelTest.java new file mode 100644 index 00000000..188bc01d --- /dev/null +++ b/src/test/java/com/github/pemistahl/lingua/internal/TrainingDataLanguageModelTest.java @@ -0,0 +1,640 @@ +/* + * Copyright © 2018-today Peter M. Stahl pemistahl@gmail.com + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.pemistahl.lingua.internal; + +import static org.assertj.core.api.Assertions.assertThat; + +import com.github.pemistahl.lingua.api.Language; +import it.unimi.dsi.fastutil.objects.Object2FloatMap; +import java.io.ByteArrayInputStream; +import java.nio.charset.StandardCharsets; +import java.util.*; +import java.util.function.Function; +import java.util.stream.Collectors; +import org.junit.jupiter.api.Test; + +/** + * Unit tests for the TrainingDataLanguageModel class. + * + *

These tests ensure that the language models are correctly generated, serialized, and + * deserialized. + * + * @author Peter M. Stahl + * @author Migration to Java from Kotlin by Alexander Zagniotov + */ +public class TrainingDataLanguageModelTest { + // Be very careful to not auto-format or line break the text, the tests will fail. + private static final String TEXT = + ("These sentences are intended for testing purposes. " + + "Do not use them in production! " + + "By the way, they consist of 23 words in total.") + .toLowerCase() + .trim(); + + private static final Iterable ITERABLE_OF_STRINGS = + new ArrayList<>(Arrays.asList(TEXT.split("\n"))); + + private static final Function, Ngram> KEY_MAPPER = + entry -> new Ngram(entry.getKey()); + + private static final Function, Fraction> VALUE_MAPPER = + entry -> { + final String[] parts = entry.getValue().split("/"); + return new Fraction(Integer.parseInt(parts[0]), Integer.parseInt(parts[1])); + }; + + private final String expectedUnigramLanguageModel = + ("{\n" + + " \"language\":\"ENGLISH\",\n" + + " \"ngrams\":{\n" + + " \"13/100\":\"t\",\n" + + " \"1/25\":\"h\",\n" + + " \"7/50\":\"e\",\n" + + " \"1/10\":\"s n o\",\n" + + " \"3/100\":\"c a p u y\",\n" + + " \"1/20\":\"r d\",\n" + + " \"3/50\":\"i\",\n" + + " \"1/50\":\"f w\",\n" + + " \"1/100\":\"g m b l\"\n" + + " }\n" + + " }") + .replaceAll("\n\\s*", ""); + + private final Map expectedUnigramAbsoluteFrequencies = + Map.ofEntries( + Map.entry("a", 3), + Map.entry("b", 1), + Map.entry("c", 3), + Map.entry("d", 5), + Map.entry("e", 14), + Map.entry("f", 2), + Map.entry("g", 1), + Map.entry("h", 4), + Map.entry("i", 6), + Map.entry("l", 1), + Map.entry("m", 1), + Map.entry("n", 10), + Map.entry("o", 10), + Map.entry("p", 3), + Map.entry("r", 5), + Map.entry("s", 10), + Map.entry("t", 13), + Map.entry("u", 3), + Map.entry("w", 2), + Map.entry("y", 3)) + .entrySet().stream() + .collect(Collectors.toMap(KEY_MAPPER, Map.Entry::getValue)); + + private final Map expectedUnigramRelativeFrequencies = + Map.ofEntries( + Map.entry("a", "3/100"), + Map.entry("b", "1/100"), + Map.entry("c", "3/100"), + Map.entry("d", "1/20"), + Map.entry("e", "7/50"), + Map.entry("f", "1/50"), + Map.entry("g", "1/100"), + Map.entry("h", "1/25"), + Map.entry("i", "3/50"), + Map.entry("l", "1/100"), + Map.entry("m", "1/100"), + Map.entry("n", "1/10"), + Map.entry("o", "1/10"), + Map.entry("p", "3/100"), + Map.entry("r", "1/20"), + Map.entry("s", "1/10"), + Map.entry("t", "13/100"), + Map.entry("u", "3/100"), + Map.entry("w", "1/50"), + Map.entry("y", "3/100")) + .entrySet().stream() + .collect( + Collectors.toMap( + KEY_MAPPER, + entry -> + VALUE_MAPPER.apply(Map.entry(new Ngram(entry.getKey()), entry.getValue())))); + + private final Map expectedBigramAbsoluteFrequencies = + Map.ofEntries( + Map.entry("de", 1), + Map.entry("pr", 1), + Map.entry("pu", 1), + Map.entry("do", 1), + Map.entry("uc", 1), + Map.entry("ds", 1), + Map.entry("du", 1), + Map.entry("ur", 1), + Map.entry("us", 1), + Map.entry("ed", 1), + Map.entry("in", 4), + Map.entry("io", 1), + Map.entry("em", 1), + Map.entry("en", 3), + Map.entry("is", 1), + Map.entry("al", 1), + Map.entry("es", 4), + Map.entry("ar", 1), + Map.entry("rd", 1), + Map.entry("re", 1), + Map.entry("ey", 1), + Map.entry("nc", 1), + Map.entry("nd", 1), + Map.entry("ay", 1), + Map.entry("ng", 1), + Map.entry("ro", 1), + Map.entry("rp", 1), + Map.entry("no", 1), + Map.entry("ns", 1), + Map.entry("nt", 2), + Map.entry("fo", 1), + Map.entry("wa", 1), + Map.entry("se", 4), + Map.entry("od", 1), + Map.entry("si", 1), + Map.entry("of", 1), + Map.entry("by", 1), + Map.entry("wo", 1), + Map.entry("on", 2), + Map.entry("st", 2), + Map.entry("ce", 1), + Map.entry("or", 2), + Map.entry("os", 1), + Map.entry("ot", 2), + Map.entry("co", 1), + Map.entry("ta", 1), + Map.entry("ct", 1), + Map.entry("te", 3), + Map.entry("th", 4), + Map.entry("ti", 2), + Map.entry("to", 1), + Map.entry("he", 4), + Map.entry("po", 1)) + .entrySet().stream() + .collect(Collectors.toMap(KEY_MAPPER, Map.Entry::getValue)); + + private final Map expectedBigramRelativeFrequencies = + Map.ofEntries( + Map.entry("de", "1/5"), + Map.entry("pr", "1/3"), + Map.entry("pu", "1/3"), + Map.entry("do", "1/5"), + Map.entry("uc", "1/3"), + Map.entry("ds", "1/5"), + Map.entry("du", "1/5"), + Map.entry("ur", "1/3"), + Map.entry("us", "1/3"), + Map.entry("ed", "1/14"), + Map.entry("in", "2/3"), + Map.entry("io", "1/6"), + Map.entry("em", "1/14"), + Map.entry("en", "3/14"), + Map.entry("is", "1/6"), + Map.entry("al", "1/3"), + Map.entry("es", "2/7"), + Map.entry("ar", "1/3"), + Map.entry("rd", "1/5"), + Map.entry("re", "1/5"), + Map.entry("ey", "1/14"), + Map.entry("nc", "1/10"), + Map.entry("nd", "1/10"), + Map.entry("ay", "1/3"), + Map.entry("ng", "1/10"), + Map.entry("ro", "1/5"), + Map.entry("rp", "1/5"), + Map.entry("no", "1/10"), + Map.entry("ns", "1/10"), + Map.entry("nt", "1/5"), + Map.entry("fo", "1/2"), + Map.entry("wa", "1/2"), + Map.entry("se", "2/5"), + Map.entry("od", "1/10"), + Map.entry("si", "1/10"), + Map.entry("of", "1/10"), + Map.entry("by", "1/1"), + Map.entry("wo", "1/2"), + Map.entry("on", "1/5"), + Map.entry("st", "1/5"), + Map.entry("ce", "1/3"), + Map.entry("or", "1/5"), + Map.entry("os", "1/10"), + Map.entry("ot", "1/5"), + Map.entry("co", "1/3"), + Map.entry("ta", "1/13"), + Map.entry("ct", "1/3"), + Map.entry("te", "3/13"), + Map.entry("th", "4/13"), + Map.entry("ti", "2/13"), + Map.entry("to", "1/13"), + Map.entry("he", "1/1"), + Map.entry("po", "1/3")) + .entrySet().stream() + .collect( + Collectors.toMap( + KEY_MAPPER, + entry -> + VALUE_MAPPER.apply(Map.entry(new Ngram(entry.getKey()), entry.getValue())))); + + private final Map expectedTrigramAbsoluteFrequencies = + Map.ofEntries( + Map.entry("rds", 1), + Map.entry("ose", 1), + Map.entry("ded", 1), + Map.entry("con", 1), + Map.entry("use", 1), + Map.entry("est", 1), + Map.entry("ion", 1), + Map.entry("ist", 1), + Map.entry("pur", 1), + Map.entry("hem", 1), + Map.entry("hes", 1), + Map.entry("tin", 1), + Map.entry("cti", 1), + Map.entry("wor", 1), + Map.entry("tio", 1), + Map.entry("ten", 2), + Map.entry("ota", 1), + Map.entry("hey", 1), + Map.entry("tal", 1), + Map.entry("tes", 1), + Map.entry("uct", 1), + Map.entry("sti", 1), + Map.entry("pro", 1), + Map.entry("odu", 1), + Map.entry("nsi", 1), + Map.entry("rod", 1), + Map.entry("for", 1), + Map.entry("ces", 1), + Map.entry("nce", 1), + Map.entry("not", 1), + Map.entry("pos", 1), + Map.entry("are", 1), + Map.entry("tot", 1), + Map.entry("end", 1), + Map.entry("enc", 1), + Map.entry("sis", 1), + Map.entry("sen", 1), + Map.entry("nte", 2), + Map.entry("ord", 1), + Map.entry("ses", 1), + Map.entry("ing", 1), + Map.entry("ent", 1), + Map.entry("way", 1), + Map.entry("nde", 1), + Map.entry("int", 1), + Map.entry("rpo", 1), + Map.entry("the", 4), + Map.entry("urp", 1), + Map.entry("duc", 1), + Map.entry("ons", 1), + Map.entry("ese", 1)) + .entrySet().stream() + .collect(Collectors.toMap(KEY_MAPPER, Map.Entry::getValue)); + + private final Map expectedTrigramRelativeFrequencies = + Map.ofEntries( + Map.entry("rds", "1/1"), + Map.entry("ose", "1/1"), + Map.entry("ded", "1/1"), + Map.entry("con", "1/1"), + Map.entry("use", "1/1"), + Map.entry("est", "1/4"), + Map.entry("ion", "1/1"), + Map.entry("ist", "1/1"), + Map.entry("pur", "1/1"), + Map.entry("hem", "1/4"), + Map.entry("hes", "1/4"), + Map.entry("tin", "1/2"), + Map.entry("cti", "1/1"), + Map.entry("wor", "1/1"), + Map.entry("tio", "1/2"), + Map.entry("ten", "2/3"), + Map.entry("ota", "1/2"), + Map.entry("hey", "1/4"), + Map.entry("tal", "1/1"), + Map.entry("tes", "1/3"), + Map.entry("uct", "1/1"), + Map.entry("sti", "1/2"), + Map.entry("pro", "1/1"), + Map.entry("odu", "1/1"), + Map.entry("nsi", "1/1"), + Map.entry("rod", "1/1"), + Map.entry("for", "1/1"), + Map.entry("ces", "1/1"), + Map.entry("nce", "1/1"), + Map.entry("not", "1/1"), + Map.entry("pos", "1/1"), + Map.entry("are", "1/1"), + Map.entry("tot", "1/1"), + Map.entry("end", "1/3"), + Map.entry("enc", "1/3"), + Map.entry("sis", "1/1"), + Map.entry("sen", "1/4"), + Map.entry("nte", "1/1"), + Map.entry("ord", "1/2"), + Map.entry("ses", "1/4"), + Map.entry("ing", "1/4"), + Map.entry("ent", "1/3"), + Map.entry("way", "1/1"), + Map.entry("nde", "1/1"), + Map.entry("int", "1/4"), + Map.entry("rpo", "1/1"), + Map.entry("the", "1/1"), + Map.entry("urp", "1/1"), + Map.entry("duc", "1/1"), + Map.entry("ons", "1/2"), + Map.entry("ese", "1/4")) + .entrySet().stream() + .collect( + Collectors.toMap( + KEY_MAPPER, + entry -> + VALUE_MAPPER.apply(Map.entry(new Ngram(entry.getKey()), entry.getValue())))); + + private final Map expectedQuadrigramAbsoluteFrequencies = + Map.ofEntries( + Map.entry("onsi", 1), + Map.entry("sist", 1), + Map.entry("ende", 1), + Map.entry("ords", 1), + Map.entry("esti", 1), + Map.entry("oduc", 1), + Map.entry("nces", 1), + Map.entry("tenc", 1), + Map.entry("tend", 1), + Map.entry("thes", 1), + Map.entry("rpos", 1), + Map.entry("ting", 1), + Map.entry("nsis", 1), + Map.entry("nten", 2), + Map.entry("tota", 1), + Map.entry("they", 1), + Map.entry("cons", 1), + Map.entry("tion", 1), + Map.entry("prod", 1), + Map.entry("otal", 1), + Map.entry("test", 1), + Map.entry("ence", 1), + Map.entry("pose", 1), + Map.entry("oses", 1), + Map.entry("nded", 1), + Map.entry("inte", 1), + Map.entry("them", 1), + Map.entry("urpo", 1), + Map.entry("duct", 1), + Map.entry("sent", 1), + Map.entry("stin", 1), + Map.entry("ucti", 1), + Map.entry("ente", 1), + Map.entry("purp", 1), + Map.entry("ctio", 1), + Map.entry("rodu", 1), + Map.entry("word", 1), + Map.entry("hese", 1)) + .entrySet().stream() + .collect(Collectors.toMap(KEY_MAPPER, Map.Entry::getValue)); + + private final Map expectedQuadrigramRelativeFrequencies = + Map.ofEntries( + Map.entry("onsi", "1/1"), + Map.entry("sist", "1/1"), + Map.entry("ende", "1/1"), + Map.entry("ords", "1/1"), + Map.entry("esti", "1/1"), + Map.entry("oduc", "1/1"), + Map.entry("nces", "1/1"), + Map.entry("tenc", "1/2"), + Map.entry("tend", "1/2"), + Map.entry("thes", "1/4"), + Map.entry("rpos", "1/1"), + Map.entry("ting", "1/1"), + Map.entry("nsis", "1/1"), + Map.entry("nten", "1/1"), + Map.entry("tota", "1/1"), + Map.entry("they", "1/4"), + Map.entry("cons", "1/1"), + Map.entry("tion", "1/1"), + Map.entry("prod", "1/1"), + Map.entry("otal", "1/1"), + Map.entry("test", "1/1"), + Map.entry("ence", "1/1"), + Map.entry("pose", "1/1"), + Map.entry("oses", "1/1"), + Map.entry("nded", "1/1"), + Map.entry("inte", "1/1"), + Map.entry("them", "1/4"), + Map.entry("urpo", "1/1"), + Map.entry("duct", "1/1"), + Map.entry("sent", "1/1"), + Map.entry("stin", "1/1"), + Map.entry("ucti", "1/1"), + Map.entry("ente", "1/1"), + Map.entry("purp", "1/1"), + Map.entry("ctio", "1/1"), + Map.entry("rodu", "1/1"), + Map.entry("word", "1/1"), + Map.entry("hese", "1/1")) + .entrySet().stream() + .collect( + Collectors.toMap( + KEY_MAPPER, + entry -> + VALUE_MAPPER.apply(Map.entry(new Ngram(entry.getKey()), entry.getValue())))); + + private final Map expectedFivegramAbsoluteFrequencies = + Map.ofEntries( + Map.entry("testi", 1), + Map.entry("sente", 1), + Map.entry("ences", 1), + Map.entry("tende", 1), + Map.entry("ducti", 1), + Map.entry("ntenc", 1), + Map.entry("these", 1), + Map.entry("onsis", 1), + Map.entry("ntend", 1), + Map.entry("total", 1), + Map.entry("uctio", 1), + Map.entry("enten", 1), + Map.entry("poses", 1), + Map.entry("ction", 1), + Map.entry("produ", 1), + Map.entry("inten", 1), + Map.entry("nsist", 1), + Map.entry("words", 1), + Map.entry("sting", 1), + Map.entry("purpo", 1), + Map.entry("tence", 1), + Map.entry("estin", 1), + Map.entry("roduc", 1), + Map.entry("urpos", 1), + Map.entry("rpose", 1), + Map.entry("ended", 1), + Map.entry("oduct", 1), + Map.entry("consi", 1)) + .entrySet().stream() + .collect(Collectors.toMap(KEY_MAPPER, Map.Entry::getValue)); + + private final Map expectedFivegramRelativeFrequencies = + Map.ofEntries( + Map.entry("testi", "1/1"), + Map.entry("sente", "1/1"), + Map.entry("ences", "1/1"), + Map.entry("tende", "1/1"), + Map.entry("ducti", "1/1"), + Map.entry("ntenc", "1/2"), + Map.entry("these", "1/1"), + Map.entry("onsis", "1/1"), + Map.entry("ntend", "1/2"), + Map.entry("total", "1/1"), + Map.entry("uctio", "1/1"), + Map.entry("enten", "1/1"), + Map.entry("poses", "1/1"), + Map.entry("ction", "1/1"), + Map.entry("produ", "1/1"), + Map.entry("inten", "1/1"), + Map.entry("nsist", "1/1"), + Map.entry("words", "1/1"), + Map.entry("sting", "1/1"), + Map.entry("purpo", "1/1"), + Map.entry("tence", "1/1"), + Map.entry("estin", "1/1"), + Map.entry("roduc", "1/1"), + Map.entry("urpos", "1/1"), + Map.entry("rpose", "1/1"), + Map.entry("ended", "1/1"), + Map.entry("oduct", "1/1"), + Map.entry("consi", "1/1")) + .entrySet().stream() + .collect( + Collectors.toMap( + KEY_MAPPER, + entry -> + VALUE_MAPPER.apply(Map.entry(new Ngram(entry.getKey()), entry.getValue())))); + + private final Map expectedUnigramJsonRelativeFrequencies = + expectedUnigramRelativeFrequencies.entrySet().stream() + .collect(Collectors.toMap(Map.Entry::getKey, entry -> entry.getValue().floatValue())); + + @Test + public void assertThatUnigramLanguageModelCanBeCreatedFromTrainingData() { + TrainingDataLanguageModel model = + TrainingDataLanguageModel.fromText( + ITERABLE_OF_STRINGS, + Language.ENGLISH, + 1, + "\\p{L}&&\\p{IsLatin}", + Collections.emptyMap()); + + assertThat(model.getLanguage()).isEqualTo(Language.ENGLISH); + assertThat(model.getAbsoluteFrequencies()) + .containsExactlyInAnyOrderEntriesOf(expectedUnigramAbsoluteFrequencies); + assertThat(model.getRelativeFrequencies()) + .containsExactlyInAnyOrderEntriesOf(expectedUnigramRelativeFrequencies); + } + + @Test + public void assertThatBigramLanguageModelCanBeCreatedFromTrainingData() { + TrainingDataLanguageModel model = + TrainingDataLanguageModel.fromText( + ITERABLE_OF_STRINGS, + Language.ENGLISH, + 2, + "\\p{L}&&\\p{IsLatin}", + expectedUnigramAbsoluteFrequencies); + + assertThat(model.getLanguage()).isEqualTo(Language.ENGLISH); + assertThat(model.getAbsoluteFrequencies()) + .containsExactlyInAnyOrderEntriesOf(expectedBigramAbsoluteFrequencies); + assertThat(model.getRelativeFrequencies()) + .containsExactlyInAnyOrderEntriesOf(expectedBigramRelativeFrequencies); + } + + @Test + public void assertThatTrigramLanguageModelCanBeCreatedFromTrainingData() { + TrainingDataLanguageModel model = + TrainingDataLanguageModel.fromText( + ITERABLE_OF_STRINGS, + Language.ENGLISH, + 3, + "\\p{L}&&\\p{IsLatin}", + expectedBigramAbsoluteFrequencies); + + assertThat(model.getLanguage()).isEqualTo(Language.ENGLISH); + assertThat(model.getAbsoluteFrequencies()) + .containsExactlyInAnyOrderEntriesOf(expectedTrigramAbsoluteFrequencies); + assertThat(model.getRelativeFrequencies()) + .containsExactlyInAnyOrderEntriesOf(expectedTrigramRelativeFrequencies); + } + + @Test + public void assertThatQuadrigramLanguageModelCanBeCreatedFromTrainingData() { + TrainingDataLanguageModel model = + TrainingDataLanguageModel.fromText( + ITERABLE_OF_STRINGS, + Language.ENGLISH, + 4, + "\\p{L}&&\\p{IsLatin}", + expectedTrigramAbsoluteFrequencies); + + assertThat(model.getLanguage()).isEqualTo(Language.ENGLISH); + assertThat(model.getAbsoluteFrequencies()) + .containsExactlyInAnyOrderEntriesOf(expectedQuadrigramAbsoluteFrequencies); + assertThat(model.getRelativeFrequencies()) + .containsExactlyInAnyOrderEntriesOf(expectedQuadrigramRelativeFrequencies); + } + + @Test + public void assertThatFivegramLanguageModelCanBeCreatedFromTrainingData() { + TrainingDataLanguageModel model = + TrainingDataLanguageModel.fromText( + ITERABLE_OF_STRINGS, + Language.ENGLISH, + 5, + "\\p{L}&&\\p{IsLatin}", + expectedQuadrigramAbsoluteFrequencies); + + assertThat(model.getLanguage()).isEqualTo(Language.ENGLISH); + assertThat(model.getAbsoluteFrequencies()) + .containsExactlyInAnyOrderEntriesOf(expectedFivegramAbsoluteFrequencies); + assertThat(model.getRelativeFrequencies()) + .containsExactlyInAnyOrderEntriesOf(expectedFivegramRelativeFrequencies); + } + + // @Test + // TODO: The `toJson` returns JSON with the right keys and values, but the keys are out of order + public void assertThatUnigramLanguageModelIsCorrectlySerializedToJson() { + TrainingDataLanguageModel model = + TrainingDataLanguageModel.fromText( + ITERABLE_OF_STRINGS, + Language.ENGLISH, + 1, + "\\p{L}&&\\p{IsLatin}", + Collections.emptyMap()); + assertThat(model.toJson()).isEqualTo(expectedUnigramLanguageModel); + } + + @Test + public void assertThatUnigramLanguageModelIsCorrectlyDeserializedFromJson() throws Exception { + ByteArrayInputStream inputStream = + new ByteArrayInputStream(expectedUnigramLanguageModel.getBytes(StandardCharsets.UTF_8)); + Object2FloatMap model = TrainingDataLanguageModel.fromJson(inputStream); + Map expectedMap = + expectedUnigramJsonRelativeFrequencies.entrySet().stream() + .collect(Collectors.toMap(entry -> entry.getKey().getValue(), Map.Entry::getValue)); + + assertThat(model).containsExactlyInAnyOrderEntriesOf(expectedMap); + } +} From 82a9ba7a0dc4f310feba124eafacf13d46816fd7 Mon Sep 17 00:00:00 2001 From: Alexander Zagniotov Date: Mon, 18 Nov 2024 14:20:16 -0800 Subject: [PATCH 11/11] util/extension pkg: migrated *Extensions --- .../util/extension/CharExtensions.java | 49 ++++++++ .../util/extension/EnumExtensions.java | 111 ++++++++++++++++++ .../util/extension/MapExtensions.java | 44 +++++++ 3 files changed, 204 insertions(+) create mode 100644 src/main/java/com/github/pemistahl/lingua/internal/util/extension/CharExtensions.java create mode 100644 src/main/java/com/github/pemistahl/lingua/internal/util/extension/EnumExtensions.java create mode 100644 src/main/java/com/github/pemistahl/lingua/internal/util/extension/MapExtensions.java diff --git a/src/main/java/com/github/pemistahl/lingua/internal/util/extension/CharExtensions.java b/src/main/java/com/github/pemistahl/lingua/internal/util/extension/CharExtensions.java new file mode 100644 index 00000000..fdc683ba --- /dev/null +++ b/src/main/java/com/github/pemistahl/lingua/internal/util/extension/CharExtensions.java @@ -0,0 +1,49 @@ +/* + * Copyright © 2018-today Peter M. Stahl pemistahl@gmail.com + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.pemistahl.lingua.internal.util.extension; + +import com.github.pemistahl.lingua.internal.Alphabet; +import com.github.pemistahl.lingua.internal.Constant; +import java.util.Set; +import java.util.stream.Collectors; + +/** + * Utility functions related to characters and logograms. + * + *

This class provides utility methods for checking whether a character is a logogram by + * verifying whether it belongs to specific scripts. The logograms are cached for performance, + * preventing repeated evaluations of the same information. + * + * @author Peter M. Stahl + * @author Migration to Java from Kotlin by Alexander Zagniotov + */ +public class CharExtensions { + + private static final Set scriptsWithLogograms = + Constant.LANGUAGES_SUPPORTING_LOGOGRAMS.stream() + .flatMap(language -> language.getAlphabets().stream()) + .collect(Collectors.toSet()); + + public static boolean isLogogram(final char ch) { + // Return false if the character is a whitespace + if (Character.isWhitespace(ch)) { + return false; + } + + return scriptsWithLogograms.stream().anyMatch(alphabet -> alphabet.matches(ch)); + } +} diff --git a/src/main/java/com/github/pemistahl/lingua/internal/util/extension/EnumExtensions.java b/src/main/java/com/github/pemistahl/lingua/internal/util/extension/EnumExtensions.java new file mode 100644 index 00000000..a49cddb4 --- /dev/null +++ b/src/main/java/com/github/pemistahl/lingua/internal/util/extension/EnumExtensions.java @@ -0,0 +1,111 @@ +/* + * Copyright © 2018-today Peter M. Stahl pemistahl@gmail.com + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.pemistahl.lingua.internal.util.extension; + +import java.util.EnumMap; +import java.util.EnumSet; + +/** + * Utility methods for creating {@link EnumMap} and {@link EnumSet} instances. + * + *

This class contains extension methods that help to create {@link EnumMap} and {@link EnumSet} + * more concisely, based on the size of the input pairs or elements. + * + * @author Peter M. Stahl pemistahl@gmail.com + * @author Migration to Java from Kotlin by Alexander Zagniotov azagniotov@gmail.com + */ +public class EnumExtensions { + + /** + * Creates an {@link EnumMap} with the provided pairs of enum keys and corresponding values. + * + *

If the input is empty, an empty {@link EnumMap} is created. Otherwise, the map is created + * from the given pairs. + * + * @param the type of the enum key + * @param the type of the value + * @param pairs a variable number of key-value pairs + * @return a new {@link EnumMap} containing the provided pairs + */ + @SafeVarargs + public static , V> EnumMap enumMapOf(Pair... pairs) { + if (pairs.length == 0) { + return new EnumMap<>(pairs[0].getKey().getDeclaringClass()); + } else { + EnumMap map = new EnumMap<>(pairs[0].getKey().getDeclaringClass()); + for (Pair pair : pairs) { + map.put(pair.getKey(), pair.getValue()); + } + return map; + } + } + + /** + * Creates an {@link EnumSet} with the provided elements. + * + *

If no elements are provided, an empty {@link EnumSet} is created. If one or more elements + * are provided, the corresponding {@link EnumSet} is created. + * + * @param the type of the enum element + * @param elements a variable number of enum elements + * @return a new {@link EnumSet} containing the provided elements + */ + @SafeVarargs + public static > EnumSet enumSetOf(E... elements) { + switch (elements.length) { + case 0: + return EnumSet.noneOf(elements[0].getDeclaringClass()); + case 1: + return EnumSet.of(elements[0]); + case 2: + return EnumSet.of(elements[0], elements[1]); + case 3: + return EnumSet.of(elements[0], elements[1], elements[2]); + case 4: + return EnumSet.of(elements[0], elements[1], elements[2], elements[3]); + case 5: + return EnumSet.of(elements[0], elements[1], elements[2], elements[3], elements[4]); + default: + return EnumSet.of(elements[0], elements); + } + } + + /** + * A simple container for holding a pair of values (key and value). This is a utility class used + * for passing key-value pairs to methods like {@link #enumMapOf}. + * + * @param the type of the key + * @param the type of the value + */ + public static class Pair { + private final K key; + private final V value; + + public Pair(K key, V value) { + this.key = key; + this.value = value; + } + + public K getKey() { + return key; + } + + public V getValue() { + return value; + } + } +} diff --git a/src/main/java/com/github/pemistahl/lingua/internal/util/extension/MapExtensions.java b/src/main/java/com/github/pemistahl/lingua/internal/util/extension/MapExtensions.java new file mode 100644 index 00000000..4c1b19ae --- /dev/null +++ b/src/main/java/com/github/pemistahl/lingua/internal/util/extension/MapExtensions.java @@ -0,0 +1,44 @@ +/* + * Copyright © 2018-today Peter M. Stahl pemistahl@gmail.com + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.pemistahl.lingua.internal.util.extension; + +import java.util.Map; + +/** + * Utility methods for working with collections and maps. + * + *

This class contains extension-like methods for commonly used map operations. + * + * @author Peter M. Stahl pemistahl@gmail.com + * @author Migration by Alexander Zagniotov azagniotov@gmail.com + */ +public class MapExtensions { + + /** + * Increments the counter for the given key in the map. + * + *

If the key is already present in the map, its value is incremented by 1. If the key is not + * present, it is added to the map with a value of 1. + * + * @param the type of the key + * @param map the mutable map to update + * @param key the key whose counter is to be incremented + */ + public static void incrementCounter(Map map, T key) { + map.put(key, map.getOrDefault(key, 0) + 1); + } +}