diff --git a/src/main/kotlin/com/github/pemistahl/lingua/api/Language.kt b/src/main/kotlin/com/github/pemistahl/lingua/api/Language.kt index ba573669..3c031f69 100644 --- a/src/main/kotlin/com/github/pemistahl/lingua/api/Language.kt +++ b/src/main/kotlin/com/github/pemistahl/lingua/api/Language.kt @@ -176,6 +176,8 @@ import com.github.pemistahl.lingua.internal.Alphabet.HIRAGANA import com.github.pemistahl.lingua.internal.Alphabet.KATAKANA import com.github.pemistahl.lingua.internal.Alphabet.LATIN import com.github.pemistahl.lingua.internal.Alphabet.NONE +import com.github.pemistahl.lingua.internal.util.extension.enumSetOf +import java.util.EnumSet /** * The supported detectable languages. @@ -183,146 +185,146 @@ import com.github.pemistahl.lingua.internal.Alphabet.NONE enum class Language( val isoCode639_1: IsoCode639_1, val isoCode639_3: IsoCode639_3, - internal val alphabets: Set, + internal val alphabets: EnumSet, internal val uniqueCharacters: String? ) { - AFRIKAANS(AF, AFR, setOf(Alphabet.LATIN), null), - ALBANIAN(SQ, SQI, setOf(Alphabet.LATIN), null), - ARABIC(AR, ARA, setOf(Alphabet.ARABIC), null), - ARMENIAN(HY, HYE, setOf(Alphabet.ARMENIAN), null), - AZERBAIJANI(AZ, AZE, setOf(Alphabet.LATIN), "Əə"), - BASQUE(EU, EUS, setOf(Alphabet.LATIN), null), - BELARUSIAN(BE, BEL, setOf(CYRILLIC), null), - BENGALI(BN, BEN, setOf(Alphabet.BENGALI), null), - BOKMAL(NB, NOB, setOf(Alphabet.LATIN), null), - BOSNIAN(BS, BOS, setOf(Alphabet.LATIN), null), - BULGARIAN(BG, BUL, setOf(CYRILLIC), null), - CATALAN(CA, CAT, setOf(Alphabet.LATIN), "Ïï"), - CHINESE(ZH, ZHO, setOf(HAN), null), - CROATIAN(HR, HRV, setOf(Alphabet.LATIN), null), - CZECH(CS, CES, setOf(Alphabet.LATIN), "ĚěŘřŮů"), - DANISH(DA, DAN, setOf(Alphabet.LATIN), null), - DUTCH(NL, NLD, setOf(Alphabet.LATIN), null), - ENGLISH(EN, ENG, setOf(Alphabet.LATIN), null), - ESPERANTO(EO, EPO, setOf(Alphabet.LATIN), "ĈĉĜĝĤĥĴĵŜŝŬŭ"), - ESTONIAN(ET, EST, setOf(Alphabet.LATIN), null), - FINNISH(FI, FIN, setOf(Alphabet.LATIN), null), - FRENCH(FR, FRA, setOf(Alphabet.LATIN), null), - GANDA(LG, LUG, setOf(Alphabet.LATIN), null), - GEORGIAN(KA, KAT, setOf(Alphabet.GEORGIAN), null), - GERMAN(DE, DEU, setOf(Alphabet.LATIN), "ß"), - GREEK(EL, ELL, setOf(Alphabet.GREEK), null), - GUJARATI(GU, GUJ, setOf(Alphabet.GUJARATI), null), - HEBREW(HE, HEB, setOf(Alphabet.HEBREW), null), - HINDI(HI, HIN, setOf(DEVANAGARI), null), - HUNGARIAN(HU, HUN, setOf(Alphabet.LATIN), "ŐőŰű"), - ICELANDIC(IS, ISL, setOf(Alphabet.LATIN), null), - INDONESIAN(ID, IND, setOf(Alphabet.LATIN), null), - IRISH(GA, GLE, setOf(Alphabet.LATIN), null), - ITALIAN(IT, ITA, setOf(Alphabet.LATIN), null), - JAPANESE(JA, JPN, setOf(HIRAGANA, KATAKANA, HAN), null), - KAZAKH(KK, KAZ, setOf(CYRILLIC), "ӘәҒғҚқҢңҰұ"), - KOREAN(KO, KOR, setOf(HANGUL), null), - LATIN(LA, LAT, setOf(Alphabet.LATIN), null), - LATVIAN(LV, LAV, setOf(Alphabet.LATIN), "ĢģĶķĻļŅņ"), - LITHUANIAN(LT, LIT, setOf(Alphabet.LATIN), "ĖėĮįŲų"), - MACEDONIAN(MK, MKD, setOf(CYRILLIC), "ЃѓЅѕЌќЏџ"), - MALAY(MS, MSA, setOf(Alphabet.LATIN), null), - MAORI(MI, MRI, setOf(Alphabet.LATIN), null), - MARATHI(MR, MAR, setOf(DEVANAGARI), "ळ"), - MONGOLIAN(MN, MON, setOf(CYRILLIC), "ӨөҮү"), - NYNORSK(NN, NNO, setOf(Alphabet.LATIN), null), - PERSIAN(FA, FAS, setOf(Alphabet.ARABIC), null), - POLISH(PL, POL, setOf(Alphabet.LATIN), "ŁłŃńŚśŹź"), - PORTUGUESE(PT, POR, setOf(Alphabet.LATIN), null), - PUNJABI(PA, PAN, setOf(GURMUKHI), null), - ROMANIAN(RO, RON, setOf(Alphabet.LATIN), "Țţ"), - RUSSIAN(RU, RUS, setOf(CYRILLIC), null), - SERBIAN(SR, SRP, setOf(CYRILLIC), "ЂђЋћ"), - SHONA(SN, SNA, setOf(Alphabet.LATIN), null), - SLOVAK(SK, SLK, setOf(Alphabet.LATIN), "Ĺ弾Ŕŕ"), - SLOVENE(SL, SLV, setOf(Alphabet.LATIN), null), - SOMALI(SO, SOM, setOf(Alphabet.LATIN), null), - SOTHO(ST, SOT, setOf(Alphabet.LATIN), null), - SPANISH(ES, SPA, setOf(Alphabet.LATIN), "¿¡"), - SWAHILI(SW, SWA, setOf(Alphabet.LATIN), null), - SWEDISH(SV, SWE, setOf(Alphabet.LATIN), null), - TAGALOG(TL, TGL, setOf(Alphabet.LATIN), null), - TAMIL(TA, TAM, setOf(Alphabet.TAMIL), null), - TELUGU(TE, TEL, setOf(Alphabet.TELUGU), null), - THAI(TH, THA, setOf(Alphabet.THAI), null), - TSONGA(TS, TSO, setOf(Alphabet.LATIN), null), - TSWANA(TN, TSN, setOf(Alphabet.LATIN), null), - TURKISH(TR, TUR, setOf(Alphabet.LATIN), null), - UKRAINIAN(UK, UKR, setOf(CYRILLIC), "ҐґЄєЇї"), - URDU(UR, URD, setOf(Alphabet.ARABIC), null), + AFRIKAANS(AF, AFR, enumSetOf(Alphabet.LATIN), null), + ALBANIAN(SQ, SQI, enumSetOf(Alphabet.LATIN), null), + ARABIC(AR, ARA, enumSetOf(Alphabet.ARABIC), null), + ARMENIAN(HY, HYE, enumSetOf(Alphabet.ARMENIAN), null), + AZERBAIJANI(AZ, AZE, enumSetOf(Alphabet.LATIN), "Əə"), + BASQUE(EU, EUS, enumSetOf(Alphabet.LATIN), null), + BELARUSIAN(BE, BEL, enumSetOf(CYRILLIC), null), + BENGALI(BN, BEN, enumSetOf(Alphabet.BENGALI), null), + BOKMAL(NB, NOB, enumSetOf(Alphabet.LATIN), null), + BOSNIAN(BS, BOS, enumSetOf(Alphabet.LATIN), null), + BULGARIAN(BG, BUL, enumSetOf(CYRILLIC), null), + CATALAN(CA, CAT, enumSetOf(Alphabet.LATIN), "Ïï"), + CHINESE(ZH, ZHO, enumSetOf(HAN), null), + CROATIAN(HR, HRV, enumSetOf(Alphabet.LATIN), null), + CZECH(CS, CES, enumSetOf(Alphabet.LATIN), "ĚěŘřŮů"), + DANISH(DA, DAN, enumSetOf(Alphabet.LATIN), null), + DUTCH(NL, NLD, enumSetOf(Alphabet.LATIN), null), + ENGLISH(EN, ENG, enumSetOf(Alphabet.LATIN), null), + ESPERANTO(EO, EPO, enumSetOf(Alphabet.LATIN), "ĈĉĜĝĤĥĴĵŜŝŬŭ"), + ESTONIAN(ET, EST, enumSetOf(Alphabet.LATIN), null), + FINNISH(FI, FIN, enumSetOf(Alphabet.LATIN), null), + FRENCH(FR, FRA, enumSetOf(Alphabet.LATIN), null), + GANDA(LG, LUG, enumSetOf(Alphabet.LATIN), null), + GEORGIAN(KA, KAT, enumSetOf(Alphabet.GEORGIAN), null), + GERMAN(DE, DEU, enumSetOf(Alphabet.LATIN), "ß"), + GREEK(EL, ELL, enumSetOf(Alphabet.GREEK), null), + GUJARATI(GU, GUJ, enumSetOf(Alphabet.GUJARATI), null), + HEBREW(HE, HEB, enumSetOf(Alphabet.HEBREW), null), + HINDI(HI, HIN, enumSetOf(DEVANAGARI), null), + HUNGARIAN(HU, HUN, enumSetOf(Alphabet.LATIN), "ŐőŰű"), + ICELANDIC(IS, ISL, enumSetOf(Alphabet.LATIN), null), + INDONESIAN(ID, IND, enumSetOf(Alphabet.LATIN), null), + IRISH(GA, GLE, enumSetOf(Alphabet.LATIN), null), + ITALIAN(IT, ITA, enumSetOf(Alphabet.LATIN), null), + JAPANESE(JA, JPN, enumSetOf(HIRAGANA, KATAKANA, HAN), null), + KAZAKH(KK, KAZ, enumSetOf(CYRILLIC), "ӘәҒғҚқҢңҰұ"), + KOREAN(KO, KOR, enumSetOf(HANGUL), null), + LATIN(LA, LAT, enumSetOf(Alphabet.LATIN), null), + LATVIAN(LV, LAV, enumSetOf(Alphabet.LATIN), "ĢģĶķĻļŅņ"), + LITHUANIAN(LT, LIT, enumSetOf(Alphabet.LATIN), "ĖėĮįŲų"), + MACEDONIAN(MK, MKD, enumSetOf(CYRILLIC), "ЃѓЅѕЌќЏџ"), + MALAY(MS, MSA, enumSetOf(Alphabet.LATIN), null), + MAORI(MI, MRI, enumSetOf(Alphabet.LATIN), null), + MARATHI(MR, MAR, enumSetOf(DEVANAGARI), "ळ"), + MONGOLIAN(MN, MON, enumSetOf(CYRILLIC), "ӨөҮү"), + NYNORSK(NN, NNO, enumSetOf(Alphabet.LATIN), null), + PERSIAN(FA, FAS, enumSetOf(Alphabet.ARABIC), null), + POLISH(PL, POL, enumSetOf(Alphabet.LATIN), "ŁłŃńŚśŹź"), + PORTUGUESE(PT, POR, enumSetOf(Alphabet.LATIN), null), + PUNJABI(PA, PAN, enumSetOf(GURMUKHI), null), + ROMANIAN(RO, RON, enumSetOf(Alphabet.LATIN), "Țţ"), + RUSSIAN(RU, RUS, enumSetOf(CYRILLIC), null), + SERBIAN(SR, SRP, enumSetOf(CYRILLIC), "ЂђЋћ"), + SHONA(SN, SNA, enumSetOf(Alphabet.LATIN), null), + SLOVAK(SK, SLK, enumSetOf(Alphabet.LATIN), "Ĺ弾Ŕŕ"), + SLOVENE(SL, SLV, enumSetOf(Alphabet.LATIN), null), + SOMALI(SO, SOM, enumSetOf(Alphabet.LATIN), null), + SOTHO(ST, SOT, enumSetOf(Alphabet.LATIN), null), + SPANISH(ES, SPA, enumSetOf(Alphabet.LATIN), "¿¡"), + SWAHILI(SW, SWA, enumSetOf(Alphabet.LATIN), null), + SWEDISH(SV, SWE, enumSetOf(Alphabet.LATIN), null), + TAGALOG(TL, TGL, enumSetOf(Alphabet.LATIN), null), + TAMIL(TA, TAM, enumSetOf(Alphabet.TAMIL), null), + TELUGU(TE, TEL, enumSetOf(Alphabet.TELUGU), null), + THAI(TH, THA, enumSetOf(Alphabet.THAI), null), + TSONGA(TS, TSO, enumSetOf(Alphabet.LATIN), null), + TSWANA(TN, TSN, enumSetOf(Alphabet.LATIN), null), + TURKISH(TR, TUR, enumSetOf(Alphabet.LATIN), null), + UKRAINIAN(UK, UKR, enumSetOf(CYRILLIC), "ҐґЄєЇї"), + URDU(UR, URD, enumSetOf(Alphabet.ARABIC), null), VIETNAMESE( VI, VIE, - setOf(Alphabet.LATIN), + enumSetOf(Alphabet.LATIN), "ẰằẦầẲẳẨẩẴẵẪẫẮắẤấẠạẶặẬậỀềẺẻỂểẼẽỄễẾếỆệỈỉĨĩỊịƠơỒồỜờỎỏỔổỞởỖỗỠỡỐốỚớỘộỢợƯưỪừỦủỬửŨũỮữỨứỤụỰựỲỳỶỷỸỹỴỵ" ), - WELSH(CY, CYM, setOf(Alphabet.LATIN), null), - XHOSA(XH, XHO, setOf(Alphabet.LATIN), null), + WELSH(CY, CYM, enumSetOf(Alphabet.LATIN), null), + XHOSA(XH, XHO, enumSetOf(Alphabet.LATIN), null), // TODO for YORUBA: "E̩e̩Ẹ́ẹ́É̩é̩Ẹ̀ẹ̀È̩è̩Ẹ̄ẹ̄Ē̩ē̩ŌōO̩o̩Ọ́ọ́Ó̩ó̩Ọ̀ọ̀Ò̩ò̩Ọ̄ọ̄Ō̩ō̩ṢṣS̩s̩" - YORUBA(YO, YOR, setOf(Alphabet.LATIN), "Ṣṣ"), - ZULU(ZU, ZUL, setOf(Alphabet.LATIN), null), + YORUBA(YO, YOR, enumSetOf(Alphabet.LATIN), "Ṣṣ"), + ZULU(ZU, ZUL, enumSetOf(Alphabet.LATIN), null), /** * The imaginary unknown language. * * This value is returned if no language can be detected reliably. */ - UNKNOWN(IsoCode639_1.NONE, IsoCode639_3.NONE, setOf(NONE), null); + UNKNOWN(IsoCode639_1.NONE, IsoCode639_3.NONE, enumSetOf(NONE), null); companion object { /** * Returns a list of all built-in languages. */ @JvmStatic - fun all() = filterOutLanguages(UNKNOWN) + fun all(): List = filterOutLanguages(UNKNOWN) /** * Returns a list of all built-in languages that are still spoken today. */ @JvmStatic - fun allSpokenOnes() = filterOutLanguages(UNKNOWN, LATIN) + fun allSpokenOnes(): List = filterOutLanguages(UNKNOWN, LATIN) /** * Returns a list of all built-in languages supporting the Arabic script. */ @JvmStatic - fun allWithArabicScript() = values().filter { it.alphabets.contains(Alphabet.ARABIC) } + fun allWithArabicScript(): List = values().filter { it.alphabets.contains(Alphabet.ARABIC) } /** * Returns a list of all built-in languages supporting the Cyrillic script. */ @JvmStatic - fun allWithCyrillicScript() = values().filter { it.alphabets.contains(CYRILLIC) } + fun allWithCyrillicScript(): List = values().filter { it.alphabets.contains(CYRILLIC) } /** * Returns a list of all built-in languages supporting the Devanagari script. */ @JvmStatic - fun allWithDevanagariScript() = values().filter { it.alphabets.contains(DEVANAGARI) } + fun allWithDevanagariScript(): List = values().filter { it.alphabets.contains(DEVANAGARI) } /** * Returns a list of all built-in languages supporting the Latin script. */ @JvmStatic - fun allWithLatinScript() = values().filter { it.alphabets.contains(Alphabet.LATIN) } + fun allWithLatinScript(): List = values().filter { it.alphabets.contains(Alphabet.LATIN) } /** * Returns the language for the given ISO 639-1 code. */ @JvmStatic - fun getByIsoCode639_1(isoCode: IsoCode639_1) = values().find { it.isoCode639_1 == isoCode }!! + fun getByIsoCode639_1(isoCode: IsoCode639_1): Language = values().first { it.isoCode639_1 == isoCode } /** * Returns the language for the given ISO 639-3 code. */ @JvmStatic - fun getByIsoCode639_3(isoCode: IsoCode639_3) = values().find { it.isoCode639_3 == isoCode }!! + fun getByIsoCode639_3(isoCode: IsoCode639_3): Language = values().first { it.isoCode639_3 == isoCode } private fun filterOutLanguages(vararg languages: Language) = values().filterNot { it in languages } } diff --git a/src/main/kotlin/com/github/pemistahl/lingua/api/LanguageDetector.kt b/src/main/kotlin/com/github/pemistahl/lingua/api/LanguageDetector.kt index 0071a440..5db1a1ad 100644 --- a/src/main/kotlin/com/github/pemistahl/lingua/api/LanguageDetector.kt +++ b/src/main/kotlin/com/github/pemistahl/lingua/api/LanguageDetector.kt @@ -29,23 +29,16 @@ import com.github.pemistahl.lingua.internal.Constant.isJapaneseAlphabet import com.github.pemistahl.lingua.internal.Ngram import com.github.pemistahl.lingua.internal.TestDataLanguageModel import com.github.pemistahl.lingua.internal.TrainingDataLanguageModel +import com.github.pemistahl.lingua.internal.util.extension.enumMapOf import com.github.pemistahl.lingua.internal.util.extension.incrementCounter import com.github.pemistahl.lingua.internal.util.extension.isLogogram +import java.util.EnumMap import java.util.SortedMap import java.util.TreeMap import java.util.concurrent.Callable -import java.util.concurrent.ExecutorService -import java.util.concurrent.LinkedBlockingQueue -import java.util.concurrent.ThreadPoolExecutor -import java.util.concurrent.TimeUnit +import java.util.concurrent.ForkJoinPool import kotlin.math.ln -private val UNIGRAM_MODELS = mutableMapOf() -private val BIGRAM_MODELS = mutableMapOf() -private val TRIGRAM_MODELS = mutableMapOf() -private val QUADRIGRAM_MODELS = mutableMapOf() -private val FIVEGRAM_MODELS = mutableMapOf() - /** * Detects the language of given input text. */ @@ -54,14 +47,7 @@ class LanguageDetector internal constructor( internal val minimumRelativeDistance: Double, isEveryLanguageModelPreloaded: Boolean, internal val numberOfLoadedLanguages: Int = languages.size, - internal val unigramLanguageModels: MutableMap = UNIGRAM_MODELS, - internal val bigramLanguageModels: MutableMap = BIGRAM_MODELS, - internal val trigramLanguageModels: MutableMap = TRIGRAM_MODELS, - internal val quadrigramLanguageModels: MutableMap = QUADRIGRAM_MODELS, - internal val fivegramLanguageModels: MutableMap = FIVEGRAM_MODELS ) { - internal val threadPool = createThreadPool() - private val languagesWithUniqueCharacters = languages.filterNot { it.uniqueCharacters.isNullOrBlank() }.asSequence() private val oneLanguageAlphabets = Alphabet.allSupportingExactlyOneLanguage().filterValues { it in languages @@ -117,11 +103,6 @@ class LanguageDetector internal constructor( * @throws IllegalStateException If [destroy] has been invoked before on this instance of [LanguageDetector]. */ fun computeLanguageConfidenceValues(text: String): SortedMap { - if (threadPool.isShutdown) { - throw IllegalStateException( - "This LanguageDetector instance has been destroyed and cannot be reused" - ) - } val values = TreeMap() val cleanedUpText = cleanUpInputText(text) @@ -144,33 +125,30 @@ class LanguageDetector internal constructor( } val ngramSizeRange = if (cleanedUpText.length >= 120) (3..3) else (1..5) - val tasks = ngramSizeRange.filter { i -> cleanedUpText.length >= i }.map { i -> - Callable { - val testDataModel = TestDataLanguageModel.fromText(cleanedUpText, ngramLength = i) - val probabilities = computeLanguageProbabilities(testDataModel, filteredLanguages) - - val unigramCounts = if (i == 1) { - val languages = probabilities.keys - val unigramFilteredLanguages = - if (languages.isNotEmpty()) filteredLanguages.asSequence() - .filter { languages.contains(it) } - .toSet() - else filteredLanguages - countUnigramsOfInputText(testDataModel, unigramFilteredLanguages) - } else { - null - } - - Pair(probabilities, unigramCounts) + val allProbabilitiesAndUnigramCounts = ngramSizeRange.filter { i -> cleanedUpText.length >= i }.map { i -> + val testDataModel = TestDataLanguageModel.fromText(cleanedUpText, ngramLength = i) + val probabilities = computeLanguageProbabilities(testDataModel, filteredLanguages) + + val unigramCounts = if (i == 1) { + val languages = probabilities.keys + val unigramFilteredLanguages = + if (languages.isNotEmpty()) filteredLanguages.asSequence() + .filter { languages.contains(it) } + .toSet() + else filteredLanguages + countUnigramsOfInputText(testDataModel, unigramFilteredLanguages) + } else { + null } + + Pair(probabilities, unigramCounts) } - val allProbabilitiesAndUnigramCounts = threadPool.invokeAll(tasks).map { it.get() } val allProbabilities = allProbabilitiesAndUnigramCounts.map { (probabilities, _) -> probabilities } val unigramCounts = allProbabilitiesAndUnigramCounts[0].second ?: emptyMap() val summedUpProbabilities = sumUpProbabilities(allProbabilities, unigramCounts, filteredLanguages) val highestProbability = summedUpProbabilities.maxByOrNull { it.value }?.value ?: return sortedMapOf() - val confidenceValues = summedUpProbabilities.mapValues { highestProbability / it.value } + val confidenceValues = summedUpProbabilities.mapValues { (highestProbability / it.value).toDouble() } val sortedByConfidenceValue = compareByDescending { language -> confidenceValues[language] } val sortedByConfidenceValueThenByLanguage = sortedByConfidenceValue.thenBy { language -> language } @@ -188,11 +166,6 @@ class LanguageDetector internal constructor( * is redeployed multiple times. */ fun destroy() { - threadPool.shutdown() - if (!threadPool.awaitTermination(10, TimeUnit.SECONDS)) { - threadPool.shutdownNow() - } - for (language in languages) { unigramLanguageModels.remove(language) bigramLanguageModels.remove(language) @@ -253,20 +226,24 @@ class LanguageDetector internal constructor( } internal fun sumUpProbabilities( - probabilities: List>, + probabilities: List>, unigramCountsOfInputText: Map, filteredLanguages: Set - ): Map { - val summedUpProbabilities = mutableMapOf() + ): Map { + val summedUpProbabilities = mutableMapOf() for (language in filteredLanguages) { - summedUpProbabilities[language] = probabilities.sumOf { it[language] ?: 0.0 } + var sum = 0F + for (probabilityMap in probabilities) { + sum += probabilityMap[language] ?: 0F + } + summedUpProbabilities[language] = sum if (unigramCountsOfInputText.containsKey(language)) { summedUpProbabilities[language] = summedUpProbabilities.getValue(language) / unigramCountsOfInputText.getValue(language) } } - return summedUpProbabilities.filter { it.value != 0.0 } + return summedUpProbabilities.filter { it.value != 0F } } internal fun detectLanguageWithRules(words: List): Language { @@ -403,8 +380,8 @@ class LanguageDetector internal constructor( internal fun computeLanguageProbabilities( testDataModel: TestDataLanguageModel, filteredLanguages: Set - ): Map { - val probabilities = mutableMapOf() + ): Map { + val probabilities = mutableMapOf() for (language in filteredLanguages) { probabilities[language] = computeSumOfNgramProbabilities(language, testDataModel.ngrams) } @@ -414,8 +391,8 @@ class LanguageDetector internal constructor( internal fun computeSumOfNgramProbabilities( language: Language, ngrams: Set - ): Double { - var probabilitiesSum = 0.0 + ): Float { + var probabilitiesSum = 0F for (ngram in ngrams) { for (elem in ngram.rangeOfLowerOrderNgrams()) { @@ -432,7 +409,7 @@ class LanguageDetector internal constructor( internal fun lookUpNgramProbability( language: Language, ngram: Ngram - ): Double { + ): Float { val ngramLength = ngram.value.length val languageModels = when (ngramLength) { 5 -> fivegramLanguageModels @@ -446,31 +423,11 @@ class LanguageDetector internal constructor( val model = loadLanguageModels(languageModels, language, ngramLength) - return model?.getRelativeFrequency(ngram) ?: 0.0 - } - - private fun loadLanguageModels( - languageModels: MutableMap, - language: Language, - ngramLength: Int - ): TrainingDataLanguageModel? { - if (languageModels.containsKey(language)) { - return languageModels.getValue(language) - } - val model = loadLanguageModel(language, ngramLength) ?: return null - languageModels[language] = model - return model - } - - private fun loadLanguageModel(language: Language, ngramLength: Int): TrainingDataLanguageModel? { - val fileName = "${Ngram.getNgramNameByLength(ngramLength)}s.json" - val filePath = "/language-models/${language.isoCode639_1}/$fileName" - val inputStream = Language::class.java.getResourceAsStream(filePath) ?: return null - val jsonContent = inputStream.bufferedReader(Charsets.UTF_8).use { it.readText() } - return TrainingDataLanguageModel.fromJson(jsonContent) + return model?.getRelativeFrequency(ngram) ?: 0F } private fun preloadLanguageModels() { + val threadPool = ForkJoinPool.commonPool() val tasks = mutableListOf>() for (language in languages) { @@ -484,13 +441,6 @@ class LanguageDetector internal constructor( threadPool.invokeAll(tasks) } - private fun createThreadPool(): ExecutorService { - val cpus = Runtime.getRuntime().availableProcessors() - val threadPool = ThreadPoolExecutor(cpus, cpus, 60L, TimeUnit.SECONDS, LinkedBlockingQueue()) - threadPool.allowCoreThreadTimeOut(true) - return threadPool - } - override fun equals(other: Any?) = when { this === other -> true other !is LanguageDetector -> false @@ -500,4 +450,37 @@ class LanguageDetector internal constructor( } override fun hashCode() = 31 * languages.hashCode() + minimumRelativeDistance.hashCode() + + internal companion object { + internal val unigramLanguageModels = enumMapOf() + internal val bigramLanguageModels = enumMapOf() + internal val trigramLanguageModels = enumMapOf() + internal val quadrigramLanguageModels = enumMapOf() + internal val fivegramLanguageModels = enumMapOf() + + private fun loadLanguageModels( + languageModels: EnumMap, + language: Language, + ngramLength: Int + ): TrainingDataLanguageModel? { + synchronized(languageModels) { + if (languageModels.containsKey(language)) { + return languageModels.getValue(language) + } + } + val model = loadLanguageModel(language, ngramLength) ?: return null + synchronized(languageModels) { + languageModels.putIfAbsent(language, model) + return languageModels.getValue(language) + } + } + + private fun loadLanguageModel(language: Language, ngramLength: Int): TrainingDataLanguageModel? { + val fileName = "${Ngram.getNgramNameByLength(ngramLength)}s.json" + val filePath = "/language-models/${language.isoCode639_1}/$fileName" + val inputStream = Language::class.java.getResourceAsStream(filePath) ?: return null + val jsonContent = inputStream.bufferedReader(Charsets.UTF_8).use { it.readText() } + return TrainingDataLanguageModel.fromJson(jsonContent) + } + } } diff --git a/src/main/kotlin/com/github/pemistahl/lingua/internal/Alphabet.kt b/src/main/kotlin/com/github/pemistahl/lingua/internal/Alphabet.kt index ef30cca9..ee02eade 100644 --- a/src/main/kotlin/com/github/pemistahl/lingua/internal/Alphabet.kt +++ b/src/main/kotlin/com/github/pemistahl/lingua/internal/Alphabet.kt @@ -40,27 +40,15 @@ internal enum class Alphabet { THAI, NONE; - val script: UnicodeScript? = try { + private val script: UnicodeScript? = try { UnicodeScript.forName(this.name) } catch (e: IllegalArgumentException) { null } - fun matches(chr: Char): Boolean { - return if (this.script != null) { - UnicodeScript.of(chr.code) == this.script - } else { - false - } - } + fun matches(chr: Char): Boolean = UnicodeScript.of(chr.code) == this.script - fun matches(input: CharSequence): Boolean { - return if (this.script != null) { - input.codePoints().allMatch { UnicodeScript.of(it) == this.script } - } else { - false - } - } + fun matches(input: CharSequence): Boolean = input.codePoints().allMatch { UnicodeScript.of(it) == this.script } private fun supportedLanguages(): Set { val languages = mutableSetOf() diff --git a/src/main/kotlin/com/github/pemistahl/lingua/internal/Constant.kt b/src/main/kotlin/com/github/pemistahl/lingua/internal/Constant.kt index 99c9effa..4e88e249 100644 --- a/src/main/kotlin/com/github/pemistahl/lingua/internal/Constant.kt +++ b/src/main/kotlin/com/github/pemistahl/lingua/internal/Constant.kt @@ -60,61 +60,68 @@ import com.github.pemistahl.lingua.api.Language.TURKISH import com.github.pemistahl.lingua.api.Language.UKRAINIAN import com.github.pemistahl.lingua.api.Language.VIETNAMESE import com.github.pemistahl.lingua.api.Language.YORUBA +import com.github.pemistahl.lingua.internal.util.extension.enumSetOf internal object Constant { val CHARS_TO_LANGUAGES_MAPPING = mapOf( - "Ãã" to setOf(PORTUGUESE, VIETNAMESE), - "ĄąĘę" to setOf(LITHUANIAN, POLISH), - "Żż" to setOf(POLISH, ROMANIAN), - "Îî" to setOf(FRENCH, ROMANIAN), - "Ññ" to setOf(BASQUE, SPANISH), - "ŇňŤť" to setOf(CZECH, SLOVAK), - "Ăă" to setOf(ROMANIAN, VIETNAMESE), - "İıĞğ" to setOf(AZERBAIJANI, TURKISH), - "ЈјЉљЊњ" to setOf(MACEDONIAN, SERBIAN), - "ẸẹỌọ" to setOf(VIETNAMESE, YORUBA), - "ÐðÞþ" to setOf(ICELANDIC, TURKISH), - "Ûû" to setOf(FRENCH, HUNGARIAN), - "Ōō" to setOf(MAORI, YORUBA), + "Ãã" to enumSetOf(PORTUGUESE, VIETNAMESE), + "ĄąĘę" to enumSetOf(LITHUANIAN, POLISH), + "Żż" to enumSetOf(POLISH, ROMANIAN), + "Îî" to enumSetOf(FRENCH, ROMANIAN), + "Ññ" to enumSetOf(BASQUE, SPANISH), + "ŇňŤť" to enumSetOf(CZECH, SLOVAK), + "Ăă" to enumSetOf(ROMANIAN, VIETNAMESE), + "İıĞğ" to enumSetOf(AZERBAIJANI, TURKISH), + "ЈјЉљЊњ" to enumSetOf(MACEDONIAN, SERBIAN), + "ẸẹỌọ" to enumSetOf(VIETNAMESE, YORUBA), + "ÐðÞþ" to enumSetOf(ICELANDIC, TURKISH), + "Ûû" to enumSetOf(FRENCH, HUNGARIAN), + "Ōō" to enumSetOf(MAORI, YORUBA), - "ĀāĒēĪī" to setOf(LATVIAN, MAORI, YORUBA), - "Şş" to setOf(AZERBAIJANI, ROMANIAN, TURKISH), - "Ďď" to setOf(CZECH, ROMANIAN, SLOVAK), - "Ćć" to setOf(BOSNIAN, CROATIAN, POLISH), - "Đđ" to setOf(BOSNIAN, CROATIAN, VIETNAMESE), - "Іі" to setOf(BELARUSIAN, KAZAKH, UKRAINIAN), - "Ìì" to setOf(ITALIAN, VIETNAMESE, YORUBA), - "Øø" to setOf(BOKMAL, DANISH, NYNORSK), + "ĀāĒēĪī" to enumSetOf(LATVIAN, MAORI, YORUBA), + "Şş" to enumSetOf(AZERBAIJANI, ROMANIAN, TURKISH), + "Ďď" to enumSetOf(CZECH, ROMANIAN, SLOVAK), + "Ćć" to enumSetOf(BOSNIAN, CROATIAN, POLISH), + "Đđ" to enumSetOf(BOSNIAN, CROATIAN, VIETNAMESE), + "Іі" to enumSetOf(BELARUSIAN, KAZAKH, UKRAINIAN), + "Ìì" to enumSetOf(ITALIAN, VIETNAMESE, YORUBA), + "Øø" to enumSetOf(BOKMAL, DANISH, NYNORSK), - "Ūū" to setOf(LATVIAN, LITHUANIAN, MAORI, YORUBA), - "Ëë" to setOf(AFRIKAANS, ALBANIAN, DUTCH, FRENCH), - "ÈèÙù" to setOf(FRENCH, ITALIAN, VIETNAMESE, YORUBA), - "Êê" to setOf(AFRIKAANS, FRENCH, PORTUGUESE, VIETNAMESE), - "Õõ" to setOf(ESTONIAN, HUNGARIAN, PORTUGUESE, VIETNAMESE), - "Ôô" to setOf(FRENCH, PORTUGUESE, SLOVAK, VIETNAMESE), + "Ūū" to enumSetOf(LATVIAN, LITHUANIAN, MAORI, YORUBA), + "Ëë" to enumSetOf(AFRIKAANS, ALBANIAN, DUTCH, FRENCH), + "ÈèÙù" to enumSetOf(FRENCH, ITALIAN, VIETNAMESE, YORUBA), + "Êê" to enumSetOf(AFRIKAANS, FRENCH, PORTUGUESE, VIETNAMESE), + "Õõ" to enumSetOf(ESTONIAN, HUNGARIAN, PORTUGUESE, VIETNAMESE), + "Ôô" to enumSetOf(FRENCH, PORTUGUESE, SLOVAK, VIETNAMESE), - "ЁёЫыЭэ" to setOf(BELARUSIAN, KAZAKH, MONGOLIAN, RUSSIAN), - "ЩщЪъ" to setOf(BULGARIAN, KAZAKH, MONGOLIAN, RUSSIAN), - "Òò" to setOf(CATALAN, ITALIAN, VIETNAMESE, YORUBA), - "Ææ" to setOf(BOKMAL, DANISH, ICELANDIC, NYNORSK), - "Åå" to setOf(BOKMAL, DANISH, NYNORSK, SWEDISH), + "ЁёЫыЭэ" to enumSetOf(BELARUSIAN, KAZAKH, MONGOLIAN, RUSSIAN), + "ЩщЪъ" to enumSetOf(BULGARIAN, KAZAKH, MONGOLIAN, RUSSIAN), + "Òò" to enumSetOf(CATALAN, ITALIAN, VIETNAMESE, YORUBA), + "Ææ" to enumSetOf(BOKMAL, DANISH, ICELANDIC, NYNORSK), + "Åå" to enumSetOf(BOKMAL, DANISH, NYNORSK, SWEDISH), - "Ýý" to setOf(CZECH, ICELANDIC, SLOVAK, TURKISH, VIETNAMESE), - "Ää" to setOf(ESTONIAN, FINNISH, GERMAN, SLOVAK, SWEDISH), - "Àà" to setOf(CATALAN, FRENCH, ITALIAN, PORTUGUESE, VIETNAMESE), - "Ââ" to setOf(FRENCH, PORTUGUESE, ROMANIAN, TURKISH, VIETNAMESE), + "Ýý" to enumSetOf(CZECH, ICELANDIC, SLOVAK, TURKISH, VIETNAMESE), + "Ää" to enumSetOf(ESTONIAN, FINNISH, GERMAN, SLOVAK, SWEDISH), + "Àà" to enumSetOf(CATALAN, FRENCH, ITALIAN, PORTUGUESE, VIETNAMESE), + "Ââ" to enumSetOf(FRENCH, PORTUGUESE, ROMANIAN, TURKISH, VIETNAMESE), - "Üü" to setOf(AZERBAIJANI, CATALAN, ESTONIAN, GERMAN, HUNGARIAN, SPANISH, TURKISH), - "Č芚Žž" to setOf(BOSNIAN, CZECH, CROATIAN, LATVIAN, LITHUANIAN, SLOVAK, SLOVENE), - "Çç" to setOf(ALBANIAN, AZERBAIJANI, BASQUE, CATALAN, FRENCH, PORTUGUESE, TURKISH), + "Üü" to enumSetOf(AZERBAIJANI, CATALAN, ESTONIAN, GERMAN, HUNGARIAN, SPANISH, TURKISH), + "Č芚Žž" to enumSetOf(BOSNIAN, CZECH, CROATIAN, LATVIAN, LITHUANIAN, SLOVAK, SLOVENE), + "Çç" to enumSetOf(ALBANIAN, AZERBAIJANI, BASQUE, CATALAN, FRENCH, PORTUGUESE, TURKISH), - "Öö" to setOf(AZERBAIJANI, ESTONIAN, FINNISH, GERMAN, HUNGARIAN, ICELANDIC, SWEDISH, TURKISH), + "Öö" to enumSetOf(AZERBAIJANI, ESTONIAN, FINNISH, GERMAN, HUNGARIAN, ICELANDIC, SWEDISH, TURKISH), - "Óó" to setOf(CATALAN, HUNGARIAN, ICELANDIC, IRISH, POLISH, PORTUGUESE, SLOVAK, SPANISH, VIETNAMESE, YORUBA), - "ÁáÍíÚú" to setOf(CATALAN, CZECH, ICELANDIC, IRISH, HUNGARIAN, PORTUGUESE, SLOVAK, SPANISH, VIETNAMESE, YORUBA), + "Óó" to enumSetOf( + CATALAN, HUNGARIAN, ICELANDIC, IRISH, POLISH, + PORTUGUESE, SLOVAK, SPANISH, VIETNAMESE, YORUBA + ), + "ÁáÍíÚú" to enumSetOf( + CATALAN, CZECH, ICELANDIC, IRISH, HUNGARIAN, + PORTUGUESE, SLOVAK, SPANISH, VIETNAMESE, YORUBA + ), - "Éé" to setOf( + "Éé" to enumSetOf( CATALAN, CZECH, FRENCH, HUNGARIAN, ICELANDIC, IRISH, ITALIAN, PORTUGUESE, SLOVAK, SPANISH, VIETNAMESE, YORUBA ) @@ -127,7 +134,7 @@ internal object Constant { script == Character.UnicodeScript.HAN } - val LANGUAGES_SUPPORTING_LOGOGRAMS = setOf(CHINESE, JAPANESE, KOREAN) + val LANGUAGES_SUPPORTING_LOGOGRAMS = enumSetOf(CHINESE, JAPANESE, KOREAN) val MULTIPLE_WHITESPACE = Regex("\\s+") val NO_LETTER = Regex("^[^\\p{L}]+$") val NUMBERS = Regex("\\p{N}") diff --git a/src/main/kotlin/com/github/pemistahl/lingua/internal/TrainingDataLanguageModel.kt b/src/main/kotlin/com/github/pemistahl/lingua/internal/TrainingDataLanguageModel.kt index cbcd4ed6..50b93ee1 100644 --- a/src/main/kotlin/com/github/pemistahl/lingua/internal/TrainingDataLanguageModel.kt +++ b/src/main/kotlin/com/github/pemistahl/lingua/internal/TrainingDataLanguageModel.kt @@ -18,8 +18,8 @@ package com.github.pemistahl.lingua.internal import com.github.pemistahl.lingua.api.Language import com.github.pemistahl.lingua.internal.util.extension.incrementCounter -import it.unimi.dsi.fastutil.objects.Object2DoubleMap -import it.unimi.dsi.fastutil.objects.Object2DoubleOpenHashMap +import it.unimi.dsi.fastutil.objects.Object2FloatMap +import it.unimi.dsi.fastutil.objects.Object2FloatOpenHashMap import kotlinx.serialization.Serializable import kotlinx.serialization.decodeFromString import kotlinx.serialization.encodeToString @@ -32,9 +32,9 @@ internal data class TrainingDataLanguageModel( val language: Language, val absoluteFrequencies: Map, val relativeFrequencies: Map, - val jsonRelativeFrequencies: Object2DoubleMap + val jsonRelativeFrequencies: Object2FloatMap ) { - fun getRelativeFrequency(ngram: Ngram): Double = jsonRelativeFrequencies.getDouble(ngram.value) + fun getRelativeFrequency(ngram: Ngram): Float = jsonRelativeFrequencies.getFloat(ngram.value) fun toJson(): String { val ngrams = mutableMapOf>() @@ -77,18 +77,18 @@ internal data class TrainingDataLanguageModel( language, absoluteFrequencies, relativeFrequencies, - Object2DoubleOpenHashMap() + Object2FloatOpenHashMap() ) } fun fromJson(json: String): TrainingDataLanguageModel { val jsonLanguageModel = Json.decodeFromString(json) - val jsonRelativeFrequencies = Object2DoubleOpenHashMap() + val jsonRelativeFrequencies = Object2FloatOpenHashMap() for ((fraction, ngrams) in jsonLanguageModel.ngrams) { - val fractionAsDouble = fraction.toDouble() + val fractionAsFloat = fraction.toFloat() for (ngram in ngrams.split(' ')) { - jsonRelativeFrequencies.put(ngram, fractionAsDouble) + jsonRelativeFrequencies.put(ngram, fractionAsFloat) } } diff --git a/src/main/kotlin/com/github/pemistahl/lingua/internal/util/extension/CharExtensions.kt b/src/main/kotlin/com/github/pemistahl/lingua/internal/util/extension/CharExtensions.kt index 3dbc3890..99ebda5a 100644 --- a/src/main/kotlin/com/github/pemistahl/lingua/internal/util/extension/CharExtensions.kt +++ b/src/main/kotlin/com/github/pemistahl/lingua/internal/util/extension/CharExtensions.kt @@ -24,10 +24,5 @@ private val scriptsWithLogograms = LANGUAGES_SUPPORTING_LOGOGRAMS.asSequence() .flatMap(Language::alphabets) .toSet() -fun Char.isLogogram(): Boolean { - return if (this.isWhitespace()) { - false - } else { - scriptsWithLogograms.any { it.matches(this) } - } -} +internal fun Char.isLogogram(): Boolean = + !this.isWhitespace() && scriptsWithLogograms.any { it.matches(this) } diff --git a/src/main/kotlin/com/github/pemistahl/lingua/internal/util/extension/EnumExtensions.kt b/src/main/kotlin/com/github/pemistahl/lingua/internal/util/extension/EnumExtensions.kt new file mode 100644 index 00000000..6b463c9d --- /dev/null +++ b/src/main/kotlin/com/github/pemistahl/lingua/internal/util/extension/EnumExtensions.kt @@ -0,0 +1,35 @@ +/* + * Copyright © 2018-today Peter M. Stahl pemistahl@gmail.com + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.github.pemistahl.lingua.internal.util.extension + +import java.util.EnumMap +import java.util.EnumSet + +internal inline fun , V> enumMapOf(vararg pairs: Pair): EnumMap = when (pairs.size) { + 0 -> EnumMap(K::class.java) + else -> EnumMap(pairs.toMap()) +} + +internal inline fun > enumSetOf(vararg elements: E): EnumSet = when (elements.size) { + 0 -> EnumSet.noneOf(E::class.java) + 1 -> EnumSet.of(elements[0]) + 2 -> EnumSet.of(elements[0], elements[1]) + 3 -> EnumSet.of(elements[0], elements[1], elements[2]) + 4 -> EnumSet.of(elements[0], elements[1], elements[2], elements[3]) + 5 -> EnumSet.of(elements[0], elements[1], elements[2], elements[3], elements[4]) + else -> EnumSet.of(elements[0], *elements.drop(1).toTypedArray()) +} diff --git a/src/test/kotlin/com/github/pemistahl/lingua/api/LanguageDetectorTest.kt b/src/test/kotlin/com/github/pemistahl/lingua/api/LanguageDetectorTest.kt index d88ea570..b9f00f02 100644 --- a/src/test/kotlin/com/github/pemistahl/lingua/api/LanguageDetectorTest.kt +++ b/src/test/kotlin/com/github/pemistahl/lingua/api/LanguageDetectorTest.kt @@ -86,12 +86,11 @@ import io.mockk.impl.annotations.SpyK import io.mockk.junit5.MockKExtension import org.assertj.core.api.Assertions.assertThat import org.assertj.core.api.Assertions.assertThatIllegalArgumentException -import org.assertj.core.api.Assertions.entry +import org.assertj.core.api.Assertions.within import org.junit.jupiter.api.AfterEach import org.junit.jupiter.api.BeforeAll import org.junit.jupiter.api.BeforeEach import org.junit.jupiter.api.Test -import org.junit.jupiter.api.assertThrows import org.junit.jupiter.api.extension.ExtendWith import org.junit.jupiter.params.ParameterizedTest import org.junit.jupiter.params.provider.Arguments.arguments @@ -212,17 +211,17 @@ class LanguageDetectorTest { // ngram probability lookup private fun ngramProbabilityProvider() = listOf( - arguments(ENGLISH, "a", 0.01), - arguments(ENGLISH, "lt", 0.12), - arguments(ENGLISH, "ter", 0.21), - arguments(ENGLISH, "alte", 0.25), - arguments(ENGLISH, "alter", 0.29), - - arguments(GERMAN, "t", 0.08), - arguments(GERMAN, "er", 0.18), - arguments(GERMAN, "alt", 0.22), - arguments(GERMAN, "lter", 0.28), - arguments(GERMAN, "alter", 0.30) + arguments(ENGLISH, "a", 0.01F), + arguments(ENGLISH, "lt", 0.12F), + arguments(ENGLISH, "ter", 0.21F), + arguments(ENGLISH, "alte", 0.25F), + arguments(ENGLISH, "alter", 0.29F), + + arguments(GERMAN, "t", 0.08F), + arguments(GERMAN, "er", 0.18F), + arguments(GERMAN, "alt", 0.22F), + arguments(GERMAN, "lter", 0.28F), + arguments(GERMAN, "alter", 0.30F) ) @ParameterizedTest @@ -230,7 +229,7 @@ class LanguageDetectorTest { internal fun `assert that ngram probability lookup works correctly`( language: Language, ngram: Ngram, - expectedProbability: Double + expectedProbability: Float ) { assertThat( detectorForEnglishAndGerman.lookUpNgramProbability(language, ngram) @@ -255,17 +254,17 @@ class LanguageDetectorTest { private fun ngramProbabilitySumProvider() = listOf( arguments( setOf(Ngram("a"), Ngram("l"), Ngram("t"), Ngram("e"), Ngram("r")), - ln(0.01) + ln(0.02) + ln(0.03) + ln(0.04) + ln(0.05) + ln(0.01F) + ln(0.02F) + ln(0.03F) + ln(0.04F) + ln(0.05F) ), arguments( // back off unknown Trigram("tez") to known Bigram("te") setOf(Ngram("alt"), Ngram("lte"), Ngram("tez")), - ln(0.19) + ln(0.2) + ln(0.13) + ln(0.19F) + ln(0.2F) + ln(0.13F) ), arguments( // back off unknown Fivegram("aquas") to known Unigram("a") setOf(Ngram("aquas")), - ln(0.01) + ln(0.01F) ) ) @@ -273,7 +272,7 @@ class LanguageDetectorTest { @MethodSource("ngramProbabilitySumProvider") internal fun `assert that sum of ngram probabilities can be computed correctly`( ngrams: Set, - expectedSumOfProbabilities: Double + expectedSumOfProbabilities: Float ) { assertThat( detectorForEnglishAndGerman.computeSumOfNgramProbabilities(ENGLISH, ngrams) @@ -290,22 +289,22 @@ class LanguageDetectorTest { arguments( unigramTestDataLanguageModel, mapOf( - ENGLISH to ln(0.01) + ln(0.02) + ln(0.03) + ln(0.04) + ln(0.05), - GERMAN to ln(0.06) + ln(0.07) + ln(0.08) + ln(0.09) + ln(0.1) + ENGLISH to ln(0.01F) + ln(0.02F) + ln(0.03F) + ln(0.04F) + ln(0.05F), + GERMAN to ln(0.06F) + ln(0.07F) + ln(0.08F) + ln(0.09F) + ln(0.1F) ) ), arguments( trigramTestDataLanguageModel, mapOf( - ENGLISH to ln(0.19) + ln(0.2) + ln(0.21), - GERMAN to ln(0.22) + ln(0.23) + ln(0.24) + ENGLISH to ln(0.19F) + ln(0.2F) + ln(0.21F), + GERMAN to ln(0.22F) + ln(0.23F) + ln(0.24F) ) ), arguments( quadrigramTestDataLanguageModel, mapOf( - ENGLISH to ln(0.25) + ln(0.26), - GERMAN to ln(0.27) + ln(0.28) + ENGLISH to ln(0.25F) + ln(0.26F), + GERMAN to ln(0.27F) + ln(0.28F) ) ) ) @@ -314,7 +313,7 @@ class LanguageDetectorTest { @MethodSource("languageProbabilitiesProvider") internal fun `assert that language probabilities can be computed correctly`( testDataModel: TestDataLanguageModel, - expectedProbabilities: Map + expectedProbabilities: Map ) { assertThat( detectorForEnglishAndGerman.computeLanguageProbabilities( @@ -767,35 +766,39 @@ class LanguageDetectorTest { val totalProbabilityForGerman = ( // Unigrams - ln(0.06) + ln(0.07) + ln(0.08) + ln(0.09) + ln(0.1) + + ln(0.06F) + ln(0.07F) + ln(0.08F) + ln(0.09F) + ln(0.1F) + // Bigrams - ln(0.15) + ln(0.16) + ln(0.17) + ln(0.18) + + ln(0.15F) + ln(0.16F) + ln(0.17F) + ln(0.18F) + // Trigrams - ln(0.22) + ln(0.23) + ln(0.24) + + ln(0.22F) + ln(0.23F) + ln(0.24F) + // Quadrigrams - ln(0.27) + ln(0.28) + + ln(0.27F) + ln(0.28F) + // Fivegrams - ln(0.3) + ln(0.3F) ) / unigramCountForBothLanguages val totalProbabilityForEnglish = ( // Unigrams - ln(0.01) + ln(0.02) + ln(0.03) + ln(0.04) + ln(0.05) + + ln(0.01F) + ln(0.02F) + ln(0.03F) + ln(0.04F) + ln(0.05F) + // Bigrams - ln(0.11) + ln(0.12) + ln(0.13) + ln(0.14) + + ln(0.11F) + ln(0.12F) + ln(0.13F) + ln(0.14F) + // Trigrams - ln(0.19) + ln(0.2) + ln(0.21) + + ln(0.19F) + ln(0.2F) + ln(0.21F) + // Quadrigrams - ln(0.25) + ln(0.26) + + ln(0.25F) + ln(0.26F) + // Fivegrams - ln(0.29) + ln(0.29F) ) / unigramCountForBothLanguages - assertThat( - detectorForEnglishAndGerman.computeLanguageConfidenceValues("Alter") - ).containsExactly( - entry(GERMAN, 1.0), - entry(ENGLISH, totalProbabilityForGerman / totalProbabilityForEnglish) + val confidenceValues = detectorForEnglishAndGerman.computeLanguageConfidenceValues("Alter") + + assertThat(confidenceValues.firstKey()).isEqualTo(GERMAN) + assertThat(confidenceValues.lastKey()).isEqualTo(ENGLISH) + + assertThat(confidenceValues[GERMAN]).isEqualTo(1.0) + assertThat(confidenceValues[ENGLISH]).isCloseTo( + (totalProbabilityForGerman / totalProbabilityForEnglish).toDouble(), + within(0.000001) ) } @@ -859,151 +862,139 @@ class LanguageDetectorTest { detector.destroy() - assertThat(detector.threadPool.isShutdown).isTrue - val exception = assertThrows { - detector.detectLanguageOf("languages are fascinating") - } - assertThat(exception.message).isEqualTo( - "This LanguageDetector instance has been destroyed and cannot be reused" - ) - - assertThat(detector.unigramLanguageModels).isEmpty() - assertThat(detector.bigramLanguageModels).isEmpty() - assertThat(detector.trigramLanguageModels).isEmpty() - assertThat(detector.quadrigramLanguageModels).isEmpty() - assertThat(detector.fivegramLanguageModels).isEmpty() + assertThat(LanguageDetector.unigramLanguageModels).isEmpty() + assertThat(LanguageDetector.bigramLanguageModels).isEmpty() + assertThat(LanguageDetector.trigramLanguageModels).isEmpty() + assertThat(LanguageDetector.quadrigramLanguageModels).isEmpty() + assertThat(LanguageDetector.fivegramLanguageModels).isEmpty() } private fun defineBehaviorOfUnigramLanguageModels() { with(unigramLanguageModelForEnglish) { - every { getRelativeFrequency(Ngram("a")) } returns 0.01 - every { getRelativeFrequency(Ngram("l")) } returns 0.02 - every { getRelativeFrequency(Ngram("t")) } returns 0.03 - every { getRelativeFrequency(Ngram("e")) } returns 0.04 - every { getRelativeFrequency(Ngram("r")) } returns 0.05 + every { getRelativeFrequency(Ngram("a")) } returns 0.01F + every { getRelativeFrequency(Ngram("l")) } returns 0.02F + every { getRelativeFrequency(Ngram("t")) } returns 0.03F + every { getRelativeFrequency(Ngram("e")) } returns 0.04F + every { getRelativeFrequency(Ngram("r")) } returns 0.05F // unknown unigrams in model - every { getRelativeFrequency(Ngram("w")) } returns 0.0 + every { getRelativeFrequency(Ngram("w")) } returns 0F } with(unigramLanguageModelForGerman) { - every { getRelativeFrequency(Ngram("a")) } returns 0.06 - every { getRelativeFrequency(Ngram("l")) } returns 0.07 - every { getRelativeFrequency(Ngram("t")) } returns 0.08 - every { getRelativeFrequency(Ngram("e")) } returns 0.09 - every { getRelativeFrequency(Ngram("r")) } returns 0.1 + every { getRelativeFrequency(Ngram("a")) } returns 0.06F + every { getRelativeFrequency(Ngram("l")) } returns 0.07F + every { getRelativeFrequency(Ngram("t")) } returns 0.08F + every { getRelativeFrequency(Ngram("e")) } returns 0.09F + every { getRelativeFrequency(Ngram("r")) } returns 0.1F // unknown unigrams in model - every { getRelativeFrequency(Ngram("w")) } returns 0.0 + every { getRelativeFrequency(Ngram("w")) } returns 0F } } private fun defineBehaviorOfBigramLanguageModels() { with(bigramLanguageModelForEnglish) { - every { getRelativeFrequency(Ngram("al")) } returns 0.11 - every { getRelativeFrequency(Ngram("lt")) } returns 0.12 - every { getRelativeFrequency(Ngram("te")) } returns 0.13 - every { getRelativeFrequency(Ngram("er")) } returns 0.14 + every { getRelativeFrequency(Ngram("al")) } returns 0.11F + every { getRelativeFrequency(Ngram("lt")) } returns 0.12F + every { getRelativeFrequency(Ngram("te")) } returns 0.13F + every { getRelativeFrequency(Ngram("er")) } returns 0.14F // unknown bigrams in model for (value in listOf("aq", "wx")) { - every { getRelativeFrequency(Ngram(value)) } returns 0.0 + every { getRelativeFrequency(Ngram(value)) } returns 0F } } with(bigramLanguageModelForGerman) { - every { getRelativeFrequency(Ngram("al")) } returns 0.15 - every { getRelativeFrequency(Ngram("lt")) } returns 0.16 - every { getRelativeFrequency(Ngram("te")) } returns 0.17 - every { getRelativeFrequency(Ngram("er")) } returns 0.18 + every { getRelativeFrequency(Ngram("al")) } returns 0.15F + every { getRelativeFrequency(Ngram("lt")) } returns 0.16F + every { getRelativeFrequency(Ngram("te")) } returns 0.17F + every { getRelativeFrequency(Ngram("er")) } returns 0.18F // unknown bigrams in model - every { getRelativeFrequency(Ngram("wx")) } returns 0.0 + every { getRelativeFrequency(Ngram("wx")) } returns 0F } } private fun defineBehaviorOfTrigramLanguageModels() { with(trigramLanguageModelForEnglish) { - every { getRelativeFrequency(Ngram("alt")) } returns 0.19 - every { getRelativeFrequency(Ngram("lte")) } returns 0.2 - every { getRelativeFrequency(Ngram("ter")) } returns 0.21 + every { getRelativeFrequency(Ngram("alt")) } returns 0.19F + every { getRelativeFrequency(Ngram("lte")) } returns 0.2F + every { getRelativeFrequency(Ngram("ter")) } returns 0.21F // unknown trigrams in model for (value in listOf("aqu", "tez", "wxy")) { - every { getRelativeFrequency(Ngram(value)) } returns 0.0 + every { getRelativeFrequency(Ngram(value)) } returns 0F } } with(trigramLanguageModelForGerman) { - every { getRelativeFrequency(Ngram("alt")) } returns 0.22 - every { getRelativeFrequency(Ngram("lte")) } returns 0.23 - every { getRelativeFrequency(Ngram("ter")) } returns 0.24 + every { getRelativeFrequency(Ngram("alt")) } returns 0.22F + every { getRelativeFrequency(Ngram("lte")) } returns 0.23F + every { getRelativeFrequency(Ngram("ter")) } returns 0.24F // unknown trigrams in model - every { getRelativeFrequency(Ngram("wxy")) } returns 0.0 + every { getRelativeFrequency(Ngram("wxy")) } returns 0F } } private fun defineBehaviorOfQuadrigramLanguageModels() { with(quadrigramLanguageModelForEnglish) { - every { getRelativeFrequency(Ngram("alte")) } returns 0.25 - every { getRelativeFrequency(Ngram("lter")) } returns 0.26 + every { getRelativeFrequency(Ngram("alte")) } returns 0.25F + every { getRelativeFrequency(Ngram("lter")) } returns 0.26F // unknown quadrigrams in model for (value in listOf("aqua", "wxyz")) { - every { getRelativeFrequency(Ngram(value)) } returns 0.0 + every { getRelativeFrequency(Ngram(value)) } returns 0F } } with(quadrigramLanguageModelForGerman) { - every { getRelativeFrequency(Ngram("alte")) } returns 0.27 - every { getRelativeFrequency(Ngram("lter")) } returns 0.28 + every { getRelativeFrequency(Ngram("alte")) } returns 0.27F + every { getRelativeFrequency(Ngram("lter")) } returns 0.28F // unknown quadrigrams in model - every { getRelativeFrequency(Ngram("wxyz")) } returns 0.0 + every { getRelativeFrequency(Ngram("wxyz")) } returns 0F } } private fun defineBehaviorOfFivegramLanguageModels() { with(fivegramLanguageModelForEnglish) { - every { getRelativeFrequency(Ngram("alter")) } returns 0.29 + every { getRelativeFrequency(Ngram("alter")) } returns 0.29F // unknown fivegrams in model - every { getRelativeFrequency(Ngram("aquas")) } returns 0.0 + every { getRelativeFrequency(Ngram("aquas")) } returns 0F } with(fivegramLanguageModelForGerman) { - every { getRelativeFrequency(Ngram("alter")) } returns 0.30 + every { getRelativeFrequency(Ngram("alter")) } returns 0.3F } } private fun addLanguageModelsToDetector() { - with(detectorForEnglishAndGerman) { - unigramLanguageModels[ENGLISH] = unigramLanguageModelForEnglish - unigramLanguageModels[GERMAN] = unigramLanguageModelForGerman + LanguageDetector.unigramLanguageModels[ENGLISH] = unigramLanguageModelForEnglish + LanguageDetector.unigramLanguageModels[GERMAN] = unigramLanguageModelForGerman - bigramLanguageModels[ENGLISH] = bigramLanguageModelForEnglish - bigramLanguageModels[GERMAN] = bigramLanguageModelForGerman + LanguageDetector.bigramLanguageModels[ENGLISH] = bigramLanguageModelForEnglish + LanguageDetector.bigramLanguageModels[GERMAN] = bigramLanguageModelForGerman - trigramLanguageModels[ENGLISH] = trigramLanguageModelForEnglish - trigramLanguageModels[GERMAN] = trigramLanguageModelForGerman + LanguageDetector.trigramLanguageModels[ENGLISH] = trigramLanguageModelForEnglish + LanguageDetector.trigramLanguageModels[GERMAN] = trigramLanguageModelForGerman - quadrigramLanguageModels[ENGLISH] = quadrigramLanguageModelForEnglish - quadrigramLanguageModels[GERMAN] = quadrigramLanguageModelForGerman + LanguageDetector.quadrigramLanguageModels[ENGLISH] = quadrigramLanguageModelForEnglish + LanguageDetector.quadrigramLanguageModels[GERMAN] = quadrigramLanguageModelForGerman - fivegramLanguageModels[ENGLISH] = fivegramLanguageModelForEnglish - fivegramLanguageModels[GERMAN] = fivegramLanguageModelForGerman - } + LanguageDetector.fivegramLanguageModels[ENGLISH] = fivegramLanguageModelForEnglish + LanguageDetector.fivegramLanguageModels[GERMAN] = fivegramLanguageModelForGerman } private fun removeLanguageModelsFromDetector() { - with(detectorForEnglishAndGerman) { - unigramLanguageModels.clear() - bigramLanguageModels.clear() - trigramLanguageModels.clear() - quadrigramLanguageModels.clear() - fivegramLanguageModels.clear() - } + LanguageDetector.unigramLanguageModels.clear() + LanguageDetector.bigramLanguageModels.clear() + LanguageDetector.trigramLanguageModels.clear() + LanguageDetector.quadrigramLanguageModels.clear() + LanguageDetector.fivegramLanguageModels.clear() } private fun defineBehaviorOfTestDataLanguageModels() { diff --git a/src/test/kotlin/com/github/pemistahl/lingua/internal/TrainingDataLanguageModelTest.kt b/src/test/kotlin/com/github/pemistahl/lingua/internal/TrainingDataLanguageModelTest.kt index 1bceb3d7..7ba33fba 100644 --- a/src/test/kotlin/com/github/pemistahl/lingua/internal/TrainingDataLanguageModelTest.kt +++ b/src/test/kotlin/com/github/pemistahl/lingua/internal/TrainingDataLanguageModelTest.kt @@ -73,7 +73,7 @@ class TrainingDataLanguageModelTest { ).mapKeys(keyMapper).mapValues(valueMapper) private val expectedUnigramJsonRelativeFrequencies = expectedUnigramRelativeFrequencies.mapValues { - it.value.toDouble() + it.value.toFloat() } private val expectedBigramAbsoluteFrequencies = mapOf(