From e0b3cc554c188ccfda42b15d2ac5e5eec55ad325 Mon Sep 17 00:00:00 2001 From: "Peter M. Stahl" Date: Mon, 16 May 2022 11:38:29 +0200 Subject: [PATCH] Improve performance of language filter engine (#101) --- .../pemistahl/lingua/api/LanguageDetector.kt | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/main/kotlin/com/github/pemistahl/lingua/api/LanguageDetector.kt b/src/main/kotlin/com/github/pemistahl/lingua/api/LanguageDetector.kt index 43ca8944..0071a440 100644 --- a/src/main/kotlin/com/github/pemistahl/lingua/api/LanguageDetector.kt +++ b/src/main/kotlin/com/github/pemistahl/lingua/api/LanguageDetector.kt @@ -375,18 +375,18 @@ class LanguageDetector internal constructor( } } - val mostFrequentAlphabet = detectedAlphabets.entries.maxByOrNull { it.value }!!.key - val filteredLanguages = languages.filter { it.alphabets.contains(mostFrequentAlphabet) } + val mostFrequentAlphabet = detectedAlphabets.entries.maxByOrNull { it.value }?.key + val filteredLanguages = languages.asSequence().filter { it.alphabets.contains(mostFrequentAlphabet) }.toSet() val languageCounts = mutableMapOf() - for (word in words) { - for ((characters, languages) in CHARS_TO_LANGUAGES_MAPPING) { + for ((characters, languages) in CHARS_TO_LANGUAGES_MAPPING) { + val relevantLanguages = languages.intersect(filteredLanguages) + + for (word in words) { for (character in characters) { if (word.contains(character)) { - for (language in languages) { - if (filteredLanguages.contains(language)) { - languageCounts.incrementCounter(language) - } + for (language in relevantLanguages) { + languageCounts.incrementCounter(language) } } } @@ -396,7 +396,7 @@ class LanguageDetector internal constructor( val languagesSubset = languageCounts.filterValues { it >= words.size / 2.0 }.keys return languagesSubset.ifEmpty { - filteredLanguages.toSet() + filteredLanguages } }