Helsinki-NLP · marco-c · Nov 24, 2023
diff --git a/docs/filters/script_and_language_identification_filters.md b/docs/filters/script_and_language_identification_filters.md
@@ -40,6 +40,7 @@ Parameters:
 * `fasttext_model_path`: path for a `fasttext` model (required only for the `fasttext` method; default `null`)
 * `langid_languages`: limit detection to a list of possible languages (valid only for the `langid` method; default `null`)
 * `cld2_options`: a dictionary of options for the `cld2` method (valid only for the `cld2` method; default `null`)
+* `lingua_mode`: a string specifying whether to use lingua's `high` or `low` accuracy mode
 
 Returned scores are the language identification confidence scores from a given identification method for the segments. The scores range from 0 to 1. In filtering, all values have to be greater than the minimum thresholds. Negative threshold can be used to skip filtering for a language.
 

diff --git a/docs/installation.md b/docs/installation.md
@@ -35,6 +35,7 @@ versions from 3.7 to 3.11.
 * subword_nmt
 * tqdm
 * xxhash
+* lingua-language-detector
 
 See `setup.py` for possible version requirements.
 

diff --git a/opusfilter/filters.py b/opusfilter/filters.py
@@ -283,10 +283,11 @@ def accept(self, score):
 class LanguageIDFilter(FilterABC):
     """Language identification confidence filter
 
-    Currently this supports three methods:
+    Currently this supports four methods:
     * langid (default): see :cite:`lui-baldwin-2012-langid`
     * cld2: see https://github.com/CLD2Owners/cld2
     * fasttext: see :cite:`joulin-etal-2016-fasttext` and :cite:`joulin-etal-2017-bag`
+    * lingua-py: see https://github.com/pemistahl/lingua-py
 
     """
 
@@ -296,7 +297,7 @@ class LanguageIDFilter(FilterABC):
 
     def __init__(self, languages=None, id_method='langid', thresholds=None,
                  fasttext_model_path=None, langid_languages=None, cld2_options=None,
-                 **kwargs):
+                 lingua_mode="low", **kwargs):
         super().__init__(**kwargs)
         if languages is None:
             raise ConfigurationError("A list of language codes needs to be defined")
@@ -330,6 +331,18 @@ def __init__(self, languages=None, id_method='langid', thresholds=None,
             if cld2_options:
                 raise ConfigurationError("cld2_options is supported only by the method cld2")
             self.cld2_options = None
+        # lingua mode
+        if id_method == "lingua":
+            from lingua import LanguageDetectorBuilder
+            # TODO: support lingua_languages just like langid_languages
+            from_languages = LanguageDetectorBuilder.from_all_languages()
+            if lingua_mode == "high":
+                self.lingua_detector = from_languages.with_preloaded_language_models().build()
+            elif lingua_mode == "low":
+                self.lingua_detector = from_languages.with_low_accuracy_mode().build()
+            else:
+                assert False, f"{lingua_mode} lingua mode is not supported."
+
         # global options
         self.languages = languages
         self.id_method = id_method
@@ -369,6 +382,16 @@ def confidence(self, sentence: str, lan: str) -> float:
                 liconf = confidence
             return liconf
 
+        if self.id_method == 'lingua':
+            confidence_values = self.lingua_detector.compute_language_confidence_values(sentence)
+            lang = confidence_values[0].language
+            confidence = confidence_values[0].value
+            if lang.iso_code_639_1.name.lower() != lan:
+                liconf = 0.0
+            else:
+                liconf = confidence
+            return liconf
+
         raise ValueError(f"Unknown language identification method '{self.id_method}'")
 
     def score(self, pairs: List[Tuple[str, str]]) -> Iterator[List[float]]:

diff --git a/requirements.txt b/requirements.txt
@@ -22,3 +22,4 @@ mecab-python3>=1.0.8
 unidic-lite==1.0.8
 subword-nmt==0.3.8
 Morfessor==2.0.6
+lingua-language-detector==2.0.1
diff --git a/setup.py b/setup.py
@@ -23,7 +23,8 @@
     "requests",
     "scikit-learn",
     "subword_nmt",
-    "tqdm"
+    "tqdm",
+    "lingua-language-detector"
 ]
 
 eflomal_require = [

diff --git a/tests/test_filters.py b/tests/test_filters.py
@@ -354,6 +354,26 @@ def test_accept(self):
             self.assertEqual(model.accept(pair_score), pair_expected)
 
 
+class TestLingua(TestLangIDMethod):
+
+    def test_accept(self):
+        model = LanguageIDFilter(
+            languages=['en', 'fr'], id_method='lingua', thresholds=[0.4, 0.99])
+        pair_scores = model.score(self.pairs_inputs)
+        pair_expecteds = [True, False]
+        for pair_score, pair_expected in zip(pair_scores, pair_expecteds):
+            self.assertEqual(model.accept(pair_score), pair_expected)
+
+    def test_accept_high(self):
+        model = LanguageIDFilter(
+            languages=['en', 'fr'], id_method='lingua', lingua_mode="high", thresholds=[0.5, 0.7])
+        pair_scores = model.score(self.pairs_inputs)
+        pair_expecteds = [True, False]
+        for pair_score, pair_expected in zip(pair_scores, pair_expecteds):
+            self.assertEqual(model.accept(pair_score), pair_expected)
+
+
+
 class TestRepetitionFilter(unittest.TestCase):
 
     def test_get_repetition(self):