diff --git a/docs/filters/script_and_language_identification_filters.md b/docs/filters/script_and_language_identification_filters.md index e1a961e..961898e 100644 --- a/docs/filters/script_and_language_identification_filters.md +++ b/docs/filters/script_and_language_identification_filters.md @@ -40,6 +40,7 @@ Parameters: * `fasttext_model_path`: path for a `fasttext` model (required only for the `fasttext` method; default `null`) * `langid_languages`: limit detection to a list of possible languages (valid only for the `langid` method; default `null`) * `cld2_options`: a dictionary of options for the `cld2` method (valid only for the `cld2` method; default `null`) +* `lingua_mode`: a string specifying whether to use lingua's `high` or `low` accuracy mode Returned scores are the language identification confidence scores from a given identification method for the segments. The scores range from 0 to 1. In filtering, all values have to be greater than the minimum thresholds. Negative threshold can be used to skip filtering for a language. diff --git a/docs/installation.md b/docs/installation.md index a4dd14b..160fb75 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -35,6 +35,7 @@ versions from 3.7 to 3.11. * subword_nmt * tqdm * xxhash +* lingua-language-detector See `setup.py` for possible version requirements. diff --git a/opusfilter/filters.py b/opusfilter/filters.py index b44dd1b..e4579a0 100644 --- a/opusfilter/filters.py +++ b/opusfilter/filters.py @@ -283,10 +283,11 @@ def accept(self, score): class LanguageIDFilter(FilterABC): """Language identification confidence filter - Currently this supports three methods: + Currently this supports four methods: * langid (default): see :cite:`lui-baldwin-2012-langid` * cld2: see https://github.com/CLD2Owners/cld2 * fasttext: see :cite:`joulin-etal-2016-fasttext` and :cite:`joulin-etal-2017-bag` + * lingua-py: see https://github.com/pemistahl/lingua-py """ @@ -296,7 +297,7 @@ class LanguageIDFilter(FilterABC): def __init__(self, languages=None, id_method='langid', thresholds=None, fasttext_model_path=None, langid_languages=None, cld2_options=None, - **kwargs): + lingua_mode="low", **kwargs): super().__init__(**kwargs) if languages is None: raise ConfigurationError("A list of language codes needs to be defined") @@ -330,6 +331,18 @@ def __init__(self, languages=None, id_method='langid', thresholds=None, if cld2_options: raise ConfigurationError("cld2_options is supported only by the method cld2") self.cld2_options = None + # lingua mode + if id_method == "lingua": + from lingua import LanguageDetectorBuilder + # TODO: support lingua_languages just like langid_languages + from_languages = LanguageDetectorBuilder.from_all_languages() + if lingua_mode == "high": + self.lingua_detector = from_languages.with_preloaded_language_models().build() + elif lingua_mode == "low": + self.lingua_detector = from_languages.with_low_accuracy_mode().build() + else: + assert False, f"{lingua_mode} lingua mode is not supported." + # global options self.languages = languages self.id_method = id_method @@ -369,6 +382,16 @@ def confidence(self, sentence: str, lan: str) -> float: liconf = confidence return liconf + if self.id_method == 'lingua': + confidence_values = self.lingua_detector.compute_language_confidence_values(sentence) + lang = confidence_values[0].language + confidence = confidence_values[0].value + if lang.iso_code_639_1.name.lower() != lan: + liconf = 0.0 + else: + liconf = confidence + return liconf + raise ValueError(f"Unknown language identification method '{self.id_method}'") def score(self, pairs: List[Tuple[str, str]]) -> Iterator[List[float]]: diff --git a/requirements.txt b/requirements.txt index 91799cc..a868fba 100644 --- a/requirements.txt +++ b/requirements.txt @@ -22,3 +22,4 @@ mecab-python3>=1.0.8 unidic-lite==1.0.8 subword-nmt==0.3.8 Morfessor==2.0.6 +lingua-language-detector==2.0.1 diff --git a/setup.py b/setup.py index 4eec909..27e6058 100644 --- a/setup.py +++ b/setup.py @@ -23,7 +23,8 @@ "requests", "scikit-learn", "subword_nmt", - "tqdm" + "tqdm", + "lingua-language-detector" ] eflomal_require = [ diff --git a/tests/test_filters.py b/tests/test_filters.py index 8db75af..b0b548f 100644 --- a/tests/test_filters.py +++ b/tests/test_filters.py @@ -354,6 +354,26 @@ def test_accept(self): self.assertEqual(model.accept(pair_score), pair_expected) +class TestLingua(TestLangIDMethod): + + def test_accept(self): + model = LanguageIDFilter( + languages=['en', 'fr'], id_method='lingua', thresholds=[0.4, 0.99]) + pair_scores = model.score(self.pairs_inputs) + pair_expecteds = [True, False] + for pair_score, pair_expected in zip(pair_scores, pair_expecteds): + self.assertEqual(model.accept(pair_score), pair_expected) + + def test_accept_high(self): + model = LanguageIDFilter( + languages=['en', 'fr'], id_method='lingua', lingua_mode="high", thresholds=[0.5, 0.7]) + pair_scores = model.score(self.pairs_inputs) + pair_expecteds = [True, False] + for pair_score, pair_expected in zip(pair_scores, pair_expecteds): + self.assertEqual(model.accept(pair_score), pair_expected) + + + class TestRepetitionFilter(unittest.TestCase): def test_get_repetition(self):