Skip to content

Commit

Permalink
Add lingua-py support for language identification
Browse files Browse the repository at this point in the history
  • Loading branch information
marco-c committed Nov 24, 2023
1 parent 1d3b703 commit 1ccb30d
Show file tree
Hide file tree
Showing 6 changed files with 50 additions and 3 deletions.
1 change: 1 addition & 0 deletions docs/filters/script_and_language_identification_filters.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ Parameters:
* `fasttext_model_path`: path for a `fasttext` model (required only for the `fasttext` method; default `null`)
* `langid_languages`: limit detection to a list of possible languages (valid only for the `langid` method; default `null`)
* `cld2_options`: a dictionary of options for the `cld2` method (valid only for the `cld2` method; default `null`)
* `lingua_mode`: a string specifying whether to use lingua's `high` or `low` accuracy mode

Returned scores are the language identification confidence scores from a given identification method for the segments. The scores range from 0 to 1. In filtering, all values have to be greater than the minimum thresholds. Negative threshold can be used to skip filtering for a language.

Expand Down
1 change: 1 addition & 0 deletions docs/installation.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ versions from 3.7 to 3.11.
* subword_nmt
* tqdm
* xxhash
* lingua-language-detector

See `setup.py` for possible version requirements.

Expand Down
27 changes: 25 additions & 2 deletions opusfilter/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,10 +283,11 @@ def accept(self, score):
class LanguageIDFilter(FilterABC):
"""Language identification confidence filter
Currently this supports three methods:
Currently this supports four methods:
* langid (default): see :cite:`lui-baldwin-2012-langid`
* cld2: see https://github.com/CLD2Owners/cld2
* fasttext: see :cite:`joulin-etal-2016-fasttext` and :cite:`joulin-etal-2017-bag`
* lingua-py: see https://github.com/pemistahl/lingua-py
"""

Expand All @@ -296,7 +297,7 @@ class LanguageIDFilter(FilterABC):

def __init__(self, languages=None, id_method='langid', thresholds=None,
fasttext_model_path=None, langid_languages=None, cld2_options=None,
**kwargs):
lingua_mode="low", **kwargs):
super().__init__(**kwargs)
if languages is None:
raise ConfigurationError("A list of language codes needs to be defined")
Expand Down Expand Up @@ -330,6 +331,18 @@ def __init__(self, languages=None, id_method='langid', thresholds=None,
if cld2_options:
raise ConfigurationError("cld2_options is supported only by the method cld2")
self.cld2_options = None
# lingua mode
if id_method == "lingua":
from lingua import LanguageDetectorBuilder
# TODO: support lingua_languages just like langid_languages
from_languages = LanguageDetectorBuilder.from_all_languages()
if lingua_mode == "high":
self.lingua_detector = from_languages.with_preloaded_language_models().build()
elif lingua_mode == "low":
self.lingua_detector = from_languages.with_low_accuracy_mode().build()
else:
assert False, f"{lingua_mode} lingua mode is not supported."

# global options
self.languages = languages
self.id_method = id_method
Expand Down Expand Up @@ -369,6 +382,16 @@ def confidence(self, sentence: str, lan: str) -> float:
liconf = confidence
return liconf

if self.id_method == 'lingua':
confidence_values = self.lingua_detector.compute_language_confidence_values(sentence)
lang = confidence_values[0].language
confidence = confidence_values[0].value
if lang.iso_code_639_1.name.lower() != lan:
liconf = 0.0
else:
liconf = confidence
return liconf

raise ValueError(f"Unknown language identification method '{self.id_method}'")

def score(self, pairs: List[Tuple[str, str]]) -> Iterator[List[float]]:
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,4 @@ mecab-python3>=1.0.8
unidic-lite==1.0.8
subword-nmt==0.3.8
Morfessor==2.0.6
lingua-language-detector==2.0.1
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@
"requests",
"scikit-learn",
"subword_nmt",
"tqdm"
"tqdm",
"lingua-language-detector"
]

eflomal_require = [
Expand Down
20 changes: 20 additions & 0 deletions tests/test_filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,6 +354,26 @@ def test_accept(self):
self.assertEqual(model.accept(pair_score), pair_expected)


class TestLingua(TestLangIDMethod):

def test_accept(self):
model = LanguageIDFilter(
languages=['en', 'fr'], id_method='lingua', thresholds=[0.4, 0.99])
pair_scores = model.score(self.pairs_inputs)
pair_expecteds = [True, False]
for pair_score, pair_expected in zip(pair_scores, pair_expecteds):
self.assertEqual(model.accept(pair_score), pair_expected)

def test_accept_high(self):
model = LanguageIDFilter(
languages=['en', 'fr'], id_method='lingua', lingua_mode="high", thresholds=[0.5, 0.7])
pair_scores = model.score(self.pairs_inputs)
pair_expecteds = [True, False]
for pair_score, pair_expected in zip(pair_scores, pair_expecteds):
self.assertEqual(model.accept(pair_score), pair_expected)



class TestRepetitionFilter(unittest.TestCase):

def test_get_repetition(self):
Expand Down

0 comments on commit 1ccb30d

Please sign in to comment.