Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add lingua-py support for language identification #65

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/filters/script_and_language_identification_filters.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ Parameters:
* `fasttext_model_path`: path for a `fasttext` model (required only for the `fasttext` method; default `null`)
* `langid_languages`: limit detection to a list of possible languages (valid only for the `langid` method; default `null`)
* `cld2_options`: a dictionary of options for the `cld2` method (valid only for the `cld2` method; default `null`)
* `lingua_mode`: a string specifying whether to use lingua's `high` or `low` accuracy mode

Returned scores are the language identification confidence scores from a given identification method for the segments. The scores range from 0 to 1. In filtering, all values have to be greater than the minimum thresholds. Negative threshold can be used to skip filtering for a language.

Expand Down
1 change: 1 addition & 0 deletions docs/installation.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ versions from 3.7 to 3.11.
* subword_nmt
* tqdm
* xxhash
* lingua-language-detector

See `setup.py` for possible version requirements.

Expand Down
27 changes: 25 additions & 2 deletions opusfilter/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,10 +283,11 @@ def accept(self, score):
class LanguageIDFilter(FilterABC):
"""Language identification confidence filter

Currently this supports three methods:
Currently this supports four methods:
* langid (default): see :cite:`lui-baldwin-2012-langid`
* cld2: see https://github.com/CLD2Owners/cld2
* fasttext: see :cite:`joulin-etal-2016-fasttext` and :cite:`joulin-etal-2017-bag`
* lingua-py: see https://github.com/pemistahl/lingua-py

"""

Expand All @@ -296,7 +297,7 @@ class LanguageIDFilter(FilterABC):

def __init__(self, languages=None, id_method='langid', thresholds=None,
fasttext_model_path=None, langid_languages=None, cld2_options=None,
**kwargs):
lingua_mode="low", **kwargs):
super().__init__(**kwargs)
if languages is None:
raise ConfigurationError("A list of language codes needs to be defined")
Expand Down Expand Up @@ -330,6 +331,18 @@ def __init__(self, languages=None, id_method='langid', thresholds=None,
if cld2_options:
raise ConfigurationError("cld2_options is supported only by the method cld2")
self.cld2_options = None
# lingua mode
if id_method == "lingua":
from lingua import LanguageDetectorBuilder
# TODO: support lingua_languages just like langid_languages
from_languages = LanguageDetectorBuilder.from_all_languages()
if lingua_mode == "high":
self.lingua_detector = from_languages.with_preloaded_language_models().build()
elif lingua_mode == "low":
self.lingua_detector = from_languages.with_low_accuracy_mode().build()
else:
assert False, f"{lingua_mode} lingua mode is not supported."

# global options
self.languages = languages
self.id_method = id_method
Expand Down Expand Up @@ -369,6 +382,16 @@ def confidence(self, sentence: str, lan: str) -> float:
liconf = confidence
return liconf

if self.id_method == 'lingua':
confidence_values = self.lingua_detector.compute_language_confidence_values(sentence)
lang = confidence_values[0].language
confidence = confidence_values[0].value
if lang.iso_code_639_1.name.lower() != lan:
liconf = 0.0
else:
liconf = confidence
return liconf

raise ValueError(f"Unknown language identification method '{self.id_method}'")

def score(self, pairs: List[Tuple[str, str]]) -> Iterator[List[float]]:
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,4 @@ mecab-python3>=1.0.8
unidic-lite==1.0.8
subword-nmt==0.3.8
Morfessor==2.0.6
lingua-language-detector==2.0.1
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@
"requests",
"scikit-learn",
"subword_nmt",
"tqdm"
"tqdm",
"lingua-language-detector"
]

eflomal_require = [
Expand Down
20 changes: 20 additions & 0 deletions tests/test_filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,6 +354,26 @@ def test_accept(self):
self.assertEqual(model.accept(pair_score), pair_expected)


class TestLingua(TestLangIDMethod):

def test_accept(self):
model = LanguageIDFilter(
languages=['en', 'fr'], id_method='lingua', thresholds=[0.4, 0.99])
pair_scores = model.score(self.pairs_inputs)
pair_expecteds = [True, False]
for pair_score, pair_expected in zip(pair_scores, pair_expecteds):
self.assertEqual(model.accept(pair_score), pair_expected)

def test_accept_high(self):
model = LanguageIDFilter(
languages=['en', 'fr'], id_method='lingua', lingua_mode="high", thresholds=[0.5, 0.7])
pair_scores = model.score(self.pairs_inputs)
pair_expecteds = [True, False]
for pair_score, pair_expected in zip(pair_scores, pair_expecteds):
self.assertEqual(model.accept(pair_score), pair_expected)



class TestRepetitionFilter(unittest.TestCase):

def test_get_repetition(self):
Expand Down
Loading