From 333d7a8e3dad5d07a9e9e10b3b0cd698939b5c77 Mon Sep 17 00:00:00 2001 From: Sami Virpioja Date: Thu, 20 Jun 2024 17:37:46 +0300 Subject: [PATCH] fix unittests for new optional libraries --- docs/CHANGELOG.md | 4 ++ requirements.txt | 3 +- setup.py | 2 +- tests/test_filters.py | 128 ----------------------------------- tests/test_lid.py | 151 ++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 157 insertions(+), 131 deletions(-) create mode 100644 tests/test_lid.py diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 836436c..04e1096 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Changed + +- make pycld2 and fasttext libraries optional + ## [3.1.0] - 2024-06-05 ### Added diff --git a/requirements.txt b/requirements.txt index b82f47b..285e7e9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ setuptools==65.5.1 setuptools_scm==6.4.2 -numpy<2.0.0 +numpy>=1.24.4 opustools jieba>=0.42 beautifulsoup4>=4.8.2 @@ -18,7 +18,6 @@ ruamel.yaml>=0.15.0 scikit-learn>=0.24.0 sentence-splitter==1.4 tqdm>=4.38.0 -fasttext==0.9.2 mecab-python3>=1.0.8 unidic-lite==1.0.8 subword-nmt==0.3.8 diff --git a/setup.py b/setup.py index a39f586..6161b90 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,6 @@ install_requires = [ "setuptools", - "numpy<2.0.0", "opustools", "beautifulsoup4>=4.8.0", "graphviz", @@ -31,6 +30,7 @@ ] fasttext_require = [ + "numpy<2.0.0", "fasttext" ] diff --git a/tests/test_filters.py b/tests/test_filters.py index b0b548f..554cf9e 100644 --- a/tests/test_filters.py +++ b/tests/test_filters.py @@ -5,9 +5,7 @@ import tempfile import unittest -from opusfilter import ConfigurationError from opusfilter.filters import * -from opusfilter.util import file_download class TestLengthFilter(unittest.TestCase): @@ -248,132 +246,6 @@ def test_trilingual_any(self): self.assertSequenceEqual(result, correct) -class TestLangIDMethod(unittest.TestCase): - - pairs_inputs = [ - ("This sentence is in english", "Je suis une phrase en français"), - ("me llamo bernardo", "je m'appelle Bernard") - ] - - -class TestLangId(TestLangIDMethod): - - def test_accept(self): - model = LanguageIDFilter( - languages=['en', 'fr'], id_method='langid', thresholds=[0.8, 0.99]) - pair_scores = model.score(self.pairs_inputs) - pair_expecteds = [True, False] - for pair_score, pair_expected in zip(pair_scores, pair_expecteds): - self.assertEqual(model.accept(pair_score), pair_expected) - - def test_accept_with_set_languages(self): - model = LanguageIDFilter( - languages=['en', 'fr'], id_method='langid', thresholds=[0.8, 0.99], - langid_languages=['fr', 'de']) - pair_scores = model.score(self.pairs_inputs) - pair_expecteds = [False, False] - for pair_score, pair_expected in zip(pair_scores, pair_expecteds): - self.assertEqual(model.accept(pair_score), pair_expected) - - -class TestCLD2(TestLangIDMethod): - - pairs_inputs = [ - ("This sentence is in english", "Je suis une phrase en français"), - ("me llamo bernardo", "je m'appelle Bernard"), - ("english sentence", "phrase français") - ] - - def test_accept(self): - model = LanguageIDFilter( - languages=['en', 'fr'], id_method='cld2', thresholds=[0.9, 0.9]) - pair_scores = model.score(self.pairs_inputs) - pair_expecteds = [True, False, False] - for pair_score, pair_expected in zip(pair_scores, pair_expecteds): - self.assertEqual(model.accept(pair_score), pair_expected) - - def test_accept_with_options(self): - model = LanguageIDFilter( - languages=['en', 'fr'], id_method='cld2', thresholds=[0.9, 0.9], - cld2_options={'bestEffort': True}) - pair_scores = model.score(self.pairs_inputs) - pair_expecteds = [True, False, True] - for pair_score, pair_expected in zip(pair_scores, pair_expecteds): - logging.info('%s %s', pair_score, pair_expected) - self.assertEqual(model.accept(pair_score), pair_expected) - - -class TestFasttext(TestLangIDMethod): - - fasttext_inputs = ["This sentence is in english", "Je suis une phrase en français"] - model_url = 'https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz' - - @classmethod - def setUpClass(self): - self.tempdir = tempfile.mkdtemp() - self.testmodel = os.path.join(self.tempdir, 'model.ftz') - try: - file_download(self.model_url, self.testmodel) - except requests.exceptions.ConnectionError: - self.testmodel = None - - @classmethod - def tearDownClass(self): - shutil.rmtree(self.tempdir) - - def test_missing_model(self): - with self.assertRaises(ConfigurationError): - model = LanguageIDFilter( - languages=['en', 'fr'], id_method='fasttext', thresholds=[0.8, 0.99]) - - def test_wrong_method_with_model(self): - with self.assertRaises(ConfigurationError): - model = LanguageIDFilter( - languages=['en', 'fr'], thresholds=[0.8, 0.99], fasttext_model_path=self.tempdir) - - def test_fasttext_predict_lang(self): - if self.testmodel is None: - self.skipTest("Failed to download test resources") - model = LanguageIDFilter( - languages=['en', 'fr'], id_method='fasttext', thresholds=[0.8, 0.99], - fasttext_model_path=self.testmodel) - expected = ['en', 'fr'] - results = [model._fasttext_predict_lang(fasttext_input)[0] - for fasttext_input in self.fasttext_inputs] - self.assertSequenceEqual(expected, results) - - def test_accept(self): - if self.testmodel is None: - self.skipTest("Failed to download test resources") - model = LanguageIDFilter( - languages=['en', 'fr'], id_method='fasttext', thresholds=[0.8, 0.99], - fasttext_model_path=self.testmodel) - pair_scores = model.score(self.pairs_inputs) - pair_expecteds = [True, False] - for pair_score, pair_expected in zip(pair_scores, pair_expecteds): - self.assertEqual(model.accept(pair_score), pair_expected) - - -class TestLingua(TestLangIDMethod): - - def test_accept(self): - model = LanguageIDFilter( - languages=['en', 'fr'], id_method='lingua', thresholds=[0.4, 0.99]) - pair_scores = model.score(self.pairs_inputs) - pair_expecteds = [True, False] - for pair_score, pair_expected in zip(pair_scores, pair_expecteds): - self.assertEqual(model.accept(pair_score), pair_expected) - - def test_accept_high(self): - model = LanguageIDFilter( - languages=['en', 'fr'], id_method='lingua', lingua_mode="high", thresholds=[0.5, 0.7]) - pair_scores = model.score(self.pairs_inputs) - pair_expecteds = [True, False] - for pair_score, pair_expected in zip(pair_scores, pair_expecteds): - self.assertEqual(model.accept(pair_score), pair_expected) - - - class TestRepetitionFilter(unittest.TestCase): def test_get_repetition(self): diff --git a/tests/test_lid.py b/tests/test_lid.py new file mode 100644 index 0000000..ef0eefe --- /dev/null +++ b/tests/test_lid.py @@ -0,0 +1,151 @@ +import logging +import os +import shutil +import tempfile +import unittest + +import requests + +from opusfilter import ConfigurationError +from opusfilter.filters import * +from opusfilter.util import file_download + + +try: + import fasttext +except ImportError: + logging.warning("Could not import fasttext") + +try: + import pycld2 +except ImportError: + logging.warning("Could not import pycld2") + + +class TestLangIDMethod(unittest.TestCase): + + pairs_inputs = [ + ("This sentence is in english", "Je suis une phrase en français"), + ("me llamo bernardo", "je m'appelle Bernard") + ] + + +class TestLangId(TestLangIDMethod): + + def test_accept(self): + model = LanguageIDFilter( + languages=['en', 'fr'], id_method='langid', thresholds=[0.8, 0.99]) + pair_scores = model.score(self.pairs_inputs) + pair_expecteds = [True, False] + for pair_score, pair_expected in zip(pair_scores, pair_expecteds): + self.assertEqual(model.accept(pair_score), pair_expected) + + def test_accept_with_set_languages(self): + model = LanguageIDFilter( + languages=['en', 'fr'], id_method='langid', thresholds=[0.8, 0.99], + langid_languages=['fr', 'de']) + pair_scores = model.score(self.pairs_inputs) + pair_expecteds = [False, False] + for pair_score, pair_expected in zip(pair_scores, pair_expecteds): + self.assertEqual(model.accept(pair_score), pair_expected) + + +class TestCLD2(TestLangIDMethod): + + pairs_inputs = [ + ("This sentence is in english", "Je suis une phrase en français"), + ("me llamo bernardo", "je m'appelle Bernard"), + ("english sentence", "phrase français") + ] + + @unittest.skipIf('pycld2' not in globals(), 'pycld2 not installed') + def test_accept(self): + model = LanguageIDFilter( + languages=['en', 'fr'], id_method='cld2', thresholds=[0.9, 0.9]) + pair_scores = model.score(self.pairs_inputs) + pair_expecteds = [True, False, False] + for pair_score, pair_expected in zip(pair_scores, pair_expecteds): + self.assertEqual(model.accept(pair_score), pair_expected) + + @unittest.skipIf('pycld2' not in globals(), 'pycld2 not installed') + def test_accept_with_options(self): + model = LanguageIDFilter( + languages=['en', 'fr'], id_method='cld2', thresholds=[0.9, 0.9], + cld2_options={'bestEffort': True}) + pair_scores = model.score(self.pairs_inputs) + pair_expecteds = [True, False, True] + for pair_score, pair_expected in zip(pair_scores, pair_expecteds): + logging.info('%s %s', pair_score, pair_expected) + self.assertEqual(model.accept(pair_score), pair_expected) + + +class TestFasttext(TestLangIDMethod): + + fasttext_inputs = ["This sentence is in english", "Je suis une phrase en français"] + model_url = 'https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz' + + @classmethod + def setUpClass(self): + self.tempdir = tempfile.mkdtemp() + if 'fasttext' not in globals(): + raise unittest.SkipTest('fasttext not installed') + self.testmodel = os.path.join(self.tempdir, 'model.ftz') + try: + file_download(self.model_url, self.testmodel) + except requests.exceptions.ConnectionError: + self.testmodel = None + + @classmethod + def tearDownClass(self): + shutil.rmtree(self.tempdir) + + def test_missing_model(self): + with self.assertRaises(ConfigurationError): + model = LanguageIDFilter( + languages=['en', 'fr'], id_method='fasttext', thresholds=[0.8, 0.99]) + + def test_wrong_method_with_model(self): + with self.assertRaises(ConfigurationError): + model = LanguageIDFilter( + languages=['en', 'fr'], thresholds=[0.8, 0.99], fasttext_model_path=self.tempdir) + + def test_fasttext_predict_lang(self): + if self.testmodel is None: + self.skipTest("Failed to download test resources") + model = LanguageIDFilter( + languages=['en', 'fr'], id_method='fasttext', thresholds=[0.8, 0.99], + fasttext_model_path=self.testmodel) + expected = ['en', 'fr'] + results = [model._fasttext_predict_lang(fasttext_input)[0] + for fasttext_input in self.fasttext_inputs] + self.assertSequenceEqual(expected, results) + + def test_accept(self): + if self.testmodel is None: + self.skipTest("Failed to download test resources") + model = LanguageIDFilter( + languages=['en', 'fr'], id_method='fasttext', thresholds=[0.8, 0.99], + fasttext_model_path=self.testmodel) + pair_scores = model.score(self.pairs_inputs) + pair_expecteds = [True, False] + for pair_score, pair_expected in zip(pair_scores, pair_expecteds): + self.assertEqual(model.accept(pair_score), pair_expected) + + +class TestLingua(TestLangIDMethod): + + def test_accept(self): + model = LanguageIDFilter( + languages=['en', 'fr'], id_method='lingua', thresholds=[0.4, 0.99]) + pair_scores = model.score(self.pairs_inputs) + pair_expecteds = [True, False] + for pair_score, pair_expected in zip(pair_scores, pair_expecteds): + self.assertEqual(model.accept(pair_score), pair_expected) + + def test_accept_high(self): + model = LanguageIDFilter( + languages=['en', 'fr'], id_method='lingua', lingua_mode="high", thresholds=[0.5, 0.7]) + pair_scores = model.score(self.pairs_inputs) + pair_expecteds = [True, False] + for pair_score, pair_expected in zip(pair_scores, pair_expecteds): + self.assertEqual(model.accept(pair_score), pair_expected)