From 333d7a8e3dad5d07a9e9e10b3b0cd698939b5c77 Mon Sep 17 00:00:00 2001
From: Sami Virpioja <sami.virpioja@helsinki.fi>
Date: Thu, 20 Jun 2024 17:37:46 +0300
Subject: [PATCH] fix unittests for new optional libraries

---
 docs/CHANGELOG.md     |   4 ++
 requirements.txt      |   3 +-
 setup.py              |   2 +-
 tests/test_filters.py | 128 -----------------------------------
 tests/test_lid.py     | 151 ++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 157 insertions(+), 131 deletions(-)
 create mode 100644 tests/test_lid.py

diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md
index 836436c..04e1096 100644
--- a/docs/CHANGELOG.md
+++ b/docs/CHANGELOG.md
@@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Changed
+
+- make pycld2 and fasttext libraries optional
+
 ## [3.1.0] - 2024-06-05
 
 ### Added
diff --git a/requirements.txt b/requirements.txt
index b82f47b..285e7e9 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,6 @@
 setuptools==65.5.1
 setuptools_scm==6.4.2
-numpy<2.0.0
+numpy>=1.24.4
 opustools
 jieba>=0.42
 beautifulsoup4>=4.8.2
@@ -18,7 +18,6 @@ ruamel.yaml>=0.15.0
 scikit-learn>=0.24.0
 sentence-splitter==1.4
 tqdm>=4.38.0
-fasttext==0.9.2
 mecab-python3>=1.0.8
 unidic-lite==1.0.8
 subword-nmt==0.3.8
diff --git a/setup.py b/setup.py
index a39f586..6161b90 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,6 @@
 
 install_requires = [
     "setuptools",
-    "numpy<2.0.0",
     "opustools",
     "beautifulsoup4>=4.8.0",
     "graphviz",
@@ -31,6 +30,7 @@
 ]
 
 fasttext_require = [
+    "numpy<2.0.0",
     "fasttext"
 ]
 
diff --git a/tests/test_filters.py b/tests/test_filters.py
index b0b548f..554cf9e 100644
--- a/tests/test_filters.py
+++ b/tests/test_filters.py
@@ -5,9 +5,7 @@
 import tempfile
 import unittest
 
-from opusfilter import ConfigurationError
 from opusfilter.filters import *
-from opusfilter.util import file_download
 
 
 class TestLengthFilter(unittest.TestCase):
@@ -248,132 +246,6 @@ def test_trilingual_any(self):
             self.assertSequenceEqual(result, correct)
 
 
-class TestLangIDMethod(unittest.TestCase):
-
-    pairs_inputs = [
-        ("This sentence is in english", "Je suis une phrase en français"),
-        ("me llamo bernardo", "je m'appelle Bernard")
-    ]
-
-
-class TestLangId(TestLangIDMethod):
-
-    def test_accept(self):
-        model = LanguageIDFilter(
-            languages=['en', 'fr'], id_method='langid', thresholds=[0.8, 0.99])
-        pair_scores = model.score(self.pairs_inputs)
-        pair_expecteds = [True, False]
-        for pair_score, pair_expected in zip(pair_scores, pair_expecteds):
-            self.assertEqual(model.accept(pair_score), pair_expected)
-
-    def test_accept_with_set_languages(self):
-        model = LanguageIDFilter(
-            languages=['en', 'fr'], id_method='langid', thresholds=[0.8, 0.99],
-            langid_languages=['fr', 'de'])
-        pair_scores = model.score(self.pairs_inputs)
-        pair_expecteds = [False, False]
-        for pair_score, pair_expected in zip(pair_scores, pair_expecteds):
-            self.assertEqual(model.accept(pair_score), pair_expected)
-
-
-class TestCLD2(TestLangIDMethod):
-
-    pairs_inputs = [
-        ("This sentence is in english", "Je suis une phrase en français"),
-        ("me llamo bernardo", "je m'appelle Bernard"),
-        ("english sentence", "phrase français")
-    ]
-
-    def test_accept(self):
-        model = LanguageIDFilter(
-            languages=['en', 'fr'], id_method='cld2', thresholds=[0.9, 0.9])
-        pair_scores = model.score(self.pairs_inputs)
-        pair_expecteds = [True, False, False]
-        for pair_score, pair_expected in zip(pair_scores, pair_expecteds):
-            self.assertEqual(model.accept(pair_score), pair_expected)
-
-    def test_accept_with_options(self):
-        model = LanguageIDFilter(
-            languages=['en', 'fr'], id_method='cld2', thresholds=[0.9, 0.9],
-            cld2_options={'bestEffort': True})
-        pair_scores = model.score(self.pairs_inputs)
-        pair_expecteds = [True, False, True]
-        for pair_score, pair_expected in zip(pair_scores, pair_expecteds):
-            logging.info('%s %s', pair_score, pair_expected)
-            self.assertEqual(model.accept(pair_score), pair_expected)
-
-
-class TestFasttext(TestLangIDMethod):
-
-    fasttext_inputs = ["This sentence is in english", "Je suis une phrase en français"]
-    model_url = 'https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz'
-
-    @classmethod
-    def setUpClass(self):
-        self.tempdir = tempfile.mkdtemp()
-        self.testmodel = os.path.join(self.tempdir, 'model.ftz')
-        try:
-            file_download(self.model_url, self.testmodel)
-        except requests.exceptions.ConnectionError:
-            self.testmodel = None
-
-    @classmethod
-    def tearDownClass(self):
-        shutil.rmtree(self.tempdir)
-
-    def test_missing_model(self):
-        with self.assertRaises(ConfigurationError):
-            model = LanguageIDFilter(
-                languages=['en', 'fr'], id_method='fasttext', thresholds=[0.8, 0.99])
-
-    def test_wrong_method_with_model(self):
-        with self.assertRaises(ConfigurationError):
-            model = LanguageIDFilter(
-                languages=['en', 'fr'], thresholds=[0.8, 0.99], fasttext_model_path=self.tempdir)
-
-    def test_fasttext_predict_lang(self):
-        if self.testmodel is None:
-            self.skipTest("Failed to download test resources")
-        model = LanguageIDFilter(
-            languages=['en', 'fr'], id_method='fasttext', thresholds=[0.8, 0.99],
-            fasttext_model_path=self.testmodel)
-        expected = ['en', 'fr']
-        results = [model._fasttext_predict_lang(fasttext_input)[0]
-                   for fasttext_input in self.fasttext_inputs]
-        self.assertSequenceEqual(expected, results)
-
-    def test_accept(self):
-        if self.testmodel is None:
-            self.skipTest("Failed to download test resources")
-        model = LanguageIDFilter(
-            languages=['en', 'fr'], id_method='fasttext', thresholds=[0.8, 0.99],
-            fasttext_model_path=self.testmodel)
-        pair_scores = model.score(self.pairs_inputs)
-        pair_expecteds = [True, False]
-        for pair_score, pair_expected in zip(pair_scores, pair_expecteds):
-            self.assertEqual(model.accept(pair_score), pair_expected)
-
-
-class TestLingua(TestLangIDMethod):
-
-    def test_accept(self):
-        model = LanguageIDFilter(
-            languages=['en', 'fr'], id_method='lingua', thresholds=[0.4, 0.99])
-        pair_scores = model.score(self.pairs_inputs)
-        pair_expecteds = [True, False]
-        for pair_score, pair_expected in zip(pair_scores, pair_expecteds):
-            self.assertEqual(model.accept(pair_score), pair_expected)
-
-    def test_accept_high(self):
-        model = LanguageIDFilter(
-            languages=['en', 'fr'], id_method='lingua', lingua_mode="high", thresholds=[0.5, 0.7])
-        pair_scores = model.score(self.pairs_inputs)
-        pair_expecteds = [True, False]
-        for pair_score, pair_expected in zip(pair_scores, pair_expecteds):
-            self.assertEqual(model.accept(pair_score), pair_expected)
-
-
-
 class TestRepetitionFilter(unittest.TestCase):
 
     def test_get_repetition(self):
diff --git a/tests/test_lid.py b/tests/test_lid.py
new file mode 100644
index 0000000..ef0eefe
--- /dev/null
+++ b/tests/test_lid.py
@@ -0,0 +1,151 @@
+import logging
+import os
+import shutil
+import tempfile
+import unittest
+
+import requests
+
+from opusfilter import ConfigurationError
+from opusfilter.filters import *
+from opusfilter.util import file_download
+
+
+try:
+    import fasttext
+except ImportError:
+    logging.warning("Could not import fasttext")
+
+try:
+    import pycld2
+except ImportError:
+    logging.warning("Could not import pycld2")
+
+
+class TestLangIDMethod(unittest.TestCase):
+
+    pairs_inputs = [
+        ("This sentence is in english", "Je suis une phrase en français"),
+        ("me llamo bernardo", "je m'appelle Bernard")
+    ]
+
+
+class TestLangId(TestLangIDMethod):
+
+    def test_accept(self):
+        model = LanguageIDFilter(
+            languages=['en', 'fr'], id_method='langid', thresholds=[0.8, 0.99])
+        pair_scores = model.score(self.pairs_inputs)
+        pair_expecteds = [True, False]
+        for pair_score, pair_expected in zip(pair_scores, pair_expecteds):
+            self.assertEqual(model.accept(pair_score), pair_expected)
+
+    def test_accept_with_set_languages(self):
+        model = LanguageIDFilter(
+            languages=['en', 'fr'], id_method='langid', thresholds=[0.8, 0.99],
+            langid_languages=['fr', 'de'])
+        pair_scores = model.score(self.pairs_inputs)
+        pair_expecteds = [False, False]
+        for pair_score, pair_expected in zip(pair_scores, pair_expecteds):
+            self.assertEqual(model.accept(pair_score), pair_expected)
+
+
+class TestCLD2(TestLangIDMethod):
+
+    pairs_inputs = [
+        ("This sentence is in english", "Je suis une phrase en français"),
+        ("me llamo bernardo", "je m'appelle Bernard"),
+        ("english sentence", "phrase français")
+    ]
+
+    @unittest.skipIf('pycld2' not in globals(), 'pycld2 not installed')
+    def test_accept(self):
+        model = LanguageIDFilter(
+            languages=['en', 'fr'], id_method='cld2', thresholds=[0.9, 0.9])
+        pair_scores = model.score(self.pairs_inputs)
+        pair_expecteds = [True, False, False]
+        for pair_score, pair_expected in zip(pair_scores, pair_expecteds):
+            self.assertEqual(model.accept(pair_score), pair_expected)
+
+    @unittest.skipIf('pycld2' not in globals(), 'pycld2 not installed')
+    def test_accept_with_options(self):
+        model = LanguageIDFilter(
+            languages=['en', 'fr'], id_method='cld2', thresholds=[0.9, 0.9],
+            cld2_options={'bestEffort': True})
+        pair_scores = model.score(self.pairs_inputs)
+        pair_expecteds = [True, False, True]
+        for pair_score, pair_expected in zip(pair_scores, pair_expecteds):
+            logging.info('%s %s', pair_score, pair_expected)
+            self.assertEqual(model.accept(pair_score), pair_expected)
+
+
+class TestFasttext(TestLangIDMethod):
+
+    fasttext_inputs = ["This sentence is in english", "Je suis une phrase en français"]
+    model_url = 'https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz'
+
+    @classmethod
+    def setUpClass(self):
+        self.tempdir = tempfile.mkdtemp()
+        if 'fasttext' not in globals():
+            raise unittest.SkipTest('fasttext not installed')
+        self.testmodel = os.path.join(self.tempdir, 'model.ftz')
+        try:
+            file_download(self.model_url, self.testmodel)
+        except requests.exceptions.ConnectionError:
+            self.testmodel = None
+
+    @classmethod
+    def tearDownClass(self):
+        shutil.rmtree(self.tempdir)
+
+    def test_missing_model(self):
+        with self.assertRaises(ConfigurationError):
+            model = LanguageIDFilter(
+                languages=['en', 'fr'], id_method='fasttext', thresholds=[0.8, 0.99])
+
+    def test_wrong_method_with_model(self):
+        with self.assertRaises(ConfigurationError):
+            model = LanguageIDFilter(
+                languages=['en', 'fr'], thresholds=[0.8, 0.99], fasttext_model_path=self.tempdir)
+
+    def test_fasttext_predict_lang(self):
+        if self.testmodel is None:
+            self.skipTest("Failed to download test resources")
+        model = LanguageIDFilter(
+            languages=['en', 'fr'], id_method='fasttext', thresholds=[0.8, 0.99],
+            fasttext_model_path=self.testmodel)
+        expected = ['en', 'fr']
+        results = [model._fasttext_predict_lang(fasttext_input)[0]
+                   for fasttext_input in self.fasttext_inputs]
+        self.assertSequenceEqual(expected, results)
+
+    def test_accept(self):
+        if self.testmodel is None:
+            self.skipTest("Failed to download test resources")
+        model = LanguageIDFilter(
+            languages=['en', 'fr'], id_method='fasttext', thresholds=[0.8, 0.99],
+            fasttext_model_path=self.testmodel)
+        pair_scores = model.score(self.pairs_inputs)
+        pair_expecteds = [True, False]
+        for pair_score, pair_expected in zip(pair_scores, pair_expecteds):
+            self.assertEqual(model.accept(pair_score), pair_expected)
+
+
+class TestLingua(TestLangIDMethod):
+
+    def test_accept(self):
+        model = LanguageIDFilter(
+            languages=['en', 'fr'], id_method='lingua', thresholds=[0.4, 0.99])
+        pair_scores = model.score(self.pairs_inputs)
+        pair_expecteds = [True, False]
+        for pair_score, pair_expected in zip(pair_scores, pair_expecteds):
+            self.assertEqual(model.accept(pair_score), pair_expected)
+
+    def test_accept_high(self):
+        model = LanguageIDFilter(
+            languages=['en', 'fr'], id_method='lingua', lingua_mode="high", thresholds=[0.5, 0.7])
+        pair_scores = model.score(self.pairs_inputs)
+        pair_expecteds = [True, False]
+        for pair_score, pair_expected in zip(pair_scores, pair_expecteds):
+            self.assertEqual(model.accept(pair_score), pair_expected)