Skip to content

Commit

Permalink
fix unittests for new optional libraries
Browse files Browse the repository at this point in the history
  • Loading branch information
svirpioj committed Jun 20, 2024
1 parent d88fb8d commit 333d7a8
Show file tree
Hide file tree
Showing 5 changed files with 157 additions and 131 deletions.
4 changes: 4 additions & 0 deletions docs/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

### Changed

- make pycld2 and fasttext libraries optional

## [3.1.0] - 2024-06-05

### Added
Expand Down
3 changes: 1 addition & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
setuptools==65.5.1
setuptools_scm==6.4.2
numpy<2.0.0
numpy>=1.24.4
opustools
jieba>=0.42
beautifulsoup4>=4.8.2
Expand All @@ -18,7 +18,6 @@ ruamel.yaml>=0.15.0
scikit-learn>=0.24.0
sentence-splitter==1.4
tqdm>=4.38.0
fasttext==0.9.2
mecab-python3>=1.0.8
unidic-lite==1.0.8
subword-nmt==0.3.8
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@

install_requires = [
"setuptools",
"numpy<2.0.0",
"opustools",
"beautifulsoup4>=4.8.0",
"graphviz",
Expand All @@ -31,6 +30,7 @@
]

fasttext_require = [
"numpy<2.0.0",
"fasttext"
]

Expand Down
128 changes: 0 additions & 128 deletions tests/test_filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,7 @@
import tempfile
import unittest

from opusfilter import ConfigurationError
from opusfilter.filters import *
from opusfilter.util import file_download


class TestLengthFilter(unittest.TestCase):
Expand Down Expand Up @@ -248,132 +246,6 @@ def test_trilingual_any(self):
self.assertSequenceEqual(result, correct)


class TestLangIDMethod(unittest.TestCase):

pairs_inputs = [
("This sentence is in english", "Je suis une phrase en français"),
("me llamo bernardo", "je m'appelle Bernard")
]


class TestLangId(TestLangIDMethod):

def test_accept(self):
model = LanguageIDFilter(
languages=['en', 'fr'], id_method='langid', thresholds=[0.8, 0.99])
pair_scores = model.score(self.pairs_inputs)
pair_expecteds = [True, False]
for pair_score, pair_expected in zip(pair_scores, pair_expecteds):
self.assertEqual(model.accept(pair_score), pair_expected)

def test_accept_with_set_languages(self):
model = LanguageIDFilter(
languages=['en', 'fr'], id_method='langid', thresholds=[0.8, 0.99],
langid_languages=['fr', 'de'])
pair_scores = model.score(self.pairs_inputs)
pair_expecteds = [False, False]
for pair_score, pair_expected in zip(pair_scores, pair_expecteds):
self.assertEqual(model.accept(pair_score), pair_expected)


class TestCLD2(TestLangIDMethod):

pairs_inputs = [
("This sentence is in english", "Je suis une phrase en français"),
("me llamo bernardo", "je m'appelle Bernard"),
("english sentence", "phrase français")
]

def test_accept(self):
model = LanguageIDFilter(
languages=['en', 'fr'], id_method='cld2', thresholds=[0.9, 0.9])
pair_scores = model.score(self.pairs_inputs)
pair_expecteds = [True, False, False]
for pair_score, pair_expected in zip(pair_scores, pair_expecteds):
self.assertEqual(model.accept(pair_score), pair_expected)

def test_accept_with_options(self):
model = LanguageIDFilter(
languages=['en', 'fr'], id_method='cld2', thresholds=[0.9, 0.9],
cld2_options={'bestEffort': True})
pair_scores = model.score(self.pairs_inputs)
pair_expecteds = [True, False, True]
for pair_score, pair_expected in zip(pair_scores, pair_expecteds):
logging.info('%s %s', pair_score, pair_expected)
self.assertEqual(model.accept(pair_score), pair_expected)


class TestFasttext(TestLangIDMethod):

fasttext_inputs = ["This sentence is in english", "Je suis une phrase en français"]
model_url = 'https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz'

@classmethod
def setUpClass(self):
self.tempdir = tempfile.mkdtemp()
self.testmodel = os.path.join(self.tempdir, 'model.ftz')
try:
file_download(self.model_url, self.testmodel)
except requests.exceptions.ConnectionError:
self.testmodel = None

@classmethod
def tearDownClass(self):
shutil.rmtree(self.tempdir)

def test_missing_model(self):
with self.assertRaises(ConfigurationError):
model = LanguageIDFilter(
languages=['en', 'fr'], id_method='fasttext', thresholds=[0.8, 0.99])

def test_wrong_method_with_model(self):
with self.assertRaises(ConfigurationError):
model = LanguageIDFilter(
languages=['en', 'fr'], thresholds=[0.8, 0.99], fasttext_model_path=self.tempdir)

def test_fasttext_predict_lang(self):
if self.testmodel is None:
self.skipTest("Failed to download test resources")
model = LanguageIDFilter(
languages=['en', 'fr'], id_method='fasttext', thresholds=[0.8, 0.99],
fasttext_model_path=self.testmodel)
expected = ['en', 'fr']
results = [model._fasttext_predict_lang(fasttext_input)[0]
for fasttext_input in self.fasttext_inputs]
self.assertSequenceEqual(expected, results)

def test_accept(self):
if self.testmodel is None:
self.skipTest("Failed to download test resources")
model = LanguageIDFilter(
languages=['en', 'fr'], id_method='fasttext', thresholds=[0.8, 0.99],
fasttext_model_path=self.testmodel)
pair_scores = model.score(self.pairs_inputs)
pair_expecteds = [True, False]
for pair_score, pair_expected in zip(pair_scores, pair_expecteds):
self.assertEqual(model.accept(pair_score), pair_expected)


class TestLingua(TestLangIDMethod):

def test_accept(self):
model = LanguageIDFilter(
languages=['en', 'fr'], id_method='lingua', thresholds=[0.4, 0.99])
pair_scores = model.score(self.pairs_inputs)
pair_expecteds = [True, False]
for pair_score, pair_expected in zip(pair_scores, pair_expecteds):
self.assertEqual(model.accept(pair_score), pair_expected)

def test_accept_high(self):
model = LanguageIDFilter(
languages=['en', 'fr'], id_method='lingua', lingua_mode="high", thresholds=[0.5, 0.7])
pair_scores = model.score(self.pairs_inputs)
pair_expecteds = [True, False]
for pair_score, pair_expected in zip(pair_scores, pair_expecteds):
self.assertEqual(model.accept(pair_score), pair_expected)



class TestRepetitionFilter(unittest.TestCase):

def test_get_repetition(self):
Expand Down
151 changes: 151 additions & 0 deletions tests/test_lid.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
import logging
import os
import shutil
import tempfile
import unittest

import requests

from opusfilter import ConfigurationError
from opusfilter.filters import *
from opusfilter.util import file_download


try:
import fasttext
except ImportError:
logging.warning("Could not import fasttext")

try:
import pycld2
except ImportError:
logging.warning("Could not import pycld2")


class TestLangIDMethod(unittest.TestCase):

pairs_inputs = [
("This sentence is in english", "Je suis une phrase en français"),
("me llamo bernardo", "je m'appelle Bernard")
]


class TestLangId(TestLangIDMethod):

def test_accept(self):
model = LanguageIDFilter(
languages=['en', 'fr'], id_method='langid', thresholds=[0.8, 0.99])
pair_scores = model.score(self.pairs_inputs)
pair_expecteds = [True, False]
for pair_score, pair_expected in zip(pair_scores, pair_expecteds):
self.assertEqual(model.accept(pair_score), pair_expected)

def test_accept_with_set_languages(self):
model = LanguageIDFilter(
languages=['en', 'fr'], id_method='langid', thresholds=[0.8, 0.99],
langid_languages=['fr', 'de'])
pair_scores = model.score(self.pairs_inputs)
pair_expecteds = [False, False]
for pair_score, pair_expected in zip(pair_scores, pair_expecteds):
self.assertEqual(model.accept(pair_score), pair_expected)


class TestCLD2(TestLangIDMethod):

pairs_inputs = [
("This sentence is in english", "Je suis une phrase en français"),
("me llamo bernardo", "je m'appelle Bernard"),
("english sentence", "phrase français")
]

@unittest.skipIf('pycld2' not in globals(), 'pycld2 not installed')
def test_accept(self):
model = LanguageIDFilter(
languages=['en', 'fr'], id_method='cld2', thresholds=[0.9, 0.9])
pair_scores = model.score(self.pairs_inputs)
pair_expecteds = [True, False, False]
for pair_score, pair_expected in zip(pair_scores, pair_expecteds):
self.assertEqual(model.accept(pair_score), pair_expected)

@unittest.skipIf('pycld2' not in globals(), 'pycld2 not installed')
def test_accept_with_options(self):
model = LanguageIDFilter(
languages=['en', 'fr'], id_method='cld2', thresholds=[0.9, 0.9],
cld2_options={'bestEffort': True})
pair_scores = model.score(self.pairs_inputs)
pair_expecteds = [True, False, True]
for pair_score, pair_expected in zip(pair_scores, pair_expecteds):
logging.info('%s %s', pair_score, pair_expected)
self.assertEqual(model.accept(pair_score), pair_expected)


class TestFasttext(TestLangIDMethod):

fasttext_inputs = ["This sentence is in english", "Je suis une phrase en français"]
model_url = 'https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz'

@classmethod
def setUpClass(self):
self.tempdir = tempfile.mkdtemp()
if 'fasttext' not in globals():
raise unittest.SkipTest('fasttext not installed')
self.testmodel = os.path.join(self.tempdir, 'model.ftz')
try:
file_download(self.model_url, self.testmodel)
except requests.exceptions.ConnectionError:
self.testmodel = None

@classmethod
def tearDownClass(self):
shutil.rmtree(self.tempdir)

def test_missing_model(self):
with self.assertRaises(ConfigurationError):
model = LanguageIDFilter(
languages=['en', 'fr'], id_method='fasttext', thresholds=[0.8, 0.99])

def test_wrong_method_with_model(self):
with self.assertRaises(ConfigurationError):
model = LanguageIDFilter(
languages=['en', 'fr'], thresholds=[0.8, 0.99], fasttext_model_path=self.tempdir)

def test_fasttext_predict_lang(self):
if self.testmodel is None:
self.skipTest("Failed to download test resources")
model = LanguageIDFilter(
languages=['en', 'fr'], id_method='fasttext', thresholds=[0.8, 0.99],
fasttext_model_path=self.testmodel)
expected = ['en', 'fr']
results = [model._fasttext_predict_lang(fasttext_input)[0]
for fasttext_input in self.fasttext_inputs]
self.assertSequenceEqual(expected, results)

def test_accept(self):
if self.testmodel is None:
self.skipTest("Failed to download test resources")
model = LanguageIDFilter(
languages=['en', 'fr'], id_method='fasttext', thresholds=[0.8, 0.99],
fasttext_model_path=self.testmodel)
pair_scores = model.score(self.pairs_inputs)
pair_expecteds = [True, False]
for pair_score, pair_expected in zip(pair_scores, pair_expecteds):
self.assertEqual(model.accept(pair_score), pair_expected)


class TestLingua(TestLangIDMethod):

def test_accept(self):
model = LanguageIDFilter(
languages=['en', 'fr'], id_method='lingua', thresholds=[0.4, 0.99])
pair_scores = model.score(self.pairs_inputs)
pair_expecteds = [True, False]
for pair_score, pair_expected in zip(pair_scores, pair_expecteds):
self.assertEqual(model.accept(pair_score), pair_expected)

def test_accept_high(self):
model = LanguageIDFilter(
languages=['en', 'fr'], id_method='lingua', lingua_mode="high", thresholds=[0.5, 0.7])
pair_scores = model.score(self.pairs_inputs)
pair_expecteds = [True, False]
for pair_score, pair_expected in zip(pair_scores, pair_expecteds):
self.assertEqual(model.accept(pair_score), pair_expected)

0 comments on commit 333d7a8

Please sign in to comment.