diff --git a/orangecontrib/text/preprocess/normalize.py b/orangecontrib/text/preprocess/normalize.py index bc8989495..07f85f761 100644 --- a/orangecontrib/text/preprocess/normalize.py +++ b/orangecontrib/text/preprocess/normalize.py @@ -10,6 +10,7 @@ from Orange.util import wrap_callback, dummy_callback from orangecontrib.text import Corpus +from orangecontrib.text.language import LANG2ISO, ISO2LANG from orangecontrib.text.misc import wait_nltk_data from orangecontrib.text.preprocess import Preprocessor, TokenizedPreprocessor @@ -71,12 +72,16 @@ class PorterStemmer(BaseNormalizer): class SnowballStemmer(BaseNormalizer): name = 'Snowball Stemmer' - supported_languages = [l.capitalize() for l in - stem.SnowballStemmer.languages] - - def __init__(self, language='English'): + supported_languages = { + LANG2ISO[l.capitalize()] + for l in stem.SnowballStemmer.languages + # skip porter since not language but porter stemmer that we implement separately + if l != "porter" + } + + def __init__(self, language='en'): super().__init__() - self.normalizer = stem.SnowballStemmer(language.lower()).stem + self.normalizer = stem.SnowballStemmer(ISO2LANG[language].lower()).stem def language_to_name(language): diff --git a/orangecontrib/text/tests/test_preprocess.py b/orangecontrib/text/tests/test_preprocess.py index 0b9bdb4c8..516c2627c 100644 --- a/orangecontrib/text/tests/test_preprocess.py +++ b/orangecontrib/text/tests/test_preprocess.py @@ -289,12 +289,19 @@ def test_function(self): self.assertEqual(stemmer._preprocess('token'), 'toke') def test_snowball(self): - stemmer = preprocess.SnowballStemmer('french') + stemmer = preprocess.SnowballStemmer('fr') token = 'voudrais' self.assertEqual( stemmer._preprocess(token), nltk.SnowballStemmer(language='french').stem(token)) + def test_snowball_all_langs(self): + for language in preprocess.SnowballStemmer.supported_languages: + normalizer = preprocess.SnowballStemmer(language) + tokens = normalizer(self.corpus).tokens + self.assertEqual(len(self.corpus), len(tokens)) + self.assertTrue(all(tokens)) + def test_udpipe(self): """Test udpipe token lemmatization""" normalizer = preprocess.UDPipeLemmatizer("Lithuanian") diff --git a/orangecontrib/text/widgets/owpreprocess.py b/orangecontrib/text/widgets/owpreprocess.py index eac04df92..7be2eed77 100644 --- a/orangecontrib/text/widgets/owpreprocess.py +++ b/orangecontrib/text/widgets/owpreprocess.py @@ -475,21 +475,23 @@ class NormalizationModule(SingleMethodModule): UDPipe: UDPipeLemmatizer, Lemmagen: LemmagenLemmatizer} DEFAULT_METHOD = Porter - DEFAULT_SNOWBALL_LANG = "English" # todo: remove when snowball use iso DEFAULT_UDPIPE_LANG = "English" # todo: remove when udpipe use iso DEFAULT_LANGUAGE = "en" DEFAULT_USE_TOKE = False def __init__(self, parent=None, **kwargs): super().__init__(parent, **kwargs) - self.__snowball_lang = self.DEFAULT_SNOWBALL_LANG + self.__snowball_lang = self.DEFAULT_LANGUAGE self.__udpipe_lang = self.DEFAULT_UDPIPE_LANG self.__lemmagen_lang = self.DEFAULT_LANGUAGE self.__use_tokenizer = self.DEFAULT_USE_TOKE - self.__combo_sbl = ComboBox( - self, SnowballStemmer.supported_languages, - self.__snowball_lang, self.__set_snowball_lang + self.__combo_sbl = LanguageComboBox( + self, + SnowballStemmer.supported_languages, + self.__snowball_lang, + False, + self.__set_snowball_lang ) self.__combo_udl = UDPipeComboBox( self, self.__udpipe_lang, self.DEFAULT_UDPIPE_LANG, self.__set_udpipe_lang @@ -534,7 +536,7 @@ def __enable_udpipe(self): def setParameters(self, params: Dict): super().setParameters(params) - snowball_lang = params.get("snowball_language", self.DEFAULT_SNOWBALL_LANG) + snowball_lang = params.get("snowball_language", self.DEFAULT_LANGUAGE) self.__set_snowball_lang(snowball_lang) udpipe_lang = params.get("udpipe_language", self.DEFAULT_UDPIPE_LANG) self.__set_udpipe_lang(udpipe_lang) @@ -550,7 +552,7 @@ def _set_method(self, method: int): def __set_snowball_lang(self, language: str): if self.__snowball_lang != language: self.__snowball_lang = language - self.__combo_sbl.setCurrentText(language) + self.__combo_sbl.set_current_language(language) self.changed.emit() if self.method == self.Snowball: self.edited.emit() @@ -591,11 +593,10 @@ def parameters(self) -> Dict: def createinstance(params: Dict) -> BaseNormalizer: method = params.get("method", NormalizationModule.DEFAULT_METHOD) args = {} - def_snowball = NormalizationModule.DEFAULT_SNOWBALL_LANG def_udpipe = NormalizationModule.DEFAULT_UDPIPE_LANG def_lang = NormalizationModule.DEFAULT_LANGUAGE if method == NormalizationModule.Snowball: - args = {"language": params.get("snowball_language", def_snowball)} + args = {"language": params.get("snowball_language", def_lang)} elif method == NormalizationModule.UDPipe: def_use = NormalizationModule.DEFAULT_USE_TOKE args = {"language": params.get("udpipe_language", def_udpipe), @@ -1390,8 +1391,10 @@ def str_into_paths(label): pp["language"] = None else: pp["language"] = StopwordsFilter.lang_to_iso(pp["language"]) - if pp_name == "preprocess.normalize" and "lemmagen_language" in pp: - pp["lemmagen_language"] = LANG2ISO[pp["lemmagen_language"]] + if pp_name == "preprocess.normalize": + for key in ("lemmagen_language", "snowball_language"): + if key in pp: + pp[key] = LANG2ISO[pp[key]] if __name__ == "__main__": diff --git a/orangecontrib/text/widgets/tests/test_owpreprocess.py b/orangecontrib/text/widgets/tests/test_owpreprocess.py index ea2abcd39..713d67b43 100644 --- a/orangecontrib/text/widgets/tests/test_owpreprocess.py +++ b/orangecontrib/text/widgets/tests/test_owpreprocess.py @@ -205,7 +205,7 @@ def test_migrate_settings_normalize(self): "udpipe_tokenizer": True}} widget = self.create_widget(OWPreprocess, stored_settings=settings) params = [("preprocess.normalize", - {"method": 2, "snowball_language": "French", + {"method": 2, "snowball_language": "fr", "udpipe_language": "German", "udpipe_tokenizer": True})] self.assertEqual(widget.storedsettings["preprocessors"], params) @@ -332,6 +332,32 @@ def test_migrate_lemmagen_language_settings(self): normalize_settings = widget.storedsettings["preprocessors"][0][1] self.assertEqual("en", normalize_settings["lemmagen_language"]) + def test_migrate_snowball_language_settings(self): + """Test migration to iso langauge codes""" + settings = { + "__version__": 3, + "storedsettings": { + "preprocessors": [ + ("preprocess.normalize", {"snowball_language": "Swedish"}), + ] + }, + } + widget = self.create_widget(OWPreprocess, stored_settings=settings) + normalize_settings = widget.storedsettings["preprocessors"][0][1] + self.assertEqual("sv", normalize_settings["snowball_language"]) + + settings = { + "__version__": 3, + "storedsettings": { + "preprocessors": [ + ("preprocess.normalize", {"snowball_language": "English"}), + ] + }, + } + widget = self.create_widget(OWPreprocess, stored_settings=settings) + normalize_settings = widget.storedsettings["preprocessors"][0][1] + self.assertEqual("en", normalize_settings["snowball_language"]) + class TestTransformationModule(WidgetTest): def setUp(self): @@ -473,7 +499,7 @@ def test_init(self): def test_parameters(self): params = { "method": NormalizationModule.Porter, - "snowball_language": "English", + "snowball_language": "en", "udpipe_language": "English", "lemmagen_language": "en", "udpipe_tokenizer": False, @@ -483,7 +509,7 @@ def test_parameters(self): def test_set_parameters(self): params = { "method": NormalizationModule.UDPipe, - "snowball_language": "Dutch", + "snowball_language": "nl", "udpipe_language": "Slovenian", "lemmagen_language": "bg", "udpipe_tokenizer": True, @@ -504,8 +530,7 @@ def test_createinstance(self): self.assertIsInstance(pp, SnowballStemmer) self.assertIn("", str(pp.normalizer)) - params = {"method": NormalizationModule.Snowball, - "snowball_language": "Dutch"} + params = {"method": NormalizationModule.Snowball, "snowball_language": "nl"} pp = self.editor.createinstance(params) self.assertIsInstance(pp, SnowballStemmer) self.assertIn("", str(pp.normalizer))