From 7f68b84ff4273665c1e5e548a2e397da6c2e7c6c Mon Sep 17 00:00:00 2001
From: PrimozGodec
Date: Wed, 13 Dec 2023 17:21:00 +0100
Subject: [PATCH 1/2] Snowball - Use ISO language codes instead of names
---
orangecontrib/text/preprocess/normalize.py | 15 ++++++++++-----
orangecontrib/text/tests/test_preprocess.py | 9 ++++++++-
2 files changed, 18 insertions(+), 6 deletions(-)
diff --git a/orangecontrib/text/preprocess/normalize.py b/orangecontrib/text/preprocess/normalize.py
index bc8989495..07f85f761 100644
--- a/orangecontrib/text/preprocess/normalize.py
+++ b/orangecontrib/text/preprocess/normalize.py
@@ -10,6 +10,7 @@
from Orange.util import wrap_callback, dummy_callback
from orangecontrib.text import Corpus
+from orangecontrib.text.language import LANG2ISO, ISO2LANG
from orangecontrib.text.misc import wait_nltk_data
from orangecontrib.text.preprocess import Preprocessor, TokenizedPreprocessor
@@ -71,12 +72,16 @@ class PorterStemmer(BaseNormalizer):
class SnowballStemmer(BaseNormalizer):
name = 'Snowball Stemmer'
- supported_languages = [l.capitalize() for l in
- stem.SnowballStemmer.languages]
-
- def __init__(self, language='English'):
+ supported_languages = {
+ LANG2ISO[l.capitalize()]
+ for l in stem.SnowballStemmer.languages
+ # skip porter since not language but porter stemmer that we implement separately
+ if l != "porter"
+ }
+
+ def __init__(self, language='en'):
super().__init__()
- self.normalizer = stem.SnowballStemmer(language.lower()).stem
+ self.normalizer = stem.SnowballStemmer(ISO2LANG[language].lower()).stem
def language_to_name(language):
diff --git a/orangecontrib/text/tests/test_preprocess.py b/orangecontrib/text/tests/test_preprocess.py
index 0b9bdb4c8..516c2627c 100644
--- a/orangecontrib/text/tests/test_preprocess.py
+++ b/orangecontrib/text/tests/test_preprocess.py
@@ -289,12 +289,19 @@ def test_function(self):
self.assertEqual(stemmer._preprocess('token'), 'toke')
def test_snowball(self):
- stemmer = preprocess.SnowballStemmer('french')
+ stemmer = preprocess.SnowballStemmer('fr')
token = 'voudrais'
self.assertEqual(
stemmer._preprocess(token),
nltk.SnowballStemmer(language='french').stem(token))
+ def test_snowball_all_langs(self):
+ for language in preprocess.SnowballStemmer.supported_languages:
+ normalizer = preprocess.SnowballStemmer(language)
+ tokens = normalizer(self.corpus).tokens
+ self.assertEqual(len(self.corpus), len(tokens))
+ self.assertTrue(all(tokens))
+
def test_udpipe(self):
"""Test udpipe token lemmatization"""
normalizer = preprocess.UDPipeLemmatizer("Lithuanian")
From aa3306c27ea178e80ffae7e3a2a4e2683cc5a3cf Mon Sep 17 00:00:00 2001
From: PrimozGodec
Date: Wed, 13 Dec 2023 17:36:31 +0100
Subject: [PATCH 2/2] Preprocess - Use ISO language codes for Snowball
---
orangecontrib/text/widgets/owpreprocess.py | 25 +++++++------
.../text/widgets/tests/test_owpreprocess.py | 35 ++++++++++++++++---
2 files changed, 44 insertions(+), 16 deletions(-)
diff --git a/orangecontrib/text/widgets/owpreprocess.py b/orangecontrib/text/widgets/owpreprocess.py
index eac04df92..7be2eed77 100644
--- a/orangecontrib/text/widgets/owpreprocess.py
+++ b/orangecontrib/text/widgets/owpreprocess.py
@@ -475,21 +475,23 @@ class NormalizationModule(SingleMethodModule):
UDPipe: UDPipeLemmatizer,
Lemmagen: LemmagenLemmatizer}
DEFAULT_METHOD = Porter
- DEFAULT_SNOWBALL_LANG = "English" # todo: remove when snowball use iso
DEFAULT_UDPIPE_LANG = "English" # todo: remove when udpipe use iso
DEFAULT_LANGUAGE = "en"
DEFAULT_USE_TOKE = False
def __init__(self, parent=None, **kwargs):
super().__init__(parent, **kwargs)
- self.__snowball_lang = self.DEFAULT_SNOWBALL_LANG
+ self.__snowball_lang = self.DEFAULT_LANGUAGE
self.__udpipe_lang = self.DEFAULT_UDPIPE_LANG
self.__lemmagen_lang = self.DEFAULT_LANGUAGE
self.__use_tokenizer = self.DEFAULT_USE_TOKE
- self.__combo_sbl = ComboBox(
- self, SnowballStemmer.supported_languages,
- self.__snowball_lang, self.__set_snowball_lang
+ self.__combo_sbl = LanguageComboBox(
+ self,
+ SnowballStemmer.supported_languages,
+ self.__snowball_lang,
+ False,
+ self.__set_snowball_lang
)
self.__combo_udl = UDPipeComboBox(
self, self.__udpipe_lang, self.DEFAULT_UDPIPE_LANG, self.__set_udpipe_lang
@@ -534,7 +536,7 @@ def __enable_udpipe(self):
def setParameters(self, params: Dict):
super().setParameters(params)
- snowball_lang = params.get("snowball_language", self.DEFAULT_SNOWBALL_LANG)
+ snowball_lang = params.get("snowball_language", self.DEFAULT_LANGUAGE)
self.__set_snowball_lang(snowball_lang)
udpipe_lang = params.get("udpipe_language", self.DEFAULT_UDPIPE_LANG)
self.__set_udpipe_lang(udpipe_lang)
@@ -550,7 +552,7 @@ def _set_method(self, method: int):
def __set_snowball_lang(self, language: str):
if self.__snowball_lang != language:
self.__snowball_lang = language
- self.__combo_sbl.setCurrentText(language)
+ self.__combo_sbl.set_current_language(language)
self.changed.emit()
if self.method == self.Snowball:
self.edited.emit()
@@ -591,11 +593,10 @@ def parameters(self) -> Dict:
def createinstance(params: Dict) -> BaseNormalizer:
method = params.get("method", NormalizationModule.DEFAULT_METHOD)
args = {}
- def_snowball = NormalizationModule.DEFAULT_SNOWBALL_LANG
def_udpipe = NormalizationModule.DEFAULT_UDPIPE_LANG
def_lang = NormalizationModule.DEFAULT_LANGUAGE
if method == NormalizationModule.Snowball:
- args = {"language": params.get("snowball_language", def_snowball)}
+ args = {"language": params.get("snowball_language", def_lang)}
elif method == NormalizationModule.UDPipe:
def_use = NormalizationModule.DEFAULT_USE_TOKE
args = {"language": params.get("udpipe_language", def_udpipe),
@@ -1390,8 +1391,10 @@ def str_into_paths(label):
pp["language"] = None
else:
pp["language"] = StopwordsFilter.lang_to_iso(pp["language"])
- if pp_name == "preprocess.normalize" and "lemmagen_language" in pp:
- pp["lemmagen_language"] = LANG2ISO[pp["lemmagen_language"]]
+ if pp_name == "preprocess.normalize":
+ for key in ("lemmagen_language", "snowball_language"):
+ if key in pp:
+ pp[key] = LANG2ISO[pp[key]]
if __name__ == "__main__":
diff --git a/orangecontrib/text/widgets/tests/test_owpreprocess.py b/orangecontrib/text/widgets/tests/test_owpreprocess.py
index ea2abcd39..713d67b43 100644
--- a/orangecontrib/text/widgets/tests/test_owpreprocess.py
+++ b/orangecontrib/text/widgets/tests/test_owpreprocess.py
@@ -205,7 +205,7 @@ def test_migrate_settings_normalize(self):
"udpipe_tokenizer": True}}
widget = self.create_widget(OWPreprocess, stored_settings=settings)
params = [("preprocess.normalize",
- {"method": 2, "snowball_language": "French",
+ {"method": 2, "snowball_language": "fr",
"udpipe_language": "German", "udpipe_tokenizer": True})]
self.assertEqual(widget.storedsettings["preprocessors"], params)
@@ -332,6 +332,32 @@ def test_migrate_lemmagen_language_settings(self):
normalize_settings = widget.storedsettings["preprocessors"][0][1]
self.assertEqual("en", normalize_settings["lemmagen_language"])
+ def test_migrate_snowball_language_settings(self):
+ """Test migration to iso langauge codes"""
+ settings = {
+ "__version__": 3,
+ "storedsettings": {
+ "preprocessors": [
+ ("preprocess.normalize", {"snowball_language": "Swedish"}),
+ ]
+ },
+ }
+ widget = self.create_widget(OWPreprocess, stored_settings=settings)
+ normalize_settings = widget.storedsettings["preprocessors"][0][1]
+ self.assertEqual("sv", normalize_settings["snowball_language"])
+
+ settings = {
+ "__version__": 3,
+ "storedsettings": {
+ "preprocessors": [
+ ("preprocess.normalize", {"snowball_language": "English"}),
+ ]
+ },
+ }
+ widget = self.create_widget(OWPreprocess, stored_settings=settings)
+ normalize_settings = widget.storedsettings["preprocessors"][0][1]
+ self.assertEqual("en", normalize_settings["snowball_language"])
+
class TestTransformationModule(WidgetTest):
def setUp(self):
@@ -473,7 +499,7 @@ def test_init(self):
def test_parameters(self):
params = {
"method": NormalizationModule.Porter,
- "snowball_language": "English",
+ "snowball_language": "en",
"udpipe_language": "English",
"lemmagen_language": "en",
"udpipe_tokenizer": False,
@@ -483,7 +509,7 @@ def test_parameters(self):
def test_set_parameters(self):
params = {
"method": NormalizationModule.UDPipe,
- "snowball_language": "Dutch",
+ "snowball_language": "nl",
"udpipe_language": "Slovenian",
"lemmagen_language": "bg",
"udpipe_tokenizer": True,
@@ -504,8 +530,7 @@ def test_createinstance(self):
self.assertIsInstance(pp, SnowballStemmer)
self.assertIn("", str(pp.normalizer))
- params = {"method": NormalizationModule.Snowball,
- "snowball_language": "Dutch"}
+ params = {"method": NormalizationModule.Snowball, "snowball_language": "nl"}
pp = self.editor.createinstance(params)
self.assertIsInstance(pp, SnowballStemmer)
self.assertIn("", str(pp.normalizer))