Skip to content

Commit

Permalink
Preprocess - Use ISO language codes for Snowball
Browse files Browse the repository at this point in the history
  • Loading branch information
PrimozGodec committed Dec 13, 2023
1 parent 7f68b84 commit aa3306c
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 16 deletions.
25 changes: 14 additions & 11 deletions orangecontrib/text/widgets/owpreprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -475,21 +475,23 @@ class NormalizationModule(SingleMethodModule):
UDPipe: UDPipeLemmatizer,
Lemmagen: LemmagenLemmatizer}
DEFAULT_METHOD = Porter
DEFAULT_SNOWBALL_LANG = "English" # todo: remove when snowball use iso
DEFAULT_UDPIPE_LANG = "English" # todo: remove when udpipe use iso
DEFAULT_LANGUAGE = "en"
DEFAULT_USE_TOKE = False

def __init__(self, parent=None, **kwargs):
super().__init__(parent, **kwargs)
self.__snowball_lang = self.DEFAULT_SNOWBALL_LANG
self.__snowball_lang = self.DEFAULT_LANGUAGE
self.__udpipe_lang = self.DEFAULT_UDPIPE_LANG
self.__lemmagen_lang = self.DEFAULT_LANGUAGE
self.__use_tokenizer = self.DEFAULT_USE_TOKE

self.__combo_sbl = ComboBox(
self, SnowballStemmer.supported_languages,
self.__snowball_lang, self.__set_snowball_lang
self.__combo_sbl = LanguageComboBox(
self,
SnowballStemmer.supported_languages,
self.__snowball_lang,
False,
self.__set_snowball_lang
)
self.__combo_udl = UDPipeComboBox(
self, self.__udpipe_lang, self.DEFAULT_UDPIPE_LANG, self.__set_udpipe_lang
Expand Down Expand Up @@ -534,7 +536,7 @@ def __enable_udpipe(self):

def setParameters(self, params: Dict):
super().setParameters(params)
snowball_lang = params.get("snowball_language", self.DEFAULT_SNOWBALL_LANG)
snowball_lang = params.get("snowball_language", self.DEFAULT_LANGUAGE)
self.__set_snowball_lang(snowball_lang)
udpipe_lang = params.get("udpipe_language", self.DEFAULT_UDPIPE_LANG)
self.__set_udpipe_lang(udpipe_lang)
Expand All @@ -550,7 +552,7 @@ def _set_method(self, method: int):
def __set_snowball_lang(self, language: str):
if self.__snowball_lang != language:
self.__snowball_lang = language
self.__combo_sbl.setCurrentText(language)
self.__combo_sbl.set_current_language(language)
self.changed.emit()
if self.method == self.Snowball:
self.edited.emit()
Expand Down Expand Up @@ -591,11 +593,10 @@ def parameters(self) -> Dict:
def createinstance(params: Dict) -> BaseNormalizer:
method = params.get("method", NormalizationModule.DEFAULT_METHOD)
args = {}
def_snowball = NormalizationModule.DEFAULT_SNOWBALL_LANG
def_udpipe = NormalizationModule.DEFAULT_UDPIPE_LANG
def_lang = NormalizationModule.DEFAULT_LANGUAGE
if method == NormalizationModule.Snowball:
args = {"language": params.get("snowball_language", def_snowball)}
args = {"language": params.get("snowball_language", def_lang)}
elif method == NormalizationModule.UDPipe:
def_use = NormalizationModule.DEFAULT_USE_TOKE
args = {"language": params.get("udpipe_language", def_udpipe),
Expand Down Expand Up @@ -1390,8 +1391,10 @@ def str_into_paths(label):
pp["language"] = None
else:
pp["language"] = StopwordsFilter.lang_to_iso(pp["language"])
if pp_name == "preprocess.normalize" and "lemmagen_language" in pp:
pp["lemmagen_language"] = LANG2ISO[pp["lemmagen_language"]]
if pp_name == "preprocess.normalize":
for key in ("lemmagen_language", "snowball_language"):
if key in pp:
pp[key] = LANG2ISO[pp[key]]


if __name__ == "__main__":
Expand Down
35 changes: 30 additions & 5 deletions orangecontrib/text/widgets/tests/test_owpreprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ def test_migrate_settings_normalize(self):
"udpipe_tokenizer": True}}
widget = self.create_widget(OWPreprocess, stored_settings=settings)
params = [("preprocess.normalize",
{"method": 2, "snowball_language": "French",
{"method": 2, "snowball_language": "fr",
"udpipe_language": "German", "udpipe_tokenizer": True})]
self.assertEqual(widget.storedsettings["preprocessors"], params)

Expand Down Expand Up @@ -332,6 +332,32 @@ def test_migrate_lemmagen_language_settings(self):
normalize_settings = widget.storedsettings["preprocessors"][0][1]
self.assertEqual("en", normalize_settings["lemmagen_language"])

def test_migrate_snowball_language_settings(self):
"""Test migration to iso langauge codes"""
settings = {
"__version__": 3,
"storedsettings": {
"preprocessors": [
("preprocess.normalize", {"snowball_language": "Swedish"}),
]
},
}
widget = self.create_widget(OWPreprocess, stored_settings=settings)
normalize_settings = widget.storedsettings["preprocessors"][0][1]
self.assertEqual("sv", normalize_settings["snowball_language"])

settings = {
"__version__": 3,
"storedsettings": {
"preprocessors": [
("preprocess.normalize", {"snowball_language": "English"}),
]
},
}
widget = self.create_widget(OWPreprocess, stored_settings=settings)
normalize_settings = widget.storedsettings["preprocessors"][0][1]
self.assertEqual("en", normalize_settings["snowball_language"])


class TestTransformationModule(WidgetTest):
def setUp(self):
Expand Down Expand Up @@ -473,7 +499,7 @@ def test_init(self):
def test_parameters(self):
params = {
"method": NormalizationModule.Porter,
"snowball_language": "English",
"snowball_language": "en",
"udpipe_language": "English",
"lemmagen_language": "en",
"udpipe_tokenizer": False,
Expand All @@ -483,7 +509,7 @@ def test_parameters(self):
def test_set_parameters(self):
params = {
"method": NormalizationModule.UDPipe,
"snowball_language": "Dutch",
"snowball_language": "nl",
"udpipe_language": "Slovenian",
"lemmagen_language": "bg",
"udpipe_tokenizer": True,
Expand All @@ -504,8 +530,7 @@ def test_createinstance(self):
self.assertIsInstance(pp, SnowballStemmer)
self.assertIn("<EnglishStemmer>", str(pp.normalizer))

params = {"method": NormalizationModule.Snowball,
"snowball_language": "Dutch"}
params = {"method": NormalizationModule.Snowball, "snowball_language": "nl"}
pp = self.editor.createinstance(params)
self.assertIsInstance(pp, SnowballStemmer)
self.assertIn("<DutchStemmer>", str(pp.normalizer))
Expand Down

0 comments on commit aa3306c

Please sign in to comment.