Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] Snowball - Use ISO language codes #1029

Merged
merged 2 commits into from
Dec 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 10 additions & 5 deletions orangecontrib/text/preprocess/normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from Orange.util import wrap_callback, dummy_callback

from orangecontrib.text import Corpus
from orangecontrib.text.language import LANG2ISO, ISO2LANG
from orangecontrib.text.misc import wait_nltk_data
from orangecontrib.text.preprocess import Preprocessor, TokenizedPreprocessor

Expand Down Expand Up @@ -71,12 +72,16 @@ class PorterStemmer(BaseNormalizer):

class SnowballStemmer(BaseNormalizer):
name = 'Snowball Stemmer'
supported_languages = [l.capitalize() for l in
stem.SnowballStemmer.languages]

def __init__(self, language='English'):
supported_languages = {
LANG2ISO[l.capitalize()]
for l in stem.SnowballStemmer.languages
# skip porter since not language but porter stemmer that we implement separately
if l != "porter"
}

def __init__(self, language='en'):
super().__init__()
self.normalizer = stem.SnowballStemmer(language.lower()).stem
self.normalizer = stem.SnowballStemmer(ISO2LANG[language].lower()).stem


def language_to_name(language):
Expand Down
9 changes: 8 additions & 1 deletion orangecontrib/text/tests/test_preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,12 +289,19 @@ def test_function(self):
self.assertEqual(stemmer._preprocess('token'), 'toke')

def test_snowball(self):
stemmer = preprocess.SnowballStemmer('french')
stemmer = preprocess.SnowballStemmer('fr')
token = 'voudrais'
self.assertEqual(
stemmer._preprocess(token),
nltk.SnowballStemmer(language='french').stem(token))

def test_snowball_all_langs(self):
for language in preprocess.SnowballStemmer.supported_languages:
normalizer = preprocess.SnowballStemmer(language)
tokens = normalizer(self.corpus).tokens
self.assertEqual(len(self.corpus), len(tokens))
self.assertTrue(all(tokens))

def test_udpipe(self):
"""Test udpipe token lemmatization"""
normalizer = preprocess.UDPipeLemmatizer("Lithuanian")
Expand Down
25 changes: 14 additions & 11 deletions orangecontrib/text/widgets/owpreprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -475,21 +475,23 @@ class NormalizationModule(SingleMethodModule):
UDPipe: UDPipeLemmatizer,
Lemmagen: LemmagenLemmatizer}
DEFAULT_METHOD = Porter
DEFAULT_SNOWBALL_LANG = "English" # todo: remove when snowball use iso
DEFAULT_UDPIPE_LANG = "English" # todo: remove when udpipe use iso
DEFAULT_LANGUAGE = "en"
DEFAULT_USE_TOKE = False

def __init__(self, parent=None, **kwargs):
super().__init__(parent, **kwargs)
self.__snowball_lang = self.DEFAULT_SNOWBALL_LANG
self.__snowball_lang = self.DEFAULT_LANGUAGE
self.__udpipe_lang = self.DEFAULT_UDPIPE_LANG
self.__lemmagen_lang = self.DEFAULT_LANGUAGE
self.__use_tokenizer = self.DEFAULT_USE_TOKE

self.__combo_sbl = ComboBox(
self, SnowballStemmer.supported_languages,
self.__snowball_lang, self.__set_snowball_lang
self.__combo_sbl = LanguageComboBox(
self,
SnowballStemmer.supported_languages,
self.__snowball_lang,
False,
self.__set_snowball_lang
)
self.__combo_udl = UDPipeComboBox(
self, self.__udpipe_lang, self.DEFAULT_UDPIPE_LANG, self.__set_udpipe_lang
Expand Down Expand Up @@ -534,7 +536,7 @@ def __enable_udpipe(self):

def setParameters(self, params: Dict):
super().setParameters(params)
snowball_lang = params.get("snowball_language", self.DEFAULT_SNOWBALL_LANG)
snowball_lang = params.get("snowball_language", self.DEFAULT_LANGUAGE)
self.__set_snowball_lang(snowball_lang)
udpipe_lang = params.get("udpipe_language", self.DEFAULT_UDPIPE_LANG)
self.__set_udpipe_lang(udpipe_lang)
Expand All @@ -550,7 +552,7 @@ def _set_method(self, method: int):
def __set_snowball_lang(self, language: str):
if self.__snowball_lang != language:
self.__snowball_lang = language
self.__combo_sbl.setCurrentText(language)
self.__combo_sbl.set_current_language(language)
self.changed.emit()
if self.method == self.Snowball:
self.edited.emit()
Expand Down Expand Up @@ -591,11 +593,10 @@ def parameters(self) -> Dict:
def createinstance(params: Dict) -> BaseNormalizer:
method = params.get("method", NormalizationModule.DEFAULT_METHOD)
args = {}
def_snowball = NormalizationModule.DEFAULT_SNOWBALL_LANG
def_udpipe = NormalizationModule.DEFAULT_UDPIPE_LANG
def_lang = NormalizationModule.DEFAULT_LANGUAGE
if method == NormalizationModule.Snowball:
args = {"language": params.get("snowball_language", def_snowball)}
args = {"language": params.get("snowball_language", def_lang)}
elif method == NormalizationModule.UDPipe:
def_use = NormalizationModule.DEFAULT_USE_TOKE
args = {"language": params.get("udpipe_language", def_udpipe),
Expand Down Expand Up @@ -1390,8 +1391,10 @@ def str_into_paths(label):
pp["language"] = None
else:
pp["language"] = StopwordsFilter.lang_to_iso(pp["language"])
if pp_name == "preprocess.normalize" and "lemmagen_language" in pp:
pp["lemmagen_language"] = LANG2ISO[pp["lemmagen_language"]]
if pp_name == "preprocess.normalize":
for key in ("lemmagen_language", "snowball_language"):
if key in pp:
pp[key] = LANG2ISO[pp[key]]


if __name__ == "__main__":
Expand Down
35 changes: 30 additions & 5 deletions orangecontrib/text/widgets/tests/test_owpreprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ def test_migrate_settings_normalize(self):
"udpipe_tokenizer": True}}
widget = self.create_widget(OWPreprocess, stored_settings=settings)
params = [("preprocess.normalize",
{"method": 2, "snowball_language": "French",
{"method": 2, "snowball_language": "fr",
"udpipe_language": "German", "udpipe_tokenizer": True})]
self.assertEqual(widget.storedsettings["preprocessors"], params)

Expand Down Expand Up @@ -332,6 +332,32 @@ def test_migrate_lemmagen_language_settings(self):
normalize_settings = widget.storedsettings["preprocessors"][0][1]
self.assertEqual("en", normalize_settings["lemmagen_language"])

def test_migrate_snowball_language_settings(self):
"""Test migration to iso langauge codes"""
settings = {
"__version__": 3,
"storedsettings": {
"preprocessors": [
("preprocess.normalize", {"snowball_language": "Swedish"}),
]
},
}
widget = self.create_widget(OWPreprocess, stored_settings=settings)
normalize_settings = widget.storedsettings["preprocessors"][0][1]
self.assertEqual("sv", normalize_settings["snowball_language"])

settings = {
"__version__": 3,
"storedsettings": {
"preprocessors": [
("preprocess.normalize", {"snowball_language": "English"}),
]
},
}
widget = self.create_widget(OWPreprocess, stored_settings=settings)
normalize_settings = widget.storedsettings["preprocessors"][0][1]
self.assertEqual("en", normalize_settings["snowball_language"])


class TestTransformationModule(WidgetTest):
def setUp(self):
Expand Down Expand Up @@ -473,7 +499,7 @@ def test_init(self):
def test_parameters(self):
params = {
"method": NormalizationModule.Porter,
"snowball_language": "English",
"snowball_language": "en",
"udpipe_language": "English",
"lemmagen_language": "en",
"udpipe_tokenizer": False,
Expand All @@ -483,7 +509,7 @@ def test_parameters(self):
def test_set_parameters(self):
params = {
"method": NormalizationModule.UDPipe,
"snowball_language": "Dutch",
"snowball_language": "nl",
"udpipe_language": "Slovenian",
"lemmagen_language": "bg",
"udpipe_tokenizer": True,
Expand All @@ -504,8 +530,7 @@ def test_createinstance(self):
self.assertIsInstance(pp, SnowballStemmer)
self.assertIn("<EnglishStemmer>", str(pp.normalizer))

params = {"method": NormalizationModule.Snowball,
"snowball_language": "Dutch"}
params = {"method": NormalizationModule.Snowball, "snowball_language": "nl"}
pp = self.editor.createinstance(params)
self.assertIsInstance(pp, SnowballStemmer)
self.assertIn("<DutchStemmer>", str(pp.normalizer))
Expand Down
Loading