Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] Lemmagen - Use ISO language codes #1025

Merged
merged 2 commits into from
Dec 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 15 additions & 31 deletions orangecontrib/text/preprocess/normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,43 +213,27 @@ def __setstate__(self, state):

class LemmagenLemmatizer(BaseNormalizer):
name = 'Lemmagen Lemmatizer'
lemmagen_languages = {
"Bulgarian": "bg",
"Croatian": "hr",
"Czech": "cs",
"English": "en",
"Estonian": "et",
"Farsi/Persian": "fa",
"French": "fr",
"German": "de",
"Hungarian": "hu",
"Italian": "it",
"Macedonian": "mk",
"Polish": "pl",
"Romanian": "ro",
"Russian": "ru",
"Serbian": "sr",
"Slovak": "sk",
"Slovenian": "sl",
"Spanish": "es",
"Ukrainian": "uk"
}
supported_languages = set(Lemmatizer.list_supported_languages())

def __init__(self, language='English'):
def __init__(self, language="en"):
super().__init__()
self.language = language
self.lemmatizer = None

def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus:
# lemmagen3 lemmatizer is not picklable, define it on call and discard it afterward
self.lemmatizer = Lemmatizer(self.lemmagen_languages[self.language])
output_corpus = super().__call__(corpus, callback)
self.lemmatizer = None
return output_corpus
self.language = language # used only for unpicking
self.lemmatizer = Lemmatizer(language)

def normalizer(self, token):
assert self.lemmatizer is not None
t = self.lemmatizer.lemmatize(token)
# sometimes Lemmagen returns an empty string, return original tokens
# in this case
return t if t else token

def __getstate__(self):
"""Remove model that cannot be pickled"""
state = super().__getstate__()
state["lemmatizer"] = None
return state

def __setstate__(self, state):
"""Reinstate the model when upickled"""
super().__setstate__(state)
self.lemmatizer = Lemmatizer(self.language)
11 changes: 9 additions & 2 deletions orangecontrib/text/tests/test_preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -342,15 +342,22 @@ def test_udpipe_deepcopy(self):
)

def test_lemmagen(self):
normalizer = preprocess.LemmagenLemmatizer('Slovenian')
sentence = 'Gori na gori hiša gori'
normalizer = preprocess.LemmagenLemmatizer("sl")
sentence = "Gori na gori hiša gori"
with self.corpus.unlocked():
self.corpus.metas[0, 0] = sentence
self.assertEqual(
[Lemmatizer("sl").lemmatize(t) for t in sentence.split()],
normalizer(self.corpus).tokens[0],
)

def test_lemmagen_all_langs(self):
for language in preprocess.LemmagenLemmatizer.supported_languages:
normalizer = preprocess.LemmagenLemmatizer(language)
tokens = normalizer(self.corpus).tokens
self.assertEqual(len(self.corpus), len(tokens))
self.assertTrue(all(tokens))

def test_normalizers_picklable(self):
""" Normalizers must be picklable, tests if it is true"""
for nm in set(preprocess.normalize.__all__) - {"BaseNormalizer"}:
Expand Down
36 changes: 22 additions & 14 deletions orangecontrib/text/widgets/owpreprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
from Orange.widgets.widget import Input, Output, Msg, Message

from orangecontrib.text import Corpus
from orangecontrib.text.language import ISO2LANG
from orangecontrib.text.language import ISO2LANG, LANG2ISO
from orangecontrib.text.misc import nltk_data_dir
from orangecontrib.text.preprocess import *
from orangecontrib.text.preprocess.normalize import UDPipeStopIteration
Expand Down Expand Up @@ -475,13 +475,15 @@ class NormalizationModule(SingleMethodModule):
UDPipe: UDPipeLemmatizer,
Lemmagen: LemmagenLemmatizer}
DEFAULT_METHOD = Porter
DEFAULT_LANGUAGE = "English"
DEFAULT_SNOWBALL_LANG = "English" # todo: remove when snowball use iso
DEFAULT_UDPIPE_LANG = "English" # todo: remove when udpipe use iso
DEFAULT_LANGUAGE = "en"
DEFAULT_USE_TOKE = False

def __init__(self, parent=None, **kwargs):
super().__init__(parent, **kwargs)
self.__snowball_lang = self.DEFAULT_LANGUAGE
self.__udpipe_lang = self.DEFAULT_LANGUAGE
self.__snowball_lang = self.DEFAULT_SNOWBALL_LANG
self.__udpipe_lang = self.DEFAULT_UDPIPE_LANG
self.__lemmagen_lang = self.DEFAULT_LANGUAGE
self.__use_tokenizer = self.DEFAULT_USE_TOKE

Expand All @@ -490,15 +492,17 @@ def __init__(self, parent=None, **kwargs):
self.__snowball_lang, self.__set_snowball_lang
)
self.__combo_udl = UDPipeComboBox(
self, self.__udpipe_lang, self.DEFAULT_LANGUAGE,
self.__set_udpipe_lang
self, self.__udpipe_lang, self.DEFAULT_UDPIPE_LANG, self.__set_udpipe_lang
)
self.__check_use = QCheckBox("UDPipe tokenizer",
checked=self.DEFAULT_USE_TOKE)
self.__check_use.clicked.connect(self.__set_use_tokenizer)
self.__combo_lemm = ComboBox(
self, LemmagenLemmatizer.lemmagen_languages,
self.__lemmagen_lang, self.__set_lemmagen_lang
self.__combo_lemm = LanguageComboBox(
self,
LemmagenLemmatizer.supported_languages,
self.__lemmagen_lang,
False,
self.__set_lemmagen_lang,
)

label = QLabel("Language:")
Expand Down Expand Up @@ -530,9 +534,9 @@ def __enable_udpipe(self):

def setParameters(self, params: Dict):
super().setParameters(params)
snowball_lang = params.get("snowball_language", self.DEFAULT_LANGUAGE)
snowball_lang = params.get("snowball_language", self.DEFAULT_SNOWBALL_LANG)
self.__set_snowball_lang(snowball_lang)
udpipe_lang = params.get("udpipe_language", self.DEFAULT_LANGUAGE)
udpipe_lang = params.get("udpipe_language", self.DEFAULT_UDPIPE_LANG)
self.__set_udpipe_lang(udpipe_lang)
use_tokenizer = params.get("udpipe_tokenizer", self.DEFAULT_USE_TOKE)
self.__set_use_tokenizer(use_tokenizer)
Expand Down Expand Up @@ -562,7 +566,7 @@ def __set_udpipe_lang(self, language: str):
def __set_lemmagen_lang(self, language: str):
if self.__lemmagen_lang != language:
self.__lemmagen_lang = language
self.__combo_lemm.setCurrentText(language)
self.__combo_lemm.set_current_language(language)
self.changed.emit()
if self.method == self.Lemmagen:
self.edited.emit()
Expand All @@ -587,12 +591,14 @@ def parameters(self) -> Dict:
def createinstance(params: Dict) -> BaseNormalizer:
method = params.get("method", NormalizationModule.DEFAULT_METHOD)
args = {}
def_snowball = NormalizationModule.DEFAULT_SNOWBALL_LANG
def_udpipe = NormalizationModule.DEFAULT_UDPIPE_LANG
def_lang = NormalizationModule.DEFAULT_LANGUAGE
if method == NormalizationModule.Snowball:
args = {"language": params.get("snowball_language", def_lang)}
args = {"language": params.get("snowball_language", def_snowball)}
elif method == NormalizationModule.UDPipe:
def_use = NormalizationModule.DEFAULT_USE_TOKE
args = {"language": params.get("udpipe_language", def_lang),
args = {"language": params.get("udpipe_language", def_udpipe),
"use_tokenizer": params.get("udpipe_tokenizer", def_use)}
elif method == NormalizationModule.Lemmagen:
args = {"language": params.get("lemmagen_language", def_lang)}
Expand Down Expand Up @@ -1384,6 +1390,8 @@ def str_into_paths(label):
pp["language"] = None
else:
pp["language"] = StopwordsFilter.lang_to_iso(pp["language"])
if pp_name == "preprocess.normalize" and "lemmagen_language" in pp:
pp["lemmagen_language"] = LANG2ISO[pp["lemmagen_language"]]


if __name__ == "__main__":
Expand Down
74 changes: 43 additions & 31 deletions orangecontrib/text/widgets/tests/test_owpreprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,30 +271,16 @@ def test_migrate_settings(self):
}
self.create_widget(OWPreprocess, stored_settings=settings)

def test_migrate_language_settings(self):
def test_migrate_filter_language_settings(self):
"""Test migration to iso langauge codes"""
settings = {
"__version__": 3,
"storedsettings": {
"preprocessors": [
(
"preprocess.normalize",
{
"snowball_language": "French",
"udpipe_language": "German",
"lemmagen_language": "Slovenian",
},
),
("preprocess.filter", {"language": "Finnish"}),
]
"preprocessors": [("preprocess.filter", {"language": "Finnish"})]
},
}
widget = self.create_widget(OWPreprocess, stored_settings=settings)
normalize_settings = widget.storedsettings["preprocessors"][0][1]
filter_settings = widget.storedsettings["preprocessors"][1][1]
self.assertEqual("Slovenian", normalize_settings["lemmagen_language"])
self.assertEqual("French", normalize_settings["snowball_language"])
self.assertEqual("German", normalize_settings["udpipe_language"])
filter_settings = widget.storedsettings["preprocessors"][0][1]
self.assertEqual("fi", filter_settings["language"])

# NLTK uses Slovene instead of Slovenian, this is also the reason
Expand All @@ -320,6 +306,32 @@ def test_migrate_language_settings(self):
filter_settings = widget.storedsettings["preprocessors"][0][1]
self.assertIsNone(filter_settings["language"])

def test_migrate_lemmagen_language_settings(self):
"""Test migration to iso langauge codes"""
settings = {
"__version__": 3,
"storedsettings": {
"preprocessors": [
("preprocess.normalize", {"lemmagen_language": "Slovenian"}),
]
},
}
widget = self.create_widget(OWPreprocess, stored_settings=settings)
normalize_settings = widget.storedsettings["preprocessors"][0][1]
self.assertEqual("sl", normalize_settings["lemmagen_language"])

settings = {
"__version__": 3,
"storedsettings": {
"preprocessors": [
("preprocess.normalize", {"lemmagen_language": "English"}),
]
},
}
widget = self.create_widget(OWPreprocess, stored_settings=settings)
normalize_settings = widget.storedsettings["preprocessors"][0][1]
self.assertEqual("en", normalize_settings["lemmagen_language"])


class TestTransformationModule(WidgetTest):
def setUp(self):
Expand Down Expand Up @@ -459,19 +471,23 @@ def test_init(self):
self.assertFalse(self.check_use.isChecked())

def test_parameters(self):
params = {"method": NormalizationModule.Porter,
"snowball_language": "English",
"udpipe_language": "English",
"lemmagen_language": "English",
"udpipe_tokenizer": False}
params = {
"method": NormalizationModule.Porter,
"snowball_language": "English",
"udpipe_language": "English",
"lemmagen_language": "en",
"udpipe_tokenizer": False,
}
self.assertDictEqual(self.editor.parameters(), params)

def test_set_parameters(self):
params = {"method": NormalizationModule.UDPipe,
"snowball_language": "Dutch",
"udpipe_language": "Slovenian",
"lemmagen_language": "Bulgarian",
"udpipe_tokenizer": True}
params = {
"method": NormalizationModule.UDPipe,
"snowball_language": "Dutch",
"udpipe_language": "Slovenian",
"lemmagen_language": "bg",
"udpipe_tokenizer": True,
}
self.editor.setParameters(params)
self.assertDictEqual(self.editor.parameters(), params)
self.assertEqual(self.combo_sbl.currentText(), "Dutch")
Expand Down Expand Up @@ -738,10 +754,6 @@ def test_createinstance(self):
pp = self.editor.createinstance({"method": POSTaggingModule.MaxEnt})
self.assertIsInstance(pp, MaxEntTagger)

# TODO - implement StanfordPOSTagger
# pp = self.editor.createinstance({"method": POSTaggingModule.Stanford})
# self.assertIsInstance(pp, StanfordPOSTagger)

def test_repr(self):
self.assertEqual(str(self.editor), "Averaged Perceptron Tagger")

Expand Down
Loading