diff --git a/Pipfile b/Pipfile index 4da47e03b..1cfc256c8 100644 --- a/Pipfile +++ b/Pipfile @@ -23,5 +23,6 @@ gensim = "*" sklearn = "*" fasttextmirror = "*" rdflib = "*" +voikko = "*" [requires] diff --git a/annif/analyzer/__init__.py b/annif/analyzer/__init__.py index 6043ee822..3ef91f32a 100644 --- a/annif/analyzer/__init__.py +++ b/annif/analyzer/__init__.py @@ -3,6 +3,7 @@ import re from . import simple from . import snowball +from . import voikko _analyzers = {} @@ -31,3 +32,4 @@ def get_analyzer(analyzerspec): register_analyzer(simple.SimpleAnalyzer) register_analyzer(snowball.SnowballAnalyzer) +register_analyzer(voikko.VoikkoAnalyzer) diff --git a/annif/analyzer/voikko.py b/annif/analyzer/voikko.py new file mode 100644 index 000000000..30995a7c3 --- /dev/null +++ b/annif/analyzer/voikko.py @@ -0,0 +1,20 @@ +"""Snowball analyzer for Annif, based on nltk Snowball stemmer.""" + +import functools +import voikko.libvoikko +from . import analyzer + + +class VoikkoAnalyzer(analyzer.Analyzer): + name = "voikko" + + def __init__(self, param): + self.param = param + self.voikko = voikko.libvoikko.Voikko(param) + + @functools.lru_cache(maxsize=500000) + def normalize_word(self, word): + result = self.voikko.analyze(word) + if len(result) > 0 and 'BASEFORM' in result[0]: + return result[0]['BASEFORM'] + return word diff --git a/setup.py b/setup.py index b632da50a..ea55429a6 100644 --- a/setup.py +++ b/setup.py @@ -27,7 +27,8 @@ def read(fname): 'gensim', 'sklearn', 'fasttextmirror', - 'rdflib'], + 'rdflib', + 'voikko'], entry_points={ 'console_scripts': ['annif=annif.cli:cli']}, classifiers=[ diff --git a/tests/test_analyzer.py b/tests/test_analyzer.py index 1e5901594..3b7c3380b 100644 --- a/tests/test_analyzer.py +++ b/tests/test_analyzer.py @@ -69,12 +69,18 @@ def test_swedish_analyzer_normalize_word(): assert analyzer.normalize_word("hundar") == "hund" -def test_finnish_analyzer_normalize_word(): +def test_snowball_finnish_analyzer_normalize_word(): analyzer = annif.analyzer.get_analyzer("snowball(finnish)") assert analyzer.normalize_word("vanhat") == "vanh" assert analyzer.normalize_word("koirien") == "koir" +def test_voikko_finnish_analyzer_normalize_word(): + analyzer = annif.analyzer.get_analyzer("voikko(fi)") + assert analyzer.normalize_word("vanhat") == "vanha" + assert analyzer.normalize_word("koirien") == "koira" + + def test_simple_analyzer(): analyzer = annif.analyzer.get_analyzer("simple") assert analyzer.normalize_word("Big") == "big"