Skip to content

Commit

Permalink
Add support for voikko analyzer. Fixes #37
Browse files Browse the repository at this point in the history
  • Loading branch information
osma committed Oct 8, 2018
1 parent 59ae5b8 commit e2b73ab
Show file tree
Hide file tree
Showing 5 changed files with 32 additions and 2 deletions.
1 change: 1 addition & 0 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -21,5 +21,6 @@ gensim = "*"
sklearn = "*"
fasttextmirror = "*"
rdflib = "*"
voikko = "*"

[requires]
2 changes: 2 additions & 0 deletions annif/analyzer/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import re
from . import simple
from . import snowball
from . import voikko

_analyzers = {}

Expand Down Expand Up @@ -31,3 +32,4 @@ def get_analyzer(analyzerspec):

register_analyzer(simple.SimpleAnalyzer)
register_analyzer(snowball.SnowballAnalyzer)
register_analyzer(voikko.VoikkoAnalyzer)
20 changes: 20 additions & 0 deletions annif/analyzer/voikko.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
"""Snowball analyzer for Annif, based on nltk Snowball stemmer."""

import functools
import voikko.libvoikko
from . import analyzer


class VoikkoAnalyzer(analyzer.Analyzer):
name = "voikko"

def __init__(self, param):
self.param = param
self.voikko = voikko.libvoikko.Voikko(param)

@functools.lru_cache(maxsize=500000)
def normalize_word(self, word):
result = self.voikko.analyze(word)
if len(result) > 0 and 'BASEFORM' in result[0]:
return result[0]['BASEFORM']
return word
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ def read(fname):
'gensim',
'sklearn',
'fasttextmirror',
'rdflib'],
'rdflib',
'voikko'],
entry_points={
'console_scripts': ['annif=annif.cli:cli']})
8 changes: 7 additions & 1 deletion tests/test_analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,12 +69,18 @@ def test_swedish_analyzer_normalize_word():
assert analyzer.normalize_word("hundar") == "hund"


def test_finnish_analyzer_normalize_word():
def test_snowball_finnish_analyzer_normalize_word():
analyzer = annif.analyzer.get_analyzer("snowball(finnish)")
assert analyzer.normalize_word("vanhat") == "vanh"
assert analyzer.normalize_word("koirien") == "koir"


def test_voikko_finnish_analyzer_normalize_word():
analyzer = annif.analyzer.get_analyzer("voikko(fi)")
assert analyzer.normalize_word("vanhat") == "vanha"
assert analyzer.normalize_word("koirien") == "koira"


def test_simple_analyzer():
analyzer = annif.analyzer.get_analyzer("simple")
assert analyzer.normalize_word("Big") == "big"

0 comments on commit e2b73ab

Please sign in to comment.