Skip to content

Commit

Permalink
Merge pull request #231 from NatLibFi/issue37-voikko
Browse files Browse the repository at this point in the history
Voikko analyzer
  • Loading branch information
osma authored Jan 15, 2019
2 parents 97af492 + 3e3bf30 commit 9a63b09
Show file tree
Hide file tree
Showing 7 changed files with 70 additions and 3 deletions.
7 changes: 7 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,20 @@ python:
- '3.5'
- '3.6'
- '3.7'
addons:
apt:
packages:
- libvoikko1
- voikko-fi
cache: pip
before_install:
- export BOTO_CONFIG=/dev/null
install:
- pip install pipenv
- pip install --upgrade pytest
- pipenv install --dev --skip-lock
# install optional dependencies that were not specified in Pipfile
- pip install voikko
- travis_wait 30 python -m nltk.downloader punkt
script:
- pytest --cov=./
Expand Down
9 changes: 9 additions & 0 deletions annif/analyzer/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import re
from . import simple
from . import snowball
import annif

_analyzers = {}

Expand Down Expand Up @@ -31,3 +32,11 @@ def get_analyzer(analyzerspec):

register_analyzer(simple.SimpleAnalyzer)
register_analyzer(snowball.SnowballAnalyzer)

# Optional analyzers
try:
import voikko as _voikko
from . import voikko
register_analyzer(voikko.VoikkoAnalyzer)
except ImportError:
annif.logger.debug("voikko not available, not enabling voikko analyzer")
29 changes: 29 additions & 0 deletions annif/analyzer/voikko.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
"""Voikko analyzer for Annif, based on libvoikko library."""

import functools
import voikko.libvoikko
from . import analyzer


class VoikkoAnalyzer(analyzer.Analyzer):
name = "voikko"

def __init__(self, param):
self.param = param
self.voikko = None

def __getstate__(self):
"""Return the state of the object for pickling purposes. The Voikko
instance is set to None because as a ctypes object it cannot be
pickled."""

return {'param': self.param, 'voikko': None}

@functools.lru_cache(maxsize=500000)
def normalize_word(self, word):
if self.voikko is None:
self.voikko = voikko.libvoikko.Voikko(self.param)
result = self.voikko.analyze(word)
if len(result) > 0 and 'BASEFORM' in result[0]:
return result[0]['BASEFORM']
return word
4 changes: 2 additions & 2 deletions projects.cfg.dist
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
name=TF-IDF Finnish
language=fi
backends=tfidf
analyzer=snowball(finnish)
analyzer=voikko(fi)
limit=100
vocab=yso-fi

Expand All @@ -28,7 +28,7 @@ vocab=yso-en
name=fastText Finnish
language=fi
backends=fasttext
analyzer=snowball(finnish)
analyzer=voikko(fi)
dim=500
lr=0.25
epoch=30
Expand Down
3 changes: 3 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@ def read(fname):
'sklearn',
'fasttextmirror',
'rdflib'],
extras_require={
'voikko': ['voikko'],
},
entry_points={
'console_scripts': ['annif=annif.cli:cli']},
classifiers=[
Expand Down
2 changes: 1 addition & 1 deletion tests/test_analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ def test_swedish_analyzer_normalize_word():
assert analyzer.normalize_word("hundar") == "hund"


def test_finnish_analyzer_normalize_word():
def test_snowball_finnish_analyzer_normalize_word():
analyzer = annif.analyzer.get_analyzer("snowball(finnish)")
assert analyzer.normalize_word("vanhat") == "vanh"
assert analyzer.normalize_word("koirien") == "koir"
Expand Down
19 changes: 19 additions & 0 deletions tests/test_analyzer_voikko.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
"""Unit tests for voikko analyzer in Annif"""

import pytest
import annif.analyzer

voikko = pytest.importorskip("voikko")


def test_voikko_getstate():
analyzer = annif.analyzer.get_analyzer("voikko(fi)")
state = analyzer.__getstate__()
assert state == {'param': 'fi', 'voikko': None}


def test_voikko_finnish_analyzer_normalize_word():
analyzer = annif.analyzer.get_analyzer("voikko(fi)")
assert analyzer.normalize_word("xyzzy") == "xyzzy"
assert analyzer.normalize_word("vanhat") == "vanha"
assert analyzer.normalize_word("koirien") == "koira"

0 comments on commit 9a63b09

Please sign in to comment.