From 1fd965730e330c8d2bae795e6cd8ebe10cdbbadf Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Mon, 8 Oct 2018 13:18:22 +0300 Subject: [PATCH 01/12] Add support for voikko analyzer. Fixes #37 --- Pipfile | 1 + annif/analyzer/__init__.py | 2 ++ annif/analyzer/voikko.py | 20 ++++++++++++++++++++ setup.py | 3 ++- tests/test_analyzer.py | 8 +++++++- 5 files changed, 32 insertions(+), 2 deletions(-) create mode 100644 annif/analyzer/voikko.py diff --git a/Pipfile b/Pipfile index 4da47e03b..1cfc256c8 100644 --- a/Pipfile +++ b/Pipfile @@ -23,5 +23,6 @@ gensim = "*" sklearn = "*" fasttextmirror = "*" rdflib = "*" +voikko = "*" [requires] diff --git a/annif/analyzer/__init__.py b/annif/analyzer/__init__.py index 6043ee822..3ef91f32a 100644 --- a/annif/analyzer/__init__.py +++ b/annif/analyzer/__init__.py @@ -3,6 +3,7 @@ import re from . import simple from . import snowball +from . import voikko _analyzers = {} @@ -31,3 +32,4 @@ def get_analyzer(analyzerspec): register_analyzer(simple.SimpleAnalyzer) register_analyzer(snowball.SnowballAnalyzer) +register_analyzer(voikko.VoikkoAnalyzer) diff --git a/annif/analyzer/voikko.py b/annif/analyzer/voikko.py new file mode 100644 index 000000000..30995a7c3 --- /dev/null +++ b/annif/analyzer/voikko.py @@ -0,0 +1,20 @@ +"""Snowball analyzer for Annif, based on nltk Snowball stemmer.""" + +import functools +import voikko.libvoikko +from . import analyzer + + +class VoikkoAnalyzer(analyzer.Analyzer): + name = "voikko" + + def __init__(self, param): + self.param = param + self.voikko = voikko.libvoikko.Voikko(param) + + @functools.lru_cache(maxsize=500000) + def normalize_word(self, word): + result = self.voikko.analyze(word) + if len(result) > 0 and 'BASEFORM' in result[0]: + return result[0]['BASEFORM'] + return word diff --git a/setup.py b/setup.py index b632da50a..ea55429a6 100644 --- a/setup.py +++ b/setup.py @@ -27,7 +27,8 @@ def read(fname): 'gensim', 'sklearn', 'fasttextmirror', - 'rdflib'], + 'rdflib', + 'voikko'], entry_points={ 'console_scripts': ['annif=annif.cli:cli']}, classifiers=[ diff --git a/tests/test_analyzer.py b/tests/test_analyzer.py index 1e5901594..3b7c3380b 100644 --- a/tests/test_analyzer.py +++ b/tests/test_analyzer.py @@ -69,12 +69,18 @@ def test_swedish_analyzer_normalize_word(): assert analyzer.normalize_word("hundar") == "hund" -def test_finnish_analyzer_normalize_word(): +def test_snowball_finnish_analyzer_normalize_word(): analyzer = annif.analyzer.get_analyzer("snowball(finnish)") assert analyzer.normalize_word("vanhat") == "vanh" assert analyzer.normalize_word("koirien") == "koir" +def test_voikko_finnish_analyzer_normalize_word(): + analyzer = annif.analyzer.get_analyzer("voikko(fi)") + assert analyzer.normalize_word("vanhat") == "vanha" + assert analyzer.normalize_word("koirien") == "koira" + + def test_simple_analyzer(): analyzer = annif.analyzer.get_analyzer("simple") assert analyzer.normalize_word("Big") == "big" From 4d1070dfe2d0f52c86e145a4189a0cee0ae52ecc Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Mon, 8 Oct 2018 13:40:29 +0300 Subject: [PATCH 02/12] Switch test and example configs to use Voikko analyzer for Finnish. Part of #37 --- projects.cfg.dist | 4 ++-- tests/projects.cfg | 6 +++--- tests/test_project.py | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/projects.cfg.dist b/projects.cfg.dist index c38ba903c..c8513719e 100644 --- a/projects.cfg.dist +++ b/projects.cfg.dist @@ -4,7 +4,7 @@ name=TF-IDF Finnish language=fi backends=tfidf -analyzer=snowball(finnish) +analyzer=voikko(fi) limit=100 vocab=yso-fi @@ -28,7 +28,7 @@ vocab=yso-en name=fastText Finnish language=fi backends=fasttext -analyzer=snowball(finnish) +analyzer=voikko(fi) dim=500 lr=0.25 epoch=30 diff --git a/tests/projects.cfg b/tests/projects.cfg index c86de7881..818f4ab2c 100644 --- a/tests/projects.cfg +++ b/tests/projects.cfg @@ -4,7 +4,7 @@ name=Dummy Finnish language=fi backends=dummy -analyzer=snowball(finnish) +analyzer=voikko(fi) key=value vocab=dummy @@ -52,7 +52,7 @@ vocab=yso-fi name=TF-IDF Finnish language=fi backends=tfidf -analyzer=snowball(finnish) +analyzer=voikko(fi) limit=10 vocab=yso-fi @@ -81,7 +81,7 @@ vocab=yso-en name=fastText Finnish language=fi backends=fasttext -analyzer=snowball(finnish) +analyzer=voikko(fi) dim=100 lr=0.25 epoch=5 diff --git a/tests/test_project.py b/tests/test_project.py index 1d9f54acd..4bcedce73 100644 --- a/tests/test_project.py +++ b/tests/test_project.py @@ -24,8 +24,8 @@ def test_get_project_fi(app): project = annif.project.get_project('dummy-fi') assert project.project_id == 'dummy-fi' assert project.language == 'fi' - assert project.analyzer.name == 'snowball' - assert project.analyzer.param == 'finnish' + assert project.analyzer.name == 'voikko' + assert project.analyzer.param == 'fi' assert len(project.backends) == 1 assert isinstance(project.backends[0][0], annif.backend.dummy.DummyBackend) assert project.backends[0][1] == 1.0 From 3061ba29a1c1bc8f3741c0bf9641d2703d428763 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Mon, 8 Oct 2018 13:48:46 +0300 Subject: [PATCH 03/12] fix docstring --- annif/analyzer/voikko.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/annif/analyzer/voikko.py b/annif/analyzer/voikko.py index 30995a7c3..69f229539 100644 --- a/annif/analyzer/voikko.py +++ b/annif/analyzer/voikko.py @@ -1,4 +1,4 @@ -"""Snowball analyzer for Annif, based on nltk Snowball stemmer.""" +"""Voikko analyzer for Annif, based on libvoikko library.""" import functools import voikko.libvoikko From 4f77397668c4b154a9d25462d2e6565b2f6db1dc Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Tue, 15 Jan 2019 12:02:55 +0200 Subject: [PATCH 04/12] Fix pickling issue with Voikko object --- annif/analyzer/voikko.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/annif/analyzer/voikko.py b/annif/analyzer/voikko.py index 69f229539..42cb8db8a 100644 --- a/annif/analyzer/voikko.py +++ b/annif/analyzer/voikko.py @@ -10,10 +10,19 @@ class VoikkoAnalyzer(analyzer.Analyzer): def __init__(self, param): self.param = param - self.voikko = voikko.libvoikko.Voikko(param) + self.voikko = None + + def __getstate__(self): + """Return the state of the object for pickling purposes. The Voikko + instance is set to None because as a ctypes object it cannot be + pickled.""" + + return {'param': self.param, 'voikko': None} @functools.lru_cache(maxsize=500000) def normalize_word(self, word): + if self.voikko is None: + self.voikko = voikko.libvoikko.Voikko(self.param) result = self.voikko.analyze(word) if len(result) > 0 and 'BASEFORM' in result[0]: return result[0]['BASEFORM'] From 8273f3fa62ae01a99f4dd79c160d7792f6d4101e Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Tue, 15 Jan 2019 12:10:13 +0200 Subject: [PATCH 05/12] Install libvoikko1 package to Travis, needed by voikko analyzer --- .travis.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.travis.yml b/.travis.yml index c3f4d7885..76f5c258f 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,6 +4,10 @@ python: - '3.5' - '3.6' - '3.7' +addons: + apt: + packages: + - libvoikko1 cache: pip before_install: - export BOTO_CONFIG=/dev/null From 1857dd2077e400e9c4dcd461c3692dd6b9623544 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Tue, 15 Jan 2019 12:17:19 +0200 Subject: [PATCH 06/12] add voikko-fi package dependency, needed by voikko analyzer --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index 76f5c258f..7a0ab7419 100644 --- a/.travis.yml +++ b/.travis.yml @@ -8,6 +8,7 @@ addons: apt: packages: - libvoikko1 + - voikko-fi cache: pip before_install: - export BOTO_CONFIG=/dev/null From 73c6ba2d185da604a9e1282bac1bd7751b7fe977 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Tue, 15 Jan 2019 13:19:51 +0200 Subject: [PATCH 07/12] Make the voikko dependency optional --- Pipfile | 1 - setup.py | 6 ++++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/Pipfile b/Pipfile index 1cfc256c8..4da47e03b 100644 --- a/Pipfile +++ b/Pipfile @@ -23,6 +23,5 @@ gensim = "*" sklearn = "*" fasttextmirror = "*" rdflib = "*" -voikko = "*" [requires] diff --git a/setup.py b/setup.py index ea55429a6..6f9b4b46e 100644 --- a/setup.py +++ b/setup.py @@ -27,8 +27,10 @@ def read(fname): 'gensim', 'sklearn', 'fasttextmirror', - 'rdflib', - 'voikko'], + 'rdflib'], + extras_require={ + 'voikko': ['voikko'], + }, entry_points={ 'console_scripts': ['annif=annif.cli:cli']}, classifiers=[ From d55d0f032cae7b3eae3f2165b47c099757277281 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Tue, 15 Jan 2019 13:31:13 +0200 Subject: [PATCH 08/12] make voikko dependency optional and just disable the functionality and tests if it's not installed --- annif/analyzer/__init__.py | 11 +++++++++-- tests/projects.cfg | 6 +++--- tests/test_analyzer.py | 6 ------ tests/test_project.py | 4 ++-- 4 files changed, 14 insertions(+), 13 deletions(-) diff --git a/annif/analyzer/__init__.py b/annif/analyzer/__init__.py index 3ef91f32a..b4c66a95c 100644 --- a/annif/analyzer/__init__.py +++ b/annif/analyzer/__init__.py @@ -3,7 +3,7 @@ import re from . import simple from . import snowball -from . import voikko +import annif _analyzers = {} @@ -32,4 +32,11 @@ def get_analyzer(analyzerspec): register_analyzer(simple.SimpleAnalyzer) register_analyzer(snowball.SnowballAnalyzer) -register_analyzer(voikko.VoikkoAnalyzer) + +# Optional analyzers +try: + import voikko + from . import voikko + register_analyzer(voikko.VoikkoAnalyzer) +except ImportError: + annif.logger.debug("voikko not available, not enabling voikko analyzer") diff --git a/tests/projects.cfg b/tests/projects.cfg index 818f4ab2c..c86de7881 100644 --- a/tests/projects.cfg +++ b/tests/projects.cfg @@ -4,7 +4,7 @@ name=Dummy Finnish language=fi backends=dummy -analyzer=voikko(fi) +analyzer=snowball(finnish) key=value vocab=dummy @@ -52,7 +52,7 @@ vocab=yso-fi name=TF-IDF Finnish language=fi backends=tfidf -analyzer=voikko(fi) +analyzer=snowball(finnish) limit=10 vocab=yso-fi @@ -81,7 +81,7 @@ vocab=yso-en name=fastText Finnish language=fi backends=fasttext -analyzer=voikko(fi) +analyzer=snowball(finnish) dim=100 lr=0.25 epoch=5 diff --git a/tests/test_analyzer.py b/tests/test_analyzer.py index 3b7c3380b..c943a7798 100644 --- a/tests/test_analyzer.py +++ b/tests/test_analyzer.py @@ -75,12 +75,6 @@ def test_snowball_finnish_analyzer_normalize_word(): assert analyzer.normalize_word("koirien") == "koir" -def test_voikko_finnish_analyzer_normalize_word(): - analyzer = annif.analyzer.get_analyzer("voikko(fi)") - assert analyzer.normalize_word("vanhat") == "vanha" - assert analyzer.normalize_word("koirien") == "koira" - - def test_simple_analyzer(): analyzer = annif.analyzer.get_analyzer("simple") assert analyzer.normalize_word("Big") == "big" diff --git a/tests/test_project.py b/tests/test_project.py index 4bcedce73..1d9f54acd 100644 --- a/tests/test_project.py +++ b/tests/test_project.py @@ -24,8 +24,8 @@ def test_get_project_fi(app): project = annif.project.get_project('dummy-fi') assert project.project_id == 'dummy-fi' assert project.language == 'fi' - assert project.analyzer.name == 'voikko' - assert project.analyzer.param == 'fi' + assert project.analyzer.name == 'snowball' + assert project.analyzer.param == 'finnish' assert len(project.backends) == 1 assert isinstance(project.backends[0][0], annif.backend.dummy.DummyBackend) assert project.backends[0][1] == 1.0 From c903be4ede3b366821a83684b2a0840923e7f35c Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Tue, 15 Jan 2019 13:36:46 +0200 Subject: [PATCH 09/12] add tests for voikko; skipped if voikko is not installed --- tests/test_analyzer_voikko.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 tests/test_analyzer_voikko.py diff --git a/tests/test_analyzer_voikko.py b/tests/test_analyzer_voikko.py new file mode 100644 index 000000000..8bd818dcb --- /dev/null +++ b/tests/test_analyzer_voikko.py @@ -0,0 +1,12 @@ +"""Unit tests for voikko analyzer in Annif""" + +import pytest +import annif.analyzer + +voikko = pytest.importorskip("voikko") + + +def test_voikko_finnish_analyzer_normalize_word(): + analyzer = annif.analyzer.get_analyzer("voikko(fi)") + assert analyzer.normalize_word("vanhat") == "vanha" + assert analyzer.normalize_word("koirien") == "koira" From 78b0facf31bab4b11d6ac8e0c16a1e81106fd69f Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Tue, 15 Jan 2019 13:46:22 +0200 Subject: [PATCH 10/12] install voikko python package in Travis environment, so voikko-specific tests will run --- .travis.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.travis.yml b/.travis.yml index 7a0ab7419..6dc237815 100644 --- a/.travis.yml +++ b/.travis.yml @@ -16,6 +16,8 @@ install: - pip install pipenv - pip install --upgrade pytest - pipenv install --dev --skip-lock +# install optional dependencies that were not specified in Pipfile +- pip install voikko - travis_wait 30 python -m nltk.downloader punkt script: - pytest --cov=./ From 3067841a38d7cb10c2f2dcb0ff8901a966bf9341 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Tue, 15 Jan 2019 13:54:38 +0200 Subject: [PATCH 11/12] fix import name clash --- annif/analyzer/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/annif/analyzer/__init__.py b/annif/analyzer/__init__.py index b4c66a95c..289c36936 100644 --- a/annif/analyzer/__init__.py +++ b/annif/analyzer/__init__.py @@ -35,7 +35,7 @@ def get_analyzer(analyzerspec): # Optional analyzers try: - import voikko + import voikko as _voikko from . import voikko register_analyzer(voikko.VoikkoAnalyzer) except ImportError: From 3e3bf3077a637c6e1892d96211f52358be37560e Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Tue, 15 Jan 2019 14:16:04 +0200 Subject: [PATCH 12/12] expand voikko unit tests for full coverage --- tests/test_analyzer_voikko.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/test_analyzer_voikko.py b/tests/test_analyzer_voikko.py index 8bd818dcb..f30b22387 100644 --- a/tests/test_analyzer_voikko.py +++ b/tests/test_analyzer_voikko.py @@ -6,7 +6,14 @@ voikko = pytest.importorskip("voikko") +def test_voikko_getstate(): + analyzer = annif.analyzer.get_analyzer("voikko(fi)") + state = analyzer.__getstate__() + assert state == {'param': 'fi', 'voikko': None} + + def test_voikko_finnish_analyzer_normalize_word(): analyzer = annif.analyzer.get_analyzer("voikko(fi)") + assert analyzer.normalize_word("xyzzy") == "xyzzy" assert analyzer.normalize_word("vanhat") == "vanha" assert analyzer.normalize_word("koirien") == "koira"