Skip to content

Commit

Permalink
Merge branch 'master' into issue273-default-values-for-configuration-…
Browse files Browse the repository at this point in the history
…settings
  • Loading branch information
juhoinkinen committed Sep 26, 2019
2 parents 420d3d1 + 81994c4 commit 0df9735
Show file tree
Hide file tree
Showing 15 changed files with 231 additions and 32 deletions.
5 changes: 4 additions & 1 deletion annif/backend/fasttext.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import os.path
import annif.util
from annif.suggestion import SubjectSuggestion, ListSuggestionResult
from annif.exception import NotInitializedException
from annif.exception import NotInitializedException, NotSupportedException
import fastText
from . import backend
from . import mixins
Expand Down Expand Up @@ -117,6 +117,9 @@ def _create_model(self):
self._model.save_model(modelpath)

def train(self, corpus, project):
if corpus.is_empty():
raise NotSupportedException('training backend {} with no documents'
.format(self.backend_id))
self._create_train_file(corpus, project)
self._create_model()

Expand Down
5 changes: 4 additions & 1 deletion annif/backend/pav.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import annif.suggestion
import annif.project
import annif.util
from annif.exception import NotInitializedException
from annif.exception import NotInitializedException, NotSupportedException
from . import ensemble


Expand Down Expand Up @@ -98,6 +98,9 @@ def _create_pav_model(self, source_project_id, min_docs, corpus):
method=joblib.dump)

def train(self, corpus, project):
if corpus.is_empty():
raise NotSupportedException('training backend {} with no documents'
.format(self.backend_id))
self.info("creating PAV models")
sources = annif.util.parse_sources(self.params['sources'])
min_docs = int(self.params['min-docs'])
Expand Down
22 changes: 13 additions & 9 deletions annif/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,11 +46,14 @@ def open_doc_path(path):
return annif.corpus.DocumentDirectory(path, require_subjects=True)
return annif.corpus.DocumentFile(path)

if len(paths) > 1:
if len(paths) == 0:
logger.warning('Reading empty file')
docs = open_doc_path(os.path.devnull)
elif len(paths) == 1:
docs = open_doc_path(paths[0])
else:
corpora = [open_doc_path(path) for path in paths]
docs = annif.corpus.CombinedCorpus(corpora)
else:
docs = open_doc_path(paths[0])
return docs


Expand Down Expand Up @@ -86,6 +89,7 @@ def common_options(f):
"""Decorator to add common options for all CLI commands"""
f = click.option(
'-p', '--projects', help='Set path to projects.cfg',
type=click.Path(dir_okay=False, exists=True),
callback=set_project_config_file_path, expose_value=False,
is_eager=True)(f)
f = click_log.simple_verbosity_option(logger)(f)
Expand Down Expand Up @@ -136,7 +140,7 @@ def run_clear_project(project_id):

@cli.command('loadvoc')
@click.argument('project_id')
@click.argument('subjectfile', type=click.Path(dir_okay=False))
@click.argument('subjectfile', type=click.Path(exists=True, dir_okay=False))
@common_options
def run_loadvoc(project_id, subjectfile):
"""
Expand All @@ -154,7 +158,7 @@ def run_loadvoc(project_id, subjectfile):

@cli.command('train')
@click.argument('project_id')
@click.argument('paths', type=click.Path(), nargs=-1)
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
@common_options
def run_train(project_id, paths):
"""
Expand All @@ -167,7 +171,7 @@ def run_train(project_id, paths):

@cli.command('learn')
@click.argument('project_id')
@click.argument('paths', type=click.Path(), nargs=-1)
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
@common_options
def run_learn(project_id, paths):
"""
Expand Down Expand Up @@ -200,7 +204,7 @@ def run_suggest(project_id, limit, threshold, backend_param):

@cli.command('index')
@click.argument('project_id')
@click.argument('directory', type=click.Path(file_okay=False))
@click.argument('directory', type=click.Path(exists=True, file_okay=False))
@click.option(
'--suffix',
default='.annif',
Expand Down Expand Up @@ -241,7 +245,7 @@ def run_index(project_id, directory, suffix, force,

@cli.command('eval')
@click.argument('project_id')
@click.argument('paths', type=click.Path(), nargs=-1)
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
@click.option('--limit', default=10, help='Maximum number of subjects')
@click.option('--threshold', default=0.0, help='Minimum score threshold')
@click.option('--backend-param', '-b', multiple=True,
Expand Down Expand Up @@ -275,7 +279,7 @@ def run_eval(project_id, paths, limit, threshold, backend_param):

@cli.command('optimize')
@click.argument('project_id')
@click.argument('paths', type=click.Path(), nargs=-1)
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
@click.option('--backend-param', '-b', multiple=True,
help='Backend parameters to override')
@common_options
Expand Down
30 changes: 18 additions & 12 deletions annif/corpus/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,12 @@
import re
import gzip
import annif.util
from .types import Document, DocumentCorpus
from .types import DocumentCorpus
from .convert import DocumentToSubjectCorpusMixin
from .subject import SubjectSet

logger = annif.logger


class DocumentDirectory(DocumentCorpus, DocumentToSubjectCorpusMixin):
"""A directory of files as a full text document corpus"""
Expand Down Expand Up @@ -56,20 +58,24 @@ def __init__(self, path):
@property
def documents(self):
if self.path.endswith('.gz'):
def opener(path):
"""open a gzip compressed file in text mode"""
return gzip.open(path, mode='rt')
opener = gzip.open
else:
opener = open

with opener(self.path) as tsvfile:
with opener(self.path, mode='rt', encoding='utf-8') as tsvfile:
for line in tsvfile:
text, uris = line.split('\t', maxsplit=1)
subjects = [annif.util.cleanup_uri(uri)
for uri in uris.split()]
yield self._create_document(text=text,
uris=subjects,
labels=[])
yield from self._parse_tsv_line(line)

def _parse_tsv_line(self, line):
if '\t' in line:
text, uris = line.split('\t', maxsplit=1)
subjects = [annif.util.cleanup_uri(uri)
for uri in uris.split()]
yield self._create_document(text=text,
uris=subjects,
labels=[])
else:
logger.warning('Skipping invalid line (missing tab): "%s"',
line.rstrip())


class DocumentList(DocumentCorpus, DocumentToSubjectCorpusMixin):
Expand Down
8 changes: 8 additions & 0 deletions annif/corpus/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,14 @@ def _create_document(self, text, uris, labels):

return Document(text=text, uris=uris, labels=labels)

def is_empty(self):
"""Check if there are no documents to iterate."""
try:
next(self.documents)
return False
except StopIteration:
return True


Subject = collections.namedtuple('Subject', 'uri label text')

Expand Down
3 changes: 3 additions & 0 deletions annif/project.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,9 @@ def _create_vectorizer(self, subjectcorpus):
if not self.backend.needs_subject_vectorizer:
logger.debug('not creating vectorizer: not needed by backend')
return
if subjectcorpus.is_empty():
raise NotSupportedException(
'using TfidfVectorizer with no documents')
logger.info('creating vectorizer')
self._vectorizer = TfidfVectorizer(
tokenizer=self.analyzer.tokenize_words)
Expand Down
1 change: 1 addition & 0 deletions pytest.ini
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
[pytest]
addopts = --pep8
markers = pep8
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 0.41.0
current_version = 0.42.0
commit = True
tag = True

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ def read(fname):

setup(
name='annif',
version='0.41.0',
version='0.42.0',
url='https://github.com/NatLibFi/Annif',
author='Osma Suominen',
author_email='osma.suominen@helsinki.fi',
Expand Down
21 changes: 21 additions & 0 deletions tests/test_backend_fasttext.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import pytest
import annif.backend
import annif.corpus
from annif.exception import NotSupportedException

fasttext = pytest.importorskip("annif.backend.fasttext")

Expand Down Expand Up @@ -68,6 +69,26 @@ def test_fasttext_train_unknown_subject(tmpdir, datadir, project):
assert datadir.join('fasttext-model').size() > 0


def test_fasttext_train_nodocuments(tmpdir, datadir, project):
fasttext_type = annif.backend.get_backend("fasttext")
fasttext = fasttext_type(
backend_id='fasttext',
params={
'limit': 50,
'dim': 100,
'lr': 0.25,
'epoch': 20,
'loss': 'hs'},
datadir=str(datadir))

empty_file = tmpdir.ensure('empty.tsv')
empty_document_corpus = annif.corpus.DocumentFile(str(empty_file))

with pytest.raises(NotSupportedException) as excinfo:
fasttext.train(empty_document_corpus, project)
assert 'training backend fasttext with no documents' in str(excinfo.value)


def test_fasttext_suggest(datadir, project):
fasttext_type = annif.backend.get_backend("fasttext")
fasttext = fasttext_type(
Expand Down
17 changes: 17 additions & 0 deletions tests/test_backend_pav.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
"""Unit tests for the PAV backend in Annif"""

import pytest
import annif.backend
import annif.corpus
from annif.exception import NotSupportedException


def test_pav_default_params(datadir, document_corpus, project):
Expand Down Expand Up @@ -38,6 +40,21 @@ def test_pav_train(app, datadir, tmpdir, project):
assert datadir.join('pav-model-dummy-fi').size() > 0


def test_pav_train_nodocuments(tmpdir, datadir, project):
pav_type = annif.backend.get_backend("pav")
pav = pav_type(
backend_id='pav',
params={'limit': 50, 'min-docs': 2, 'sources': 'dummy-fi'},
datadir=str(datadir))

empty_file = tmpdir.ensure('empty.tsv')
empty_document_corpus = annif.corpus.DocumentFile(str(empty_file))

with pytest.raises(NotSupportedException) as excinfo:
pav.train(empty_document_corpus, project)
assert 'training backend pav with no documents' in str(excinfo.value)


def test_pav_initialize(app, datadir):
pav_type = annif.backend.get_backend("pav")
pav = pav_type(
Expand Down
28 changes: 28 additions & 0 deletions tests/test_backend_vw_multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,34 @@ def test_vw_multi_train_and_learn(datadir, document_corpus, project):
assert modelfile.size() != old_size or modelfile.mtime() != old_mtime


def test_vw_multi_train_and_learn_nodocuments(datadir, tmpdir, project):
vw_type = annif.backend.get_backend('vw_multi')
vw = vw_type(
backend_id='vw_multi',
params={
'chunksize': 4,
'learning_rate': 0.5,
'loss_function': 'hinge'},
datadir=str(datadir))

empty_file = tmpdir.ensure('empty.tsv')
empty_document_corpus = annif.corpus.DocumentFile(str(empty_file))

vw.train(empty_document_corpus, project)
assert datadir.join('vw-train.txt').exists()
assert datadir.join('vw-train.txt').size() == 0

# test online learning
modelfile = datadir.join('vw-model')

old_size = modelfile.size()

vw.learn(empty_document_corpus, project)

assert modelfile.size() == old_size
assert datadir.join('vw-train.txt').size() == 0


def test_vw_multi_train_from_project(app, datadir, document_corpus, project):
vw_type = annif.backend.get_backend('vw_multi')
vw = vw_type(
Expand Down
Loading

0 comments on commit 0df9735

Please sign in to comment.