Skip to content

Commit

Permalink
Merge pull request #322 from NatLibFi/issue318-handle-missing-or-inva…
Browse files Browse the repository at this point in the history
…lid-path-input-for-commands

Issue318 handle missing or invalid path input for commands
  • Loading branch information
juhoinkinen authored Sep 23, 2019
2 parents a218bf8 + af8c54d commit 81994c4
Show file tree
Hide file tree
Showing 11 changed files with 193 additions and 18 deletions.
5 changes: 4 additions & 1 deletion annif/backend/fasttext.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import os.path
import annif.util
from annif.suggestion import SubjectSuggestion, ListSuggestionResult
from annif.exception import NotInitializedException
from annif.exception import NotInitializedException, NotSupportedException
import fastText
from . import backend
from . import mixins
Expand Down Expand Up @@ -104,6 +104,9 @@ def _create_model(self):
self._model.save_model(modelpath)

def train(self, corpus, project):
if corpus.is_empty():
raise NotSupportedException('training backend {} with no documents'
.format(self.backend_id))
self._create_train_file(corpus, project)
self._create_model()

Expand Down
5 changes: 4 additions & 1 deletion annif/backend/pav.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import annif.suggestion
import annif.project
import annif.util
from annif.exception import NotInitializedException
from annif.exception import NotInitializedException, NotSupportedException
from . import ensemble


Expand Down Expand Up @@ -96,6 +96,9 @@ def _create_pav_model(self, source_project_id, min_docs, corpus):
method=joblib.dump)

def train(self, corpus, project):
if corpus.is_empty():
raise NotSupportedException('training backend {} with no documents'
.format(self.backend_id))
self.info("creating PAV models")
sources = annif.util.parse_sources(self.params['sources'])
min_docs = int(self.params['min-docs'])
Expand Down
22 changes: 13 additions & 9 deletions annif/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,11 +46,14 @@ def open_doc_path(path):
return annif.corpus.DocumentDirectory(path, require_subjects=True)
return annif.corpus.DocumentFile(path)

if len(paths) > 1:
if len(paths) == 0:
logger.warning('Reading empty file')
docs = open_doc_path(os.path.devnull)
elif len(paths) == 1:
docs = open_doc_path(paths[0])
else:
corpora = [open_doc_path(path) for path in paths]
docs = annif.corpus.CombinedCorpus(corpora)
else:
docs = open_doc_path(paths[0])
return docs


Expand Down Expand Up @@ -86,6 +89,7 @@ def common_options(f):
"""Decorator to add common options for all CLI commands"""
f = click.option(
'-p', '--projects', help='Set path to projects.cfg',
type=click.Path(dir_okay=False, exists=True),
callback=set_project_config_file_path, expose_value=False,
is_eager=True)(f)
f = click_log.simple_verbosity_option(logger)(f)
Expand Down Expand Up @@ -136,7 +140,7 @@ def run_clear_project(project_id):

@cli.command('loadvoc')
@click.argument('project_id')
@click.argument('subjectfile', type=click.Path(dir_okay=False))
@click.argument('subjectfile', type=click.Path(exists=True, dir_okay=False))
@common_options
def run_loadvoc(project_id, subjectfile):
"""
Expand All @@ -154,7 +158,7 @@ def run_loadvoc(project_id, subjectfile):

@cli.command('train')
@click.argument('project_id')
@click.argument('paths', type=click.Path(), nargs=-1)
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
@common_options
def run_train(project_id, paths):
"""
Expand All @@ -167,7 +171,7 @@ def run_train(project_id, paths):

@cli.command('learn')
@click.argument('project_id')
@click.argument('paths', type=click.Path(), nargs=-1)
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
@common_options
def run_learn(project_id, paths):
"""
Expand Down Expand Up @@ -200,7 +204,7 @@ def run_suggest(project_id, limit, threshold, backend_param):

@cli.command('index')
@click.argument('project_id')
@click.argument('directory', type=click.Path(file_okay=False))
@click.argument('directory', type=click.Path(exists=True, file_okay=False))
@click.option(
'--suffix',
default='.annif',
Expand Down Expand Up @@ -241,7 +245,7 @@ def run_index(project_id, directory, suffix, force,

@cli.command('eval')
@click.argument('project_id')
@click.argument('paths', type=click.Path(), nargs=-1)
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
@click.option('--limit', default=10, help='Maximum number of subjects')
@click.option('--threshold', default=0.0, help='Minimum score threshold')
@click.option('--backend-param', '-b', multiple=True,
Expand Down Expand Up @@ -275,7 +279,7 @@ def run_eval(project_id, paths, limit, threshold, backend_param):

@cli.command('optimize')
@click.argument('project_id')
@click.argument('paths', type=click.Path(), nargs=-1)
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
@click.option('--backend-param', '-b', multiple=True,
help='Backend parameters to override')
@common_options
Expand Down
8 changes: 8 additions & 0 deletions annif/corpus/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,14 @@ def _create_document(self, text, uris, labels):

return Document(text=text, uris=uris, labels=labels)

def is_empty(self):
"""Check if there are no documents to iterate."""
try:
next(self.documents)
return False
except StopIteration:
return True


Subject = collections.namedtuple('Subject', 'uri label text')

Expand Down
3 changes: 3 additions & 0 deletions annif/project.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,9 @@ def _create_vectorizer(self, subjectcorpus):
if not self.backend.needs_subject_vectorizer:
logger.debug('not creating vectorizer: not needed by backend')
return
if subjectcorpus.is_empty():
raise NotSupportedException(
'using TfidfVectorizer with no documents')
logger.info('creating vectorizer')
self._vectorizer = TfidfVectorizer(
tokenizer=self.analyzer.tokenize_words)
Expand Down
21 changes: 21 additions & 0 deletions tests/test_backend_fasttext.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import pytest
import annif.backend
import annif.corpus
from annif.exception import NotSupportedException

fasttext = pytest.importorskip("annif.backend.fasttext")

Expand Down Expand Up @@ -48,6 +49,26 @@ def test_fasttext_train_unknown_subject(tmpdir, datadir, project):
assert datadir.join('fasttext-model').size() > 0


def test_fasttext_train_nodocuments(tmpdir, datadir, project):
fasttext_type = annif.backend.get_backend("fasttext")
fasttext = fasttext_type(
backend_id='fasttext',
params={
'limit': 50,
'dim': 100,
'lr': 0.25,
'epoch': 20,
'loss': 'hs'},
datadir=str(datadir))

empty_file = tmpdir.ensure('empty.tsv')
empty_document_corpus = annif.corpus.DocumentFile(str(empty_file))

with pytest.raises(NotSupportedException) as excinfo:
fasttext.train(empty_document_corpus, project)
assert 'training backend fasttext with no documents' in str(excinfo.value)


def test_fasttext_suggest(datadir, project):
fasttext_type = annif.backend.get_backend("fasttext")
fasttext = fasttext_type(
Expand Down
17 changes: 17 additions & 0 deletions tests/test_backend_pav.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
"""Unit tests for the PAV backend in Annif"""

import pytest
import annif.backend
import annif.corpus
from annif.exception import NotSupportedException


def test_pav_train(app, datadir, tmpdir, project):
Expand All @@ -23,6 +25,21 @@ def test_pav_train(app, datadir, tmpdir, project):
assert datadir.join('pav-model-dummy-fi').size() > 0


def test_pav_train_nodocuments(tmpdir, datadir, project):
pav_type = annif.backend.get_backend("pav")
pav = pav_type(
backend_id='pav',
params={'limit': 50, 'min-docs': 2, 'sources': 'dummy-fi'},
datadir=str(datadir))

empty_file = tmpdir.ensure('empty.tsv')
empty_document_corpus = annif.corpus.DocumentFile(str(empty_file))

with pytest.raises(NotSupportedException) as excinfo:
pav.train(empty_document_corpus, project)
assert 'training backend pav with no documents' in str(excinfo.value)


def test_pav_initialize(app, datadir):
pav_type = annif.backend.get_backend("pav")
pav = pav_type(
Expand Down
28 changes: 28 additions & 0 deletions tests/test_backend_vw_multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,34 @@ def test_vw_multi_train_and_learn(datadir, document_corpus, project):
assert modelfile.size() != old_size or modelfile.mtime() != old_mtime


def test_vw_multi_train_and_learn_nodocuments(datadir, tmpdir, project):
vw_type = annif.backend.get_backend('vw_multi')
vw = vw_type(
backend_id='vw_multi',
params={
'chunksize': 4,
'learning_rate': 0.5,
'loss_function': 'hinge'},
datadir=str(datadir))

empty_file = tmpdir.ensure('empty.tsv')
empty_document_corpus = annif.corpus.DocumentFile(str(empty_file))

vw.train(empty_document_corpus, project)
assert datadir.join('vw-train.txt').exists()
assert datadir.join('vw-train.txt').size() == 0

# test online learning
modelfile = datadir.join('vw-model')

old_size = modelfile.size()

vw.learn(empty_document_corpus, project)

assert modelfile.size() == old_size
assert datadir.join('vw-train.txt').size() == 0


def test_vw_multi_train_from_project(app, datadir, document_corpus, project):
vw_type = annif.backend.get_backend('vw_multi')
vw = vw_type(
Expand Down
86 changes: 79 additions & 7 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import random
import re
import os.path
import pytest
from click.testing import CliRunner
import annif.cli

Expand Down Expand Up @@ -50,13 +51,12 @@ def test_list_projects_config_path_option():


def test_list_projects_config_path_option_nonexistent():
nonexistent_file = "nonexistent.cfg"
result = runner.invoke(
annif.cli.cli, ["list-projects", "--projects", nonexistent_file])
assert not result.exception
assert result.exit_code == 0
assert 'Project configuration file "{}" is missing.'.format(
nonexistent_file) in result.output
failed_result = runner.invoke(
annif.cli.cli, ["list-projects", "--projects", "nonexistent.cfg"])
assert failed_result.exception
assert failed_result.exit_code != 0
assert 'Error: Invalid value for "-p" / "--projects": ' \
'File "nonexistent.cfg" does not exist.' in failed_result.output


def test_show_project():
Expand Down Expand Up @@ -137,6 +137,16 @@ def test_loadvoc_rdf(testdatadir):
assert testdatadir.join('vocabs/yso-fi/subjects').size() > 0


def test_loadvoc_nonexistent_path():
failed_result = runner.invoke(
annif.cli.cli, [
'loadvoc', 'dummy-fi', 'nonexistent_path'])
assert failed_result.exception
assert failed_result.exit_code != 0
assert 'Invalid value for "SUBJECTFILE": ' \
'File "nonexistent_path" does not exist.' in failed_result.output


def test_train(testdatadir):
docfile = os.path.join(
os.path.dirname(__file__),
Expand Down Expand Up @@ -168,6 +178,27 @@ def test_train_multiple(testdatadir):
assert testdatadir.join('projects/tfidf-fi/tfidf-index').size() > 0


def test_train_nonexistent_path():
failed_result = runner.invoke(
annif.cli.cli, [
'train', 'dummy-fi', 'nonexistent_path'])
assert failed_result.exception
assert failed_result.exit_code != 0
assert 'Invalid value for "[PATHS]...": ' \
'Path "nonexistent_path" does not exist.' in failed_result.output


def test_train_no_path(caplog):
logger = annif.logger
logger.propagate = True
result = runner.invoke(
annif.cli.cli, [
'train', 'dummy-fi'])
assert not result.exception
assert result.exit_code == 0
assert 'Reading empty file' == caplog.records[0].message


def test_learn(testdatadir):
docfile = os.path.join(
os.path.dirname(__file__),
Expand All @@ -190,6 +221,16 @@ def test_learn_notsupported(testdatadir):
assert 'Learning not supported' in result.output


def test_learn_nonexistent_path():
failed_result = runner.invoke(
annif.cli.cli, [
'learn', 'dummy-fi', 'nonexistent_path'])
assert failed_result.exception
assert failed_result.exit_code != 0
assert 'Invalid value for "[PATHS]...": ' \
'Path "nonexistent_path" does not exist.' in failed_result.output


def test_suggest():
result = runner.invoke(
annif.cli.cli,
Expand Down Expand Up @@ -259,6 +300,17 @@ def test_index(tmpdir):
'utf-8') == "<http://example.org/dummy>\tdummy\t1.0\n"


def test_index_nonexistent_path():
failed_result = runner.invoke(
annif.cli.cli, [
'index', 'dummy-fi', 'nonexistent_path'])
assert failed_result.exception
assert failed_result.exit_code != 0
assert 'Invalid value for "DIRECTORY": ' \
'Directory "nonexistent_path" does not exist.' \
in failed_result.output


def test_eval_label(tmpdir):
tmpdir.join('doc1.txt').write('doc1')
tmpdir.join('doc1.key').write('dummy')
Expand Down Expand Up @@ -362,6 +414,16 @@ def test_eval_docfile():
assert result.exit_code == 0


def test_eval_nonexistent_path():
failed_result = runner.invoke(
annif.cli.cli, [
'eval', 'dummy-fi', 'nonexistent_path'])
assert failed_result.exception
assert failed_result.exit_code != 0
assert 'Invalid value for "[PATHS]...": ' \
'Path "nonexistent_path" does not exist.' in failed_result.output


def test_optimize_dir(tmpdir):
tmpdir.join('doc1.txt').write('doc1')
tmpdir.join('doc1.key').write('dummy')
Expand Down Expand Up @@ -398,3 +460,13 @@ def test_optimize_docfile(tmpdir):
'optimize', 'dummy-fi', str(docfile)])
assert not result.exception
assert result.exit_code == 0


def test_optimize_nonexistent_path():
failed_result = runner.invoke(
annif.cli.cli, [
'optimize', 'dummy-fi', 'nonexistent_path'])
assert failed_result.exception
assert failed_result.exit_code != 0
assert 'Invalid value for "[PATHS]...": ' \
'Path "nonexistent_path" does not exist.' in failed_result.output
6 changes: 6 additions & 0 deletions tests/test_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,3 +244,9 @@ def test_docfile_gzipped(tmpdir):

docs = annif.corpus.DocumentFile(str(docfile))
assert len(list(docs.documents)) == 3


def test_docfile_is_empty(tmpdir):
empty_file = tmpdir.ensure('empty.tsv')
docs = annif.corpus.DocumentFile(str(empty_file))
assert docs.is_empty()
Loading

0 comments on commit 81994c4

Please sign in to comment.