Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Issue318 handle missing or invalid path input for commands #322

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion annif/backend/fasttext.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import os.path
import annif.util
from annif.suggestion import SubjectSuggestion, ListSuggestionResult
from annif.exception import NotInitializedException
from annif.exception import NotInitializedException, NotSupportedException
import fastText
from . import backend
from . import mixins
Expand Down Expand Up @@ -104,6 +104,9 @@ def _create_model(self):
self._model.save_model(modelpath)

def train(self, corpus, project):
if corpus.is_empty():
raise NotSupportedException('training backend {} with no documents'
.format(self.backend_id))
self._create_train_file(corpus, project)
self._create_model()

Expand Down
5 changes: 4 additions & 1 deletion annif/backend/pav.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import annif.suggestion
import annif.project
import annif.util
from annif.exception import NotInitializedException
from annif.exception import NotInitializedException, NotSupportedException
from . import ensemble


Expand Down Expand Up @@ -96,6 +96,9 @@ def _create_pav_model(self, source_project_id, min_docs, corpus):
method=joblib.dump)

def train(self, corpus, project):
if corpus.is_empty():
raise NotSupportedException('training backend {} with no documents'
.format(self.backend_id))
self.info("creating PAV models")
sources = annif.util.parse_sources(self.params['sources'])
min_docs = int(self.params['min-docs'])
Expand Down
22 changes: 13 additions & 9 deletions annif/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,11 +46,14 @@ def open_doc_path(path):
return annif.corpus.DocumentDirectory(path, require_subjects=True)
return annif.corpus.DocumentFile(path)

if len(paths) > 1:
if len(paths) == 0:
logger.warning('Reading empty file')
docs = open_doc_path(os.path.devnull)
elif len(paths) == 1:
docs = open_doc_path(paths[0])
else:
corpora = [open_doc_path(path) for path in paths]
docs = annif.corpus.CombinedCorpus(corpora)
else:
docs = open_doc_path(paths[0])
return docs


Expand Down Expand Up @@ -86,6 +89,7 @@ def common_options(f):
"""Decorator to add common options for all CLI commands"""
f = click.option(
'-p', '--projects', help='Set path to projects.cfg',
type=click.Path(dir_okay=False, exists=True),
callback=set_project_config_file_path, expose_value=False,
is_eager=True)(f)
f = click_log.simple_verbosity_option(logger)(f)
Expand Down Expand Up @@ -136,7 +140,7 @@ def run_clear_project(project_id):

@cli.command('loadvoc')
@click.argument('project_id')
@click.argument('subjectfile', type=click.Path(dir_okay=False))
@click.argument('subjectfile', type=click.Path(exists=True, dir_okay=False))
@common_options
def run_loadvoc(project_id, subjectfile):
"""
Expand All @@ -154,7 +158,7 @@ def run_loadvoc(project_id, subjectfile):

@cli.command('train')
@click.argument('project_id')
@click.argument('paths', type=click.Path(), nargs=-1)
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
@common_options
def run_train(project_id, paths):
"""
Expand All @@ -167,7 +171,7 @@ def run_train(project_id, paths):

@cli.command('learn')
@click.argument('project_id')
@click.argument('paths', type=click.Path(), nargs=-1)
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
@common_options
def run_learn(project_id, paths):
"""
Expand Down Expand Up @@ -200,7 +204,7 @@ def run_suggest(project_id, limit, threshold, backend_param):

@cli.command('index')
@click.argument('project_id')
@click.argument('directory', type=click.Path(file_okay=False))
@click.argument('directory', type=click.Path(exists=True, file_okay=False))
@click.option(
'--suffix',
default='.annif',
Expand Down Expand Up @@ -241,7 +245,7 @@ def run_index(project_id, directory, suffix, force,

@cli.command('eval')
@click.argument('project_id')
@click.argument('paths', type=click.Path(), nargs=-1)
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
@click.option('--limit', default=10, help='Maximum number of subjects')
@click.option('--threshold', default=0.0, help='Minimum score threshold')
@click.option('--backend-param', '-b', multiple=True,
Expand Down Expand Up @@ -275,7 +279,7 @@ def run_eval(project_id, paths, limit, threshold, backend_param):

@cli.command('optimize')
@click.argument('project_id')
@click.argument('paths', type=click.Path(), nargs=-1)
@click.argument('paths', type=click.Path(exists=True), nargs=-1)
@click.option('--backend-param', '-b', multiple=True,
help='Backend parameters to override')
@common_options
Expand Down
8 changes: 8 additions & 0 deletions annif/corpus/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,14 @@ def _create_document(self, text, uris, labels):

return Document(text=text, uris=uris, labels=labels)

def is_empty(self):
"""Check if there are no documents to iterate."""
try:
next(self.documents)
return False
except StopIteration:
return True


Subject = collections.namedtuple('Subject', 'uri label text')

Expand Down
3 changes: 3 additions & 0 deletions annif/project.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,9 @@ def _create_vectorizer(self, subjectcorpus):
if not self.backend.needs_subject_vectorizer:
logger.debug('not creating vectorizer: not needed by backend')
return
if subjectcorpus.is_empty():
raise NotSupportedException(
'using TfidfVectorizer with no documents')
logger.info('creating vectorizer')
self._vectorizer = TfidfVectorizer(
tokenizer=self.analyzer.tokenize_words)
Expand Down
21 changes: 21 additions & 0 deletions tests/test_backend_fasttext.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import pytest
import annif.backend
import annif.corpus
from annif.exception import NotSupportedException

fasttext = pytest.importorskip("annif.backend.fasttext")

Expand Down Expand Up @@ -48,6 +49,26 @@ def test_fasttext_train_unknown_subject(tmpdir, datadir, project):
assert datadir.join('fasttext-model').size() > 0


def test_fasttext_train_nodocuments(tmpdir, datadir, project):
fasttext_type = annif.backend.get_backend("fasttext")
fasttext = fasttext_type(
backend_id='fasttext',
params={
'limit': 50,
'dim': 100,
'lr': 0.25,
'epoch': 20,
'loss': 'hs'},
datadir=str(datadir))

empty_file = tmpdir.ensure('empty.tsv')
empty_document_corpus = annif.corpus.DocumentFile(str(empty_file))

with pytest.raises(NotSupportedException) as excinfo:
fasttext.train(empty_document_corpus, project)
assert 'training backend fasttext with no documents' in str(excinfo.value)


def test_fasttext_suggest(datadir, project):
fasttext_type = annif.backend.get_backend("fasttext")
fasttext = fasttext_type(
Expand Down
17 changes: 17 additions & 0 deletions tests/test_backend_pav.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
"""Unit tests for the PAV backend in Annif"""

import pytest
import annif.backend
import annif.corpus
from annif.exception import NotSupportedException


def test_pav_train(app, datadir, tmpdir, project):
Expand All @@ -23,6 +25,21 @@ def test_pav_train(app, datadir, tmpdir, project):
assert datadir.join('pav-model-dummy-fi').size() > 0


def test_pav_train_nodocuments(tmpdir, datadir, project):
pav_type = annif.backend.get_backend("pav")
pav = pav_type(
backend_id='pav',
params={'limit': 50, 'min-docs': 2, 'sources': 'dummy-fi'},
datadir=str(datadir))

empty_file = tmpdir.ensure('empty.tsv')
empty_document_corpus = annif.corpus.DocumentFile(str(empty_file))

with pytest.raises(NotSupportedException) as excinfo:
pav.train(empty_document_corpus, project)
assert 'training backend pav with no documents' in str(excinfo.value)


def test_pav_initialize(app, datadir):
pav_type = annif.backend.get_backend("pav")
pav = pav_type(
Expand Down
28 changes: 28 additions & 0 deletions tests/test_backend_vw_multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,34 @@ def test_vw_multi_train_and_learn(datadir, document_corpus, project):
assert modelfile.size() != old_size or modelfile.mtime() != old_mtime


def test_vw_multi_train_and_learn_nodocuments(datadir, tmpdir, project):
vw_type = annif.backend.get_backend('vw_multi')
vw = vw_type(
backend_id='vw_multi',
params={
'chunksize': 4,
'learning_rate': 0.5,
'loss_function': 'hinge'},
datadir=str(datadir))

empty_file = tmpdir.ensure('empty.tsv')
empty_document_corpus = annif.corpus.DocumentFile(str(empty_file))

vw.train(empty_document_corpus, project)
assert datadir.join('vw-train.txt').exists()
assert datadir.join('vw-train.txt').size() == 0

# test online learning
modelfile = datadir.join('vw-model')

old_size = modelfile.size()

vw.learn(empty_document_corpus, project)

assert modelfile.size() == old_size
assert datadir.join('vw-train.txt').size() == 0


def test_vw_multi_train_from_project(app, datadir, document_corpus, project):
vw_type = annif.backend.get_backend('vw_multi')
vw = vw_type(
Expand Down
86 changes: 79 additions & 7 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import random
import re
import os.path
import pytest
from click.testing import CliRunner
import annif.cli

Expand Down Expand Up @@ -50,13 +51,12 @@ def test_list_projects_config_path_option():


def test_list_projects_config_path_option_nonexistent():
nonexistent_file = "nonexistent.cfg"
result = runner.invoke(
annif.cli.cli, ["list-projects", "--projects", nonexistent_file])
assert not result.exception
assert result.exit_code == 0
assert 'Project configuration file "{}" is missing.'.format(
nonexistent_file) in result.output
failed_result = runner.invoke(
annif.cli.cli, ["list-projects", "--projects", "nonexistent.cfg"])
assert failed_result.exception
assert failed_result.exit_code != 0
assert 'Error: Invalid value for "-p" / "--projects": ' \
'File "nonexistent.cfg" does not exist.' in failed_result.output


def test_show_project():
Expand Down Expand Up @@ -137,6 +137,16 @@ def test_loadvoc_rdf(testdatadir):
assert testdatadir.join('vocabs/yso-fi/subjects').size() > 0


def test_loadvoc_nonexistent_path():
failed_result = runner.invoke(
annif.cli.cli, [
'loadvoc', 'dummy-fi', 'nonexistent_path'])
assert failed_result.exception
assert failed_result.exit_code != 0
assert 'Invalid value for "SUBJECTFILE": ' \
'File "nonexistent_path" does not exist.' in failed_result.output


def test_train(testdatadir):
docfile = os.path.join(
os.path.dirname(__file__),
Expand Down Expand Up @@ -168,6 +178,27 @@ def test_train_multiple(testdatadir):
assert testdatadir.join('projects/tfidf-fi/tfidf-index').size() > 0


def test_train_nonexistent_path():
failed_result = runner.invoke(
annif.cli.cli, [
'train', 'dummy-fi', 'nonexistent_path'])
assert failed_result.exception
assert failed_result.exit_code != 0
assert 'Invalid value for "[PATHS]...": ' \
'Path "nonexistent_path" does not exist.' in failed_result.output


def test_train_no_path(caplog):
logger = annif.logger
logger.propagate = True
result = runner.invoke(
annif.cli.cli, [
'train', 'dummy-fi'])
assert not result.exception
assert result.exit_code == 0
assert 'Reading empty file' == caplog.records[0].message


def test_learn(testdatadir):
docfile = os.path.join(
os.path.dirname(__file__),
Expand All @@ -190,6 +221,16 @@ def test_learn_notsupported(testdatadir):
assert 'Learning not supported' in result.output


def test_learn_nonexistent_path():
failed_result = runner.invoke(
annif.cli.cli, [
'learn', 'dummy-fi', 'nonexistent_path'])
assert failed_result.exception
assert failed_result.exit_code != 0
assert 'Invalid value for "[PATHS]...": ' \
'Path "nonexistent_path" does not exist.' in failed_result.output


def test_suggest():
result = runner.invoke(
annif.cli.cli,
Expand Down Expand Up @@ -259,6 +300,17 @@ def test_index(tmpdir):
'utf-8') == "<http://example.org/dummy>\tdummy\t1.0\n"


def test_index_nonexistent_path():
failed_result = runner.invoke(
annif.cli.cli, [
'index', 'dummy-fi', 'nonexistent_path'])
assert failed_result.exception
assert failed_result.exit_code != 0
assert 'Invalid value for "DIRECTORY": ' \
'Directory "nonexistent_path" does not exist.' \
in failed_result.output


def test_eval_label(tmpdir):
tmpdir.join('doc1.txt').write('doc1')
tmpdir.join('doc1.key').write('dummy')
Expand Down Expand Up @@ -362,6 +414,16 @@ def test_eval_docfile():
assert result.exit_code == 0


def test_eval_nonexistent_path():
failed_result = runner.invoke(
annif.cli.cli, [
'eval', 'dummy-fi', 'nonexistent_path'])
assert failed_result.exception
assert failed_result.exit_code != 0
assert 'Invalid value for "[PATHS]...": ' \
'Path "nonexistent_path" does not exist.' in failed_result.output


def test_optimize_dir(tmpdir):
tmpdir.join('doc1.txt').write('doc1')
tmpdir.join('doc1.key').write('dummy')
Expand Down Expand Up @@ -398,3 +460,13 @@ def test_optimize_docfile(tmpdir):
'optimize', 'dummy-fi', str(docfile)])
assert not result.exception
assert result.exit_code == 0


def test_optimize_nonexistent_path():
failed_result = runner.invoke(
annif.cli.cli, [
'optimize', 'dummy-fi', 'nonexistent_path'])
assert failed_result.exception
assert failed_result.exit_code != 0
assert 'Invalid value for "[PATHS]...": ' \
'Path "nonexistent_path" does not exist.' in failed_result.output
6 changes: 6 additions & 0 deletions tests/test_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,3 +244,9 @@ def test_docfile_gzipped(tmpdir):

docs = annif.corpus.DocumentFile(str(docfile))
assert len(list(docs.documents)) == 3


def test_docfile_is_empty(tmpdir):
empty_file = tmpdir.ensure('empty.tsv')
docs = annif.corpus.DocumentFile(str(empty_file))
assert docs.is_empty()
Loading