From c7e56ab88f2f4d0e687f47c42c1c26185e43c1a3 Mon Sep 17 00:00:00 2001 From: Juho Inkinen Date: Mon, 26 Aug 2019 14:04:32 +0300 Subject: [PATCH 01/16] Click existence checks for path arguments --- annif/cli.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/annif/cli.py b/annif/cli.py index 1e2c96b49..b15631627 100644 --- a/annif/cli.py +++ b/annif/cli.py @@ -136,7 +136,7 @@ def run_clear_project(project_id): @cli.command('loadvoc') @click.argument('project_id') -@click.argument('subjectfile', type=click.Path(dir_okay=False)) +@click.argument('subjectfile', type=click.Path(exists=True, dir_okay=False)) @common_options def run_loadvoc(project_id, subjectfile): """ @@ -154,7 +154,7 @@ def run_loadvoc(project_id, subjectfile): @cli.command('train') @click.argument('project_id') -@click.argument('paths', type=click.Path(), nargs=-1) +@click.argument('paths', type=click.Path(exists=True), nargs=-1) @common_options def run_train(project_id, paths): """ @@ -167,7 +167,7 @@ def run_train(project_id, paths): @cli.command('learn') @click.argument('project_id') -@click.argument('paths', type=click.Path(), nargs=-1) +@click.argument('paths', type=click.Path(exists=True), nargs=-1) @common_options def run_learn(project_id, paths): """ @@ -200,7 +200,7 @@ def run_suggest(project_id, limit, threshold, backend_param): @cli.command('index') @click.argument('project_id') -@click.argument('directory', type=click.Path(file_okay=False)) +@click.argument('directory', type=click.Path(exists=True, file_okay=False)) @click.option( '--suffix', default='.annif', @@ -241,7 +241,7 @@ def run_index(project_id, directory, suffix, force, @cli.command('eval') @click.argument('project_id') -@click.argument('paths', type=click.Path(), nargs=-1) +@click.argument('paths', type=click.Path(exists=True), nargs=-1) @click.option('--limit', default=10, help='Maximum number of subjects') @click.option('--threshold', default=0.0, help='Minimum score threshold') @click.option('--backend-param', '-b', multiple=True, @@ -275,7 +275,7 @@ def run_eval(project_id, paths, limit, threshold, backend_param): @cli.command('optimize') @click.argument('project_id') -@click.argument('paths', type=click.Path(), nargs=-1) +@click.argument('paths', type=click.Path(exists=True), nargs=-1) @click.option('--backend-param', '-b', multiple=True, help='Backend parameters to override') @common_options From 67840a944dcae55dc0ba23949c300e7b10dd0947 Mon Sep 17 00:00:00 2001 From: Juho Inkinen Date: Mon, 26 Aug 2019 15:20:43 +0300 Subject: [PATCH 02/16] Tests for Click existence checks for path arguments --- tests/test_cli.py | 61 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/tests/test_cli.py b/tests/test_cli.py index c9e8a1fc1..189c32e11 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -137,6 +137,16 @@ def test_loadvoc_rdf(testdatadir): assert testdatadir.join('vocabs/yso-fi/subjects').size() > 0 +def test_loadvoc_nonexistent_path(): + failed_result = runner.invoke( + annif.cli.cli, [ + 'loadvoc', 'dummy-fi', 'nonexistent_path']) + assert failed_result.exception + assert failed_result.exit_code != 0 + assert 'Invalid value for "SUBJECTFILE": ' \ + 'File "nonexistent_path" does not exist.' in failed_result.output + + def test_train(testdatadir): docfile = os.path.join( os.path.dirname(__file__), @@ -168,6 +178,16 @@ def test_train_multiple(testdatadir): assert testdatadir.join('projects/tfidf-fi/tfidf-index').size() > 0 +def test_train_nonexistent_path(): + failed_result = runner.invoke( + annif.cli.cli, [ + 'train', 'dummy-fi', 'nonexistent_path']) + assert failed_result.exception + assert failed_result.exit_code != 0 + assert 'Invalid value for "[PATHS]...": ' \ + 'Path "nonexistent_path" does not exist.' in failed_result.output + + def test_learn(testdatadir): docfile = os.path.join( os.path.dirname(__file__), @@ -190,6 +210,16 @@ def test_learn_notsupported(testdatadir): assert 'Learning not supported' in result.output +def test_learn_nonexistent_path(): + failed_result = runner.invoke( + annif.cli.cli, [ + 'learn', 'dummy-fi', 'nonexistent_path']) + assert failed_result.exception + assert failed_result.exit_code != 0 + assert 'Invalid value for "[PATHS]...": ' \ + 'Path "nonexistent_path" does not exist.' in failed_result.output + + def test_suggest(): result = runner.invoke( annif.cli.cli, @@ -259,6 +289,17 @@ def test_index(tmpdir): 'utf-8') == "\tdummy\t1.0\n" +def test_index_nonexistent_path(): + failed_result = runner.invoke( + annif.cli.cli, [ + 'index', 'dummy-fi', 'nonexistent_path']) + assert failed_result.exception + assert failed_result.exit_code != 0 + assert 'Invalid value for "DIRECTORY": ' \ + 'Directory "nonexistent_path" does not exist.' \ + in failed_result.output + + def test_eval_label(tmpdir): tmpdir.join('doc1.txt').write('doc1') tmpdir.join('doc1.key').write('dummy') @@ -362,6 +403,16 @@ def test_eval_docfile(): assert result.exit_code == 0 +def test_eval_nonexistent_path(): + failed_result = runner.invoke( + annif.cli.cli, [ + 'eval', 'dummy-fi', 'nonexistent_path']) + assert failed_result.exception + assert failed_result.exit_code != 0 + assert 'Invalid value for "[PATHS]...": ' \ + 'Path "nonexistent_path" does not exist.' in failed_result.output + + def test_optimize_dir(tmpdir): tmpdir.join('doc1.txt').write('doc1') tmpdir.join('doc1.key').write('dummy') @@ -398,3 +449,13 @@ def test_optimize_docfile(tmpdir): 'optimize', 'dummy-fi', str(docfile)]) assert not result.exception assert result.exit_code == 0 + + +def test_optimize_nonexistent_path(): + failed_result = runner.invoke( + annif.cli.cli, [ + 'optimize', 'dummy-fi', 'nonexistent_path']) + assert failed_result.exception + assert failed_result.exit_code != 0 + assert 'Invalid value for "[PATHS]...": ' \ + 'Path "nonexistent_path" does not exist.' in failed_result.output From 0c61e3349ecdbecdce9538c9e7063af7909c79a3 Mon Sep 17 00:00:00 2001 From: Juho Inkinen Date: Mon, 26 Aug 2019 15:34:48 +0300 Subject: [PATCH 03/16] Set type; corrects output from --help flag for the option: TEXT->PATH --- annif/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/annif/cli.py b/annif/cli.py index b15631627..18c4a1477 100644 --- a/annif/cli.py +++ b/annif/cli.py @@ -85,7 +85,7 @@ def set_project_config_file_path(ctx, param, value): def common_options(f): """Decorator to add common options for all CLI commands""" f = click.option( - '-p', '--projects', help='Set path to projects.cfg', + '-p', '--projects', help='Set path to projects.cfg', type=click.Path(), callback=set_project_config_file_path, expose_value=False, is_eager=True)(f) f = click_log.simple_verbosity_option(logger)(f) From a78fafc9a70e17a399bf6369225902ba313c4091 Mon Sep 17 00:00:00 2001 From: Juho Inkinen Date: Tue, 27 Aug 2019 09:36:40 +0300 Subject: [PATCH 04/16] Use /dev/null in case of missing training file; allows training an empty model --- annif/cli.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/annif/cli.py b/annif/cli.py index 18c4a1477..b305d1bf7 100644 --- a/annif/cli.py +++ b/annif/cli.py @@ -46,11 +46,14 @@ def open_doc_path(path): return annif.corpus.DocumentDirectory(path, require_subjects=True) return annif.corpus.DocumentFile(path) - if len(paths) > 1: + if len(paths) == 0: + logger.warning('Creating empty model') + docs = open_doc_path('/dev/null') + elif len(paths) == 1: + docs = open_doc_path(paths[0]) + elif len(paths) > 1: corpora = [open_doc_path(path) for path in paths] docs = annif.corpus.CombinedCorpus(corpora) - else: - docs = open_doc_path(paths[0]) return docs From f45683abc72da045c403da0ed49b9b83abd91755 Mon Sep 17 00:00:00 2001 From: Juho Inkinen Date: Tue, 27 Aug 2019 10:20:55 +0300 Subject: [PATCH 05/16] Click existence check for projects.cfg path option (--projects) --- annif/cli.py | 3 ++- tests/test_cli.py | 13 ++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/annif/cli.py b/annif/cli.py index b305d1bf7..d10a47716 100644 --- a/annif/cli.py +++ b/annif/cli.py @@ -88,7 +88,8 @@ def set_project_config_file_path(ctx, param, value): def common_options(f): """Decorator to add common options for all CLI commands""" f = click.option( - '-p', '--projects', help='Set path to projects.cfg', type=click.Path(), + '-p', '--projects', help='Set path to projects.cfg', + type=click.Path(dir_okay=False, exists=True), callback=set_project_config_file_path, expose_value=False, is_eager=True)(f) f = click_log.simple_verbosity_option(logger)(f) diff --git a/tests/test_cli.py b/tests/test_cli.py index 189c32e11..4710749bf 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -50,13 +50,12 @@ def test_list_projects_config_path_option(): def test_list_projects_config_path_option_nonexistent(): - nonexistent_file = "nonexistent.cfg" - result = runner.invoke( - annif.cli.cli, ["list-projects", "--projects", nonexistent_file]) - assert not result.exception - assert result.exit_code == 0 - assert 'Project configuration file "{}" is missing.'.format( - nonexistent_file) in result.output + failed_result = runner.invoke( + annif.cli.cli, ["list-projects", "--projects", "nonexistent.cfg"]) + assert failed_result.exception + assert failed_result.exit_code != 0 + assert 'Error: Invalid value for "-p" / "--projects": ' \ + 'File "nonexistent.cfg" does not exist.' in failed_result.output def test_show_project(): From 7a3c53abb336200228a2ffaaba0d043e2ddd89f5 Mon Sep 17 00:00:00 2001 From: Juho Inkinen Date: Mon, 2 Sep 2019 13:56:55 +0300 Subject: [PATCH 06/16] Use cross-platform null device file --- annif/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/annif/cli.py b/annif/cli.py index d10a47716..962412cff 100644 --- a/annif/cli.py +++ b/annif/cli.py @@ -48,7 +48,7 @@ def open_doc_path(path): if len(paths) == 0: logger.warning('Creating empty model') - docs = open_doc_path('/dev/null') + docs = open_doc_path(os.path.devnull) elif len(paths) == 1: docs = open_doc_path(paths[0]) elif len(paths) > 1: From 8939d6b120586b1282def71779f5476fe9defaa0 Mon Sep 17 00:00:00 2001 From: Juho Inkinen Date: Wed, 4 Sep 2019 13:39:17 +0300 Subject: [PATCH 07/16] Emptiness check for documents generator raising necessary NotSupportedExcs --- annif/backend/fasttext.py | 5 ++++- annif/corpus/types.py | 8 ++++++++ annif/project.py | 3 +++ tests/projects.cfg | 10 ++++++++++ tests/test_cli.py | 37 +++++++++++++++++++++++++++++++++++++ 5 files changed, 62 insertions(+), 1 deletion(-) diff --git a/annif/backend/fasttext.py b/annif/backend/fasttext.py index d64e70916..5028f57c5 100644 --- a/annif/backend/fasttext.py +++ b/annif/backend/fasttext.py @@ -4,7 +4,7 @@ import os.path import annif.util from annif.suggestion import SubjectSuggestion, ListSuggestionResult -from annif.exception import NotInitializedException +from annif.exception import NotInitializedException, NotSupportedException import fastText from . import backend from . import mixins @@ -104,6 +104,9 @@ def _create_model(self): self._model.save_model(modelpath) def train(self, corpus, project): + if corpus.are_documents_empty(): + raise NotSupportedException('training backend {} with no documents' + .format(self.backend_id)) self._create_train_file(corpus, project) self._create_model() diff --git a/annif/corpus/types.py b/annif/corpus/types.py index d79e51947..e3b3c4f25 100644 --- a/annif/corpus/types.py +++ b/annif/corpus/types.py @@ -37,6 +37,14 @@ def _create_document(self, text, uris, labels): return Document(text=text, uris=uris, labels=labels) + def are_documents_empty(self): + """Check if there are no documents to iterate.""" + try: + next(self.documents) + return False + except StopIteration: + return True + Subject = collections.namedtuple('Subject', 'uri label text') diff --git a/annif/project.py b/annif/project.py index ae99fd741..1e5984e01 100644 --- a/annif/project.py +++ b/annif/project.py @@ -193,6 +193,9 @@ def _create_vectorizer(self, subjectcorpus): if not self.backend.needs_subject_vectorizer: logger.debug('not creating vectorizer: not needed by backend') return + if subjectcorpus.are_documents_empty(): + raise NotSupportedException( + 'using TfidfVectorizer with no documents') logger.info('creating vectorizer') self._vectorizer = TfidfVectorizer( tokenizer=self.analyzer.tokenize_words) diff --git a/tests/projects.cfg b/tests/projects.cfg index eaccea36f..c39152e12 100644 --- a/tests/projects.cfg +++ b/tests/projects.cfg @@ -98,3 +98,13 @@ loss=hs limit=100 chunksize=24 vocab=yso-fi + +[vw-multi-fi] +name=vw_multi Finnish +language=fi +backend=vw_multi +analyzer=snowball(finnish) +bit_precision=1 +limit=100 +chunksize=2 +vocab=yso-fi diff --git a/tests/test_cli.py b/tests/test_cli.py index 4710749bf..561d9203d 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -187,6 +187,43 @@ def test_train_nonexistent_path(): 'Path "nonexistent_path" does not exist.' in failed_result.output +def test_train_no_path_vw(caplog): + logger = annif.logger + logger.propagate = True + result = runner.invoke( + annif.cli.cli, [ + 'train', 'vw-multi-fi']) + assert not result.exception + assert result.exit_code == 0 + assert 'Creating empty model' == caplog.records[0].message + + +def test_train_no_path_tfidf(caplog): + logger = annif.logger + logger.propagate = True + failed_result = runner.invoke( + annif.cli.cli, [ + 'train', 'tfidf-fi']) + assert failed_result.exception + assert failed_result.exit_code != 0 + assert 'Creating empty model' == caplog.records[0].message + assert 'Not supported: using TfidfVectorizer with no documents' \ + in failed_result.output + + +def test_train_no_path_fasttext(caplog): + logger = annif.logger + logger.propagate = True + failed_result = runner.invoke( + annif.cli.cli, [ + 'train', 'fasttext-fi']) + assert failed_result.exception + assert failed_result.exit_code != 0 + assert 'Creating empty model' == caplog.records[0].message + assert 'Not supported: training backend fasttext with no documents' \ + in failed_result.output + + def test_learn(testdatadir): docfile = os.path.join( os.path.dirname(__file__), From c2b3c28b9c9f3e011f4dd1fa8ccefc74d33135f5 Mon Sep 17 00:00:00 2001 From: Juho Inkinen Date: Wed, 4 Sep 2019 13:47:27 +0300 Subject: [PATCH 08/16] Test for emptiness check for documents generator --- tests/test_corpus.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/test_corpus.py b/tests/test_corpus.py index 4f7928aff..aeb67fa76 100644 --- a/tests/test_corpus.py +++ b/tests/test_corpus.py @@ -244,3 +244,11 @@ def test_docfile_gzipped(tmpdir): docs = annif.corpus.DocumentFile(str(docfile)) assert len(list(docs.documents)) == 3 + + +def test_docfile_are_documents_empty(): + import os.path + docfile = os.path.devnull + + docs = annif.corpus.DocumentFile(str(docfile)) + assert docs.are_documents_empty() From f3fdf88197cdcfb7b56fff1b0b7cd81977a02f60 Mon Sep 17 00:00:00 2001 From: Juho Inkinen Date: Wed, 4 Sep 2019 14:20:51 +0300 Subject: [PATCH 09/16] Skip tests that use vw and fasttext if those packages are not available --- tests/test_cli.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/test_cli.py b/tests/test_cli.py index 561d9203d..1a4ba3056 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -4,6 +4,7 @@ import random import re import os.path +import pytest from click.testing import CliRunner import annif.cli @@ -188,6 +189,7 @@ def test_train_nonexistent_path(): def test_train_no_path_vw(caplog): + pytest.importorskip('annif.backend.vw_multi') logger = annif.logger logger.propagate = True result = runner.invoke( @@ -212,6 +214,7 @@ def test_train_no_path_tfidf(caplog): def test_train_no_path_fasttext(caplog): + pytest.importorskip('annif.backend.fasttext') logger = annif.logger logger.propagate = True failed_result = runner.invoke( From a7f367ebeb1158bbfb8555f94a812e9655741b00 Mon Sep 17 00:00:00 2001 From: Juho Inkinen Date: Thu, 5 Sep 2019 15:26:07 +0300 Subject: [PATCH 10/16] More general warning: applies also to learn method --- annif/cli.py | 2 +- tests/test_cli.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/annif/cli.py b/annif/cli.py index 962412cff..2c13dbc6d 100644 --- a/annif/cli.py +++ b/annif/cli.py @@ -47,7 +47,7 @@ def open_doc_path(path): return annif.corpus.DocumentFile(path) if len(paths) == 0: - logger.warning('Creating empty model') + logger.warning('Reading empty file') docs = open_doc_path(os.path.devnull) elif len(paths) == 1: docs = open_doc_path(paths[0]) diff --git a/tests/test_cli.py b/tests/test_cli.py index 1a4ba3056..c10cfc179 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -197,7 +197,7 @@ def test_train_no_path_vw(caplog): 'train', 'vw-multi-fi']) assert not result.exception assert result.exit_code == 0 - assert 'Creating empty model' == caplog.records[0].message + assert 'Reading empty file' == caplog.records[0].message def test_train_no_path_tfidf(caplog): @@ -208,7 +208,7 @@ def test_train_no_path_tfidf(caplog): 'train', 'tfidf-fi']) assert failed_result.exception assert failed_result.exit_code != 0 - assert 'Creating empty model' == caplog.records[0].message + assert 'Reading empty file' == caplog.records[0].message assert 'Not supported: using TfidfVectorizer with no documents' \ in failed_result.output @@ -222,7 +222,7 @@ def test_train_no_path_fasttext(caplog): 'train', 'fasttext-fi']) assert failed_result.exception assert failed_result.exit_code != 0 - assert 'Creating empty model' == caplog.records[0].message + assert 'Reading empty file' == caplog.records[0].message assert 'Not supported: training backend fasttext with no documents' \ in failed_result.output From 46cb52999c084738c6d13a4dc54b5272d6780c74 Mon Sep 17 00:00:00 2001 From: Juho Inkinen Date: Thu, 5 Sep 2019 17:34:08 +0300 Subject: [PATCH 11/16] Refactor and add tests --- tests/test_backend_fasttext.py | 21 +++++++++++++++++++++ tests/test_backend_vw_multi.py | 28 ++++++++++++++++++++++++++++ tests/test_cli.py | 32 ++++++++------------------------ tests/test_corpus.py | 8 +++----- tests/test_project.py | 10 ++++++++++ 5 files changed, 70 insertions(+), 29 deletions(-) diff --git a/tests/test_backend_fasttext.py b/tests/test_backend_fasttext.py index 6f4007cd3..8afe14b25 100644 --- a/tests/test_backend_fasttext.py +++ b/tests/test_backend_fasttext.py @@ -3,6 +3,7 @@ import pytest import annif.backend import annif.corpus +from annif.exception import NotSupportedException fasttext = pytest.importorskip("annif.backend.fasttext") @@ -48,6 +49,26 @@ def test_fasttext_train_unknown_subject(tmpdir, datadir, project): assert datadir.join('fasttext-model').size() > 0 +def test_fasttext_train_nodocuments(tmpdir, datadir, project): + fasttext_type = annif.backend.get_backend("fasttext") + fasttext = fasttext_type( + backend_id='fasttext', + params={ + 'limit': 50, + 'dim': 100, + 'lr': 0.25, + 'epoch': 20, + 'loss': 'hs'}, + datadir=str(datadir)) + + empty_file = tmpdir.ensure('empty.tsv') + empty_document_corpus = annif.corpus.DocumentFile(str(empty_file)) + + with pytest.raises(NotSupportedException) as excinfo: + fasttext.train(empty_document_corpus, project) + assert 'training backend fasttext with no documents' in str(excinfo.value) + + def test_fasttext_suggest(datadir, project): fasttext_type = annif.backend.get_backend("fasttext") fasttext = fasttext_type( diff --git a/tests/test_backend_vw_multi.py b/tests/test_backend_vw_multi.py index 56b007cf8..88e6bbcf7 100644 --- a/tests/test_backend_vw_multi.py +++ b/tests/test_backend_vw_multi.py @@ -55,6 +55,34 @@ def test_vw_multi_train_and_learn(datadir, document_corpus, project): assert modelfile.size() != old_size or modelfile.mtime() != old_mtime +def test_vw_multi_train_and_learn_nodocuments(datadir, tmpdir, project): + vw_type = annif.backend.get_backend('vw_multi') + vw = vw_type( + backend_id='vw_multi', + params={ + 'chunksize': 4, + 'learning_rate': 0.5, + 'loss_function': 'hinge'}, + datadir=str(datadir)) + + empty_file = tmpdir.ensure('empty.tsv') + empty_document_corpus = annif.corpus.DocumentFile(str(empty_file)) + + vw.train(empty_document_corpus, project) + assert datadir.join('vw-train.txt').exists() + assert datadir.join('vw-train.txt').size() == 0 + + # test online learning + modelfile = datadir.join('vw-model') + + old_size = modelfile.size() + + vw.learn(empty_document_corpus, project) + + assert modelfile.size() == old_size + assert datadir.join('vw-train.txt').size() == 0 + + def test_vw_multi_train_from_project(app, datadir, document_corpus, project): vw_type = annif.backend.get_backend('vw_multi') vw = vw_type( diff --git a/tests/test_cli.py b/tests/test_cli.py index c10cfc179..b85272a4e 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -188,43 +188,27 @@ def test_train_nonexistent_path(): 'Path "nonexistent_path" does not exist.' in failed_result.output -def test_train_no_path_vw(caplog): - pytest.importorskip('annif.backend.vw_multi') +def test_train_no_path(caplog): logger = annif.logger logger.propagate = True result = runner.invoke( annif.cli.cli, [ - 'train', 'vw-multi-fi']) + 'train', 'dummy-fi']) assert not result.exception assert result.exit_code == 0 assert 'Reading empty file' == caplog.records[0].message -def test_train_no_path_tfidf(caplog): - logger = annif.logger - logger.propagate = True - failed_result = runner.invoke( - annif.cli.cli, [ - 'train', 'tfidf-fi']) - assert failed_result.exception - assert failed_result.exit_code != 0 - assert 'Reading empty file' == caplog.records[0].message - assert 'Not supported: using TfidfVectorizer with no documents' \ - in failed_result.output - - -def test_train_no_path_fasttext(caplog): - pytest.importorskip('annif.backend.fasttext') +def test_train_no_path_vw(caplog): + pytest.importorskip('annif.backend.vw_multi') logger = annif.logger logger.propagate = True - failed_result = runner.invoke( + result = runner.invoke( annif.cli.cli, [ - 'train', 'fasttext-fi']) - assert failed_result.exception - assert failed_result.exit_code != 0 + 'train', 'vw-multi-fi']) + assert not result.exception + assert result.exit_code == 0 assert 'Reading empty file' == caplog.records[0].message - assert 'Not supported: training backend fasttext with no documents' \ - in failed_result.output def test_learn(testdatadir): diff --git a/tests/test_corpus.py b/tests/test_corpus.py index aeb67fa76..260c51402 100644 --- a/tests/test_corpus.py +++ b/tests/test_corpus.py @@ -246,9 +246,7 @@ def test_docfile_gzipped(tmpdir): assert len(list(docs.documents)) == 3 -def test_docfile_are_documents_empty(): - import os.path - docfile = os.path.devnull - - docs = annif.corpus.DocumentFile(str(docfile)) +def test_docfile_are_documents_empty(tmpdir): + empty_file = tmpdir.ensure('empty.tsv') + docs = annif.corpus.DocumentFile(str(empty_file)) assert docs.are_documents_empty() diff --git a/tests/test_project.py b/tests/test_project.py index 331aedf7f..787aa940a 100644 --- a/tests/test_project.py +++ b/tests/test_project.py @@ -113,6 +113,16 @@ def test_project_train_tfidf(app, document_corpus, testdatadir): assert testdatadir.join('projects/tfidf-fi/tfidf-index').size() > 0 +def test_project_train_tfidf_nodocuments(app, tmpdir): + with app.app_context(): + project = annif.project.get_project('tfidf-fi') + empty_file = tmpdir.ensure('empty.tsv') + empty_document_corpus = annif.corpus.DocumentFile(str(empty_file)) + with pytest.raises(NotSupportedException) as excinfo: + project.train(empty_document_corpus) + assert 'using TfidfVectorizer with no documents' in str(excinfo.value) + + def test_project_learn(app, tmpdir): tmpdir.join('doc1.txt').write('doc1') tmpdir.join('doc1.tsv').write('\tkey1') From dc27476e26146b1f65da697332971dbe0c85dd6b Mon Sep 17 00:00:00 2001 From: Juho Inkinen Date: Thu, 5 Sep 2019 17:41:12 +0300 Subject: [PATCH 12/16] Forgot to remove the VW test from CLI tests --- tests/projects.cfg | 10 ---------- tests/test_cli.py | 12 ------------ 2 files changed, 22 deletions(-) diff --git a/tests/projects.cfg b/tests/projects.cfg index c39152e12..eaccea36f 100644 --- a/tests/projects.cfg +++ b/tests/projects.cfg @@ -98,13 +98,3 @@ loss=hs limit=100 chunksize=24 vocab=yso-fi - -[vw-multi-fi] -name=vw_multi Finnish -language=fi -backend=vw_multi -analyzer=snowball(finnish) -bit_precision=1 -limit=100 -chunksize=2 -vocab=yso-fi diff --git a/tests/test_cli.py b/tests/test_cli.py index b85272a4e..7c36378fa 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -199,18 +199,6 @@ def test_train_no_path(caplog): assert 'Reading empty file' == caplog.records[0].message -def test_train_no_path_vw(caplog): - pytest.importorskip('annif.backend.vw_multi') - logger = annif.logger - logger.propagate = True - result = runner.invoke( - annif.cli.cli, [ - 'train', 'vw-multi-fi']) - assert not result.exception - assert result.exit_code == 0 - assert 'Reading empty file' == caplog.records[0].message - - def test_learn(testdatadir): docfile = os.path.join( os.path.dirname(__file__), From 237532e8b59442ce2ccf8252062b7becdf4f57fa Mon Sep 17 00:00:00 2001 From: Juho Inkinen Date: Wed, 18 Sep 2019 13:21:59 +0300 Subject: [PATCH 13/16] Raise NotSupportedException on training PAV without documents --- annif/backend/pav.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/annif/backend/pav.py b/annif/backend/pav.py index 191719a38..3dd5ae953 100644 --- a/annif/backend/pav.py +++ b/annif/backend/pav.py @@ -11,7 +11,7 @@ import annif.suggestion import annif.project import annif.util -from annif.exception import NotInitializedException +from annif.exception import NotInitializedException, NotSupportedException from . import ensemble @@ -96,6 +96,9 @@ def _create_pav_model(self, source_project_id, min_docs, corpus): method=joblib.dump) def train(self, corpus, project): + if corpus.are_documents_empty(): + raise NotSupportedException('training backend {} with no documents' + .format(self.backend_id)) self.info("creating PAV models") sources = annif.util.parse_sources(self.params['sources']) min_docs = int(self.params['min-docs']) From 6887fbd98acdd68522d517233ea107369240d33f Mon Sep 17 00:00:00 2001 From: Juho Inkinen Date: Wed, 18 Sep 2019 13:22:35 +0300 Subject: [PATCH 14/16] Test raise NotSupportedException on training PAV without documents --- tests/test_backend_pav.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/tests/test_backend_pav.py b/tests/test_backend_pav.py index eb90fe256..36aaed11b 100644 --- a/tests/test_backend_pav.py +++ b/tests/test_backend_pav.py @@ -1,7 +1,9 @@ """Unit tests for the PAV backend in Annif""" +import pytest import annif.backend import annif.corpus +from annif.exception import NotSupportedException def test_pav_train(app, datadir, tmpdir, project): @@ -23,6 +25,21 @@ def test_pav_train(app, datadir, tmpdir, project): assert datadir.join('pav-model-dummy-fi').size() > 0 +def test_pav_train_nodocuments(tmpdir, datadir, project): + pav_type = annif.backend.get_backend("pav") + pav = pav_type( + backend_id='pav', + params={'limit': 50, 'min-docs': 2, 'sources': 'dummy-fi'}, + datadir=str(datadir)) + + empty_file = tmpdir.ensure('empty.tsv') + empty_document_corpus = annif.corpus.DocumentFile(str(empty_file)) + + with pytest.raises(NotSupportedException) as excinfo: + pav.train(empty_document_corpus, project) + assert 'training backend pav with no documents' in str(excinfo.value) + + def test_pav_initialize(app, datadir): pav_type = annif.backend.get_backend("pav") pav = pav_type( From 08463a1b527c277a7af51a91433944e6d35399e3 Mon Sep 17 00:00:00 2001 From: Juho Inkinen Date: Mon, 23 Sep 2019 16:28:41 +0300 Subject: [PATCH 15/16] Rename corpus emptiness check --- annif/backend/fasttext.py | 2 +- annif/backend/pav.py | 2 +- annif/corpus/types.py | 2 +- annif/project.py | 2 +- tests/test_corpus.py | 4 ++-- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/annif/backend/fasttext.py b/annif/backend/fasttext.py index 5028f57c5..9cd52d544 100644 --- a/annif/backend/fasttext.py +++ b/annif/backend/fasttext.py @@ -104,7 +104,7 @@ def _create_model(self): self._model.save_model(modelpath) def train(self, corpus, project): - if corpus.are_documents_empty(): + if corpus.is_empty(): raise NotSupportedException('training backend {} with no documents' .format(self.backend_id)) self._create_train_file(corpus, project) diff --git a/annif/backend/pav.py b/annif/backend/pav.py index 3dd5ae953..407f430a3 100644 --- a/annif/backend/pav.py +++ b/annif/backend/pav.py @@ -96,7 +96,7 @@ def _create_pav_model(self, source_project_id, min_docs, corpus): method=joblib.dump) def train(self, corpus, project): - if corpus.are_documents_empty(): + if corpus.is_empty(): raise NotSupportedException('training backend {} with no documents' .format(self.backend_id)) self.info("creating PAV models") diff --git a/annif/corpus/types.py b/annif/corpus/types.py index e3b3c4f25..9e7dcfb2d 100644 --- a/annif/corpus/types.py +++ b/annif/corpus/types.py @@ -37,7 +37,7 @@ def _create_document(self, text, uris, labels): return Document(text=text, uris=uris, labels=labels) - def are_documents_empty(self): + def is_empty(self): """Check if there are no documents to iterate.""" try: next(self.documents) diff --git a/annif/project.py b/annif/project.py index 1e5984e01..34af903dd 100644 --- a/annif/project.py +++ b/annif/project.py @@ -193,7 +193,7 @@ def _create_vectorizer(self, subjectcorpus): if not self.backend.needs_subject_vectorizer: logger.debug('not creating vectorizer: not needed by backend') return - if subjectcorpus.are_documents_empty(): + if subjectcorpus.is_empty(): raise NotSupportedException( 'using TfidfVectorizer with no documents') logger.info('creating vectorizer') diff --git a/tests/test_corpus.py b/tests/test_corpus.py index 260c51402..4fecf04b2 100644 --- a/tests/test_corpus.py +++ b/tests/test_corpus.py @@ -246,7 +246,7 @@ def test_docfile_gzipped(tmpdir): assert len(list(docs.documents)) == 3 -def test_docfile_are_documents_empty(tmpdir): +def test_docfile_is_empty(tmpdir): empty_file = tmpdir.ensure('empty.tsv') docs = annif.corpus.DocumentFile(str(empty_file)) - assert docs.are_documents_empty() + assert docs.is_empty() From af8c54d63bd0d08ea59150cf4588a524bf93ab51 Mon Sep 17 00:00:00 2001 From: Juho Inkinen Date: Mon, 23 Sep 2019 16:33:00 +0300 Subject: [PATCH 16/16] Avoid Scrutinizer issue about undef var for execution paths --- annif/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/annif/cli.py b/annif/cli.py index 2c13dbc6d..9156d617a 100644 --- a/annif/cli.py +++ b/annif/cli.py @@ -51,7 +51,7 @@ def open_doc_path(path): docs = open_doc_path(os.path.devnull) elif len(paths) == 1: docs = open_doc_path(paths[0]) - elif len(paths) > 1: + else: corpora = [open_doc_path(path) for path in paths] docs = annif.corpus.CombinedCorpus(corpora) return docs