From a60c62f064749c44082d6e2dfb2e7917ce64dc52 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Wed, 6 Feb 2019 16:31:26 +0200 Subject: [PATCH 01/12] Initial support for online learning in vw_multi backend. Part of #225 and #230 --- annif/backend/backend.py | 4 ++++ annif/backend/vw_multi.py | 14 ++++++++++++-- tests/test_backend_vw_multi.py | 12 +++++++++++- 3 files changed, 27 insertions(+), 3 deletions(-) diff --git a/annif/backend/backend.py b/annif/backend/backend.py index 60fbb966b..b959275fe 100644 --- a/annif/backend/backend.py +++ b/annif/backend/backend.py @@ -33,6 +33,10 @@ def train(self, corpus, project): """train the model on the given document or subject corpus""" pass # default is to do nothing, subclasses may override + def learn(self, corpus, project): + """further train the model on the given document or subject corpus""" + pass # default is to do nothing, subclasses may override + def initialize(self): """This method can be overridden by backends. It should cause the backend to pre-load all data it needs during operation.""" diff --git a/annif/backend/vw_multi.py b/annif/backend/vw_multi.py index 1f6d47961..8b66d137f 100644 --- a/annif/backend/vw_multi.py +++ b/annif/backend/vw_multi.py @@ -129,8 +129,7 @@ def _inputs_to_exampletext(self, project, text): return ' '.join(['|{} {}'.format(namespace, featurestr) for namespace, featurestr in namespaces.items()]) - def _create_train_file(self, corpus, project): - self.info('creating VW train file') + def _create_examples(self, corpus, project): examples = [] for doc in corpus.documents: text = self._inputs_to_exampletext(project, doc.text) @@ -138,6 +137,11 @@ def _create_train_file(self, corpus, project): continue examples.extend(self._format_examples(project, text, doc.uris)) random.shuffle(examples) + return examples + + def _create_train_file(self, corpus, project): + self.info('creating VW train file') + examples = self._create_examples(corpus, project) annif.util.atomic_save(examples, self._get_datadir(), self.TRAIN_FILE, @@ -184,6 +188,12 @@ def train(self, corpus, project): self._create_train_file(corpus, project) self._create_model(project) + def learn(self, corpus, project): + for example in self._create_examples(corpus, project): + self._model.learn(example) + modelpath = os.path.join(self._get_datadir(), self.MODEL_FILE) + self._model.save(modelpath) + def _convert_result(self, result, project): if self.algorithm == 'multilabel_oaa': # result is a list of subject IDs - need to vectorize diff --git a/tests/test_backend_vw_multi.py b/tests/test_backend_vw_multi.py index 359fcbdf7..5812b36ac 100644 --- a/tests/test_backend_vw_multi.py +++ b/tests/test_backend_vw_multi.py @@ -29,7 +29,7 @@ def test_vw_multi_analyze_no_model(datadir, project): results = vw.analyze("example text", project) -def test_vw_multi_train(datadir, document_corpus, project): +def test_vw_multi_train_and_learn(datadir, document_corpus, project): vw_type = annif.backend.get_backend('vw_multi') vw = vw_type( backend_id='vw_multi', @@ -44,6 +44,16 @@ def test_vw_multi_train(datadir, document_corpus, project): assert datadir.join('vw-model').exists() assert datadir.join('vw-model').size() > 0 + # test online learning + modelfile = datadir.join('vw-model') + + old_size = modelfile.size() + old_mtime = modelfile.mtime() + + vw.learn(document_corpus, project) + + assert modelfile.size() != old_size or modelfile.mtime() != old_mtime + def test_vw_multi_train_from_project(app, datadir, document_corpus, project): vw_type = annif.backend.get_backend('vw_multi') From 84c959946367264535c97be9a3d6e5c14a07cfeb Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Fri, 8 Feb 2019 16:50:01 +0200 Subject: [PATCH 02/12] Add learn method to AnnifProject, with test --- annif/project.py | 6 ++++++ tests/test_project.py | 9 +++++++++ 2 files changed, 15 insertions(+) diff --git a/annif/project.py b/annif/project.py index 893e37932..eae3e1be6 100644 --- a/annif/project.py +++ b/annif/project.py @@ -202,6 +202,12 @@ def train(self, corpus): self._create_vectorizer(corpus) self.backend.train(corpus, project=self) + def learn(self, corpus): + """further train the project using documents from a metadata source""" + + corpus.set_subject_index(self.subjects) + self.backend.train(corpus, project=self) + def dump(self): """return this project as a dict""" return {'project_id': self.project_id, diff --git a/tests/test_project.py b/tests/test_project.py index 08c473d84..5e6dfe531 100644 --- a/tests/test_project.py +++ b/tests/test_project.py @@ -92,6 +92,15 @@ def test_project_train_tfidf(app, document_corpus, testdatadir): assert testdatadir.join('projects/tfidf-fi/tfidf-index').size() > 0 +def test_project_learn_tfidf(app, document_corpus, testdatadir): + with app.app_context(): + project = annif.project.get_project('tfidf-fi') + + project.learn(document_corpus) + # Should assert that the index file changed, but this is not really + # implemented in the tfidf backend yet + + def test_project_load_vocabulary_fasttext(app, vocabulary, testdatadir): pytest.importorskip("annif.backend.fasttext") with app.app_context(): From c8266cd697b1e5a71ff1625f4b37e7943d716294 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Fri, 8 Feb 2019 16:52:59 +0200 Subject: [PATCH 03/12] Add learn CLI command --- annif/cli.py | 13 +++++++++++++ tests/test_cli.py | 11 +++++++++++ 2 files changed, 24 insertions(+) diff --git a/annif/cli.py b/annif/cli.py index 3a590904b..d97775a57 100644 --- a/annif/cli.py +++ b/annif/cli.py @@ -134,6 +134,19 @@ def run_train(project_id, paths): proj.train(documents) +@cli.command('learn') +@click_log.simple_verbosity_option(logger) +@click.argument('project_id') +@click.argument('paths', type=click.Path(), nargs=-1) +def run_learn(project_id, paths): + """ + Further train an existing project on a collection of documents. + """ + proj = get_project(project_id) + documents = open_documents(paths) + proj.learn(documents) + + @cli.command('analyze') @click_log.simple_verbosity_option(logger) @click.argument('project_id') diff --git a/tests/test_cli.py b/tests/test_cli.py index 5385bb9b4..e7db6d3f7 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -126,6 +126,17 @@ def test_train_multiple(testdatadir): assert testdatadir.join('projects/tfidf-fi/tfidf-index').size() > 0 +def test_learn(testdatadir): + docfile = os.path.join( + os.path.dirname(__file__), + 'corpora', + 'archaeology', + 'documents.tsv') + result = runner.invoke(annif.cli.cli, ['learn', 'tfidf-fi', docfile]) + assert not result.exception + assert result.exit_code == 0 + + def test_analyze(): result = runner.invoke( annif.cli.cli, From 5b9d7df451343c32c33151dabc36680701e4f61e Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Fri, 8 Feb 2019 17:26:57 +0200 Subject: [PATCH 04/12] Define REST API learn method. Implementation still missing conversion of input --- annif/rest.py | 18 ++++++++++++ annif/swagger/annif.yaml | 63 ++++++++++++++++++++++++++++++++++++---- 2 files changed, 76 insertions(+), 5 deletions(-) diff --git a/annif/rest.py b/annif/rest.py index ce13cea38..da8904942 100644 --- a/annif/rest.py +++ b/annif/rest.py @@ -64,3 +64,21 @@ def analyze(project_id, text, limit, threshold): return server_error(err) hits = hit_filter(result) return {'results': [hit._asdict() for hit in hits]} + + +def learn(project_id, documents): + """learn from documents and return an empty 204 response if succesful""" + + try: + project = annif.project.get_project( + project_id, min_access=Access.hidden) + except ValueError: + return project_not_found_error(project_id) + +# TODO should convert the documents to a corpus object +# try: +# project.learn(documents) +# except AnnifException as err: +# return server_error(err) + + return None, 204 diff --git a/annif/swagger/annif.yaml b/annif/swagger/annif.yaml index a53cb9d83..4738c4a40 100644 --- a/annif/swagger/annif.yaml +++ b/annif/swagger/annif.yaml @@ -90,6 +90,38 @@ paths: $ref: '#/definitions/Problem' tags: - Automatic subject indexing + '/projects/{project_id}/learn': + post: + summary: learn from manually indexed documents + operationId: annif.rest.learn + consumes: + - application/json + produces: + - application/json + - application/problem+json + parameters: + - $ref: '#/parameters/project_id' + - name: documents + in: body + description: documents to learn from + required: true + schema: + type: array + items: + $ref: '#/definitions/IndexedDocument' + responses: + '204': + description: successful operation + '404': + description: Project not found + schema: + $ref: '#/definitions/Problem' + '503': + description: Service Unavailable + schema: + $ref: '#/definitions/Problem' + tags: + - Learning from feedback definitions: ProjectBackend: description: A backend of a project @@ -133,7 +165,7 @@ definitions: example: 'http://example.org/subject1' label: type: string - example: 'Archaeology' + example: Archaeology score: type: number example: 0.85 @@ -148,6 +180,23 @@ definitions: type: array items: $ref: '#/definitions/AnalysisResult' + IndexedDocument: + description: A document with attached, known good subjects + properties: + text: + type: string + example: "A quick brown fox jumped over the lazy dog." + subjects: + type: array + items: + type: object + properties: + uri: + type: string + example: 'http://example.org/subject1' + label: + type: string + example: 'Vulpes vulpes' Problem: type: object properties: @@ -169,8 +218,10 @@ definitions: status: type: integer format: int32 - description: | - The HTTP status code generated by the origin server for this occurrence + description: > + The HTTP status code generated by the origin server for this + occurrence + of the problem. minimum: 100 maximum: 600 @@ -185,6 +236,8 @@ definitions: instance: type: string format: uri - description: | - An absolute URI that identifies the specific occurrence of the problem. + description: > + An absolute URI that identifies the specific occurrence of the + problem. + It may or may not yield further information if dereferenced. From e6e2d1775a1f8ed04c701f73c41a21def3ca4251 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Thu, 14 Feb 2019 13:24:05 +0200 Subject: [PATCH 05/12] adapt to datadir changes on master branch --- annif/backend/vw_multi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/annif/backend/vw_multi.py b/annif/backend/vw_multi.py index 842f611f7..ec77e4c4d 100644 --- a/annif/backend/vw_multi.py +++ b/annif/backend/vw_multi.py @@ -191,7 +191,7 @@ def train(self, corpus, project): def learn(self, corpus, project): for example in self._create_examples(corpus, project): self._model.learn(example) - modelpath = os.path.join(self._get_datadir(), self.MODEL_FILE) + modelpath = os.path.join(self.datadir, self.MODEL_FILE) self._model.save(modelpath) def _convert_result(self, result, project): From bdef18f783624569179b9e40983c1a63004ba57a Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Thu, 14 Feb 2019 13:32:50 +0200 Subject: [PATCH 06/12] Split off initializing SubjectSet from string into a separate classmethod --- annif/corpus/document.py | 2 +- annif/corpus/subject.py | 26 ++++++++++++-------------- tests/test_corpus.py | 4 ++-- tests/test_eval.py | 2 +- 4 files changed, 16 insertions(+), 18 deletions(-) diff --git a/annif/corpus/document.py b/annif/corpus/document.py index 44e6f822d..4c884fde3 100644 --- a/annif/corpus/document.py +++ b/annif/corpus/document.py @@ -40,7 +40,7 @@ def documents(self): with open(docfilename, errors='replace') as docfile: text = docfile.read() with open(keyfilename) as keyfile: - subjects = SubjectSet(keyfile.read()) + subjects = SubjectSet.from_string(keyfile.read()) yield Document(text=text, uris=subjects.subject_uris, labels=subjects.subject_labels) diff --git a/annif/corpus/subject.py b/annif/corpus/subject.py index d91e4d0df..5bb68105f 100644 --- a/annif/corpus/subject.py +++ b/annif/corpus/subject.py @@ -98,22 +98,20 @@ def load(cls, path): class SubjectSet: """Represents a set of subjects for a document.""" - def __init__(self, subj_data): - """initialize a SubjectSet from either a string representation or a - tuple (URIs, labels)""" - - if isinstance(subj_data, str): - self.subject_uris = set() - self.subject_labels = set() - self._parse(subj_data) - else: - uris, labels = subj_data - self.subject_uris = set(uris) - self.subject_labels = set(labels) + def __init__(self, subj_data=None): + """Create a SubjectSet and optionally initialize it from a tuple + (URIs, labels)""" + + uris, labels = subj_data or ([], []) + self.subject_uris = set(uris) + self.subject_labels = set(labels) - def _parse(self, subj_data): + @classmethod + def from_string(cls, subj_data): + sset = cls() for line in subj_data.splitlines(): - self._parse_line(line) + sset._parse_line(line) + return sset def _parse_line(self, line): vals = line.split("\t") diff --git a/tests/test_corpus.py b/tests/test_corpus.py index b2331e922..ac7f0c755 100644 --- a/tests/test_corpus.py +++ b/tests/test_corpus.py @@ -9,7 +9,7 @@ def test_subjectset_uris(): \tanother """ - sset = annif.corpus.SubjectSet(data) + sset = annif.corpus.SubjectSet.from_string(data) assert sset.has_uris() assert len(sset.subject_uris) == 2 assert "http://example.org/dummy" in sset.subject_uris @@ -21,7 +21,7 @@ def test_subjectset_labels(): another """ - sset = annif.corpus.SubjectSet(data) + sset = annif.corpus.SubjectSet.from_string(data) assert not sset.has_uris() assert len(sset.subject_labels) == 2 assert "dummy" in sset.subject_labels diff --git a/tests/test_eval.py b/tests/test_eval.py index 411dc7fcf..0de8524b0 100644 --- a/tests/test_eval.py +++ b/tests/test_eval.py @@ -90,7 +90,7 @@ def test_ndcg_empty2(): def test_evaluation_batch(subject_index): batch = annif.eval.EvaluationBatch(subject_index) - gold_set = annif.corpus.SubjectSet( + gold_set = annif.corpus.SubjectSet.from_string( '\tarkeologit') hits1 = annif.hit.ListAnalysisResult([ annif.hit.AnalysisHit( From 3ce498b3ed92fdcff1d4fa8f1353a5a7f7e99eef Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Thu, 14 Feb 2019 15:12:58 +0200 Subject: [PATCH 07/12] Implement learning in REST API by converting the input to a DocumentCorpus --- annif/corpus/__init__.py | 3 ++- annif/corpus/document.py | 12 ++++++++++++ annif/rest.py | 15 ++++++++++----- 3 files changed, 24 insertions(+), 6 deletions(-) diff --git a/annif/corpus/__init__.py b/annif/corpus/__init__.py index 68fa3ced9..9f749f6cf 100644 --- a/annif/corpus/__init__.py +++ b/annif/corpus/__init__.py @@ -1,8 +1,9 @@ """Annif corpus operations""" -from .document import DocumentDirectory, DocumentFile +from .document import DocumentDirectory, DocumentFile, DocumentList from .subject import Subject, SubjectDirectory, SubjectFileTSV from .subject import SubjectIndex, SubjectSet from .skos import SubjectFileSKOS +from .types import Document from .combine import CombinedCorpus diff --git a/annif/corpus/document.py b/annif/corpus/document.py index 4c884fde3..dc8facb65 100644 --- a/annif/corpus/document.py +++ b/annif/corpus/document.py @@ -66,3 +66,15 @@ def opener(path): subjects = [annif.util.cleanup_uri(uri) for uri in uris.split()] yield Document(text=text, uris=subjects, labels=[]) + + +class DocumentList(DocumentCorpus, DocumentToSubjectCorpusMixin): + """A document corpus based on a list of other iterable of Document + objects""" + + def __init__(self, documents): + self._documents = documents + + @property + def documents(self): + yield from self._documents diff --git a/annif/rest.py b/annif/rest.py index da8904942..8672fdca0 100644 --- a/annif/rest.py +++ b/annif/rest.py @@ -3,6 +3,7 @@ import connexion import annif.project +from annif.corpus import Document, DocumentList from annif.hit import HitFilter from annif.exception import AnnifException from annif.project import Access @@ -75,10 +76,14 @@ def learn(project_id, documents): except ValueError: return project_not_found_error(project_id) -# TODO should convert the documents to a corpus object -# try: -# project.learn(documents) -# except AnnifException as err: -# return server_error(err) + corpus = [Document(text=d['text'], + uris=[subj['uri'] for subj in d['subjects']], + labels=[subj['label'] for subj in d['subjects']]) + for d in documents + if 'text' in d and 'subjects' in d] + try: + project.learn(DocumentList(corpus)) + except AnnifException as err: + return server_error(err) return None, 204 From 972a285262e677eec36146bfc7b21ab678533d86 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Wed, 27 Feb 2019 10:51:06 +0200 Subject: [PATCH 08/12] Split AnnifLearningBackend into separate abstract class + add some learning tests --- annif/backend/backend.py | 13 +++++++++---- annif/backend/dummy.py | 19 ++++++++++++++++--- annif/backend/vw_multi.py | 2 +- annif/project.py | 2 +- tests/test_backend.py | 21 +++++++++++++++++++++ tests/test_cli.py | 2 +- tests/test_project.py | 20 ++++++++++++++------ tests/test_rest.py | 32 +++++++++++++++++++++++++++++++- 8 files changed, 94 insertions(+), 17 deletions(-) diff --git a/annif/backend/backend.py b/annif/backend/backend.py index b3e2e520b..5dec8708f 100644 --- a/annif/backend/backend.py +++ b/annif/backend/backend.py @@ -24,10 +24,6 @@ def train(self, corpus, project): """train the model on the given document or subject corpus""" pass # default is to do nothing, subclasses may override - def learn(self, corpus, project): - """further train the model on the given document or subject corpus""" - pass # default is to do nothing, subclasses may override - def initialize(self): """This method can be overridden by backends. It should cause the backend to pre-load all data it needs during operation.""" @@ -58,3 +54,12 @@ def info(self, message): def warning(self, message): """Log a warning message from this backend""" logger.warning("Backend {}: {}".format(self.backend_id, message)) + + +class AnnifLearningBackend(AnnifBackend): + """Base class for Annif backends that can perform online learning""" + + @abc.abstractmethod + def learn(self, corpus, project): + """further train the model on the given document or subject corpus""" + pass # pragma: no cover diff --git a/annif/backend/dummy.py b/annif/backend/dummy.py index 6f748c8fa..2f3cee660 100644 --- a/annif/backend/dummy.py +++ b/annif/backend/dummy.py @@ -5,15 +5,28 @@ from . import backend -class DummyBackend(backend.AnnifBackend): +class DummyBackend(backend.AnnifLearningBackend): name = "dummy" initialized = False + uri = 'http://example.org/dummy' + label = 'dummy' def initialize(self): self.initialized = True def _analyze(self, text, project, params): score = float(params.get('score', 1.0)) - return ListAnalysisResult([AnalysisHit(uri='http://example.org/dummy', - label='dummy', score=score)], + return ListAnalysisResult([AnalysisHit(uri=self.uri, + label=self.label, + score=score)], project.subjects) + + def learn(self, corpus, project): + # in this dummy backend we "learn" by picking up the URI and label + # of the first subject of the first document in the learning set + # and using that in subsequent analysis results + for doc in corpus.documents: + if doc.uris and doc.labels: + self.uri = list(doc.uris)[0] + self.label = list(doc.labels)[0] + break diff --git a/annif/backend/vw_multi.py b/annif/backend/vw_multi.py index ec77e4c4d..310ac609d 100644 --- a/annif/backend/vw_multi.py +++ b/annif/backend/vw_multi.py @@ -12,7 +12,7 @@ from . import mixins -class VWMultiBackend(mixins.ChunkingBackend, backend.AnnifBackend): +class VWMultiBackend(mixins.ChunkingBackend, backend.AnnifLearningBackend): """Vorpal Wabbit multiclass/multilabel backend for Annif""" name = "vw_multi" diff --git a/annif/project.py b/annif/project.py index 630492f3d..4c6685536 100644 --- a/annif/project.py +++ b/annif/project.py @@ -200,7 +200,7 @@ def learn(self, corpus): """further train the project using documents from a metadata source""" corpus.set_subject_index(self.subjects) - self.backend.train(corpus, project=self) + self.backend.learn(corpus, project=self) def dump(self): """return this project as a dict""" diff --git a/tests/test_backend.py b/tests/test_backend.py index 68496d025..0b1073a9b 100644 --- a/tests/test_backend.py +++ b/tests/test_backend.py @@ -4,6 +4,7 @@ import pytest import annif import annif.backend +import annif.corpus def test_get_backend_nonexistent(): @@ -20,3 +21,23 @@ def test_get_backend_dummy(app, project): assert result[0].uri == 'http://example.org/dummy' assert result[0].label == 'dummy' assert result[0].score == 1.0 + + +def test_learn_dummy(app, project, tmpdir): + dummy_type = annif.backend.get_backend("dummy") + dummy = dummy_type(backend_id='dummy', params={}, + datadir=app.config['DATADIR']) + + tmpdir.join('doc1.txt').write('doc1') + tmpdir.join('doc1.tsv').write('\tkey1') + tmpdir.join('doc2.txt').write('doc2') + tmpdir.join('doc2.tsv').write('\tkey2') + docdir = annif.corpus.DocumentDirectory(str(tmpdir)) + + dummy.learn(docdir, project) + + result = dummy.analyze(text='this is some text', project=project) + assert len(result) == 1 + assert result[0].uri == 'http://example.org/key1' + assert result[0].label == 'key1' + assert result[0].score == 1.0 diff --git a/tests/test_cli.py b/tests/test_cli.py index e7db6d3f7..5b9916d37 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -132,7 +132,7 @@ def test_learn(testdatadir): 'corpora', 'archaeology', 'documents.tsv') - result = runner.invoke(annif.cli.cli, ['learn', 'tfidf-fi', docfile]) + result = runner.invoke(annif.cli.cli, ['learn', 'dummy-fi', docfile]) assert not result.exception assert result.exit_code == 0 diff --git a/tests/test_project.py b/tests/test_project.py index 5e6dfe531..ad4dae86d 100644 --- a/tests/test_project.py +++ b/tests/test_project.py @@ -92,13 +92,21 @@ def test_project_train_tfidf(app, document_corpus, testdatadir): assert testdatadir.join('projects/tfidf-fi/tfidf-index').size() > 0 -def test_project_learn_tfidf(app, document_corpus, testdatadir): - with app.app_context(): - project = annif.project.get_project('tfidf-fi') +def test_project_learn(app, tmpdir): + tmpdir.join('doc1.txt').write('doc1') + tmpdir.join('doc1.tsv').write('\tkey1') + tmpdir.join('doc2.txt').write('doc2') + tmpdir.join('doc2.tsv').write('\tkey2') + docdir = annif.corpus.DocumentDirectory(str(tmpdir)) - project.learn(document_corpus) - # Should assert that the index file changed, but this is not really - # implemented in the tfidf backend yet + with app.app_context(): + project = annif.project.get_project('dummy-fi') + project.learn(docdir) + result = project.analyze('this is some text') + assert len(result) == 1 + assert result[0].uri == 'http://example.org/key1' + assert result[0].label == 'key1' + assert result[0].score == 1.0 def test_project_load_vocabulary_fasttext(app, vocabulary, testdatadir): diff --git a/tests/test_rest.py b/tests/test_rest.py index 0aa657774..135125738 100644 --- a/tests/test_rest.py +++ b/tests/test_rest.py @@ -88,7 +88,7 @@ def test_rest_analyze_nonexistent(app): assert result.status_code == 404 -def test_rest_novocab(app): +def test_rest_analyze_novocab(app): with app.app_context(): result = annif.rest.analyze( 'novocab', @@ -96,3 +96,33 @@ def test_rest_novocab(app): limit=10, threshold=0.0) assert result.status_code == 503 + + +def test_rest_learn_empty(app): + with app.app_context(): + response = annif.rest.learn('dummy-en', []) + assert response == (None, 204) # success, no output + + +def test_rest_learn(app): + documents = [{'text': 'the quick brown fox', + 'subjects': [{'uri': 'http://example.org/fox', + 'label': 'fox'}]}] + with app.app_context(): + response = annif.rest.learn('dummy-en', documents) + assert response == (None, 204) # success, no output + + result = annif.rest.analyze( + 'dummy-en', + text='example text', + limit=10, + threshold=0.0) + assert 'results' in result + assert result['results'][0]['uri'] == 'http://example.org/fox' + assert result['results'][0]['label'] == 'fox' + + +def test_rest_learn_novocab(app): + with app.app_context(): + result = annif.rest.learn('novocab', []) + assert result.status_code == 503 From 01fa0b55270a1627fa7e4d7af2996c7e313a5fdd Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Wed, 27 Feb 2019 11:00:59 +0200 Subject: [PATCH 09/12] add unit test for learn on nonexistent vocabulary --- tests/test_rest.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/test_rest.py b/tests/test_rest.py index 135125738..633867860 100644 --- a/tests/test_rest.py +++ b/tests/test_rest.py @@ -126,3 +126,9 @@ def test_rest_learn_novocab(app): with app.app_context(): result = annif.rest.learn('novocab', []) assert result.status_code == 503 + + +def test_rest_learn_nonexistent(app): + with app.app_context(): + result = annif.rest.learn('nonexistent', []) + assert result.status_code == 404 From 16c0959f608cb42a2ebcde3f23410f15119449af Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Wed, 27 Feb 2019 13:24:11 +0200 Subject: [PATCH 10/12] REST handling of backends that don't support learning --- annif/exception.py | 7 +++++++ annif/project.py | 10 ++++++++-- tests/test_project.py | 15 ++++++++++++++- tests/test_rest.py | 6 ++++++ 4 files changed, 35 insertions(+), 3 deletions(-) diff --git a/annif/exception.py b/annif/exception.py index 10d92a73e..867eb2beb 100644 --- a/annif/exception.py +++ b/annif/exception.py @@ -46,3 +46,10 @@ class ConfigurationException(AnnifException): """Exception raised when a project or backend is misconfigured.""" prefix = "Misconfigured" + + +class NotSupportedException(AnnifException): + """Exception raised when an operation is not supported by a project or + backend.""" + + prefix = "Not supported" diff --git a/annif/project.py b/annif/project.py index 4c6685536..4136ed8a7 100644 --- a/annif/project.py +++ b/annif/project.py @@ -16,7 +16,7 @@ import annif.vocab from annif.datadir import DatadirMixin from annif.exception import AnnifException, ConfigurationException, \ - NotInitializedException + NotInitializedException, NotSupportedException logger = annif.logger @@ -200,7 +200,13 @@ def learn(self, corpus): """further train the project using documents from a metadata source""" corpus.set_subject_index(self.subjects) - self.backend.learn(corpus, project=self) + if isinstance( + self.backend, + annif.backend.backend.AnnifLearningBackend): + self.backend.learn(corpus, project=self) + else: + raise NotSupportedException("Learning not supported by backend", + project_id=self.project_id) def dump(self): """return this project as a dict""" diff --git a/tests/test_project.py b/tests/test_project.py index ad4dae86d..879118228 100644 --- a/tests/test_project.py +++ b/tests/test_project.py @@ -4,7 +4,7 @@ import pytest import annif.project import annif.backend.dummy -from annif.exception import ConfigurationException +from annif.exception import ConfigurationException, NotSupportedException from annif.project import Access @@ -109,6 +109,19 @@ def test_project_learn(app, tmpdir): assert result[0].score == 1.0 +def test_project_learn_not_supported(app, tmpdir): + tmpdir.join('doc1.txt').write('doc1') + tmpdir.join('doc1.tsv').write('\tkey1') + tmpdir.join('doc2.txt').write('doc2') + tmpdir.join('doc2.tsv').write('\tkey2') + docdir = annif.corpus.DocumentDirectory(str(tmpdir)) + + with app.app_context(): + project = annif.project.get_project('tfidf-fi') + with pytest.raises(NotSupportedException): + project.learn(docdir) + + def test_project_load_vocabulary_fasttext(app, vocabulary, testdatadir): pytest.importorskip("annif.backend.fasttext") with app.app_context(): diff --git a/tests/test_rest.py b/tests/test_rest.py index 633867860..4ecc57cc4 100644 --- a/tests/test_rest.py +++ b/tests/test_rest.py @@ -132,3 +132,9 @@ def test_rest_learn_nonexistent(app): with app.app_context(): result = annif.rest.learn('nonexistent', []) assert result.status_code == 404 + + +def test_rest_learn_not_supported(app): + with app.app_context(): + result = annif.rest.learn('tfidf-fi', []) + assert result.status_code == 503 From 31577837c8113ec05bc38b8e64f16750b2c68f21 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Wed, 27 Feb 2019 13:26:27 +0200 Subject: [PATCH 11/12] CLI unit test for trying to learn when backend doesn't support it --- tests/test_cli.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tests/test_cli.py b/tests/test_cli.py index 5b9916d37..4d55a0edb 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -137,6 +137,17 @@ def test_learn(testdatadir): assert result.exit_code == 0 +def test_learn_notsupported(testdatadir): + docfile = os.path.join( + os.path.dirname(__file__), + 'corpora', + 'archaeology', + 'documents.tsv') + result = runner.invoke(annif.cli.cli, ['learn', 'tfidf-fi', docfile]) + assert result.exit_code != 0 + assert 'Learning not supported' in result.output + + def test_analyze(): result = runner.invoke( annif.cli.cli, From 7624d21f4a8223ac5aac6f09cbee59e1ec4a3d9b Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Wed, 27 Feb 2019 13:40:32 +0200 Subject: [PATCH 12/12] refactor: split off JSON input to document corpus conversion in rest module --- annif/rest.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/annif/rest.py b/annif/rest.py index 8672fdca0..8a038bdb0 100644 --- a/annif/rest.py +++ b/annif/rest.py @@ -67,6 +67,15 @@ def analyze(project_id, text, limit, threshold): return {'results': [hit._asdict() for hit in hits]} +def _documents_to_corpus(documents): + corpus = [Document(text=d['text'], + uris=[subj['uri'] for subj in d['subjects']], + labels=[subj['label'] for subj in d['subjects']]) + for d in documents + if 'text' in d and 'subjects' in d] + return DocumentList(corpus) + + def learn(project_id, documents): """learn from documents and return an empty 204 response if succesful""" @@ -76,13 +85,10 @@ def learn(project_id, documents): except ValueError: return project_not_found_error(project_id) - corpus = [Document(text=d['text'], - uris=[subj['uri'] for subj in d['subjects']], - labels=[subj['label'] for subj in d['subjects']]) - for d in documents - if 'text' in d and 'subjects' in d] + corpus = _documents_to_corpus(documents) + try: - project.learn(DocumentList(corpus)) + project.learn(corpus) except AnnifException as err: return server_error(err)