From 794bfb568533ce4f3023e17a6db20aa41b6777be Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Mon, 4 Feb 2019 14:19:54 +0200 Subject: [PATCH 1/9] Move text normalization outside chunking method --- annif/backend/fasttext.py | 11 ++++++++++- annif/backend/mixins.py | 5 +---- annif/backend/vw_multi.py | 5 ++++- 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/annif/backend/fasttext.py b/annif/backend/fasttext.py index bf3abe719..609664a49 100644 --- a/annif/backend/fasttext.py +++ b/annif/backend/fasttext.py @@ -107,9 +107,18 @@ def train(self, corpus, project): self._create_train_file(corpus, project) self._create_model() + def _predict_chunks(self, chunktexts, project, limit): + normalized_chunks = [] + for chunktext in chunktexts: + normalized = self._normalize_text(project, chunktext) + if normalized != '': + normalized_chunks.append(normalized) + return self._model.predict(normalized_chunks, limit) + def _analyze_chunks(self, chunktexts, project): limit = int(self.params['limit']) - chunklabels, chunkscores = self._model.predict(chunktexts, limit) + chunklabels, chunkscores = self._predict_chunks( + chunktexts, project, limit) label_scores = collections.defaultdict(float) for labels, scores in zip(chunklabels, chunkscores): for label, score in zip(labels, scores): diff --git a/annif/backend/mixins.py b/annif/backend/mixins.py index f109ac9b5..f89aa5f1c 100644 --- a/annif/backend/mixins.py +++ b/annif/backend/mixins.py @@ -24,10 +24,7 @@ def _analyze(self, text, project, params): chunksize = int(params['chunksize']) chunktexts = [] for i in range(0, len(sentences), chunksize): - chunktext = ' '.join(sentences[i:i + chunksize]) - normalized = self._normalize_text(project, chunktext) - if normalized != '': - chunktexts.append(normalized) + chunktexts.append(' '.join(sentences[i:i + chunksize])) self.debug('Split sentences into {} chunks'.format(len(chunktexts))) if len(chunktexts) == 0: # nothing to analyze, empty result return ListAnalysisResult(hits=[], subject_index=project.subjects) diff --git a/annif/backend/vw_multi.py b/annif/backend/vw_multi.py index ed97b32e3..738571956 100644 --- a/annif/backend/vw_multi.py +++ b/annif/backend/vw_multi.py @@ -152,7 +152,10 @@ def train(self, corpus, project): def _analyze_chunks(self, chunktexts, project): results = [] for chunktext in chunktexts: - example = ' | {}'.format(chunktext) + normalized = self._normalize_text(project, chunktext) + if normalized == '': + continue + example = ' | {}'.format(normalized) result = self._model.predict(example) if self.algorithm == 'multilabel_oaa': # result is a list of subject IDs - need to vectorize From f77c4a98cb8a6c0b76b0fde602b65a856ff48e28 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Mon, 4 Feb 2019 16:13:49 +0200 Subject: [PATCH 2/9] Make it possible to use input from other projects/backends in vw_multi --- annif/backend/vw_multi.py | 56 ++++++++++++++++++++++++++++++++------- 1 file changed, 46 insertions(+), 10 deletions(-) diff --git a/annif/backend/vw_multi.py b/annif/backend/vw_multi.py index 738571956..9f6b286b6 100644 --- a/annif/backend/vw_multi.py +++ b/annif/backend/vw_multi.py @@ -6,7 +6,7 @@ import annif.util from vowpalwabbit import pyvw import numpy as np -from annif.hit import VectorAnalysisResult +from annif.hit import ListAnalysisResult, VectorAnalysisResult from annif.exception import ConfigurationException, NotInitializedException from . import backend from . import mixins @@ -23,7 +23,7 @@ class VWMultiBackend(mixins.ChunkingBackend, backend.AnnifBackend): # where allowed_values is either a type or a list of allowed values # and default_value may be None, to let VW decide by itself 'bit_precision': (int, None), - 'ngram': (int, None), + 'ngram': (lambda x: '_{}'.format(int(x)), None), 'learning_rate': (float, None), 'loss_function': (['squared', 'logistic', 'hinge'], 'logistic'), 'l1': (float, None), @@ -35,6 +35,8 @@ class VWMultiBackend(mixins.ChunkingBackend, backend.AnnifBackend): DEFAULT_ALGORITHM = 'oaa' SUPPORTED_ALGORITHMS = ('oaa', 'ect', 'log_multi', 'multilabel_oaa') + DEFAULT_INPUTS = '_text_' + MODEL_FILE = 'vw-model' TRAIN_FILE = 'vw-train.txt' @@ -67,11 +69,20 @@ def algorithm(self): backend_id=self.backend_id) return algorithm + @property + def inputs(self): + inputs = self.params.get('inputs', self.DEFAULT_INPUTS) + return inputs.split(',') + + @staticmethod + def _cleanup_text(text): + # colon and pipe chars have special meaning in VW and must be avoided + return text.replace(':', '').replace('|', '') + @staticmethod def _normalize_text(project, text): ntext = ' '.join(project.analyzer.tokenize_words(text)) - # colon and pipe chars have special meaning in VW and must be avoided - return ntext.replace(':', '').replace('|', '') + return VWMultiBackend._cleanup_text(ntext) @staticmethod def _write_train_file(examples, filename): @@ -91,16 +102,39 @@ def _uris_to_subject_ids(project, uris): def _format_examples(self, project, text, uris): subject_ids = self._uris_to_subject_ids(project, uris) if self.algorithm == 'multilabel_oaa': - yield '{} | {}'.format(','.join(map(str, subject_ids)), text) + yield '{} {}'.format(','.join(map(str, subject_ids)), text) else: for subject_id in subject_ids: - yield '{} | {}'.format(subject_id + 1, text) + yield '{} {}'.format(subject_id + 1, text) + + def _inputs_to_exampletext(self, project, text): + namespaces = {} + for input in self.inputs: + if input == '_text_': + normalized = self._normalize_text(project, text) + if normalized != '': + namespaces['_text_'] = normalized + else: + proj = annif.project.get_project(input) + result = proj.analyze(text) + features = [ + '{}:{}'.format( + self._cleanup_text( + hit.uri), + hit.score) for hit in result.hits] + namespaces[input] = ' '.join(features) + if not namespaces: + return None + return ' '.join(['|{} {}'.format(namespace, featurestr) + for namespace, featurestr in namespaces.items()]) def _create_train_file(self, corpus, project): self.info('creating VW train file') examples = [] for doc in corpus.documents: - text = self._normalize_text(project, doc.text) + text = self._inputs_to_exampletext(project, doc.text) + if not text: + continue examples.extend(self._format_examples(project, text, doc.uris)) random.shuffle(examples) annif.util.atomic_save(examples, @@ -152,10 +186,10 @@ def train(self, corpus, project): def _analyze_chunks(self, chunktexts, project): results = [] for chunktext in chunktexts: - normalized = self._normalize_text(project, chunktext) - if normalized == '': + exampletext = self._inputs_to_exampletext(project, chunktext) + if not exampletext: continue - example = ' | {}'.format(normalized) + example = ' {}'.format(exampletext) result = self._model.predict(example) if self.algorithm == 'multilabel_oaa': # result is a list of subject IDs - need to vectorize @@ -170,5 +204,7 @@ def _analyze_chunks(self, chunktexts, project): else: result = np.array(result) results.append(result) + if len(results) == 0: # empty result + return ListAnalysisResult(hits=[], subject_index=project.subjects) return VectorAnalysisResult( np.array(results).mean(axis=0), project.subjects) From c4d5ea4a9819a0efdc757f30fba1e8f0e9fa1433 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Tue, 5 Feb 2019 11:10:10 +0200 Subject: [PATCH 3/9] Add unit test for training vw_multi using input from another project --- tests/test_backend_vw_multi.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tests/test_backend_vw_multi.py b/tests/test_backend_vw_multi.py index a0594ae2b..e204490f3 100644 --- a/tests/test_backend_vw_multi.py +++ b/tests/test_backend_vw_multi.py @@ -44,6 +44,22 @@ def test_vw_multi_train(datadir, document_corpus, project): assert datadir.join('vw-model').size() > 0 +def test_vw_multi_train_from_project(app, datadir, document_corpus, project): + vw_type = annif.backend.get_backend('vw_multi') + vw = vw_type( + backend_id='vw_multi', + params={ + 'chunksize': 4, + 'inputs': '_text_,dummy-en'}, + datadir=str(datadir)) + + with app.app_context(): + vw.train(document_corpus, project) + assert vw._model is not None + assert datadir.join('vw-model').exists() + assert datadir.join('vw-model').size() > 0 + + def test_vw_multi_train_multiple_passes(datadir, document_corpus, project): vw_type = annif.backend.get_backend('vw_multi') vw = vw_type( From 99201e5664b023dda95695c52168fb1d25b02ac9 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Tue, 5 Feb 2019 12:27:39 +0200 Subject: [PATCH 4/9] Add empty vw_multi training doc to ensure 100% test coverage --- tests/test_backend_vw_multi.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_backend_vw_multi.py b/tests/test_backend_vw_multi.py index e204490f3..359fcbdf7 100644 --- a/tests/test_backend_vw_multi.py +++ b/tests/test_backend_vw_multi.py @@ -13,7 +13,8 @@ def vw_corpus(tmpdir): """return a small document corpus for testing VW training""" tmpfile = tmpdir.join('document.tsv') tmpfile.write("nonexistent\thttp://example.com/nonexistent\n" + - "arkeologia\thttp://www.yso.fi/onto/yso/p1265") + "arkeologia\thttp://www.yso.fi/onto/yso/p1265\n" + + "...\thttp://example.com/none") return annif.corpus.DocumentFile(str(tmpfile)) From 69906fcc998ba07e50e25bfc6196b11cd12c1af4 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Tue, 5 Feb 2019 12:32:43 +0200 Subject: [PATCH 5/9] refactor: simplify _predict_chunks using a filter expression --- annif/backend/fasttext.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/annif/backend/fasttext.py b/annif/backend/fasttext.py index 609664a49..5c8b6627c 100644 --- a/annif/backend/fasttext.py +++ b/annif/backend/fasttext.py @@ -108,12 +108,9 @@ def train(self, corpus, project): self._create_model() def _predict_chunks(self, chunktexts, project, limit): - normalized_chunks = [] - for chunktext in chunktexts: - normalized = self._normalize_text(project, chunktext) - if normalized != '': - normalized_chunks.append(normalized) - return self._model.predict(normalized_chunks, limit) + return self._model.predict(list( + filter(None, [self._normalize_text(project, chunktext) + for chunktext in chunktexts])), limit) def _analyze_chunks(self, chunktexts, project): limit = int(self.params['limit']) From b69034007e3dfd722fdbebd3f5bb775eee58e41b Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Tue, 5 Feb 2019 12:36:03 +0200 Subject: [PATCH 6/9] style fixes --- annif/backend/vw_multi.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/annif/backend/vw_multi.py b/annif/backend/vw_multi.py index 9f6b286b6..9f0a2dfaf 100644 --- a/annif/backend/vw_multi.py +++ b/annif/backend/vw_multi.py @@ -118,11 +118,10 @@ def _inputs_to_exampletext(self, project, text): proj = annif.project.get_project(input) result = proj.analyze(text) features = [ - '{}:{}'.format( - self._cleanup_text( - hit.uri), - hit.score) for hit in result.hits] - namespaces[input] = ' '.join(features) + '{}:{}'.format(self._cleanup_text(hit.uri), hit.score) + for hit in result.hits] + if features: + namespaces[input] = ' '.join(features) if not namespaces: return None return ' '.join(['|{} {}'.format(namespace, featurestr) @@ -204,7 +203,7 @@ def _analyze_chunks(self, chunktexts, project): else: result = np.array(result) results.append(result) - if len(results) == 0: # empty result + if not results: # empty result return ListAnalysisResult(hits=[], subject_index=project.subjects) return VectorAnalysisResult( np.array(results).mean(axis=0), project.subjects) From 899e8871ad8564b085bbad1c87bd2a485387da5e Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Tue, 5 Feb 2019 12:38:44 +0200 Subject: [PATCH 7/9] refactor: split off result conversion from _analyze_chunks --- annif/backend/vw_multi.py | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/annif/backend/vw_multi.py b/annif/backend/vw_multi.py index 9f0a2dfaf..df7cd9d87 100644 --- a/annif/backend/vw_multi.py +++ b/annif/backend/vw_multi.py @@ -182,6 +182,21 @@ def train(self, corpus, project): self._create_train_file(corpus, project) self._create_model(project) + def _convert_result(self, result, project): + if self.algorithm == 'multilabel_oaa': + # result is a list of subject IDs - need to vectorize + mask = np.zeros(len(project.subjects)) + mask[result] = 1.0 + return mask + elif isinstance(result, int): + # result is a single integer - need to one-hot-encode + mask = np.zeros(len(project.subjects)) + mask[result - 1] = 1.0 + return mask + else: + # result is a list of scores (probabilities or binary 1/0) + return np.array(result) + def _analyze_chunks(self, chunktexts, project): results = [] for chunktext in chunktexts: @@ -190,19 +205,7 @@ def _analyze_chunks(self, chunktexts, project): continue example = ' {}'.format(exampletext) result = self._model.predict(example) - if self.algorithm == 'multilabel_oaa': - # result is a list of subject IDs - need to vectorize - mask = np.zeros(len(project.subjects)) - mask[result] = 1.0 - result = mask - elif isinstance(result, int): - # result is a single integer - need to one-hot-encode - mask = np.zeros(len(project.subjects)) - mask[result - 1] = 1.0 - result = mask - else: - result = np.array(result) - results.append(result) + results.append(self._convert_result(result, project)) if not results: # empty result return ListAnalysisResult(hits=[], subject_index=project.subjects) return VectorAnalysisResult( From 7edf0f67cccc4f8389cc86ca65e2954c4870c9c7 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Tue, 5 Feb 2019 12:42:07 +0200 Subject: [PATCH 8/9] refactor _inputs_to_exampletext: split off _get_input --- annif/backend/vw_multi.py | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/annif/backend/vw_multi.py b/annif/backend/vw_multi.py index df7cd9d87..8b6cd637e 100644 --- a/annif/backend/vw_multi.py +++ b/annif/backend/vw_multi.py @@ -107,21 +107,27 @@ def _format_examples(self, project, text, uris): for subject_id in subject_ids: yield '{} {}'.format(subject_id + 1, text) + def _get_input(self, input, project, text): + if input == '_text_': + normalized = self._normalize_text(project, text) + if normalized != '': + return normalized + else: + proj = annif.project.get_project(input) + result = proj.analyze(text) + features = [ + '{}:{}'.format(self._cleanup_text(hit.uri), hit.score) + for hit in result.hits] + if features: + return ' '.join(features) + return None + def _inputs_to_exampletext(self, project, text): namespaces = {} for input in self.inputs: - if input == '_text_': - normalized = self._normalize_text(project, text) - if normalized != '': - namespaces['_text_'] = normalized - else: - proj = annif.project.get_project(input) - result = proj.analyze(text) - features = [ - '{}:{}'.format(self._cleanup_text(hit.uri), hit.score) - for hit in result.hits] - if features: - namespaces[input] = ' '.join(features) + inputtext = self._get_input(input, project, text) + if inputtext: + namespaces[input] = inputtext if not namespaces: return None return ' '.join(['|{} {}'.format(namespace, featurestr) From 11943c7c6becef5691e8a394e3962137045135a5 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Tue, 5 Feb 2019 12:45:26 +0200 Subject: [PATCH 9/9] further simplify _get_input --- annif/backend/vw_multi.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/annif/backend/vw_multi.py b/annif/backend/vw_multi.py index 8b6cd637e..1f6d47961 100644 --- a/annif/backend/vw_multi.py +++ b/annif/backend/vw_multi.py @@ -109,18 +109,14 @@ def _format_examples(self, project, text, uris): def _get_input(self, input, project, text): if input == '_text_': - normalized = self._normalize_text(project, text) - if normalized != '': - return normalized + return self._normalize_text(project, text) else: proj = annif.project.get_project(input) result = proj.analyze(text) features = [ '{}:{}'.format(self._cleanup_text(hit.uri), hit.score) for hit in result.hits] - if features: - return ' '.join(features) - return None + return ' '.join(features) def _inputs_to_exampletext(self, project, text): namespaces = {}