From acacf9e80448cfd626b5673d3377d0403a65031c Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Tue, 2 Jul 2019 14:12:02 +0300 Subject: [PATCH 1/4] Pass more features to VW in vw_ensemble: raw scores (descriptor invariant) and "zero features" (both descriptor-specific and descriptor-invariant) --- annif/backend/vw_ensemble.py | 20 ++++++++++++++++---- tests/test_backend_vw_ensemble.py | 5 +++-- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/annif/backend/vw_ensemble.py b/annif/backend/vw_ensemble.py index 94cf28fa7..0fbdc1d06 100644 --- a/annif/backend/vw_ensemble.py +++ b/annif/backend/vw_ensemble.py @@ -44,6 +44,10 @@ class VWEnsembleBackend( # will make it more careful so that it will require more training data. DEFAULT_DISCOUNT_RATE = 0.01 + # score threshold for "zero features": scores lower than this will be + # considered zero and marked with a zero feature given to VW + ZERO_THRESHOLD = 0.001 + def _load_subject_freq(self): path = os.path.join(self.datadir, self.FREQ_FILE) if not os.path.exists(path): @@ -101,9 +105,16 @@ def _format_example(self, subject_id, scores, true=None): val = 1 else: val = -1 - ex = "{} |{}".format(val, subject_id) - for proj_idx, proj in enumerate(self._source_project_ids): - ex += " {}:{:.6f}".format(proj, scores[proj_idx]) + + features = " ".join(["{}:{:.6f}".format(proj, scores[proj_idx]) + for proj_idx, proj + in enumerate(self._source_project_ids)]) + zero_features = " ".join(["zero^{}".format(proj) + for proj_idx, proj + in enumerate(self._source_project_ids) + if scores[proj_idx] < self.ZERO_THRESHOLD]) + ex = "{} |raw {} {} |{} {} {}".format( + val, features, zero_features, subject_id, features, zero_features) return ex def _doc_score_vector(self, doc, source_projects): @@ -119,7 +130,8 @@ def _doc_to_example(self, doc, project, source_projects): true = subjects.as_vector(project.subjects) score_vector = self._doc_score_vector(doc, source_projects) for subj_id in range(len(true)): - if true[subj_id] or score_vector[:, subj_id].sum() > 0.0: + if true[subj_id] \ + or score_vector[:, subj_id].sum() >= self.ZERO_THRESHOLD: ex = (subj_id, self._format_example( subj_id, score_vector[:, subj_id], diff --git a/tests/test_backend_vw_ensemble.py b/tests/test_backend_vw_ensemble.py index 23b48ff53..f22dbf75d 100644 --- a/tests/test_backend_vw_ensemble.py +++ b/tests/test_backend_vw_ensemble.py @@ -126,7 +126,7 @@ def test_vw_ensemble_format_example(datadir): datadir=str(datadir)) ex = vw_ensemble._format_example(0, [0.5]) - assert ex == ' |0 dummy-en:0.500000' + assert ex == ' |raw dummy-en:0.500000 |0 dummy-en:0.500000 ' def test_vw_ensemble_format_example_avoid_sci_notation(datadir): @@ -137,4 +137,5 @@ def test_vw_ensemble_format_example_avoid_sci_notation(datadir): datadir=str(datadir)) ex = vw_ensemble._format_example(0, [7.24e-05]) - assert ex == ' |0 dummy-en:0.000072' + assert ex == ' |raw dummy-en:0.000072 zero^dummy-en' + \ + ' |0 dummy-en:0.000072 zero^dummy-en' From 64e76d385498e144b68e7dd5c21143612696ef88 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Tue, 2 Jul 2019 14:16:03 +0300 Subject: [PATCH 2/4] Add interactions between raw features to the VW model --- annif/backend/vw_ensemble.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/annif/backend/vw_ensemble.py b/annif/backend/vw_ensemble.py index 0fbdc1d06..9e2a7f3d4 100644 --- a/annif/backend/vw_ensemble.py +++ b/annif/backend/vw_ensemble.py @@ -148,6 +148,11 @@ def _create_examples(self, corpus, project): random.shuffle(examples) return examples + def _create_model(self, project): + # add interactions between raw (descriptor-invariant) features to + # the mix + super()._create_model(project, {'q': 'rr'}) + @staticmethod def _write_freq_file(subject_freq, filename): with open(filename, 'w') as freqfile: From 46e7e609b4013ec0757e750ad42311a393972210 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Tue, 2 Jul 2019 14:23:57 +0300 Subject: [PATCH 3/4] Refactor: avoid useless variable --- annif/backend/vw_ensemble.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/annif/backend/vw_ensemble.py b/annif/backend/vw_ensemble.py index 9e2a7f3d4..6fc7a09cd 100644 --- a/annif/backend/vw_ensemble.py +++ b/annif/backend/vw_ensemble.py @@ -113,9 +113,8 @@ def _format_example(self, subject_id, scores, true=None): for proj_idx, proj in enumerate(self._source_project_ids) if scores[proj_idx] < self.ZERO_THRESHOLD]) - ex = "{} |raw {} {} |{} {} {}".format( + return "{} |raw {} {} |{} {} {}".format( val, features, zero_features, subject_id, features, zero_features) - return ex def _doc_score_vector(self, doc, source_projects): score_vectors = [] From d531b748c5bc452d9cae4baca01324c1c5845924 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Tue, 2 Jul 2019 14:26:55 +0300 Subject: [PATCH 4/4] Refactor: split and simplify _format_example in vw_ensemble --- annif/backend/vw_ensemble.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/annif/backend/vw_ensemble.py b/annif/backend/vw_ensemble.py index 6fc7a09cd..132e26d69 100644 --- a/annif/backend/vw_ensemble.py +++ b/annif/backend/vw_ensemble.py @@ -98,14 +98,16 @@ def _source_project_ids(self): sources = annif.util.parse_sources(self.params['sources']) return [project_id for project_id, _ in sources] - def _format_example(self, subject_id, scores, true=None): + @staticmethod + def _format_value(true): if true is None: - val = '' + return '' elif true: - val = 1 + return 1 else: - val = -1 + return -1 + def _format_example(self, subject_id, scores, true=None): features = " ".join(["{}:{:.6f}".format(proj, scores[proj_idx]) for proj_idx, proj in enumerate(self._source_project_ids)]) @@ -114,7 +116,12 @@ def _format_example(self, subject_id, scores, true=None): in enumerate(self._source_project_ids) if scores[proj_idx] < self.ZERO_THRESHOLD]) return "{} |raw {} {} |{} {} {}".format( - val, features, zero_features, subject_id, features, zero_features) + self._format_value(true), + features, + zero_features, + subject_id, + features, + zero_features) def _doc_score_vector(self, doc, source_projects): score_vectors = []