From e7a92f5e99125618a8b9b51298ef6f7b8d5b4bf9 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Fri, 25 Oct 2019 13:58:10 +0300 Subject: [PATCH 1/6] Switch from int8 to bool for one-hot subject vectors --- annif/corpus/subject.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/annif/corpus/subject.py b/annif/corpus/subject.py index 33e1c10ec..0f4844bf6 100644 --- a/annif/corpus/subject.py +++ b/annif/corpus/subject.py @@ -131,15 +131,15 @@ def as_vector(self, subject_index): multilabel indicator format, using a subject index as the source of subjects.""" - vector = np.zeros(len(subject_index), dtype=np.int8) + vector = np.zeros(len(subject_index), dtype=bool) if self.has_uris(): for uri in self.subject_uris: subject_id = subject_index.by_uri(uri) if subject_id is not None: - vector[subject_id] = 1 + vector[subject_id] = True else: for label in self.subject_labels: subject_id = subject_index.by_label(label) if subject_id is not None: - vector[subject_id] = 1 + vector[subject_id] = True return vector From c406d9c1da575b62a5c59c57abecd28e499892f5 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Fri, 25 Oct 2019 14:09:42 +0300 Subject: [PATCH 2/6] Use float32 data type for storing suggestion vectors, instead of default float64 --- annif/suggestion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/annif/suggestion.py b/annif/suggestion.py index 4f0164eb3..81bbc2573 100644 --- a/annif/suggestion.py +++ b/annif/suggestion.py @@ -99,7 +99,7 @@ class VectorSuggestionResult(SuggestionResult): """SuggestionResult implementation based primarily on NumPy vectors.""" def __init__(self, vector, subject_index): - self._vector = vector + self._vector = vector.astype(np.float32) self._subject_index = subject_index self._subject_order = None self._hits = None From 324a78255e234c8be142cc040879421f93d5bc8e Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Fri, 25 Oct 2019 14:13:12 +0300 Subject: [PATCH 3/6] Use float32 data type also for vectors generated from ListSuggestionResult objects --- annif/suggestion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/annif/suggestion.py b/annif/suggestion.py index 81bbc2573..43b901a1a 100644 --- a/annif/suggestion.py +++ b/annif/suggestion.py @@ -156,7 +156,7 @@ def __init__(self, hits, subject_index): self._vector = None def _hits_to_vector(self): - vector = np.zeros(len(self._subject_index)) + vector = np.zeros(len(self._subject_index), dtype=np.float32) for hit in self._hits: subject_id = self._subject_index.by_uri(hit.uri) if subject_id is not None: From 534697c77e2b6fd2b923e48b206744ec4412b1c2 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Fri, 25 Oct 2019 14:30:23 +0300 Subject: [PATCH 4/6] Convert to float64 because IsotonicRegression requires it --- annif/backend/pav.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/annif/backend/pav.py b/annif/backend/pav.py index 0232a2876..e05a81045 100644 --- a/annif/backend/pav.py +++ b/annif/backend/pav.py @@ -86,7 +86,7 @@ def _create_pav_model(self, source_project_id, min_docs, corpus): if true[:, cid].sum() < min_docs: continue # don't create model b/c of too few examples reg = IsotonicRegression(out_of_bounds='clip') - reg.fit(scores[:, cid], true[:, cid]) + reg.fit(scores[:, cid].astype(np.float64), true[:, cid]) pav_regressions[source_project.subjects[cid][0]] = reg self.info("created PAV model for {} concepts".format( len(pav_regressions))) From 12f10d307756005905d809c3ee9ef0f93604f8aa Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Fri, 25 Oct 2019 14:32:14 +0300 Subject: [PATCH 5/6] Make sure evaluation batches are stored as float32 (better safe than sorry) --- annif/eval.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/annif/eval.py b/annif/eval.py index da67d6c88..2f7c998e8 100644 --- a/annif/eval.py +++ b/annif/eval.py @@ -137,7 +137,8 @@ def results(self, metrics='all'): y_true = np.array([gold_subjects.as_vector(self._subject_index) for hits, gold_subjects in self._samples]) y_pred = np.array([hits.vector - for hits, gold_subjects in self._samples]) + for hits, gold_subjects in self._samples], + dtype=np.float32) results = self._evaluate_samples( y_true, y_pred, metrics) From 9d4fe86022cf63aebed59ecbedd658c02f01e1ef Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Fri, 25 Oct 2019 14:44:28 +0300 Subject: [PATCH 6/6] Use float32 data types for numpy arrays in vw_multi and vw_ensemble backends --- annif/backend/vw_ensemble.py | 7 ++++--- annif/backend/vw_multi.py | 8 ++++---- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/annif/backend/vw_ensemble.py b/annif/backend/vw_ensemble.py index 1d731f937..81ec5201a 100644 --- a/annif/backend/vw_ensemble.py +++ b/annif/backend/vw_ensemble.py @@ -84,9 +84,10 @@ def _calculate_scores(self, subj_id, subj_score_vector): def _merge_hits_from_sources(self, hits_from_sources, project, params): score_vector = np.array([hits.vector - for hits, _ in hits_from_sources]) + for hits, _ in hits_from_sources], + dtype=np.float32) discount_rate = float(self.params['discount_rate']) - result = np.zeros(score_vector.shape[1]) + result = np.zeros(score_vector.shape[1], dtype=np.float32) for subj_id in range(score_vector.shape[1]): subj_score_vector = score_vector[:, subj_id] if subj_score_vector.sum() > 0.0: @@ -120,7 +121,7 @@ def _doc_score_vector(self, doc, source_projects): for source_project in source_projects: hits = source_project.suggest(doc.text) score_vectors.append(hits.vector) - return np.array(score_vectors) + return np.array(score_vectors, dtype=np.float32) def _doc_to_example(self, doc, project, source_projects): examples = [] diff --git a/annif/backend/vw_multi.py b/annif/backend/vw_multi.py index 63cef305d..06464431b 100644 --- a/annif/backend/vw_multi.py +++ b/annif/backend/vw_multi.py @@ -124,17 +124,17 @@ def _create_model(self, project): def _convert_result(self, result, project): if self.algorithm == 'multilabel_oaa': # result is a list of subject IDs - need to vectorize - mask = np.zeros(len(project.subjects)) + mask = np.zeros(len(project.subjects), dtype=np.float32) mask[result] = 1.0 return mask elif isinstance(result, int): # result is a single integer - need to one-hot-encode - mask = np.zeros(len(project.subjects)) + mask = np.zeros(len(project.subjects), dtype=np.float32) mask[result - 1] = 1.0 return mask else: # result is a list of scores (probabilities or binary 1/0) - return np.array(result) + return np.array(result, dtype=np.float32) def _suggest_chunks(self, chunktexts, project): results = [] @@ -149,4 +149,4 @@ def _suggest_chunks(self, chunktexts, project): return ListSuggestionResult( hits=[], subject_index=project.subjects) return VectorSuggestionResult( - np.array(results).mean(axis=0), project.subjects) + np.array(results, dtype=np.float32).mean(axis=0), project.subjects)