From e7a92f5e99125618a8b9b51298ef6f7b8d5b4bf9 Mon Sep 17 00:00:00 2001
From: Osma Suominen <osma.suominen@helsinki.fi>
Date: Fri, 25 Oct 2019 13:58:10 +0300
Subject: [PATCH 1/6] Switch from int8 to bool for one-hot subject vectors

---
 annif/corpus/subject.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/annif/corpus/subject.py b/annif/corpus/subject.py
index 33e1c10ec..0f4844bf6 100644
--- a/annif/corpus/subject.py
+++ b/annif/corpus/subject.py
@@ -131,15 +131,15 @@ def as_vector(self, subject_index):
            multilabel indicator format, using a subject index as the source
            of subjects."""
 
-        vector = np.zeros(len(subject_index), dtype=np.int8)
+        vector = np.zeros(len(subject_index), dtype=bool)
         if self.has_uris():
             for uri in self.subject_uris:
                 subject_id = subject_index.by_uri(uri)
                 if subject_id is not None:
-                    vector[subject_id] = 1
+                    vector[subject_id] = True
         else:
             for label in self.subject_labels:
                 subject_id = subject_index.by_label(label)
                 if subject_id is not None:
-                    vector[subject_id] = 1
+                    vector[subject_id] = True
         return vector

From c406d9c1da575b62a5c59c57abecd28e499892f5 Mon Sep 17 00:00:00 2001
From: Osma Suominen <osma.suominen@helsinki.fi>
Date: Fri, 25 Oct 2019 14:09:42 +0300
Subject: [PATCH 2/6] Use float32 data type for storing suggestion vectors,
 instead of default float64

---
 annif/suggestion.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/annif/suggestion.py b/annif/suggestion.py
index 4f0164eb3..81bbc2573 100644
--- a/annif/suggestion.py
+++ b/annif/suggestion.py
@@ -99,7 +99,7 @@ class VectorSuggestionResult(SuggestionResult):
     """SuggestionResult implementation based primarily on NumPy vectors."""
 
     def __init__(self, vector, subject_index):
-        self._vector = vector
+        self._vector = vector.astype(np.float32)
         self._subject_index = subject_index
         self._subject_order = None
         self._hits = None

From 324a78255e234c8be142cc040879421f93d5bc8e Mon Sep 17 00:00:00 2001
From: Osma Suominen <osma.suominen@helsinki.fi>
Date: Fri, 25 Oct 2019 14:13:12 +0300
Subject: [PATCH 3/6] Use float32 data type also for vectors generated from
 ListSuggestionResult objects

---
 annif/suggestion.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/annif/suggestion.py b/annif/suggestion.py
index 81bbc2573..43b901a1a 100644
--- a/annif/suggestion.py
+++ b/annif/suggestion.py
@@ -156,7 +156,7 @@ def __init__(self, hits, subject_index):
         self._vector = None
 
     def _hits_to_vector(self):
-        vector = np.zeros(len(self._subject_index))
+        vector = np.zeros(len(self._subject_index), dtype=np.float32)
         for hit in self._hits:
             subject_id = self._subject_index.by_uri(hit.uri)
             if subject_id is not None:

From 534697c77e2b6fd2b923e48b206744ec4412b1c2 Mon Sep 17 00:00:00 2001
From: Osma Suominen <osma.suominen@helsinki.fi>
Date: Fri, 25 Oct 2019 14:30:23 +0300
Subject: [PATCH 4/6] Convert to float64 because IsotonicRegression requires it

---
 annif/backend/pav.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/annif/backend/pav.py b/annif/backend/pav.py
index 0232a2876..e05a81045 100644
--- a/annif/backend/pav.py
+++ b/annif/backend/pav.py
@@ -86,7 +86,7 @@ def _create_pav_model(self, source_project_id, min_docs, corpus):
             if true[:, cid].sum() < min_docs:
                 continue  # don't create model b/c of too few examples
             reg = IsotonicRegression(out_of_bounds='clip')
-            reg.fit(scores[:, cid], true[:, cid])
+            reg.fit(scores[:, cid].astype(np.float64), true[:, cid])
             pav_regressions[source_project.subjects[cid][0]] = reg
         self.info("created PAV model for {} concepts".format(
             len(pav_regressions)))

From 12f10d307756005905d809c3ee9ef0f93604f8aa Mon Sep 17 00:00:00 2001
From: Osma Suominen <osma.suominen@helsinki.fi>
Date: Fri, 25 Oct 2019 14:32:14 +0300
Subject: [PATCH 5/6] Make sure evaluation batches are stored as float32
 (better safe than sorry)

---
 annif/eval.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/annif/eval.py b/annif/eval.py
index da67d6c88..2f7c998e8 100644
--- a/annif/eval.py
+++ b/annif/eval.py
@@ -137,7 +137,8 @@ def results(self, metrics='all'):
         y_true = np.array([gold_subjects.as_vector(self._subject_index)
                            for hits, gold_subjects in self._samples])
         y_pred = np.array([hits.vector
-                           for hits, gold_subjects in self._samples])
+                           for hits, gold_subjects in self._samples],
+                          dtype=np.float32)
 
         results = self._evaluate_samples(
             y_true, y_pred, metrics)

From 9d4fe86022cf63aebed59ecbedd658c02f01e1ef Mon Sep 17 00:00:00 2001
From: Osma Suominen <osma.suominen@helsinki.fi>
Date: Fri, 25 Oct 2019 14:44:28 +0300
Subject: [PATCH 6/6] Use float32 data types for numpy arrays in vw_multi and
 vw_ensemble backends

---
 annif/backend/vw_ensemble.py | 7 ++++---
 annif/backend/vw_multi.py    | 8 ++++----
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/annif/backend/vw_ensemble.py b/annif/backend/vw_ensemble.py
index 1d731f937..81ec5201a 100644
--- a/annif/backend/vw_ensemble.py
+++ b/annif/backend/vw_ensemble.py
@@ -84,9 +84,10 @@ def _calculate_scores(self, subj_id, subj_score_vector):
 
     def _merge_hits_from_sources(self, hits_from_sources, project, params):
         score_vector = np.array([hits.vector
-                                 for hits, _ in hits_from_sources])
+                                 for hits, _ in hits_from_sources],
+                                dtype=np.float32)
         discount_rate = float(self.params['discount_rate'])
-        result = np.zeros(score_vector.shape[1])
+        result = np.zeros(score_vector.shape[1], dtype=np.float32)
         for subj_id in range(score_vector.shape[1]):
             subj_score_vector = score_vector[:, subj_id]
             if subj_score_vector.sum() > 0.0:
@@ -120,7 +121,7 @@ def _doc_score_vector(self, doc, source_projects):
         for source_project in source_projects:
             hits = source_project.suggest(doc.text)
             score_vectors.append(hits.vector)
-        return np.array(score_vectors)
+        return np.array(score_vectors, dtype=np.float32)
 
     def _doc_to_example(self, doc, project, source_projects):
         examples = []
diff --git a/annif/backend/vw_multi.py b/annif/backend/vw_multi.py
index 63cef305d..06464431b 100644
--- a/annif/backend/vw_multi.py
+++ b/annif/backend/vw_multi.py
@@ -124,17 +124,17 @@ def _create_model(self, project):
     def _convert_result(self, result, project):
         if self.algorithm == 'multilabel_oaa':
             # result is a list of subject IDs - need to vectorize
-            mask = np.zeros(len(project.subjects))
+            mask = np.zeros(len(project.subjects), dtype=np.float32)
             mask[result] = 1.0
             return mask
         elif isinstance(result, int):
             # result is a single integer - need to one-hot-encode
-            mask = np.zeros(len(project.subjects))
+            mask = np.zeros(len(project.subjects), dtype=np.float32)
             mask[result - 1] = 1.0
             return mask
         else:
             # result is a list of scores (probabilities or binary 1/0)
-            return np.array(result)
+            return np.array(result, dtype=np.float32)
 
     def _suggest_chunks(self, chunktexts, project):
         results = []
@@ -149,4 +149,4 @@ def _suggest_chunks(self, chunktexts, project):
             return ListSuggestionResult(
                 hits=[], subject_index=project.subjects)
         return VectorSuggestionResult(
-            np.array(results).mean(axis=0), project.subjects)
+            np.array(results, dtype=np.float32).mean(axis=0), project.subjects)