Merge pull request #340 from NatLibFi/reduce-vector-memory-usage

Reduce vector memory usage
NatLibFi · Oct 25, 2019 · e1b27ae · e1b27ae
2 parents 462165f + 9d4fe86
commit e1b27ae
Show file tree

Hide file tree

Showing 6 changed files with 16 additions and 14 deletions.
diff --git a/annif/backend/pav.py b/annif/backend/pav.py
@@ -86,7 +86,7 @@ def _create_pav_model(self, source_project_id, min_docs, corpus):
             if true[:, cid].sum() < min_docs:
                 continue  # don't create model b/c of too few examples
             reg = IsotonicRegression(out_of_bounds='clip')
-            reg.fit(scores[:, cid], true[:, cid])
+            reg.fit(scores[:, cid].astype(np.float64), true[:, cid])
             pav_regressions[source_project.subjects[cid][0]] = reg
         self.info("created PAV model for {} concepts".format(
             len(pav_regressions)))

diff --git a/annif/backend/vw_ensemble.py b/annif/backend/vw_ensemble.py
@@ -84,9 +84,10 @@ def _calculate_scores(self, subj_id, subj_score_vector):
 
     def _merge_hits_from_sources(self, hits_from_sources, project, params):
         score_vector = np.array([hits.vector
-                                 for hits, _ in hits_from_sources])
+                                 for hits, _ in hits_from_sources],
+                                dtype=np.float32)
         discount_rate = float(self.params['discount_rate'])
-        result = np.zeros(score_vector.shape[1])
+        result = np.zeros(score_vector.shape[1], dtype=np.float32)
         for subj_id in range(score_vector.shape[1]):
             subj_score_vector = score_vector[:, subj_id]
             if subj_score_vector.sum() > 0.0:
@@ -120,7 +121,7 @@ def _doc_score_vector(self, doc, source_projects):
         for source_project in source_projects:
             hits = source_project.suggest(doc.text)
             score_vectors.append(hits.vector)
-        return np.array(score_vectors)
+        return np.array(score_vectors, dtype=np.float32)
 
     def _doc_to_example(self, doc, project, source_projects):
         examples = []

diff --git a/annif/backend/vw_multi.py b/annif/backend/vw_multi.py
@@ -124,17 +124,17 @@ def _create_model(self, project):
     def _convert_result(self, result, project):
         if self.algorithm == 'multilabel_oaa':
             # result is a list of subject IDs - need to vectorize
-            mask = np.zeros(len(project.subjects))
+            mask = np.zeros(len(project.subjects), dtype=np.float32)
             mask[result] = 1.0
             return mask
         elif isinstance(result, int):
             # result is a single integer - need to one-hot-encode
-            mask = np.zeros(len(project.subjects))
+            mask = np.zeros(len(project.subjects), dtype=np.float32)
             mask[result - 1] = 1.0
             return mask
         else:
             # result is a list of scores (probabilities or binary 1/0)
-            return np.array(result)
+            return np.array(result, dtype=np.float32)
 
     def _suggest_chunks(self, chunktexts, project):
         results = []
@@ -149,4 +149,4 @@ def _suggest_chunks(self, chunktexts, project):
             return ListSuggestionResult(
                 hits=[], subject_index=project.subjects)
         return VectorSuggestionResult(
-            np.array(results).mean(axis=0), project.subjects)
+            np.array(results, dtype=np.float32).mean(axis=0), project.subjects)
diff --git a/annif/corpus/subject.py b/annif/corpus/subject.py
@@ -131,15 +131,15 @@ def as_vector(self, subject_index):
            multilabel indicator format, using a subject index as the source
            of subjects."""
 
-        vector = np.zeros(len(subject_index), dtype=np.int8)
+        vector = np.zeros(len(subject_index), dtype=bool)
         if self.has_uris():
             for uri in self.subject_uris:
                 subject_id = subject_index.by_uri(uri)
                 if subject_id is not None:
-                    vector[subject_id] = 1
+                    vector[subject_id] = True
         else:
             for label in self.subject_labels:
                 subject_id = subject_index.by_label(label)
                 if subject_id is not None:
-                    vector[subject_id] = 1
+                    vector[subject_id] = True
         return vector
diff --git a/annif/eval.py b/annif/eval.py
@@ -137,7 +137,8 @@ def results(self, metrics='all'):
         y_true = np.array([gold_subjects.as_vector(self._subject_index)
                            for hits, gold_subjects in self._samples])
         y_pred = np.array([hits.vector
-                           for hits, gold_subjects in self._samples])
+                           for hits, gold_subjects in self._samples],
+                          dtype=np.float32)
 
         results = self._evaluate_samples(
             y_true, y_pred, metrics)

diff --git a/annif/suggestion.py b/annif/suggestion.py
@@ -99,7 +99,7 @@ class VectorSuggestionResult(SuggestionResult):
     """SuggestionResult implementation based primarily on NumPy vectors."""
 
     def __init__(self, vector, subject_index):
-        self._vector = vector
+        self._vector = vector.astype(np.float32)
         self._subject_index = subject_index
         self._subject_order = None
         self._hits = None
@@ -156,7 +156,7 @@ def __init__(self, hits, subject_index):
         self._vector = None
 
     def _hits_to_vector(self):
-        vector = np.zeros(len(self._subject_index))
+        vector = np.zeros(len(self._subject_index), dtype=np.float32)
         for hit in self._hits:
             subject_id = self._subject_index.by_uri(hit.uri)
             if subject_id is not None: