Skip to content

Commit

Permalink
Merge pull request #340 from NatLibFi/reduce-vector-memory-usage
Browse files Browse the repository at this point in the history
Reduce vector memory usage
  • Loading branch information
osma authored Oct 25, 2019
2 parents 462165f + 9d4fe86 commit e1b27ae
Show file tree
Hide file tree
Showing 6 changed files with 16 additions and 14 deletions.
2 changes: 1 addition & 1 deletion annif/backend/pav.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def _create_pav_model(self, source_project_id, min_docs, corpus):
if true[:, cid].sum() < min_docs:
continue # don't create model b/c of too few examples
reg = IsotonicRegression(out_of_bounds='clip')
reg.fit(scores[:, cid], true[:, cid])
reg.fit(scores[:, cid].astype(np.float64), true[:, cid])
pav_regressions[source_project.subjects[cid][0]] = reg
self.info("created PAV model for {} concepts".format(
len(pav_regressions)))
Expand Down
7 changes: 4 additions & 3 deletions annif/backend/vw_ensemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,9 +84,10 @@ def _calculate_scores(self, subj_id, subj_score_vector):

def _merge_hits_from_sources(self, hits_from_sources, project, params):
score_vector = np.array([hits.vector
for hits, _ in hits_from_sources])
for hits, _ in hits_from_sources],
dtype=np.float32)
discount_rate = float(self.params['discount_rate'])
result = np.zeros(score_vector.shape[1])
result = np.zeros(score_vector.shape[1], dtype=np.float32)
for subj_id in range(score_vector.shape[1]):
subj_score_vector = score_vector[:, subj_id]
if subj_score_vector.sum() > 0.0:
Expand Down Expand Up @@ -120,7 +121,7 @@ def _doc_score_vector(self, doc, source_projects):
for source_project in source_projects:
hits = source_project.suggest(doc.text)
score_vectors.append(hits.vector)
return np.array(score_vectors)
return np.array(score_vectors, dtype=np.float32)

def _doc_to_example(self, doc, project, source_projects):
examples = []
Expand Down
8 changes: 4 additions & 4 deletions annif/backend/vw_multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,17 +124,17 @@ def _create_model(self, project):
def _convert_result(self, result, project):
if self.algorithm == 'multilabel_oaa':
# result is a list of subject IDs - need to vectorize
mask = np.zeros(len(project.subjects))
mask = np.zeros(len(project.subjects), dtype=np.float32)
mask[result] = 1.0
return mask
elif isinstance(result, int):
# result is a single integer - need to one-hot-encode
mask = np.zeros(len(project.subjects))
mask = np.zeros(len(project.subjects), dtype=np.float32)
mask[result - 1] = 1.0
return mask
else:
# result is a list of scores (probabilities or binary 1/0)
return np.array(result)
return np.array(result, dtype=np.float32)

def _suggest_chunks(self, chunktexts, project):
results = []
Expand All @@ -149,4 +149,4 @@ def _suggest_chunks(self, chunktexts, project):
return ListSuggestionResult(
hits=[], subject_index=project.subjects)
return VectorSuggestionResult(
np.array(results).mean(axis=0), project.subjects)
np.array(results, dtype=np.float32).mean(axis=0), project.subjects)
6 changes: 3 additions & 3 deletions annif/corpus/subject.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,15 +131,15 @@ def as_vector(self, subject_index):
multilabel indicator format, using a subject index as the source
of subjects."""

vector = np.zeros(len(subject_index), dtype=np.int8)
vector = np.zeros(len(subject_index), dtype=bool)
if self.has_uris():
for uri in self.subject_uris:
subject_id = subject_index.by_uri(uri)
if subject_id is not None:
vector[subject_id] = 1
vector[subject_id] = True
else:
for label in self.subject_labels:
subject_id = subject_index.by_label(label)
if subject_id is not None:
vector[subject_id] = 1
vector[subject_id] = True
return vector
3 changes: 2 additions & 1 deletion annif/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,8 @@ def results(self, metrics='all'):
y_true = np.array([gold_subjects.as_vector(self._subject_index)
for hits, gold_subjects in self._samples])
y_pred = np.array([hits.vector
for hits, gold_subjects in self._samples])
for hits, gold_subjects in self._samples],
dtype=np.float32)

results = self._evaluate_samples(
y_true, y_pred, metrics)
Expand Down
4 changes: 2 additions & 2 deletions annif/suggestion.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ class VectorSuggestionResult(SuggestionResult):
"""SuggestionResult implementation based primarily on NumPy vectors."""

def __init__(self, vector, subject_index):
self._vector = vector
self._vector = vector.astype(np.float32)
self._subject_index = subject_index
self._subject_order = None
self._hits = None
Expand Down Expand Up @@ -156,7 +156,7 @@ def __init__(self, hits, subject_index):
self._vector = None

def _hits_to_vector(self):
vector = np.zeros(len(self._subject_index))
vector = np.zeros(len(self._subject_index), dtype=np.float32)
for hit in self._hits:
subject_id = self._subject_index.by_uri(hit.uri)
if subject_id is not None:
Expand Down

0 comments on commit e1b27ae

Please sign in to comment.