Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add get_sentence_vector() to FastText and get_mean_vector() to KeyedVectors #3188

Merged
merged 12 commits into from
Mar 22, 2022
24 changes: 22 additions & 2 deletions gensim/models/fasttext.py
Original file line number Diff line number Diff line change
Expand Up @@ -1045,7 +1045,7 @@ def __contains__(self, word):

Note
----
This method **always** returns True, because of the way FastText works.
This method **always** returns True with char ngrams, because of the way FastText works.

If you want to check if a word is an in-vocabulary term, use this instead:

Expand All @@ -1059,7 +1059,10 @@ def __contains__(self, word):
False

"""
return True
if self.bucket == 0: # check for the case when char ngrams not used
rock420 marked this conversation as resolved.
Show resolved Hide resolved
return word in self.key_to_index
else:
return True

def save(self, *args, **kwargs):
"""Save object.
Expand Down Expand Up @@ -1131,6 +1134,23 @@ def get_vector(self, word, norm=False):
else:
return word_vec / len(ngram_hashes)

def get_sentence_vector(self, sentence):
"""Get a single 1-D vector representation for a given `sentence`.
This function is workalike of the official fasttext's get_sentence_vector().

Parameters
----------
sentence : list of (str or int)
list of words specified by string or int ids.

Returns
-------
numpy.ndarray
1-D numpy array representation of the `sentence`.

"""
return super(FastTextKeyedVectors, self).get_mean_vector(sentence)

def resize_vectors(self, seed=0):
"""Make underlying vectors match 'index_to_key' size; random-initialize any new rows."""

Expand Down
119 changes: 90 additions & 29 deletions gensim/models/keyedvectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,8 +174,8 @@
from typing import Iterable

from numpy import (
dot, float32 as REAL, double, array, zeros, vstack,
ndarray, sum as np_sum, prod, argmax, dtype, ascontiguousarray, frombuffer,
dot, float32 as REAL, double, zeros, vstack, ndarray,
sum as np_sum, prod, argmax, dtype, ascontiguousarray, frombuffer,
)
import numpy as np
from scipy import stats
Expand Down Expand Up @@ -203,6 +203,9 @@ def _ensure_list(value):
if isinstance(value, _KEY_TYPES) or (isinstance(value, ndarray) and len(value.shape) == 1):
return [value]

if isinstance(value, ndarray) and len(value.shape) == 2:
return list(value)

return value


Expand Down Expand Up @@ -453,6 +456,71 @@ def word_vec(self, *args, **kwargs):
"""Compatibility alias for get_vector(); must exist so subclass calls reach subclass get_vector()."""
return self.get_vector(*args, **kwargs)

def get_mean_vector(self, keys, weights=None, pre_normalize=True, post_normalize=False, ignore_missing=True):
"""Get the mean vector for a given list of keys.

Parameters
----------

keys : list of (str or int or ndarray)
Keys specified by string or int ids or numpy array.
weights : list of float or numpy.ndarray, optional
1D array of same size of `keys` specifying the weight for each key.
pre_normalize : bool, optional
Flag indicating whether to normalize each keyvector before taking mean.
If False, individual keyvector will not be normalized.
post_normalize: bool, optional
Flag indicating whether to normalize the final mean vector.
If True, normalized mean vector will be return.
ignore_missing : bool, optional
If False, will raise error if a key doesn't exist in vocabulary.

Returns
-------

numpy.ndarray
Mean vector for the list of keys.

Raises
------

ValueError
If the size of the list of `keys` and `weights` doesn't match.
KeyError
If any of the key doesn't exist in vocabulary and `ignore_missing` is false.

"""
if len(keys) == 0:
raise ValueError("cannot compute mean with no input")
if isinstance(weights, list):
weights = np.array(weights)
if weights is None:
weights = np.ones(len(keys))
if len(keys) != weights.shape[0]: # weights is a 1-D numpy array
raise ValueError(
"keys and weights array must have same number of elements"
)

mean = np.zeros(self.vector_size, self.vectors.dtype)

total_weight = 0
for idx, key in enumerate(keys):
if isinstance(key, ndarray):
mean += weights[idx] * key
total_weight += abs(weights[idx])
elif self.__contains__(key):
vec = self.get_vector(key, norm=pre_normalize)
mean += weights[idx] * vec
total_weight += abs(weights[idx])
elif not ignore_missing:
raise KeyError(f"Key '{key}' not present in vocabulary")

if(total_weight > 0):
mean = mean / total_weight
if post_normalize:
mean = matutils.unitvec(mean).astype(REAL)
return mean

def add_vector(self, key, vector):
"""Add one new vector at the given key, into existing slot if available.

Expand Down Expand Up @@ -717,10 +785,10 @@ def most_similar(

Parameters
----------
positive : list of (str or int or ndarray), optional
List of keys that contribute positively.
negative : list of (str or int or ndarray), optional
List of keys that contribute negatively.
positive : list of (str or int or ndarray) or list of ((str,float) or (int,float) or (ndarray,float)), optional
List of keys that contribute positively. If tuple, second element specifies the weight (default `1.0`)
negative : list of (str or int or ndarray) or list of ((str,float) or (int,float) or (ndarray,float)), optional
List of keys that contribute negatively. If tuple, second element specifies the weight (default `-1.0`)
topn : int or None, optional
Number of top-N similar keys to return, when `topn` is int. When `topn` is None,
then similarities for all keys are returned.
Expand Down Expand Up @@ -758,27 +826,20 @@ def most_similar(
clip_end = restrict_vocab

# add weights for each key, if not already present; default to 1.0 for positive and -1.0 for negative keys
positive = [
(item, 1.0) if isinstance(item, _EXTENDED_KEY_TYPES) else item
for item in positive
]
negative = [
(item, -1.0) if isinstance(item, _EXTENDED_KEY_TYPES) else item
for item in negative
]
keys = []
weight = np.concatenate((np.ones(len(positive)), -1.0 * np.ones(len(negative))))
for idx, item in enumerate(positive + negative):
if isinstance(item, _EXTENDED_KEY_TYPES):
keys.append(item)
else:
keys.append(item[0])
weight[idx] = item[1]

# compute the weighted average of all keys
all_keys, mean = set(), []
for key, weight in positive + negative:
if isinstance(key, ndarray):
mean.append(weight * key)
else:
mean.append(weight * self.get_vector(key, norm=True))
if self.has_index_for(key):
all_keys.add(self.get_index(key))
if not mean:
raise ValueError("cannot compute similarity with no input")
mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL)
mean = self.get_mean_vector(keys, weight, pre_normalize=True, post_normalize=True, ignore_missing=False)
all_keys = [
self.get_index(key) for key in keys if isinstance(key, _KEY_TYPES) and self.has_index_for(key)
]

if indexer is not None and isinstance(topn, int):
return indexer.most_similar(mean, topn)
Expand Down Expand Up @@ -1059,7 +1120,7 @@ def rank_by_centrality(self, words, use_norm=True):
if not used_words:
raise ValueError("cannot select a word from an empty list")
vectors = vstack([self.get_vector(word, norm=use_norm) for word in used_words]).astype(REAL)
mean = matutils.unitvec(vectors.mean(axis=0)).astype(REAL)
mean = self.get_mean_vector(vectors, post_normalize=True)
dists = dot(vectors, mean)
return sorted(zip(dists, used_words), reverse=True)

Expand Down Expand Up @@ -1191,9 +1252,9 @@ def n_similarity(self, ws1, ws2):
"""
if not(len(ws1) and len(ws2)):
raise ZeroDivisionError('At least one of the passed list is empty.')
v1 = [self[key] for key in ws1]
v2 = [self[key] for key in ws2]
return dot(matutils.unitvec(array(v1).mean(axis=0)), matutils.unitvec(array(v2).mean(axis=0)))
mean1 = self.get_mean_vector(ws1, pre_normalize=False)
mean2 = self.get_mean_vector(ws2, pre_normalize=False)
return dot(matutils.unitvec(mean1), matutils.unitvec(mean2))

@staticmethod
def _log_evaluate_word_analogies(section):
Expand Down
29 changes: 29 additions & 0 deletions gensim/test/test_keyedvectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -366,6 +366,35 @@ def test_no_header(self):
self.assertEqual(randkv.index_to_key, reloadtxtkv.index_to_key)
self.assertTrue((randkv.vectors == reloadtxtkv.vectors).all())

def test_get_mean_vector(self):
"""Test get_mean_vector returns expected results."""
keys = [
'conflict',
'administration',
'terrorism',
'call',
'an out-of-vocabulary word',
]
weights = [1, 2, 3, 1, 2]
expected_result_1 = np.array([
0.02000151, -0.12685453, 0.09196121, 0.25514853, 0.25740655,
-0.11134843, -0.0502661, -0.19278568, -0.83346179, -0.12068878,
], dtype=np.float32)
expected_result_2 = np.array([
-0.0145228, -0.11530358, 0.1169825, 0.22537769, 0.29353586,
-0.10458107, -0.05272481, -0.17547795, -0.84245106, -0.10356515,
], dtype=np.float32)
expected_result_3 = np.array([
0.01343237, -0.47651053, 0.45645328, 0.98304356, 1.1840123,
-0.51647933, -0.25308795, -0.77931081, -3.55954733, -0.55429711,
], dtype=np.float32)

self.assertTrue(np.allclose(self.vectors.get_mean_vector(keys), expected_result_1))
self.assertTrue(np.allclose(self.vectors.get_mean_vector(keys, weights), expected_result_2))
self.assertTrue(np.allclose(
self.vectors.get_mean_vector(keys, pre_normalize=False), expected_result_3)
)


class Gensim320Test(unittest.TestCase):
def test(self):
Expand Down