piskvorky · mpenkov · Mar 22, 2022 · Jul 6, 2021 · Jul 6, 2021 · Jul 7, 2021
diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py
@@ -1045,7 +1045,7 @@ def __contains__(self, word):
 
         Note
         ----
-        This method **always** returns True, because of the way FastText works.
+        This method **always** returns True with char ngrams, because of the way FastText works.
 
         If you want to check if a word is an in-vocabulary term, use this instead:
 
@@ -1059,7 +1059,10 @@ def __contains__(self, word):
             False
 
         """
-        return True
+        if self.bucket == 0:  # check for the case when char ngrams not used
+            return word in self.key_to_index
+        else:
+            return True
 
     def save(self, *args, **kwargs):
         """Save object.
@@ -1131,6 +1134,23 @@ def get_vector(self, word, norm=False):
             else:
                 return word_vec / len(ngram_hashes)
 
+    def get_sentence_vector(self, sentence):
+        """Get a single 1-D vector representation for a given `sentence`.
+        This function is workalike of the official fasttext's get_sentence_vector().
+
+        Parameters
+        ----------
+        sentence : list of (str or int)
+            list of words specified by string or int ids.
+
+        Returns
+        -------
+        numpy.ndarray
+            1-D numpy array representation of the `sentence`.
+
+        """
+        return super(FastTextKeyedVectors, self).get_mean_vector(sentence)
+
     def resize_vectors(self, seed=0):
         """Make underlying vectors match 'index_to_key' size; random-initialize any new rows."""
 

diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py
@@ -174,8 +174,8 @@
 from typing import Iterable
 
 from numpy import (
-    dot, float32 as REAL, double, array, zeros, vstack,
-    ndarray, sum as np_sum, prod, argmax, dtype, ascontiguousarray, frombuffer,
+    dot, float32 as REAL, double, zeros, vstack, ndarray,
+    sum as np_sum, prod, argmax, dtype, ascontiguousarray, frombuffer,
 )
 import numpy as np
 from scipy import stats
@@ -203,6 +203,9 @@ def _ensure_list(value):
     if isinstance(value, _KEY_TYPES) or (isinstance(value, ndarray) and len(value.shape) == 1):
         return [value]
 
+    if isinstance(value, ndarray) and len(value.shape) == 2:
+        return list(value)
+
     return value
 
 
@@ -453,6 +456,71 @@ def word_vec(self, *args, **kwargs):
         """Compatibility alias for get_vector(); must exist so subclass calls reach subclass get_vector()."""
         return self.get_vector(*args, **kwargs)
 
+    def get_mean_vector(self, keys, weights=None, pre_normalize=True, post_normalize=False, ignore_missing=True):
+        """Get the mean vector for a given list of keys.
+
+        Parameters
+        ----------
+
+        keys : list of (str or int or ndarray)
+            Keys specified by string or int ids or numpy array.
+        weights : list of float or numpy.ndarray, optional
+            1D array of same size of `keys` specifying the weight for each key.
+        pre_normalize : bool, optional
+            Flag indicating whether to normalize each keyvector before taking mean.
+            If False, individual keyvector will not be normalized.
+        post_normalize: bool, optional
+            Flag indicating whether to normalize the final mean vector.
+            If True, normalized mean vector will be return.
+        ignore_missing : bool, optional
+            If False, will raise error if a key doesn't exist in vocabulary.
+
+        Returns
+        -------
+
+        numpy.ndarray
+            Mean vector for the list of keys.
+
+        Raises
+        ------
+
+        ValueError
+            If the size of the list of `keys` and `weights` doesn't match.
+        KeyError
+            If any of the key doesn't exist in vocabulary and `ignore_missing` is false.
+
+        """
+        if len(keys) == 0:
+            raise ValueError("cannot compute mean with no input")
+        if isinstance(weights, list):
+            weights = np.array(weights)
+        if weights is None:
+            weights = np.ones(len(keys))
+        if len(keys) != weights.shape[0]:  # weights is a 1-D numpy array
+            raise ValueError(
+                "keys and weights array must have same number of elements"
+            )
+
+        mean = np.zeros(self.vector_size, self.vectors.dtype)
+
+        total_weight = 0
+        for idx, key in enumerate(keys):
+            if isinstance(key, ndarray):
+                mean += weights[idx] * key
+                total_weight += abs(weights[idx])
+            elif self.__contains__(key):
+                vec = self.get_vector(key, norm=pre_normalize)
+                mean += weights[idx] * vec
+                total_weight += abs(weights[idx])
+            elif not ignore_missing:
+                raise KeyError(f"Key '{key}' not present in vocabulary")
+
+        if(total_weight > 0):
+            mean = mean / total_weight
+        if post_normalize:
+            mean = matutils.unitvec(mean).astype(REAL)
+        return mean
+
     def add_vector(self, key, vector):
         """Add one new vector at the given key, into existing slot if available.
 
@@ -717,10 +785,10 @@ def most_similar(
 
         Parameters
         ----------
-        positive : list of (str or int or ndarray), optional
-            List of keys that contribute positively.
-        negative : list of (str or int or ndarray), optional
-            List of keys that contribute negatively.
+        positive : list of (str or int or ndarray) or list of ((str,float) or (int,float) or (ndarray,float)), optional
+            List of keys that contribute positively. If tuple, second element specifies the weight (default `1.0`)
+        negative : list of (str or int or ndarray) or list of ((str,float) or (int,float) or (ndarray,float)), optional
+            List of keys that contribute negatively. If tuple, second element specifies the weight (default `-1.0`)
         topn : int or None, optional
             Number of top-N similar keys to return, when `topn` is int. When `topn` is None,
             then similarities for all keys are returned.
@@ -758,27 +826,20 @@ def most_similar(
             clip_end = restrict_vocab
 
         # add weights for each key, if not already present; default to 1.0 for positive and -1.0 for negative keys
-        positive = [
-            (item, 1.0) if isinstance(item, _EXTENDED_KEY_TYPES) else item
-            for item in positive
-        ]
-        negative = [
-            (item, -1.0) if isinstance(item, _EXTENDED_KEY_TYPES) else item
-            for item in negative
-        ]
+        keys = []
+        weight = np.concatenate((np.ones(len(positive)), -1.0 * np.ones(len(negative))))
+        for idx, item in enumerate(positive + negative):
+            if isinstance(item, _EXTENDED_KEY_TYPES):
+                keys.append(item)
+            else:
+                keys.append(item[0])
+                weight[idx] = item[1]
 
         # compute the weighted average of all keys
-        all_keys, mean = set(), []
-        for key, weight in positive + negative:
-            if isinstance(key, ndarray):
-                mean.append(weight * key)
-            else:
-                mean.append(weight * self.get_vector(key, norm=True))
-                if self.has_index_for(key):
-                    all_keys.add(self.get_index(key))
-        if not mean:
-            raise ValueError("cannot compute similarity with no input")
-        mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL)
+        mean = self.get_mean_vector(keys, weight, pre_normalize=True, post_normalize=True, ignore_missing=False)
+        all_keys = [
+            self.get_index(key) for key in keys if isinstance(key, _KEY_TYPES) and self.has_index_for(key)
+        ]
 
         if indexer is not None and isinstance(topn, int):
             return indexer.most_similar(mean, topn)
@@ -1059,7 +1120,7 @@ def rank_by_centrality(self, words, use_norm=True):
         if not used_words:
             raise ValueError("cannot select a word from an empty list")
         vectors = vstack([self.get_vector(word, norm=use_norm) for word in used_words]).astype(REAL)
-        mean = matutils.unitvec(vectors.mean(axis=0)).astype(REAL)
+        mean = self.get_mean_vector(vectors, post_normalize=True)
         dists = dot(vectors, mean)
         return sorted(zip(dists, used_words), reverse=True)
 
@@ -1191,9 +1252,9 @@ def n_similarity(self, ws1, ws2):
         """
         if not(len(ws1) and len(ws2)):
             raise ZeroDivisionError('At least one of the passed list is empty.')
-        v1 = [self[key] for key in ws1]
-        v2 = [self[key] for key in ws2]
-        return dot(matutils.unitvec(array(v1).mean(axis=0)), matutils.unitvec(array(v2).mean(axis=0)))
+        mean1 = self.get_mean_vector(ws1, pre_normalize=False)
+        mean2 = self.get_mean_vector(ws2, pre_normalize=False)
+        return dot(matutils.unitvec(mean1), matutils.unitvec(mean2))
 
     @staticmethod
     def _log_evaluate_word_analogies(section):

diff --git a/gensim/test/test_keyedvectors.py b/gensim/test/test_keyedvectors.py
@@ -366,6 +366,35 @@ def test_no_header(self):
         self.assertEqual(randkv.index_to_key, reloadtxtkv.index_to_key)
         self.assertTrue((randkv.vectors == reloadtxtkv.vectors).all())
 
+    def test_get_mean_vector(self):
+        """Test get_mean_vector returns expected results."""
+        keys = [
+            'conflict',
+            'administration',
+            'terrorism',
+            'call',
+            'an out-of-vocabulary word',
+        ]
+        weights = [1, 2, 3, 1, 2]
+        expected_result_1 = np.array([
+            0.02000151, -0.12685453, 0.09196121, 0.25514853, 0.25740655,
+            -0.11134843, -0.0502661, -0.19278568, -0.83346179, -0.12068878,
+            ], dtype=np.float32)
+        expected_result_2 = np.array([
+            -0.0145228, -0.11530358, 0.1169825, 0.22537769, 0.29353586,
+            -0.10458107, -0.05272481, -0.17547795, -0.84245106, -0.10356515,
+            ], dtype=np.float32)
+        expected_result_3 = np.array([
+            0.01343237, -0.47651053, 0.45645328, 0.98304356, 1.1840123,
+            -0.51647933, -0.25308795, -0.77931081, -3.55954733, -0.55429711,
+            ], dtype=np.float32)
+
+        self.assertTrue(np.allclose(self.vectors.get_mean_vector(keys), expected_result_1))
+        self.assertTrue(np.allclose(self.vectors.get_mean_vector(keys, weights), expected_result_2))
+        self.assertTrue(np.allclose(
+            self.vectors.get_mean_vector(keys, pre_normalize=False), expected_result_3)
+        )
+
 
 class Gensim320Test(unittest.TestCase):
     def test(self):