From 6a94492cfb4f1454c692fa7b4fa5ba2944853d5f Mon Sep 17 00:00:00 2001
From: Mathis <mathis.demay@protonmail.com>
Date: Tue, 18 May 2021 10:43:24 +0200
Subject: [PATCH 1/8] [Fix] gensim/models/word2vec.py: in method
 predict_output_word, changed a call to sum to numpy.sum to gain performance.

---
 gensim/models/word2vec.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py
index 2593a373b0..b19f23c97e 100755
--- a/gensim/models/word2vec.py
+++ b/gensim/models/word2vec.py
@@ -1830,7 +1830,7 @@ def predict_output_word(self, context_words_list, topn=10):
 
         # propagate hidden -> output and take softmax to get probabilities
         prob_values = np.exp(np.dot(l1, self.syn1neg.T))
-        prob_values /= sum(prob_values)
+        prob_values /= np.sum(prob_values)
         top_indices = matutils.argsort(prob_values, topn=topn, reverse=True)
         # returning the most probable output words with their probabilities
         return [(self.wv.index_to_key[index1], prob_values[index1]) for index1 in top_indices]

From 91d5dca174808765a0cc929ce2cc6d4e693529a2 Mon Sep 17 00:00:00 2001
From: Mathis <mathis.demay@protonmail.com>
Date: Tue, 18 May 2021 16:05:54 +0200
Subject: [PATCH 2/8] [Feat]
 gensim.models.word2vec.Word2Vec.predict_output_word: added possibility for
 the user to input a list of word indices as parameter 'context' instead of a
 list of words.

---
 gensim/models/word2vec.py | 28 +++++++++++++++++++++-------
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py
index b19f23c97e..c363ac42fa 100755
--- a/gensim/models/word2vec.py
+++ b/gensim/models/word2vec.py
@@ -1799,8 +1799,9 @@ def predict_output_word(self, context_words_list, topn=10):
 
         Parameters
         ----------
-        context_words_list : list of str
-            List of context words.
+        context_words_list : list of str OR list of int
+            If list of str: List of context words.
+            If list of int: List of indices of context words in `self.wv.vectors`
         topn : int, optional
             Return `topn` words and their probabilities.
 
@@ -1819,13 +1820,26 @@ def predict_output_word(self, context_words_list, topn=10):
         if not hasattr(self.wv, 'vectors') or not hasattr(self, 'syn1neg'):
             raise RuntimeError("Parameters required for predicting the output words not found.")
 
-        word2_indices = [self.wv.get_index(w) for w in context_words_list if w in self.wv]
-        if not word2_indices:
-            logger.warning("All the input context words are out-of-vocabulary for the current model.")
-            return None
+        if all(isinstance(w, int) for w in context_words_list):
+            # then, indices were passed. Check they are valid
+            word2_indices = np.array(context_words_list)
+            if np.any(word2_indices < 0):
+                logger.warning("All input context word indices must be non-negative.")
+                return None
+            # take only the ones in the vocabulary
+            word2_indices = word2_indices[word2_indices < self.wv.vectors.shape[0]]
+            if word2_indices.size == 0:
+                logger.warning("All the input context words are out-of-vocabulary for the current model.")
+                return None
+        else:
+            # then, words were passed. Retrieve their indices
+            word2_indices = [self.wv.get_index(w) for w in context_words_list if w in self.wv]
+            if not word2_indices:
+                logger.warning("All the input context words are out-of-vocabulary for the current model.")
+                return None
 
         l1 = np.sum(self.wv.vectors[word2_indices], axis=0)
-        if word2_indices and self.cbow_mean:
+        if self.cbow_mean:
             l1 /= len(word2_indices)
 
         # propagate hidden -> output and take softmax to get probabilities

From 84258b4333ee0293e960d81873dc505eafaba1cc Mon Sep 17 00:00:00 2001
From: M-Demay <55137190+M-Demay@users.noreply.github.com>
Date: Tue, 25 May 2021 10:26:29 +0200
Subject: [PATCH 3/8] Word2Vec.predict_output_word: Changed handling of ints
 and strs, trying to trying to make it more compact and versatile.

---
 gensim/models/word2vec.py | 32 ++++++++++++++------------------
 1 file changed, 14 insertions(+), 18 deletions(-)

diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py
index c363ac42fa..9fb0a4a9bb 100755
--- a/gensim/models/word2vec.py
+++ b/gensim/models/word2vec.py
@@ -1819,24 +1819,20 @@ def predict_output_word(self, context_words_list, topn=10):
 
         if not hasattr(self.wv, 'vectors') or not hasattr(self, 'syn1neg'):
             raise RuntimeError("Parameters required for predicting the output words not found.")
-
-        if all(isinstance(w, int) for w in context_words_list):
-            # then, indices were passed. Check they are valid
-            word2_indices = np.array(context_words_list)
-            if np.any(word2_indices < 0):
-                logger.warning("All input context word indices must be non-negative.")
-                return None
-            # take only the ones in the vocabulary
-            word2_indices = word2_indices[word2_indices < self.wv.vectors.shape[0]]
-            if word2_indices.size == 0:
-                logger.warning("All the input context words are out-of-vocabulary for the current model.")
-                return None
-        else:
-            # then, words were passed. Retrieve their indices
-            word2_indices = [self.wv.get_index(w) for w in context_words_list if w in self.wv]
-            if not word2_indices:
-                logger.warning("All the input context words are out-of-vocabulary for the current model.")
-                return None
+        
+        # Retrieve indices if words were passed as input, otherwise keep the input indices
+        # Remark : out-of-vocabulary words are discarded.
+        word2_indices = []
+        max_index = self.wv.vectors.shape[0]
+        for w in context_words_list:
+            if w in self.wv:
+                word2_indices.append(self.wv.get_index(w))
+            elif isinstance(w, int) and (w < max_index):
+                word2_indices.append(w)
+
+        if not word2_indices:
+            logger.warning("All the input context words are out-of-vocabulary for the current model.")
+            return None
 
         l1 = np.sum(self.wv.vectors[word2_indices], axis=0)
         if self.cbow_mean:

From 47a7fe51db778d7a21ba141f3ee11e4608b25333 Mon Sep 17 00:00:00 2001
From: Mathis <mathis.demay@protonmail.com>
Date: Wed, 26 May 2021 10:29:59 +0200
Subject: [PATCH 4/8] Fixed docstring of predict_output_word.

---
 gensim/models/word2vec.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py
index 9fb0a4a9bb..3af14eec93 100755
--- a/gensim/models/word2vec.py
+++ b/gensim/models/word2vec.py
@@ -1799,7 +1799,7 @@ def predict_output_word(self, context_words_list, topn=10):
 
         Parameters
         ----------
-        context_words_list : list of str OR list of int
+        context_words_list : list of (str OR int)
             If list of str: List of context words.
             If list of int: List of indices of context words in `self.wv.vectors`
         topn : int, optional

From ba00d9884e75ecb84c41b15c24a94d2c3910624d Mon Sep 17 00:00:00 2001
From: Paul Andrey <paul.andrey@hotmail.fr>
Date: Tue, 15 Jun 2021 23:57:15 +0200
Subject: [PATCH 5/8] Simplified `predict_output_word` changes.

* Retained the suggested `sum`->`np.sum`
  replacement, which has been tested to
  yield significant runtime gains.
* Dropped unnecessary type/value checks
  that are already run when calling the
  `KeyedVectors.__isin__` dunder method.
* Corrected the docstring to accurately
  document the supported inputs (which
  were already compatible prior to the
  PR this commit is a part of).
---
 gensim/models/word2vec.py | 19 +++++--------------
 1 file changed, 5 insertions(+), 14 deletions(-)

diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py
index 3af14eec93..f9bb39a512 100755
--- a/gensim/models/word2vec.py
+++ b/gensim/models/word2vec.py
@@ -1799,9 +1799,9 @@ def predict_output_word(self, context_words_list, topn=10):
 
         Parameters
         ----------
-        context_words_list : list of (str OR int)
-            If list of str: List of context words.
-            If list of int: List of indices of context words in `self.wv.vectors`
+        context_words_list : list of (str and/or int)
+            List of context words, which may be words themselves (str)
+            or their index in `self.wv.vectors` (int).
         topn : int, optional
             Return `topn` words and their probabilities.
 
@@ -1819,23 +1819,14 @@ def predict_output_word(self, context_words_list, topn=10):
 
         if not hasattr(self.wv, 'vectors') or not hasattr(self, 'syn1neg'):
             raise RuntimeError("Parameters required for predicting the output words not found.")
-        
-        # Retrieve indices if words were passed as input, otherwise keep the input indices
-        # Remark : out-of-vocabulary words are discarded.
-        word2_indices = []
-        max_index = self.wv.vectors.shape[0]
-        for w in context_words_list:
-            if w in self.wv:
-                word2_indices.append(self.wv.get_index(w))
-            elif isinstance(w, int) and (w < max_index):
-                word2_indices.append(w)
+        word2_indices = [self.wv.get_index(w) for w in context_words_list if w in self.wv]
 
         if not word2_indices:
             logger.warning("All the input context words are out-of-vocabulary for the current model.")
             return None
 
         l1 = np.sum(self.wv.vectors[word2_indices], axis=0)
-        if self.cbow_mean:
+        if word2_indices and self.cbow_mean:
             l1 /= len(word2_indices)
 
         # propagate hidden -> output and take softmax to get probabilities

From 679a0866cab3f4c212cbb98219444bf8b0acde1d Mon Sep 17 00:00:00 2001
From: Mathis Demay <mathis.demay.etu@univ-lille.fr>
Date: Thu, 1 Jul 2021 09:58:22 +0200
Subject: [PATCH 6/8] Added tests for gensim.Word2Vec.predict_output_word()
 when context contains ints.

---
 gensim/test/test_word2vec.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py
index d46b2f3e37..5cc9bb238c 100644
--- a/gensim/test/test_word2vec.py
+++ b/gensim/test/test_word2vec.py
@@ -843,6 +843,16 @@ def test_predict_output_word(self):
         model_without_neg = word2vec.Word2Vec(sentences, min_count=1, negative=0)
         self.assertRaises(RuntimeError, model_without_neg.predict_output_word, ['system', 'human'])
 
+        # passing indices instead of words in context
+        str_context = ['system', 'human']
+        mixed_context = [model_with_neg.wv.get_index(str_context[0]), str_context[1]]
+        idx_context = [model_with_neg.wv.get_index(w) for w in str_context]
+        prediction_from_str = model_with_neg.predict_output_word(str_context, topn=5)
+        prediction_from_mixed = model_with_neg.predict_output_word(mixed_context, topn=5)
+        prediction_from_idx = model_with_neg.predict_output_word(idx_context, topn=5)
+        self.assertEqual(prediction_from_str, prediction_from_mixed)
+        self.assertEqual(prediction_from_str, prediction_from_idx)
+
     def test_load_old_model(self):
         """Test loading an old word2vec model of indeterminate version"""
 

From ed8ec363c4c729de9a0071c9b59230b573bd6495 Mon Sep 17 00:00:00 2001
From: Michael Penkov <m@penkov.dev>
Date: Mon, 19 Jul 2021 14:56:14 +0900
Subject: [PATCH 7/8] Update CHANGELOG.md

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 507d93c380..ed68783eee 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -15,6 +15,7 @@ Changes
 * [#3115](https://github.com/RaRe-Technologies/gensim/pull/3115): Make LSI dispatcher CLI param for number of jobs optional, by [@robguinness](https://github.com/robguinness)
 * [#3128](https://github.com/RaRe-Technologies/gensim/pull/3128): Materialize and copy the corpus passed to SoftCosineSimilarity, by [@Witiko](https://github.com/Witiko)
 * [#3131](https://github.com/RaRe-Technologies/gensim/pull/3131): Added import to Nmf docs, and to models/__init__.py, by [@properGrammar](https://github.com/properGrammar)
+* [#3153](https://github.com/RaRe-Technologies/gensim/pull/3153): Vectorize word2vec.predict_output_word for speed, by [@M-Demay](https://github.com/M-Demay)
 
 ### :books: Documentation
 

From 04cc1375406333467034f9f83e270d6c075cc176 Mon Sep 17 00:00:00 2001
From: Michael Penkov <m@penkov.dev>
Date: Mon, 19 Jul 2021 14:59:58 +0900
Subject: [PATCH 8/8] update sbt install step

---
 .github/workflows/tests.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 8e3ad48871..41a608ef90 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -39,7 +39,8 @@ jobs:
       #
       - name: Update sbt
         run: |
-          echo "deb https://dl.bintray.com/sbt/debian /" | sudo tee -a /etc/apt/sources.list.d/sbt.list
+          echo "deb https://repo.scala-sbt.org/scalasbt/debian all main" | sudo tee /etc/apt/sources.list.d/sbt.list
+          echo "deb https://repo.scala-sbt.org/scalasbt/debian /" | sudo tee /etc/apt/sources.list.d/sbt_old.list
           curl -sL "https://keyserver.ubuntu.com/pks/lookup?op=get&search=0x2EE0EA64E40A89B84B2DF73499E82A75642AC823" | sudo apt-key add
           sudo apt-get update -y
           sudo apt-get install -y sbt