From 6a94492cfb4f1454c692fa7b4fa5ba2944853d5f Mon Sep 17 00:00:00 2001 From: Mathis Date: Tue, 18 May 2021 10:43:24 +0200 Subject: [PATCH 1/8] [Fix] gensim/models/word2vec.py: in method predict_output_word, changed a call to sum to numpy.sum to gain performance. --- gensim/models/word2vec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 2593a373b0..b19f23c97e 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -1830,7 +1830,7 @@ def predict_output_word(self, context_words_list, topn=10): # propagate hidden -> output and take softmax to get probabilities prob_values = np.exp(np.dot(l1, self.syn1neg.T)) - prob_values /= sum(prob_values) + prob_values /= np.sum(prob_values) top_indices = matutils.argsort(prob_values, topn=topn, reverse=True) # returning the most probable output words with their probabilities return [(self.wv.index_to_key[index1], prob_values[index1]) for index1 in top_indices] From 91d5dca174808765a0cc929ce2cc6d4e693529a2 Mon Sep 17 00:00:00 2001 From: Mathis Date: Tue, 18 May 2021 16:05:54 +0200 Subject: [PATCH 2/8] [Feat] gensim.models.word2vec.Word2Vec.predict_output_word: added possibility for the user to input a list of word indices as parameter 'context' instead of a list of words. --- gensim/models/word2vec.py | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index b19f23c97e..c363ac42fa 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -1799,8 +1799,9 @@ def predict_output_word(self, context_words_list, topn=10): Parameters ---------- - context_words_list : list of str - List of context words. + context_words_list : list of str OR list of int + If list of str: List of context words. + If list of int: List of indices of context words in `self.wv.vectors` topn : int, optional Return `topn` words and their probabilities. @@ -1819,13 +1820,26 @@ def predict_output_word(self, context_words_list, topn=10): if not hasattr(self.wv, 'vectors') or not hasattr(self, 'syn1neg'): raise RuntimeError("Parameters required for predicting the output words not found.") - word2_indices = [self.wv.get_index(w) for w in context_words_list if w in self.wv] - if not word2_indices: - logger.warning("All the input context words are out-of-vocabulary for the current model.") - return None + if all(isinstance(w, int) for w in context_words_list): + # then, indices were passed. Check they are valid + word2_indices = np.array(context_words_list) + if np.any(word2_indices < 0): + logger.warning("All input context word indices must be non-negative.") + return None + # take only the ones in the vocabulary + word2_indices = word2_indices[word2_indices < self.wv.vectors.shape[0]] + if word2_indices.size == 0: + logger.warning("All the input context words are out-of-vocabulary for the current model.") + return None + else: + # then, words were passed. Retrieve their indices + word2_indices = [self.wv.get_index(w) for w in context_words_list if w in self.wv] + if not word2_indices: + logger.warning("All the input context words are out-of-vocabulary for the current model.") + return None l1 = np.sum(self.wv.vectors[word2_indices], axis=0) - if word2_indices and self.cbow_mean: + if self.cbow_mean: l1 /= len(word2_indices) # propagate hidden -> output and take softmax to get probabilities From 84258b4333ee0293e960d81873dc505eafaba1cc Mon Sep 17 00:00:00 2001 From: M-Demay <55137190+M-Demay@users.noreply.github.com> Date: Tue, 25 May 2021 10:26:29 +0200 Subject: [PATCH 3/8] Word2Vec.predict_output_word: Changed handling of ints and strs, trying to trying to make it more compact and versatile. --- gensim/models/word2vec.py | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index c363ac42fa..9fb0a4a9bb 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -1819,24 +1819,20 @@ def predict_output_word(self, context_words_list, topn=10): if not hasattr(self.wv, 'vectors') or not hasattr(self, 'syn1neg'): raise RuntimeError("Parameters required for predicting the output words not found.") - - if all(isinstance(w, int) for w in context_words_list): - # then, indices were passed. Check they are valid - word2_indices = np.array(context_words_list) - if np.any(word2_indices < 0): - logger.warning("All input context word indices must be non-negative.") - return None - # take only the ones in the vocabulary - word2_indices = word2_indices[word2_indices < self.wv.vectors.shape[0]] - if word2_indices.size == 0: - logger.warning("All the input context words are out-of-vocabulary for the current model.") - return None - else: - # then, words were passed. Retrieve their indices - word2_indices = [self.wv.get_index(w) for w in context_words_list if w in self.wv] - if not word2_indices: - logger.warning("All the input context words are out-of-vocabulary for the current model.") - return None + + # Retrieve indices if words were passed as input, otherwise keep the input indices + # Remark : out-of-vocabulary words are discarded. + word2_indices = [] + max_index = self.wv.vectors.shape[0] + for w in context_words_list: + if w in self.wv: + word2_indices.append(self.wv.get_index(w)) + elif isinstance(w, int) and (w < max_index): + word2_indices.append(w) + + if not word2_indices: + logger.warning("All the input context words are out-of-vocabulary for the current model.") + return None l1 = np.sum(self.wv.vectors[word2_indices], axis=0) if self.cbow_mean: From 47a7fe51db778d7a21ba141f3ee11e4608b25333 Mon Sep 17 00:00:00 2001 From: Mathis Date: Wed, 26 May 2021 10:29:59 +0200 Subject: [PATCH 4/8] Fixed docstring of predict_output_word. --- gensim/models/word2vec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 9fb0a4a9bb..3af14eec93 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -1799,7 +1799,7 @@ def predict_output_word(self, context_words_list, topn=10): Parameters ---------- - context_words_list : list of str OR list of int + context_words_list : list of (str OR int) If list of str: List of context words. If list of int: List of indices of context words in `self.wv.vectors` topn : int, optional From ba00d9884e75ecb84c41b15c24a94d2c3910624d Mon Sep 17 00:00:00 2001 From: Paul Andrey Date: Tue, 15 Jun 2021 23:57:15 +0200 Subject: [PATCH 5/8] Simplified `predict_output_word` changes. * Retained the suggested `sum`->`np.sum` replacement, which has been tested to yield significant runtime gains. * Dropped unnecessary type/value checks that are already run when calling the `KeyedVectors.__isin__` dunder method. * Corrected the docstring to accurately document the supported inputs (which were already compatible prior to the PR this commit is a part of). --- gensim/models/word2vec.py | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 3af14eec93..f9bb39a512 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -1799,9 +1799,9 @@ def predict_output_word(self, context_words_list, topn=10): Parameters ---------- - context_words_list : list of (str OR int) - If list of str: List of context words. - If list of int: List of indices of context words in `self.wv.vectors` + context_words_list : list of (str and/or int) + List of context words, which may be words themselves (str) + or their index in `self.wv.vectors` (int). topn : int, optional Return `topn` words and their probabilities. @@ -1819,23 +1819,14 @@ def predict_output_word(self, context_words_list, topn=10): if not hasattr(self.wv, 'vectors') or not hasattr(self, 'syn1neg'): raise RuntimeError("Parameters required for predicting the output words not found.") - - # Retrieve indices if words were passed as input, otherwise keep the input indices - # Remark : out-of-vocabulary words are discarded. - word2_indices = [] - max_index = self.wv.vectors.shape[0] - for w in context_words_list: - if w in self.wv: - word2_indices.append(self.wv.get_index(w)) - elif isinstance(w, int) and (w < max_index): - word2_indices.append(w) + word2_indices = [self.wv.get_index(w) for w in context_words_list if w in self.wv] if not word2_indices: logger.warning("All the input context words are out-of-vocabulary for the current model.") return None l1 = np.sum(self.wv.vectors[word2_indices], axis=0) - if self.cbow_mean: + if word2_indices and self.cbow_mean: l1 /= len(word2_indices) # propagate hidden -> output and take softmax to get probabilities From 679a0866cab3f4c212cbb98219444bf8b0acde1d Mon Sep 17 00:00:00 2001 From: Mathis Demay Date: Thu, 1 Jul 2021 09:58:22 +0200 Subject: [PATCH 6/8] Added tests for gensim.Word2Vec.predict_output_word() when context contains ints. --- gensim/test/test_word2vec.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py index d46b2f3e37..5cc9bb238c 100644 --- a/gensim/test/test_word2vec.py +++ b/gensim/test/test_word2vec.py @@ -843,6 +843,16 @@ def test_predict_output_word(self): model_without_neg = word2vec.Word2Vec(sentences, min_count=1, negative=0) self.assertRaises(RuntimeError, model_without_neg.predict_output_word, ['system', 'human']) + # passing indices instead of words in context + str_context = ['system', 'human'] + mixed_context = [model_with_neg.wv.get_index(str_context[0]), str_context[1]] + idx_context = [model_with_neg.wv.get_index(w) for w in str_context] + prediction_from_str = model_with_neg.predict_output_word(str_context, topn=5) + prediction_from_mixed = model_with_neg.predict_output_word(mixed_context, topn=5) + prediction_from_idx = model_with_neg.predict_output_word(idx_context, topn=5) + self.assertEqual(prediction_from_str, prediction_from_mixed) + self.assertEqual(prediction_from_str, prediction_from_idx) + def test_load_old_model(self): """Test loading an old word2vec model of indeterminate version""" From ed8ec363c4c729de9a0071c9b59230b573bd6495 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Mon, 19 Jul 2021 14:56:14 +0900 Subject: [PATCH 7/8] Update CHANGELOG.md --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 507d93c380..ed68783eee 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,7 @@ Changes * [#3115](https://github.com/RaRe-Technologies/gensim/pull/3115): Make LSI dispatcher CLI param for number of jobs optional, by [@robguinness](https://github.com/robguinness) * [#3128](https://github.com/RaRe-Technologies/gensim/pull/3128): Materialize and copy the corpus passed to SoftCosineSimilarity, by [@Witiko](https://github.com/Witiko) * [#3131](https://github.com/RaRe-Technologies/gensim/pull/3131): Added import to Nmf docs, and to models/__init__.py, by [@properGrammar](https://github.com/properGrammar) +* [#3153](https://github.com/RaRe-Technologies/gensim/pull/3153): Vectorize word2vec.predict_output_word for speed, by [@M-Demay](https://github.com/M-Demay) ### :books: Documentation From 04cc1375406333467034f9f83e270d6c075cc176 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Mon, 19 Jul 2021 14:59:58 +0900 Subject: [PATCH 8/8] update sbt install step --- .github/workflows/tests.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 8e3ad48871..41a608ef90 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -39,7 +39,8 @@ jobs: # - name: Update sbt run: | - echo "deb https://dl.bintray.com/sbt/debian /" | sudo tee -a /etc/apt/sources.list.d/sbt.list + echo "deb https://repo.scala-sbt.org/scalasbt/debian all main" | sudo tee /etc/apt/sources.list.d/sbt.list + echo "deb https://repo.scala-sbt.org/scalasbt/debian /" | sudo tee /etc/apt/sources.list.d/sbt_old.list curl -sL "https://keyserver.ubuntu.com/pks/lookup?op=get&search=0x2EE0EA64E40A89B84B2DF73499E82A75642AC823" | sudo apt-key add sudo apt-get update -y sudo apt-get install -y sbt