From b4915c030cf1e0adb00fd50849e70ad92a2add1f Mon Sep 17 00:00:00 2001 From: Simon Wiles Date: Mon, 16 Nov 2020 22:59:06 -0800 Subject: [PATCH 01/12] Allow supplying a string-key as the negative arg. to most_similar() --- gensim/models/keyedvectors.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index 193c5f8f0f..da5ec5e687 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -746,6 +746,9 @@ def most_similar( # allow calls like most_similar('dog'), as a shorthand for most_similar(['dog']) positive = [positive] + if isinstance(negative, KEY_TYPES) and not positive: + negative = [negative] + # add weights for each key, if not already present; default to 1.0 for positive and -1.0 for negative keys positive = [ (item, 1.0) if isinstance(item, KEY_TYPES + (ndarray,)) From ace83215e75e78a26b6e58814626de76657044ea Mon Sep 17 00:00:00 2001 From: Simon Wiles Date: Mon, 16 Nov 2020 23:44:06 -0800 Subject: [PATCH 02/12] Allow a single vector as a positive or negative arg. to most_similar() --- gensim/models/keyedvectors.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index da5ec5e687..ecb18dfa92 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -742,11 +742,11 @@ def most_similar( clip_start = 0 clip_end = restrict_vocab - if isinstance(positive, KEY_TYPES) and not negative: + if isinstance(positive, KEY_TYPES + (ndarray,)) and not negative: # allow calls like most_similar('dog'), as a shorthand for most_similar(['dog']) positive = [positive] - if isinstance(negative, KEY_TYPES) and not positive: + if isinstance(negative, KEY_TYPES + (ndarray,)) and not positive: negative = [negative] # add weights for each key, if not already present; default to 1.0 for positive and -1.0 for negative keys From 15bfd1ba2154a1cca9a57d9348d89e727ac77992 Mon Sep 17 00:00:00 2001 From: Simon Wiles Date: Mon, 16 Nov 2020 23:50:01 -0800 Subject: [PATCH 03/12] Update comments --- gensim/models/keyedvectors.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index ecb18dfa92..b8efd9679f 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -743,10 +743,11 @@ def most_similar( clip_end = restrict_vocab if isinstance(positive, KEY_TYPES + (ndarray,)) and not negative: - # allow calls like most_similar('dog'), as a shorthand for most_similar(['dog']) + # allow passing a single string-key or vector for the positive argument positive = [positive] if isinstance(negative, KEY_TYPES + (ndarray,)) and not positive: + # allow passing a single string-key or vector for the negative argument negative = [negative] # add weights for each key, if not already present; default to 1.0 for positive and -1.0 for negative keys From 475b013df8b3638892694844871976fe957bb78e Mon Sep 17 00:00:00 2001 From: Simon Wiles Date: Mon, 16 Nov 2020 23:51:32 -0800 Subject: [PATCH 04/12] Accept single arguments when positive and negative are both supplied --- gensim/models/keyedvectors.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index b8efd9679f..1faaecc3f0 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -742,11 +742,11 @@ def most_similar( clip_start = 0 clip_end = restrict_vocab - if isinstance(positive, KEY_TYPES + (ndarray,)) and not negative: + if isinstance(positive, KEY_TYPES + (ndarray,)): # allow passing a single string-key or vector for the positive argument positive = [positive] - if isinstance(negative, KEY_TYPES + (ndarray,)) and not positive: + if isinstance(negative, KEY_TYPES + (ndarray,)): # allow passing a single string-key or vector for the negative argument negative = [negative] From 571cff9d2d61962bcb566806ad164740b822e744 Mon Sep 17 00:00:00 2001 From: Simon Wiles Date: Mon, 16 Nov 2020 23:54:23 -0800 Subject: [PATCH 05/12] Update most_similar_cosmul to match most_similar I'm not sure if this fully addresses the `# TODO: Update to better match & share code with most_similar()` at line #981 or not, so I've left it in. --- gensim/models/keyedvectors.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index 1faaecc3f0..2f78e6dfee 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -989,10 +989,14 @@ def most_similar_cosmul(self, positive=None, negative=None, topn=10): self.fill_norms() - if isinstance(positive, str) and not negative: - # allow calls like most_similar_cosmul('dog'), as a shorthand for most_similar_cosmul(['dog']) + if isinstance(positive, KEY_TYPES + (ndarray,)): + # allow passing a single string-key or vector for the positive argument positive = [positive] + if isinstance(negative, KEY_TYPES + (ndarray,)): + # allow passing a single string-key or vector for the negative argument + negative = [negative] + all_words = { self.get_index(word) for word in positive + negative if not isinstance(word, ndarray) and word in self.key_to_index From 647a2eb323143a66bc246ba3deec86d4617b17a9 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Fri, 23 Jul 2021 16:16:41 +0900 Subject: [PATCH 06/12] minor code cleanup --- gensim/models/keyedvectors.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index 2f78e6dfee..d0af0716c3 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -189,6 +189,8 @@ KEY_TYPES = (str, int, np.integer) +_EXTENDED_KEY_TYPES = (str, int, np.integer, np.ndarray) + class KeyedVectors(utils.SaveLoad): def __init__(self, vector_size, count=0, dtype=np.float32, mapfile_path=None): @@ -742,21 +744,21 @@ def most_similar( clip_start = 0 clip_end = restrict_vocab - if isinstance(positive, KEY_TYPES + (ndarray,)): + if isinstance(positive, _EXTENDED_KEY_TYPES): # allow passing a single string-key or vector for the positive argument positive = [positive] - if isinstance(negative, KEY_TYPES + (ndarray,)): + if isinstance(negative, _EXTENDED_KEY_TYPES): # allow passing a single string-key or vector for the negative argument negative = [negative] # add weights for each key, if not already present; default to 1.0 for positive and -1.0 for negative keys positive = [ - (item, 1.0) if isinstance(item, KEY_TYPES + (ndarray,)) + (item, 1.0) if isinstance(item, _EXTENDED_KEY_TYPES) else item for item in positive ] negative = [ - (item, -1.0) if isinstance(item, KEY_TYPES + (ndarray,)) + (item, -1.0) if isinstance(item, _EXTENDED_KEY_TYPES) else item for item in negative ] @@ -989,11 +991,11 @@ def most_similar_cosmul(self, positive=None, negative=None, topn=10): self.fill_norms() - if isinstance(positive, KEY_TYPES + (ndarray,)): + if isinstance(positive, _EXTENDED_KEY_TYPES): # allow passing a single string-key or vector for the positive argument positive = [positive] - if isinstance(negative, KEY_TYPES + (ndarray,)): + if isinstance(negative, _EXTENDED_KEY_TYPES): # allow passing a single string-key or vector for the negative argument negative = [negative] From 54b67098a3023dce4134e01bc11dd842f03b2869 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Fri, 23 Jul 2021 16:28:06 +0900 Subject: [PATCH 07/12] add unit tests --- gensim/models/keyedvectors.py | 8 ++++---- gensim/test/test_keyedvectors.py | 35 ++++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 4 deletions(-) diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index d0af0716c3..8bda9437d7 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -754,12 +754,12 @@ def most_similar( # add weights for each key, if not already present; default to 1.0 for positive and -1.0 for negative keys positive = [ - (item, 1.0) if isinstance(item, _EXTENDED_KEY_TYPES) - else item for item in positive + (item, 1.0) if isinstance(item, _EXTENDED_KEY_TYPES) else item + for item in positive ] negative = [ - (item, -1.0) if isinstance(item, _EXTENDED_KEY_TYPES) - else item for item in negative + (item, -1.0) if isinstance(item, _EXTENDED_KEY_TYPES) else item + for item in negative ] # compute the weighted average of all keys diff --git a/gensim/test/test_keyedvectors.py b/gensim/test/test_keyedvectors.py index fd96f9f26f..4de14f6c1b 100644 --- a/gensim/test/test_keyedvectors.py +++ b/gensim/test/test_keyedvectors.py @@ -9,6 +9,7 @@ Automated tests for checking the poincare module from the models package. """ +import functools import logging import unittest @@ -39,6 +40,40 @@ def test_most_similar(self): predicted = [result[0] for result in self.vectors.most_similar('war', topn=5)] self.assertEqual(expected, predicted) + def test_most_similar_parameter_types(self): + """Are the positive/negative parameter types are getting interpreted correctly?""" + partial = functools.partial(self.vectors.most_similar, topn=5) + + keyword = partial(positive='war', negative='peace') + position = partial('war', 'peace') + position_list = partial(['war'], ['peace']) + keyword = partial(positive='war', negative='peace') + keyword_list = partial(positive=['war'], negative=['peace']) + + # + # The above calls should all yield identical results. + # + assert position == position_list + assert position == keyword + assert position == keyword_list + + def test_most_similar_cosmul_parameter_types(self): + """Are the positive/negative parameter types are getting interpreted correctly?""" + partial = functools.partial(self.vectors.most_similar_cosmul, topn=5) + + keyword = partial(positive='war', negative='peace') + position = partial('war', 'peace') + position_list = partial(['war'], ['peace']) + keyword = partial(positive='war', negative='peace') + keyword_list = partial(positive=['war'], negative=['peace']) + + # + # The above calls should all yield identical results. + # + assert position == position_list + assert position == keyword + assert position == keyword_list + def test_most_similar_topn(self): """Test most_similar returns correct results when `topn` is specified.""" self.assertEqual(len(self.vectors.most_similar('war', topn=5)), 5) From d67f2258c58637188397bcf340ae7c3fc96e3452 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Fri, 23 Jul 2021 16:36:03 +0900 Subject: [PATCH 08/12] Update CHANGELOG.md --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7a05a5567a..ad33c647dd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,7 @@ Changes * [#3163](https://github.com/RaRe-Technologies/gensim/pull/3163): Optimize word mover distance (WMD) computation, by [@flowlight0](https://github.com/flowlight0) * [#2965](https://github.com/RaRe-Technologies/gensim/pull/2965): Remove strip_punctuation2 alias of strip_punctuation, by [@sciatro](https://github.com/sciatro) * [#3169](https://github.com/RaRe-Technologies/gensim/pull/3169): Implement `shrink_windows` argument for Word2Vec., by [@M-Demay](https://github.com/M-Demay) +* [#3000](https://github.com/RaRe-Technologies/gensim/pull/3000): Tidy up KeyedVectors.most_similar() API, by [@simonwiles](https://github.com/simonwiles) ### :books: Documentation From bec3c7f2137bd3013ba970514d50e9e8ee29c0cb Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Fri, 23 Jul 2021 16:40:02 +0900 Subject: [PATCH 09/12] remove redundant variable declaration --- gensim/test/test_keyedvectors.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/gensim/test/test_keyedvectors.py b/gensim/test/test_keyedvectors.py index ed14871a6c..e610f970c4 100644 --- a/gensim/test/test_keyedvectors.py +++ b/gensim/test/test_keyedvectors.py @@ -44,7 +44,6 @@ def test_most_similar_parameter_types(self): """Are the positive/negative parameter types are getting interpreted correctly?""" partial = functools.partial(self.vectors.most_similar, topn=5) - keyword = partial(positive='war', negative='peace') position = partial('war', 'peace') position_list = partial(['war'], ['peace']) keyword = partial(positive='war', negative='peace') @@ -61,7 +60,6 @@ def test_most_similar_cosmul_parameter_types(self): """Are the positive/negative parameter types are getting interpreted correctly?""" partial = functools.partial(self.vectors.most_similar_cosmul, topn=5) - keyword = partial(positive='war', negative='peace') position = partial('war', 'peace') position_list = partial(['war'], ['peace']) keyword = partial(positive='war', negative='peace') From 80364cfbd9c6536cf470a1274c46668afbb2abfc Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Sat, 24 Jul 2021 08:02:31 +0900 Subject: [PATCH 10/12] enforce consistency --- gensim/models/keyedvectors.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index eb3e67fdf8..2cd2d82db6 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -189,7 +189,7 @@ logger = logging.getLogger(__name__) -KEY_TYPES = (str, int, np.integer) +_KEY_TYPES = (str, int, np.integer) _EXTENDED_KEY_TYPES = (str, int, np.integer, np.ndarray) @@ -379,7 +379,7 @@ def __getitem__(self, key_or_keys): Vector representation for `key_or_keys` (1D if `key_or_keys` is single key, otherwise - 2D). """ - if isinstance(key_or_keys, KEY_TYPES): + if isinstance(key_or_keys, _KEY_TYPES): return self.get_vector(key_or_keys) return vstack([self.get_vector(key) for key in key_or_keys]) @@ -493,7 +493,7 @@ def add_vectors(self, keys, weights, extras=None, replace=False): if True - replace vectors, otherwise - keep old vectors. """ - if isinstance(keys, KEY_TYPES): + if isinstance(keys, _KEY_TYPES): keys = [keys] weights = np.array(weights).reshape(1, -1) elif isinstance(weights, list): @@ -1111,7 +1111,7 @@ def distances(self, word_or_vector, other_words=()): If either `word_or_vector` or any word in `other_words` is absent from vocab. """ - if isinstance(word_or_vector, KEY_TYPES): + if isinstance(word_or_vector, _KEY_TYPES): input_vector = self.get_vector(word_or_vector) else: input_vector = word_or_vector From 498bbcc700767ce5933f631e98715d09c66fbdf8 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Thu, 12 Aug 2021 09:47:00 +0900 Subject: [PATCH 11/12] respond to review feedback --- gensim/models/keyedvectors.py | 51 ++++++++++++++++---------------- gensim/test/test_keyedvectors.py | 6 ++++ 2 files changed, 32 insertions(+), 25 deletions(-) diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index 2cd2d82db6..d3be3f6cd8 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -194,6 +194,25 @@ _EXTENDED_KEY_TYPES = (str, int, np.integer, np.ndarray) +def _ensure_list(value): + # + # Ensure that the specified value is a list. Sometimes users pass a single + # value when they should really pass a list containing value. + # + # This is here to make invocation of e.g. most_similar method consistent + # and convenient, and to guarantee backwards compability with older + # versions. See https://github.com/RaRe-Technologies/gensim/pull/3000 + # for the background. + # + if value is None: + return [] + + if isinstance(value, _KEY_TYPES) or (isinstance(value, ndarray) and len(value.shape) == 1): + return [value] + + return value + + class KeyedVectors(utils.SaveLoad): def __init__(self, vector_size, count=0, dtype=np.float32, mapfile_path=None): @@ -731,10 +750,9 @@ def most_similar( if isinstance(topn, Integral) and topn < 1: return [] - if positive is None: - positive = [] - if negative is None: - negative = [] + # allow passing a single string-key or vector for the positive/negative arguments + positive = _ensure_list(positive) + negative = _ensure_list(negative) self.fill_norms() clip_end = clip_end or len(self.vectors) @@ -743,14 +761,6 @@ def most_similar( clip_start = 0 clip_end = restrict_vocab - if isinstance(positive, _EXTENDED_KEY_TYPES): - # allow passing a single string-key or vector for the positive argument - positive = [positive] - - if isinstance(negative, _EXTENDED_KEY_TYPES): - # allow passing a single string-key or vector for the negative argument - negative = [negative] - # add weights for each key, if not already present; default to 1.0 for positive and -1.0 for negative keys positive = [ (item, 1.0) if isinstance(item, _EXTENDED_KEY_TYPES) else item @@ -975,25 +985,16 @@ def most_similar_cosmul(self, positive=None, negative=None, topn=10): if isinstance(topn, Integral) and topn < 1: return [] - if positive is None: - positive = [] - if negative is None: - negative = [] + # allow passing a single string-key or vector for the positive/negative arguments + positive = _ensure_list(positive) + negative = _ensure_list(negative) self.fill_norms() - if isinstance(positive, _EXTENDED_KEY_TYPES): - # allow passing a single string-key or vector for the positive argument - positive = [positive] - - if isinstance(negative, _EXTENDED_KEY_TYPES): - # allow passing a single string-key or vector for the negative argument - negative = [negative] - all_words = { self.get_index(word) for word in positive + negative if not isinstance(word, ndarray) and word in self.key_to_index - } + } positive = [ self.get_vector(word, norm=True) if isinstance(word, str) else word diff --git a/gensim/test/test_keyedvectors.py b/gensim/test/test_keyedvectors.py index e610f970c4..d5eda547ea 100644 --- a/gensim/test/test_keyedvectors.py +++ b/gensim/test/test_keyedvectors.py @@ -40,6 +40,12 @@ def test_most_similar(self): predicted = [result[0] for result in self.vectors.most_similar('war', topn=5)] self.assertEqual(expected, predicted) + def test_most_similar_vector(self): + """Can we pass vectors to most_similar directly?""" + positive = self.vectors.vectors[0:5] + most_similar = self.vectors.most_similar(positive=positive) + assert most_similar is not None + def test_most_similar_parameter_types(self): """Are the positive/negative parameter types are getting interpreted correctly?""" partial = functools.partial(self.vectors.most_similar, topn=5) From 6ead38116cc1277ba5779eebedc368ac13e132bb Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Fri, 13 Aug 2021 17:58:33 +0900 Subject: [PATCH 12/12] Update keyedvectors.py --- gensim/models/keyedvectors.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index d3be3f6cd8..b5debb21c1 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -195,15 +195,8 @@ def _ensure_list(value): - # - # Ensure that the specified value is a list. Sometimes users pass a single - # value when they should really pass a list containing value. - # - # This is here to make invocation of e.g. most_similar method consistent - # and convenient, and to guarantee backwards compability with older - # versions. See https://github.com/RaRe-Technologies/gensim/pull/3000 - # for the background. - # + """Ensure that the specified value is wrapped in a list, for those supported cases + where we also accept a single key or vector.""" if value is None: return []