diff --git a/CHANGELOG.md b/CHANGELOG.md index 70e0f0e7ed..c443c05393 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,6 +20,7 @@ Changes * Remove ShardedCorpus from init because of Theano dependency (@tmylk, [#919](https://github.com/RaRe-Technologies/gensim/pull/919)) * Documentation improvements ( @dsquareindia & @tmylk, [#914](https://github.com/RaRe-Technologies/gensim/pull/914), [#906](https://github.com/RaRe-Technologies/gensim/pull/906) ) * Add Annoy memory-mapping example (@harshul1610, [#899](https://github.com/RaRe-Technologies/gensim/pull/899)) +* Fixed issue [#601](https://github.com/RaRe-Technologies/gensim/issues/601), correct docID in most_similar for clip range (@parulsethi, [#994](https://github.com/RaRe-Technologies/gensim/pull/994)) 0.13.2, 2016-08-19 diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index c9f39f3299..49c2774a87 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -460,7 +460,7 @@ def most_similar(self, positive=[], negative=[], topn=10, clip_start=0, clip_end return dists best = matutils.argsort(dists, topn=topn + len(all_docs), reverse=True) # ignore (don't return) docs from the input - result = [(self.index_to_doctag(sim), float(dists[sim])) for sim in best if sim not in all_docs] + result = [(self.index_to_doctag(sim + clip_start), float(dists[sim])) for sim in best if (sim + clip_start) not in all_docs] return result[:topn] def doesnt_match(self, docs): diff --git a/gensim/test/test_doc2vec.py b/gensim/test/test_doc2vec.py index 42264c0b4b..a695b1a724 100644 --- a/gensim/test/test_doc2vec.py +++ b/gensim/test/test_doc2vec.py @@ -8,6 +8,7 @@ Automated tests for checking transformation algorithms (the models package). """ + from __future__ import with_statement import logging @@ -159,6 +160,12 @@ def model_sanity(self, model): self.assertEqual(list(zip(*sims))[0], list(zip(*sims2))[0]) # same doc ids self.assertTrue(np.allclose(list(zip(*sims))[1], list(zip(*sims2))[1])) # close-enough dists + # sim results should be in clip range if given + clip_sims = model.docvecs.most_similar(fire1, clip_start=len(model.docvecs) // 2, clip_end=len(model.docvecs) * 2 // 3) + sims_doc_id = [docid for docid, sim in clip_sims] + for s_id in sims_doc_id: + self.assertTrue(len(model.docvecs) // 2 <= s_id <= len(model.docvecs) * 2 // 3) + # tennis doc should be out-of-place among fire news self.assertEqual(model.docvecs.doesnt_match([fire1, tennis1, fire2]), tennis1)