Skip to content

Commit

Permalink
Merge pull request #582 from piskvorky/docvecs_keyerror
Browse files Browse the repository at this point in the history
fix for #520: raise KeyError when no matching doctag
  • Loading branch information
gojomo committed Jan 16, 2016
2 parents f267abf + 6f91668 commit 1ab5df2
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 3 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ Changes
* Better internal handling of job batching in word2vec (#535)
- up to 300% speed up when training on very short documents (~tweets)
* Word2vec allows non-strict unicode error handling (ignore or replace) (Gordon Mohr, #466)
* Doc2Vec `model.docvecs[key]` now raises KeyError for unknown keys (Gordon Mohr, #520)
* Fix `DocvecsArray.index_to_doctag` so `most_similar()` returns string doctags (Gordon Mohr, #560)
* On-demand loading of the `pattern` library in utils.lemmatize (Jan Zikes, #461)
- `utils.HAS_PATTERN` flag moved to `utils.has_pattern()`
Expand Down
6 changes: 3 additions & 3 deletions gensim/models/doc2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,20 +297,20 @@ def note_doctag(self, key, document_no, document_length):

def indexed_doctags(self, doctag_tokens):
"""Return indexes and backing-arrays used in training examples."""
return ([i for i in [self._int_index(index, -1) for index in doctag_tokens] if i > -1],
return ([self._int_index(index) for index in doctag_tokens if index in self],
self.doctag_syn0, self.doctag_syn0_lockf, doctag_tokens)

def trained_item(self, indexed_tuple):
"""Persist any changes made to the given indexes (matching tuple previously
returned by indexed_doctags()); a no-op for this implementation"""
pass

def _int_index(self, index, missing=None):
def _int_index(self, index):
"""Return int index for either string or int index"""
if isinstance(index, int):
return index
else:
return self.max_rawint + 1 + self.doctags[index].offset if index in self.doctags else missing
return self.max_rawint + 1 + self.doctags[index].offset

def _key_index(self, i_index, missing=None):
"""Return string index for given int index, if available"""
Expand Down
10 changes: 10 additions & 0 deletions gensim/test/test_doc2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,16 @@ def test_int_doctags(self):
self.assertEqual(model.docvecs[0].shape, (300,))
self.assertRaises(KeyError, model.__getitem__, '_*0')

def test_missing_string_doctag(self):
"""Test doc2vec doctag alternatives"""
corpus = list(DocsLeeCorpus(True))
# force duplicated tags
corpus = corpus[0:10] + corpus

model = doc2vec.Doc2Vec(min_count=1)
model.build_vocab(corpus)
self.assertRaises(KeyError, model.docvecs.__getitem__, 'not_a_tag')

def test_string_doctags(self):
"""Test doc2vec doctag alternatives"""
corpus = list(DocsLeeCorpus(True))
Expand Down

0 comments on commit 1ab5df2

Please sign in to comment.