Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Properly handle unicode_errors arg parameter when loading a vocab file #2570

Merged
merged 6 commits into from
Aug 26, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion gensim/models/utils_any2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -335,7 +335,7 @@ def _load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8'
counts = {}
with utils.open(fvocab, 'rb') as fin:
for line in fin:
word, count = utils.to_unicode(line).strip().split()
word, count = utils.to_unicode(line, errors=unicode_errors).strip().split()
counts[word] = int(count)

logger.info("loading projection weights from %s", fname)
Expand Down
3 changes: 3 additions & 0 deletions gensim/test/test_data/w2v_keyedvectors_load_test.modeldata
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
2 3
ありがとう� 0.6 0.6 0.6
どういたしまして� 0.1 0.2 0.3
2 changes: 2 additions & 0 deletions gensim/test/test_data/w2v_keyedvectors_load_test.vocab
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
ありがとう� 123
どういたしまして� 789
39 changes: 39 additions & 0 deletions gensim/test/test_keyedvectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,45 @@ def test(self):
self.assertTrue(vectors.word_vec('computer') is not None)


class Word2VecKeyedVectorsTest(unittest.TestCase):
def setUp(self):
self.model_path = datapath("w2v_keyedvectors_load_test.modeldata")
self.vocab_path = datapath("w2v_keyedvectors_load_test.vocab")

def test_load_model_and_vocab_file_strict(self):
"""Test loading model and voacab files which have decoding errors: strict mode"""
with self.assertRaises(UnicodeDecodeError):
gensim.models.KeyedVectors.load_word2vec_format(
self.model_path, fvocab=self.vocab_path, binary=False, unicode_errors="strict")

def test_load_model_and_vocab_file_replace(self):
"""Test loading model and voacab files which have decoding errors: replace mode"""
model = gensim.models.KeyedVectors.load_word2vec_format(
self.model_path, fvocab=self.vocab_path, binary=False, unicode_errors="replace")
self.assertEqual(model.vocab[u'ありがとう�'].count, 123)
self.assertEqual(model.vocab[u'どういたしまして�'].count, 789)
self.assertEqual(model.vocab[u'ありがとう�'].index, 0)
self.assertEqual(model.vocab[u'どういたしまして�'].index, 1)
self.assertTrue(np.array_equal(
model.get_vector(u'ありがとう�'), np.array([.6, .6, .6], dtype=np.float32)))
self.assertTrue(np.array_equal(
model.get_vector(u'どういたしまして�'), np.array([.1, .2, .3], dtype=np.float32)))

def test_load_model_and_vocab_file_ignore(self):
"""Test loading model and voacab files which have decoding errors: ignore mode"""
model = gensim.models.KeyedVectors.load_word2vec_format(
self.model_path, fvocab=self.vocab_path, binary=False, unicode_errors="ignore")
print(model.vocab.keys())
self.assertEqual(model.vocab[u'ありがとう'].count, 123)
self.assertEqual(model.vocab[u'どういたしまして'].count, 789)
self.assertEqual(model.vocab[u'ありがとう'].index, 0)
self.assertEqual(model.vocab[u'どういたしまして'].index, 1)
self.assertTrue(np.array_equal(
model.get_vector(u'ありがとう'), np.array([.6, .6, .6], dtype=np.float32)))
self.assertTrue(np.array_equal(
model.get_vector(u'どういたしまして'), np.array([.1, .2, .3], dtype=np.float32)))


if __name__ == '__main__':
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
unittest.main()