piskvorky · mpenkov · Aug 26, 2019 · Aug 7, 2019 · Aug 22, 2019 · Aug 22, 2019
diff --git a/gensim/models/utils_any2vec.py b/gensim/models/utils_any2vec.py
@@ -335,7 +335,7 @@ def _load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8'
         counts = {}
         with utils.open(fvocab, 'rb') as fin:
             for line in fin:
-                word, count = utils.to_unicode(line).strip().split()
+                word, count = utils.to_unicode(line, errors=unicode_errors).strip().split()
                 counts[word] = int(count)
 
     logger.info("loading projection weights from %s", fname)

diff --git a/gensim/test/test_data/w2v_keyedvectors_load_test.modeldata b/gensim/test/test_data/w2v_keyedvectors_load_test.modeldata
@@ -0,0 +1,3 @@
+2 3
+ありがとう� 0.6 0.6 0.6
+どういたしまして� 0.1 0.2 0.3
diff --git a/gensim/test/test_data/w2v_keyedvectors_load_test.vocab b/gensim/test/test_data/w2v_keyedvectors_load_test.vocab
@@ -0,0 +1,2 @@
+ありがとう� 123
+どういたしまして� 789
diff --git a/gensim/test/test_keyedvectors.py b/gensim/test/test_keyedvectors.py
@@ -361,6 +361,45 @@ def test(self):
         self.assertTrue(vectors.word_vec('computer') is not None)
 
 
+class Word2VecKeyedVectorsTest(unittest.TestCase):
+    def setUp(self):
+        self.model_path = datapath("w2v_keyedvectors_load_test.modeldata")
+        self.vocab_path = datapath("w2v_keyedvectors_load_test.vocab")
+
+    def test_load_model_and_vocab_file_strict(self):
+        """Test loading model and voacab files which have decoding errors: strict mode"""
+        with self.assertRaises(UnicodeDecodeError):
+            gensim.models.KeyedVectors.load_word2vec_format(
+                self.model_path, fvocab=self.vocab_path, binary=False, unicode_errors="strict")
+
+    def test_load_model_and_vocab_file_replace(self):
+        """Test loading model and voacab files which have decoding errors: replace mode"""
+        model = gensim.models.KeyedVectors.load_word2vec_format(
+            self.model_path, fvocab=self.vocab_path, binary=False, unicode_errors="replace")
+        self.assertEqual(model.vocab[u'ありがとう�'].count, 123)
+        self.assertEqual(model.vocab[u'どういたしまして�'].count, 789)
+        self.assertEqual(model.vocab[u'ありがとう�'].index, 0)
+        self.assertEqual(model.vocab[u'どういたしまして�'].index, 1)
+        self.assertTrue(np.array_equal(
+            model.get_vector(u'ありがとう�'), np.array([.6, .6, .6], dtype=np.float32)))
+        self.assertTrue(np.array_equal(
+            model.get_vector(u'どういたしまして�'), np.array([.1, .2, .3], dtype=np.float32)))
+
+    def test_load_model_and_vocab_file_ignore(self):
+        """Test loading model and voacab files which have decoding errors: ignore mode"""
+        model = gensim.models.KeyedVectors.load_word2vec_format(
+            self.model_path, fvocab=self.vocab_path, binary=False, unicode_errors="ignore")
+        print(model.vocab.keys())
+        self.assertEqual(model.vocab[u'ありがとう'].count, 123)
+        self.assertEqual(model.vocab[u'どういたしまして'].count, 789)
+        self.assertEqual(model.vocab[u'ありがとう'].index, 0)
+        self.assertEqual(model.vocab[u'どういたしまして'].index, 1)
+        self.assertTrue(np.array_equal(
+            model.get_vector(u'ありがとう'), np.array([.6, .6, .6], dtype=np.float32)))
+        self.assertTrue(np.array_equal(
+            model.get_vector(u'どういたしまして'), np.array([.1, .2, .3], dtype=np.float32)))
+
+
 if __name__ == '__main__':
     logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
     unittest.main()
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		ありがとう� 123
		どういたしまして� 789