diff --git a/gensim/models/_fasttext_bin.py b/gensim/models/_fasttext_bin.py index d8d84131a3..b03376e139 100644 --- a/gensim/models/_fasttext_bin.py +++ b/gensim/models/_fasttext_bin.py @@ -29,12 +29,14 @@ """ +import codecs import collections import io import logging import struct import numpy as np +import six _END_OF_WORD_MARKER = b'\x00' @@ -182,9 +184,9 @@ def _load_vocab(fin, new_format, encoding='utf-8'): try: word = word_bytes.decode(encoding) except UnicodeDecodeError: - word = word_bytes.decode(encoding, errors='ignore') + word = word_bytes.decode(encoding, errors='backslashreplace') logger.error( - 'failed to decode invalid unicode bytes %r; ignoring invalid characters, using %r', + 'failed to decode invalid unicode bytes %r; replacing invalid characters, using %r', word_bytes, word ) count, _ = _struct_unpack(fin, '@qb') @@ -280,3 +282,39 @@ def load(fin, encoding='utf-8', full_model=True): model.update(vectors_ngrams=vectors_ngrams, hidden_output=hidden_output) model = {k: v for k, v in model.items() if k in _FIELD_NAMES} return Model(**model) + + +def _backslashreplace_backport(ex): + """Replace byte sequences that failed to decode with character escapes. + + Does the same thing as errors="backslashreplace" from Python 3. Python 2 + lacks this functionality out of the box, so we need to backport it. + + Parameters + ---------- + ex: UnicodeDecodeError + contains arguments of the string and start/end indexes of the bad portion. + + Returns + ------- + text: unicode + The Unicode string corresponding to the decoding of the bad section. + end: int + The index from which to continue decoding. + + Note + ---- + Works on Py2 only. Py3 already has backslashreplace built-in. + + """ + # + # Based on: + # https://stackoverflow.com/questions/42860186/exact-equivalent-of-b-decodeutf-8-backslashreplace-in-python-2 + # + bstr, start, end = ex.object, ex.start, ex.end + text = u''.join('\\x{:02x}'.format(ord(c)) for c in bstr[start:end]) + return text, end + + +if six.PY2: + codecs.register_error('backslashreplace', _backslashreplace_backport) diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index b4cf61abd6..811f7849d0 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -2098,6 +2098,17 @@ def word_vec(self, word, use_norm=False): else: ngram_weights = self.vectors_ngrams ngram_hashes = ft_ngram_hashes(word, self.min_n, self.max_n, self.bucket, self.compatible_hash) + if len(ngram_hashes) == 0: + # + # If it is impossible to extract _any_ ngrams from the input + # word, then the best we can do is return a vector that points + # to the origin. The reference FB implementation does this, + # too. + # + # https://github.com/RaRe-Technologies/gensim/issues/2402 + # + logger.warning('could not extract any ngrams from %r, returning origin vector', word) + return word_vec for nh in ngram_hashes: word_vec += ngram_weights[nh] return word_vec / len(ngram_hashes) diff --git a/gensim/test/test_fasttext.py b/gensim/test/test_fasttext.py index cba0e8c098..50cacaefdf 100644 --- a/gensim/test/test_fasttext.py +++ b/gensim/test/test_fasttext.py @@ -1119,6 +1119,13 @@ def test_load_native_vectors(self): iv_vector = fbkv['landlady'] self.assertFalse(np.allclose(oov_vector, iv_vector)) + def test_no_ngrams(self): + model = gensim.models.fasttext.load_facebook_model(datapath('crime-and-punishment.bin')) + + v1 = model.wv[''] + origin = np.zeros(v1.shape, v1.dtype) + self.assertTrue(np.allclose(v1, origin)) + def _train_model_with_pretrained_vectors(): """Generate toy-model-pretrained.bin for use in test_load_native_pretrained. @@ -1261,10 +1268,12 @@ def test_bad_unicode(self): buf.seek(0) raw_vocab, vocab_size, nlabels = gensim.models._fasttext_bin._load_vocab(buf, False) + expected = { - u'英語版ウィキペディアへの投稿はいつでも': 1, - u'административно-территориальн': 2, + u'英語版ウィキペディアへの投稿はいつでも\\xe6': 1, + u'административно-территориальн\\xd1': 2, } + self.assertEqual(expected, dict(raw_vocab)) self.assertEqual(vocab_size, 2)