Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

avoid collisions when decoding bad unicode #2411

Merged
merged 5 commits into from
Apr 6, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 40 additions & 2 deletions gensim/models/_fasttext_bin.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,14 @@

"""

import codecs
import collections
import io
import logging
import struct

import numpy as np
import six

_END_OF_WORD_MARKER = b'\x00'

Expand Down Expand Up @@ -182,9 +184,9 @@ def _load_vocab(fin, new_format, encoding='utf-8'):
try:
word = word_bytes.decode(encoding)
except UnicodeDecodeError:
word = word_bytes.decode(encoding, errors='ignore')
word = word_bytes.decode(encoding, errors='backslashreplace')
logger.error(
'failed to decode invalid unicode bytes %r; ignoring invalid characters, using %r',
'failed to decode invalid unicode bytes %r; replacing invalid characters, using %r',
word_bytes, word
)
count, _ = _struct_unpack(fin, '@qb')
Expand Down Expand Up @@ -280,3 +282,39 @@ def load(fin, encoding='utf-8', full_model=True):
model.update(vectors_ngrams=vectors_ngrams, hidden_output=hidden_output)
model = {k: v for k, v in model.items() if k in _FIELD_NAMES}
return Model(**model)


def _backslashreplace_backport(ex):
"""Replace byte sequences that failed to decode with character escapes.

Does the same thing as errors="backslashreplace" from Python 3. Python 2
lacks this functionality out of the box, so we need to backport it.

Parameters
----------
ex: UnicodeDecodeError
contains arguments of the string and start/end indexes of the bad portion.

Returns
-------
text: unicode
The Unicode string corresponding to the decoding of the bad section.
end: int
The index from which to continue decoding.

Note
----
Works on Py2 only. Py3 already has backslashreplace built-in.

"""
#
# Based on:
# https://stackoverflow.com/questions/42860186/exact-equivalent-of-b-decodeutf-8-backslashreplace-in-python-2
#
bstr, start, end = ex.object, ex.start, ex.end
text = u''.join('\\x{:02x}'.format(ord(c)) for c in bstr[start:end])
return text, end


if six.PY2:
codecs.register_error('backslashreplace', _backslashreplace_backport)
11 changes: 11 additions & 0 deletions gensim/models/keyedvectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -2098,6 +2098,17 @@ def word_vec(self, word, use_norm=False):
else:
ngram_weights = self.vectors_ngrams
ngram_hashes = ft_ngram_hashes(word, self.min_n, self.max_n, self.bucket, self.compatible_hash)
if len(ngram_hashes) == 0:
#
# If it is impossible to extract _any_ ngrams from the input
# word, then the best we can do is return a vector that points
# to the origin. The reference FB implementation does this,
# too.
#
# https://github.com/RaRe-Technologies/gensim/issues/2402
#
logger.warning('could not extract any ngrams from %r, returning origin vector', word)
return word_vec
for nh in ngram_hashes:
word_vec += ngram_weights[nh]
return word_vec / len(ngram_hashes)
Expand Down
13 changes: 11 additions & 2 deletions gensim/test/test_fasttext.py
Original file line number Diff line number Diff line change
Expand Up @@ -1119,6 +1119,13 @@ def test_load_native_vectors(self):
iv_vector = fbkv['landlady']
self.assertFalse(np.allclose(oov_vector, iv_vector))

def test_no_ngrams(self):
model = gensim.models.fasttext.load_facebook_model(datapath('crime-and-punishment.bin'))

v1 = model.wv['']
origin = np.zeros(v1.shape, v1.dtype)
self.assertTrue(np.allclose(v1, origin))


def _train_model_with_pretrained_vectors():
"""Generate toy-model-pretrained.bin for use in test_load_native_pretrained.
Expand Down Expand Up @@ -1261,10 +1268,12 @@ def test_bad_unicode(self):
buf.seek(0)

raw_vocab, vocab_size, nlabels = gensim.models._fasttext_bin._load_vocab(buf, False)

expected = {
u'英語版ウィキペディアへの投稿はいつでも': 1,
u'административно-территориальн': 2,
u'英語版ウィキペディアへの投稿はいつでも\\xe6': 1,
u'административно-территориальн\\xd1': 2,
}

self.assertEqual(expected, dict(raw_vocab))

self.assertEqual(vocab_size, 2)
Expand Down