From 65f6c8ba5249326083ee0e9b42897b1df845c575 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Sat, 4 May 2019 18:29:44 +0900 Subject: [PATCH 1/9] added some assertions --- gensim/models/_fasttext_bin.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/gensim/models/_fasttext_bin.py b/gensim/models/_fasttext_bin.py index b03376e139..36c531b264 100644 --- a/gensim/models/_fasttext_bin.py +++ b/gensim/models/_fasttext_bin.py @@ -196,6 +196,8 @@ def _load_vocab(fin, new_format, encoding='utf-8'): for j in range(pruneidx_size): _struct_unpack(fin, '@2i') + assert len(raw_vocab) == vocab_size == nwords, \ + 'should be equal: %r %r %r' % (len(raw_vocab), vocab_size, nwords) return raw_vocab, vocab_size, nwords @@ -233,7 +235,9 @@ def _load_matrix(fin, new_format=True): else: raise ValueError("Incompatible float size: %r" % float_size) - matrix = np.fromfile(fin, dtype=dtype, count=num_vectors * dim) + count = num_vectors * dim + matrix = np.fromfile(fin, dtype=dtype, count=count) + assert matrix.shape == (count,), 'expected (%r,), got %r' % (count, matrix.shape) matrix = matrix.reshape((num_vectors, dim)) return matrix From dce7ba462f5292a5b0d35d4fa70981ffce95541a Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Sun, 5 May 2019 10:27:10 +0900 Subject: [PATCH 2/9] extract np.fromfile function, add tests around it --- gensim/models/_fasttext_bin.py | 7 ++++++- gensim/test/test_data/reproduce.dat | Bin 0 -> 83 bytes gensim/test/test_fasttext.py | 14 ++++++++++++++ 3 files changed, 20 insertions(+), 1 deletion(-) create mode 100644 gensim/test/test_data/reproduce.dat diff --git a/gensim/models/_fasttext_bin.py b/gensim/models/_fasttext_bin.py index 36c531b264..338692ce8f 100644 --- a/gensim/models/_fasttext_bin.py +++ b/gensim/models/_fasttext_bin.py @@ -236,12 +236,17 @@ def _load_matrix(fin, new_format=True): raise ValueError("Incompatible float size: %r" % float_size) count = num_vectors * dim - matrix = np.fromfile(fin, dtype=dtype, count=count) + matrix = _fromfile(fin, dtype, count) assert matrix.shape == (count,), 'expected (%r,), got %r' % (count, matrix.shape) matrix = matrix.reshape((num_vectors, dim)) return matrix +def _fromfile(fin, dtype, count): + """Reimplementation of numpy.fromfile.""" + return np.fromfile(fin, dtype=dtype, count=count) + + def load(fin, encoding='utf-8', full_model=True): """Load a model from a binary stream. diff --git a/gensim/test/test_data/reproduce.dat b/gensim/test/test_data/reproduce.dat new file mode 100644 index 0000000000000000000000000000000000000000..14ffb68199da3d65e3d3b0e3313c856abb04c3c4 GIT binary patch literal 83 zcmXTPNL45-%}mZ#NGi%N&r?XtuTaP;%`GTa$S+GRQYZmR=Ok8DDx~D6Gk`&ZJ& Date: Sun, 5 May 2019 10:34:10 +0900 Subject: [PATCH 3/9] add gzip test case --- gensim/test/test_data/reproduce.dat.gz | Bin 0 -> 102 bytes gensim/test/test_fasttext.py | 19 +++++++++++++++---- 2 files changed, 15 insertions(+), 4 deletions(-) create mode 100644 gensim/test/test_data/reproduce.dat.gz diff --git a/gensim/test/test_data/reproduce.dat.gz b/gensim/test/test_data/reproduce.dat.gz new file mode 100644 index 0000000000000000000000000000000000000000..d9a3b02f3c30901ecf74330c9970c9434af787f9 GIT binary patch literal 102 zcmV-s0Gaa&KgHV`VO6VRQg1$w*ZwEX_>LR!Az!FV9m*%db$# zD$Ok@R>&_)Em9}}O6Me2RVt+9r!#;-gFTRN03sl60OAEed;o|a05O9j1A~Ag02B5T I0aE|~07RJ~{r~^~ literal 0 HcmV?d00001 diff --git a/gensim/test/test_fasttext.py b/gensim/test/test_fasttext.py index 6675314118..156fef558e 100644 --- a/gensim/test/test_fasttext.py +++ b/gensim/test/test_fasttext.py @@ -2,6 +2,7 @@ # -*- coding: utf-8 -*- from __future__ import division +import gzip import io import logging import unittest @@ -1285,11 +1286,21 @@ def test_bad_unicode(self): class TestFromfile(unittest.TestCase): def test_decompressed(self): with open(datapath('reproduce.dat'), 'rb') as fin: - actual = fin.read(len(_BYTES)) - self.assertEqual(_BYTES, actual) + self._run(fin) + + @unittest.skip('numpy does not work with gzip, see https://github.com/numpy/numpy/issues/13470') + def test_compressed(self): + with gzip.GzipFile(datapath('reproduce.dat.gz'), 'rb') as fin: + self._run(fin) + + def _run(self, fin): + actual = fin.read(len(_BYTES)) + self.assertEqual(_BYTES, actual) + + array = gensim.models._fasttext_bin._fromfile(fin, _ARRAY.dtype, _ARRAY.shape[0]) + logger.error('array: %r', array) + self.assertTrue(np.allclose(_ARRAY, array)) - array = gensim.models._fasttext_bin._fromfile(fin, _ARRAY.dtype, _ARRAY.shape[0]) - self.assertTrue(np.allclose(_ARRAY, array)) if __name__ == '__main__': From 606162e825ab0193fe5f4922c8db75ad2ad8e053 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Sun, 5 May 2019 10:51:59 +0900 Subject: [PATCH 4/9] get matrix loading working with gzip --- gensim/models/_fasttext_bin.py | 42 ++++++++++++++++++++++++++-------- gensim/test/test_fasttext.py | 1 - 2 files changed, 32 insertions(+), 11 deletions(-) diff --git a/gensim/models/_fasttext_bin.py b/gensim/models/_fasttext_bin.py index 338692ce8f..a50fd9558d 100644 --- a/gensim/models/_fasttext_bin.py +++ b/gensim/models/_fasttext_bin.py @@ -227,24 +227,46 @@ def _load_matrix(fin, new_format=True): num_vectors, dim = _struct_unpack(fin, '@2q') - float_size = struct.calcsize('@f') - if float_size == 4: - dtype = np.dtype(np.float32) - elif float_size == 8: - dtype = np.dtype(np.float64) - else: - raise ValueError("Incompatible float size: %r" % float_size) - count = num_vectors * dim - matrix = _fromfile(fin, dtype, count) + matrix = _fromfile(fin, None, count) assert matrix.shape == (count,), 'expected (%r,), got %r' % (count, matrix.shape) matrix = matrix.reshape((num_vectors, dim)) return matrix +def _batched_generator(fin, count, batch_size=1e6): + """Reads count floats from fin. + + Batches up read calls to avoid I/O overhead. Keeps no more than batch_size + floats in memory at once. + + Yields floats. + + """ + while count > batch_size: + batch = _struct_unpack(fin, '@%df' % batch_size) + for f in batch: + yield f + count -= batch_size + + batch = _struct_unpack(fin, '@%df' % count) + for f in batch: + yield f + + def _fromfile(fin, dtype, count): """Reimplementation of numpy.fromfile.""" - return np.fromfile(fin, dtype=dtype, count=count) + + if dtype is None: + float_size = struct.calcsize('@f') + if float_size == 4: + dtype = np.dtype(np.float32) + elif float_size == 8: + dtype = np.dtype(np.float64) + else: + raise ValueError("Incompatible float size: %r" % float_size) + + return np.fromiter(_batched_generator(fin, count), dtype=dtype) def load(fin, encoding='utf-8', full_model=True): diff --git a/gensim/test/test_fasttext.py b/gensim/test/test_fasttext.py index 156fef558e..0ce3de77b2 100644 --- a/gensim/test/test_fasttext.py +++ b/gensim/test/test_fasttext.py @@ -1288,7 +1288,6 @@ def test_decompressed(self): with open(datapath('reproduce.dat'), 'rb') as fin: self._run(fin) - @unittest.skip('numpy does not work with gzip, see https://github.com/numpy/numpy/issues/13470') def test_compressed(self): with gzip.GzipFile(datapath('reproduce.dat.gz'), 'rb') as fin: self._run(fin) From ffd41660733f7a0b1e437b2baef356c504fd3971 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Sun, 5 May 2019 17:43:59 +0900 Subject: [PATCH 5/9] remove assertion, some tests trip it --- gensim/models/_fasttext_bin.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/gensim/models/_fasttext_bin.py b/gensim/models/_fasttext_bin.py index a50fd9558d..b890cf0888 100644 --- a/gensim/models/_fasttext_bin.py +++ b/gensim/models/_fasttext_bin.py @@ -196,8 +196,6 @@ def _load_vocab(fin, new_format, encoding='utf-8'): for j in range(pruneidx_size): _struct_unpack(fin, '@2i') - assert len(raw_vocab) == vocab_size == nwords, \ - 'should be equal: %r %r %r' % (len(raw_vocab), vocab_size, nwords) return raw_vocab, vocab_size, nwords From d186bd08919951da40e7fd4b0df2d0394f27c291 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Sun, 5 May 2019 17:57:53 +0900 Subject: [PATCH 6/9] apply comments from review --- gensim/models/_fasttext_bin.py | 44 ++++++++++++++++++++++++---------- 1 file changed, 32 insertions(+), 12 deletions(-) diff --git a/gensim/models/_fasttext_bin.py b/gensim/models/_fasttext_bin.py index b890cf0888..5afad3ba2a 100644 --- a/gensim/models/_fasttext_bin.py +++ b/gensim/models/_fasttext_bin.py @@ -31,6 +31,7 @@ import codecs import collections +import gzip import io import logging import struct @@ -74,6 +75,14 @@ ('t', 'd'), ] +_FLOAT_SIZE = struct.calcsize('@f') +if _FLOAT_SIZE == 4: + _FLOAT_DTYPE = np.dtype(np.float32) +elif _FLOAT_SIZE == 8: + _FLOAT_DTYPE = np.dtype(np.float64) +else: + _FLOAT_DTYPE = None + def _yield_field_names(): for name, _ in _OLD_HEADER_FORMAT + _NEW_HEADER_FORMAT: @@ -220,13 +229,34 @@ def _load_matrix(fin, new_format=True): The number of columns of the array will correspond to the vector size. """ + if _FLOAT_DTYPE is None: + raise ValueError('bad _FLOAT_SIZE: %r' % _FLOAT_SIZE) + if new_format: _struct_unpack(fin, '@?') # bool quant_input in fasttext.cc num_vectors, dim = _struct_unpack(fin, '@2q') - count = num_vectors * dim - matrix = _fromfile(fin, None, count) + + # + # numpy.fromfile doesn't play well with gzip.GzipFile as input: + # + # - https://github.com/RaRe-Technologies/gensim/pull/2476 + # - https://github.com/numpy/numpy/issues/13470 + # + # Until they fix it, we have to apply a workaround. We only apply the + # workaround when it's necessary, because np.fromfile is heavily optimized + # and very efficient (when it works). + # + if isinstance(fin, gzip.GzipFile): + logger.info( + 'Loading model from a compressed .gz file. This can be slow. ' + 'Consider decompressing the model before loading it.' + ) + matrix = _fromfile(fin, _FLOAT_DTYPE, count) + else: + matrix = np.fromfile(fin, _FLOAT_DTYPE, count) + assert matrix.shape == (count,), 'expected (%r,), got %r' % (count, matrix.shape) matrix = matrix.reshape((num_vectors, dim)) return matrix @@ -254,16 +284,6 @@ def _batched_generator(fin, count, batch_size=1e6): def _fromfile(fin, dtype, count): """Reimplementation of numpy.fromfile.""" - - if dtype is None: - float_size = struct.calcsize('@f') - if float_size == 4: - dtype = np.dtype(np.float32) - elif float_size == 8: - dtype = np.dtype(np.float64) - else: - raise ValueError("Incompatible float size: %r" % float_size) - return np.fromiter(_batched_generator(fin, count), dtype=dtype) From b61bfeee6e22212e581ac8ca09565d924db194f3 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Sun, 5 May 2019 18:24:18 +0900 Subject: [PATCH 7/9] make flake8 happy --- gensim/test/test_fasttext.py | 1 - 1 file changed, 1 deletion(-) diff --git a/gensim/test/test_fasttext.py b/gensim/test/test_fasttext.py index 0ce3de77b2..1346534053 100644 --- a/gensim/test/test_fasttext.py +++ b/gensim/test/test_fasttext.py @@ -1301,7 +1301,6 @@ def _run(self, fin): self.assertTrue(np.allclose(_ARRAY, array)) - if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) unittest.main() From 4d18b346cd0eed636edc2d26f27d2ebf179e754e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= Date: Sun, 5 May 2019 18:55:17 +0900 Subject: [PATCH 8/9] Update gensim/models/_fasttext_bin.py Co-Authored-By: mpenkov --- gensim/models/_fasttext_bin.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/_fasttext_bin.py b/gensim/models/_fasttext_bin.py index 5afad3ba2a..a9e9fa5cb7 100644 --- a/gensim/models/_fasttext_bin.py +++ b/gensim/models/_fasttext_bin.py @@ -263,7 +263,7 @@ def _load_matrix(fin, new_format=True): def _batched_generator(fin, count, batch_size=1e6): - """Reads count floats from fin. + """Read `count` floats from `fin`. Batches up read calls to avoid I/O overhead. Keeps no more than batch_size floats in memory at once. From 0cfb9e1baac072fcaaf274e4f3b8ecc929b751e7 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Sun, 5 May 2019 22:15:33 +0900 Subject: [PATCH 9/9] More review responses --- gensim/models/_fasttext_bin.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/gensim/models/_fasttext_bin.py b/gensim/models/_fasttext_bin.py index a9e9fa5cb7..f00b049ee4 100644 --- a/gensim/models/_fasttext_bin.py +++ b/gensim/models/_fasttext_bin.py @@ -249,9 +249,10 @@ def _load_matrix(fin, new_format=True): # and very efficient (when it works). # if isinstance(fin, gzip.GzipFile): - logger.info( + logger.warning( 'Loading model from a compressed .gz file. This can be slow. ' - 'Consider decompressing the model before loading it.' + 'This is a work-around for a bug in NumPy: https://github.com/numpy/numpy/issues/13470. ' + 'Consider decompressing your model file for a faster load. ' ) matrix = _fromfile(fin, _FLOAT_DTYPE, count) else: