From 65f6c8ba5249326083ee0e9b42897b1df845c575 Mon Sep 17 00:00:00 2001
From: Michael Penkov <m@penkov.dev>
Date: Sat, 4 May 2019 18:29:44 +0900
Subject: [PATCH 1/9] added some assertions

---
 gensim/models/_fasttext_bin.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/gensim/models/_fasttext_bin.py b/gensim/models/_fasttext_bin.py
index b03376e139..36c531b264 100644
--- a/gensim/models/_fasttext_bin.py
+++ b/gensim/models/_fasttext_bin.py
@@ -196,6 +196,8 @@ def _load_vocab(fin, new_format, encoding='utf-8'):
         for j in range(pruneidx_size):
             _struct_unpack(fin, '@2i')
 
+    assert len(raw_vocab) == vocab_size == nwords, \
+        'should be equal: %r %r %r' % (len(raw_vocab), vocab_size, nwords)
     return raw_vocab, vocab_size, nwords
 
 
@@ -233,7 +235,9 @@ def _load_matrix(fin, new_format=True):
     else:
         raise ValueError("Incompatible float size: %r" % float_size)
 
-    matrix = np.fromfile(fin, dtype=dtype, count=num_vectors * dim)
+    count = num_vectors * dim
+    matrix = np.fromfile(fin, dtype=dtype, count=count)
+    assert matrix.shape == (count,), 'expected (%r,),  got %r' % (count, matrix.shape)
     matrix = matrix.reshape((num_vectors, dim))
     return matrix
 

From dce7ba462f5292a5b0d35d4fa70981ffce95541a Mon Sep 17 00:00:00 2001
From: Michael Penkov <m@penkov.dev>
Date: Sun, 5 May 2019 10:27:10 +0900
Subject: [PATCH 2/9] extract np.fromfile function, add tests around it

---
 gensim/models/_fasttext_bin.py      |   7 ++++++-
 gensim/test/test_data/reproduce.dat | Bin 0 -> 83 bytes
 gensim/test/test_fasttext.py        |  14 ++++++++++++++
 3 files changed, 20 insertions(+), 1 deletion(-)
 create mode 100644 gensim/test/test_data/reproduce.dat

diff --git a/gensim/models/_fasttext_bin.py b/gensim/models/_fasttext_bin.py
index 36c531b264..338692ce8f 100644
--- a/gensim/models/_fasttext_bin.py
+++ b/gensim/models/_fasttext_bin.py
@@ -236,12 +236,17 @@ def _load_matrix(fin, new_format=True):
         raise ValueError("Incompatible float size: %r" % float_size)
 
     count = num_vectors * dim
-    matrix = np.fromfile(fin, dtype=dtype, count=count)
+    matrix = _fromfile(fin, dtype, count)
     assert matrix.shape == (count,), 'expected (%r,),  got %r' % (count, matrix.shape)
     matrix = matrix.reshape((num_vectors, dim))
     return matrix
 
 
+def _fromfile(fin, dtype, count):
+    """Reimplementation of numpy.fromfile."""
+    return np.fromfile(fin, dtype=dtype, count=count)
+
+
 def load(fin, encoding='utf-8', full_model=True):
     """Load a model from a binary stream.
 
diff --git a/gensim/test/test_data/reproduce.dat b/gensim/test/test_data/reproduce.dat
new file mode 100644
index 0000000000000000000000000000000000000000..14ffb68199da3d65e3d3b0e3313c856abb04c3c4
GIT binary patch
literal 83
zcmXTPNL45-%}mZ#NGi%N&r?XtuTaP;%`GTa$S+GRQYZmR=Ok8DDx~D6Gk`&ZJ&<w$
XA|P%6;sro_0Eiy|F@qxmgMcFdN4^!y

literal 0
HcmV?d00001

diff --git a/gensim/test/test_fasttext.py b/gensim/test/test_fasttext.py
index df4a76e941..6675314118 100644
--- a/gensim/test/test_fasttext.py
+++ b/gensim/test/test_fasttext.py
@@ -1278,6 +1278,20 @@ def test_bad_unicode(self):
         self.assertEqual(nlabels, -1)
 
 
+_BYTES = b'the quick brown fox jumps over the lazy dog'
+_ARRAY = np.array([0., 1., 2., 3., 4., 5., 6., 7., 8.], dtype=np.dtype('float32'))
+
+
+class TestFromfile(unittest.TestCase):
+    def test_decompressed(self):
+        with open(datapath('reproduce.dat'), 'rb') as fin:
+            actual = fin.read(len(_BYTES))
+            self.assertEqual(_BYTES, actual)
+
+            array = gensim.models._fasttext_bin._fromfile(fin, _ARRAY.dtype, _ARRAY.shape[0])
+            self.assertTrue(np.allclose(_ARRAY, array))
+
+
 if __name__ == '__main__':
     logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
     unittest.main()

From 63f978a94008f3849e1c6569032c20097afd0f1c Mon Sep 17 00:00:00 2001
From: Michael Penkov <m@penkov.dev>
Date: Sun, 5 May 2019 10:34:10 +0900
Subject: [PATCH 3/9] add gzip test case

---
 gensim/test/test_data/reproduce.dat.gz | Bin 0 -> 102 bytes
 gensim/test/test_fasttext.py           |  19 +++++++++++++++----
 2 files changed, 15 insertions(+), 4 deletions(-)
 create mode 100644 gensim/test/test_data/reproduce.dat.gz

diff --git a/gensim/test/test_data/reproduce.dat.gz b/gensim/test/test_data/reproduce.dat.gz
new file mode 100644
index 0000000000000000000000000000000000000000..d9a3b02f3c30901ecf74330c9970c9434af787f9
GIT binary patch
literal 102
zcmV-s0Ga<EiwFoHI?h}G19D|>a&KgHV`VO6VRQg1$w*ZwEX_>LR!Az!FV9m*%db$#
zD$Ok@R>&_)Em9}}O6Me2RVt+9r!#;-gFTRN03sl60OAEed;o|a05O9j1A~Ag02B5T
I0aE|~07RJ~{r~^~

literal 0
HcmV?d00001

diff --git a/gensim/test/test_fasttext.py b/gensim/test/test_fasttext.py
index 6675314118..156fef558e 100644
--- a/gensim/test/test_fasttext.py
+++ b/gensim/test/test_fasttext.py
@@ -2,6 +2,7 @@
 # -*- coding: utf-8 -*-
 from __future__ import division
 
+import gzip
 import io
 import logging
 import unittest
@@ -1285,11 +1286,21 @@ def test_bad_unicode(self):
 class TestFromfile(unittest.TestCase):
     def test_decompressed(self):
         with open(datapath('reproduce.dat'), 'rb') as fin:
-            actual = fin.read(len(_BYTES))
-            self.assertEqual(_BYTES, actual)
+            self._run(fin)
+
+    @unittest.skip('numpy does not work with gzip, see https://github.com/numpy/numpy/issues/13470')
+    def test_compressed(self):
+        with gzip.GzipFile(datapath('reproduce.dat.gz'), 'rb') as fin:
+            self._run(fin)
+
+    def _run(self, fin):
+        actual = fin.read(len(_BYTES))
+        self.assertEqual(_BYTES, actual)
+
+        array = gensim.models._fasttext_bin._fromfile(fin, _ARRAY.dtype, _ARRAY.shape[0])
+        logger.error('array: %r', array)
+        self.assertTrue(np.allclose(_ARRAY, array))
 
-            array = gensim.models._fasttext_bin._fromfile(fin, _ARRAY.dtype, _ARRAY.shape[0])
-            self.assertTrue(np.allclose(_ARRAY, array))
 
 
 if __name__ == '__main__':

From 606162e825ab0193fe5f4922c8db75ad2ad8e053 Mon Sep 17 00:00:00 2001
From: Michael Penkov <m@penkov.dev>
Date: Sun, 5 May 2019 10:51:59 +0900
Subject: [PATCH 4/9] get matrix loading working with gzip

---
 gensim/models/_fasttext_bin.py | 42 ++++++++++++++++++++++++++--------
 gensim/test/test_fasttext.py   |  1 -
 2 files changed, 32 insertions(+), 11 deletions(-)

diff --git a/gensim/models/_fasttext_bin.py b/gensim/models/_fasttext_bin.py
index 338692ce8f..a50fd9558d 100644
--- a/gensim/models/_fasttext_bin.py
+++ b/gensim/models/_fasttext_bin.py
@@ -227,24 +227,46 @@ def _load_matrix(fin, new_format=True):
 
     num_vectors, dim = _struct_unpack(fin, '@2q')
 
-    float_size = struct.calcsize('@f')
-    if float_size == 4:
-        dtype = np.dtype(np.float32)
-    elif float_size == 8:
-        dtype = np.dtype(np.float64)
-    else:
-        raise ValueError("Incompatible float size: %r" % float_size)
-
     count = num_vectors * dim
-    matrix = _fromfile(fin, dtype, count)
+    matrix = _fromfile(fin, None, count)
     assert matrix.shape == (count,), 'expected (%r,),  got %r' % (count, matrix.shape)
     matrix = matrix.reshape((num_vectors, dim))
     return matrix
 
 
+def _batched_generator(fin, count, batch_size=1e6):
+    """Reads count floats from fin.
+
+    Batches up read calls to avoid I/O overhead.  Keeps no more than batch_size
+    floats in memory at once.
+
+    Yields floats.
+
+    """
+    while count > batch_size:
+        batch = _struct_unpack(fin, '@%df' % batch_size)
+        for f in batch:
+            yield f
+        count -= batch_size
+
+    batch = _struct_unpack(fin, '@%df' % count)
+    for f in batch:
+        yield f
+
+
 def _fromfile(fin, dtype, count):
     """Reimplementation of numpy.fromfile."""
-    return np.fromfile(fin, dtype=dtype, count=count)
+
+    if dtype is None:
+        float_size = struct.calcsize('@f')
+        if float_size == 4:
+            dtype = np.dtype(np.float32)
+        elif float_size == 8:
+            dtype = np.dtype(np.float64)
+        else:
+            raise ValueError("Incompatible float size: %r" % float_size)
+
+    return np.fromiter(_batched_generator(fin, count), dtype=dtype)
 
 
 def load(fin, encoding='utf-8', full_model=True):
diff --git a/gensim/test/test_fasttext.py b/gensim/test/test_fasttext.py
index 156fef558e..0ce3de77b2 100644
--- a/gensim/test/test_fasttext.py
+++ b/gensim/test/test_fasttext.py
@@ -1288,7 +1288,6 @@ def test_decompressed(self):
         with open(datapath('reproduce.dat'), 'rb') as fin:
             self._run(fin)
 
-    @unittest.skip('numpy does not work with gzip, see https://github.com/numpy/numpy/issues/13470')
     def test_compressed(self):
         with gzip.GzipFile(datapath('reproduce.dat.gz'), 'rb') as fin:
             self._run(fin)

From ffd41660733f7a0b1e437b2baef356c504fd3971 Mon Sep 17 00:00:00 2001
From: Michael Penkov <m@penkov.dev>
Date: Sun, 5 May 2019 17:43:59 +0900
Subject: [PATCH 5/9] remove assertion, some tests trip it

---
 gensim/models/_fasttext_bin.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/gensim/models/_fasttext_bin.py b/gensim/models/_fasttext_bin.py
index a50fd9558d..b890cf0888 100644
--- a/gensim/models/_fasttext_bin.py
+++ b/gensim/models/_fasttext_bin.py
@@ -196,8 +196,6 @@ def _load_vocab(fin, new_format, encoding='utf-8'):
         for j in range(pruneidx_size):
             _struct_unpack(fin, '@2i')
 
-    assert len(raw_vocab) == vocab_size == nwords, \
-        'should be equal: %r %r %r' % (len(raw_vocab), vocab_size, nwords)
     return raw_vocab, vocab_size, nwords
 
 

From d186bd08919951da40e7fd4b0df2d0394f27c291 Mon Sep 17 00:00:00 2001
From: Michael Penkov <m@penkov.dev>
Date: Sun, 5 May 2019 17:57:53 +0900
Subject: [PATCH 6/9] apply comments from review

---
 gensim/models/_fasttext_bin.py | 44 ++++++++++++++++++++++++----------
 1 file changed, 32 insertions(+), 12 deletions(-)

diff --git a/gensim/models/_fasttext_bin.py b/gensim/models/_fasttext_bin.py
index b890cf0888..5afad3ba2a 100644
--- a/gensim/models/_fasttext_bin.py
+++ b/gensim/models/_fasttext_bin.py
@@ -31,6 +31,7 @@
 
 import codecs
 import collections
+import gzip
 import io
 import logging
 import struct
@@ -74,6 +75,14 @@
     ('t', 'd'),
 ]
 
+_FLOAT_SIZE = struct.calcsize('@f')
+if _FLOAT_SIZE == 4:
+    _FLOAT_DTYPE = np.dtype(np.float32)
+elif _FLOAT_SIZE == 8:
+    _FLOAT_DTYPE = np.dtype(np.float64)
+else:
+    _FLOAT_DTYPE = None
+
 
 def _yield_field_names():
     for name, _ in _OLD_HEADER_FORMAT + _NEW_HEADER_FORMAT:
@@ -220,13 +229,34 @@ def _load_matrix(fin, new_format=True):
         The number of columns of the array will correspond to the vector size.
 
     """
+    if _FLOAT_DTYPE is None:
+        raise ValueError('bad _FLOAT_SIZE: %r' % _FLOAT_SIZE)
+
     if new_format:
         _struct_unpack(fin, '@?')  # bool quant_input in fasttext.cc
 
     num_vectors, dim = _struct_unpack(fin, '@2q')
-
     count = num_vectors * dim
-    matrix = _fromfile(fin, None, count)
+
+    #
+    # numpy.fromfile doesn't play well with gzip.GzipFile as input:
+    #
+    # - https://github.com/RaRe-Technologies/gensim/pull/2476
+    # - https://github.com/numpy/numpy/issues/13470
+    #
+    # Until they fix it, we have to apply a workaround.  We only apply the
+    # workaround when it's necessary, because np.fromfile is heavily optimized
+    # and very efficient (when it works).
+    #
+    if isinstance(fin, gzip.GzipFile):
+        logger.info(
+            'Loading model from a compressed .gz file.  This can be slow. '
+            'Consider decompressing the model before loading it.'
+        )
+        matrix = _fromfile(fin, _FLOAT_DTYPE, count)
+    else:
+        matrix = np.fromfile(fin, _FLOAT_DTYPE, count)
+
     assert matrix.shape == (count,), 'expected (%r,),  got %r' % (count, matrix.shape)
     matrix = matrix.reshape((num_vectors, dim))
     return matrix
@@ -254,16 +284,6 @@ def _batched_generator(fin, count, batch_size=1e6):
 
 def _fromfile(fin, dtype, count):
     """Reimplementation of numpy.fromfile."""
-
-    if dtype is None:
-        float_size = struct.calcsize('@f')
-        if float_size == 4:
-            dtype = np.dtype(np.float32)
-        elif float_size == 8:
-            dtype = np.dtype(np.float64)
-        else:
-            raise ValueError("Incompatible float size: %r" % float_size)
-
     return np.fromiter(_batched_generator(fin, count), dtype=dtype)
 
 

From b61bfeee6e22212e581ac8ca09565d924db194f3 Mon Sep 17 00:00:00 2001
From: Michael Penkov <m@penkov.dev>
Date: Sun, 5 May 2019 18:24:18 +0900
Subject: [PATCH 7/9] make flake8 happy

---
 gensim/test/test_fasttext.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/gensim/test/test_fasttext.py b/gensim/test/test_fasttext.py
index 0ce3de77b2..1346534053 100644
--- a/gensim/test/test_fasttext.py
+++ b/gensim/test/test_fasttext.py
@@ -1301,7 +1301,6 @@ def _run(self, fin):
         self.assertTrue(np.allclose(_ARRAY, array))
 
 
-
 if __name__ == '__main__':
     logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
     unittest.main()

From 4d18b346cd0eed636edc2d26f27d2ebf179e754e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= <me@radimrehurek.com>
Date: Sun, 5 May 2019 18:55:17 +0900
Subject: [PATCH 8/9] Update gensim/models/_fasttext_bin.py

Co-Authored-By: mpenkov <m@penkov.dev>
---
 gensim/models/_fasttext_bin.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gensim/models/_fasttext_bin.py b/gensim/models/_fasttext_bin.py
index 5afad3ba2a..a9e9fa5cb7 100644
--- a/gensim/models/_fasttext_bin.py
+++ b/gensim/models/_fasttext_bin.py
@@ -263,7 +263,7 @@ def _load_matrix(fin, new_format=True):
 
 
 def _batched_generator(fin, count, batch_size=1e6):
-    """Reads count floats from fin.
+    """Read `count` floats from `fin`.
 
     Batches up read calls to avoid I/O overhead.  Keeps no more than batch_size
     floats in memory at once.

From 0cfb9e1baac072fcaaf274e4f3b8ecc929b751e7 Mon Sep 17 00:00:00 2001
From: Michael Penkov <m@penkov.dev>
Date: Sun, 5 May 2019 22:15:33 +0900
Subject: [PATCH 9/9] More review responses

---
 gensim/models/_fasttext_bin.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/gensim/models/_fasttext_bin.py b/gensim/models/_fasttext_bin.py
index a9e9fa5cb7..f00b049ee4 100644
--- a/gensim/models/_fasttext_bin.py
+++ b/gensim/models/_fasttext_bin.py
@@ -249,9 +249,10 @@ def _load_matrix(fin, new_format=True):
     # and very efficient (when it works).
     #
     if isinstance(fin, gzip.GzipFile):
-        logger.info(
+        logger.warning(
             'Loading model from a compressed .gz file.  This can be slow. '
-            'Consider decompressing the model before loading it.'
+            'This is a work-around for a bug in NumPy: https://github.com/numpy/numpy/issues/13470. '
+            'Consider decompressing your model file for a faster load. '
         )
         matrix = _fromfile(fin, _FLOAT_DTYPE, count)
     else: