Changed from using floats to ints for doc terms & frequencies

darindf · Apr 11, 2018 · b1f004e · b1f004e
1 parent 06f5f5c
commit b1f004e
Show file tree

Hide file tree

Showing 2 changed files with 4 additions and 3 deletions.
diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py
@@ -698,7 +698,8 @@ def rho():
             dirty = False
 
             reallen = 0
-            for chunk_no, chunk in enumerate(utils.grouper(corpus, chunksize, as_numpy=chunks_as_numpy)):
+            for chunk_no, chunk in enumerate(utils.grouper(corpus, chunksize, as_numpy=chunks_as_numpy,
+                                            dtype=self.dtype)):
                 reallen += len(chunk)  # keep track of how many documents we've processed so far
 
                 if eval_every and ((reallen == lencorpus) or ((chunk_no + 1) % (eval_every * self.numworkers) == 0)):

diff --git a/gensim/utils.py b/gensim/utils.py
@@ -1119,7 +1119,7 @@ def substitute_entity(match):
     return RE_HTML_ENTITY.sub(substitute_entity, text)
 
 
-def chunkize_serial(iterable, chunksize, as_numpy=False):
+def chunkize_serial(iterable, chunksize, as_numpy=False,dtype=np.float32):
     """Give elements from the iterable in `chunksize`-ed lists.
     The last returned element may be smaller (if length of collection is not divisible by `chunksize`).
 
@@ -1148,7 +1148,7 @@ def chunkize_serial(iterable, chunksize, as_numpy=False):
         if as_numpy:
             # convert each document to a 2d numpy array (~6x faster when transmitting
             # chunk data over the wire, in Pyro)
-            wrapped_chunk = [[np.array(doc) for doc in itertools.islice(it, int(chunksize))]]
+            wrapped_chunk = [[np.asarray(doc,dtype=dtype) for doc in itertools.islice(it, int(chunksize))]]
         else:
             wrapped_chunk = [list(itertools.islice(it, int(chunksize)))]
         if not wrapped_chunk[0]: