Skip to content

Commit

Permalink
Changed from using floats to ints for doc terms & frequencies
Browse files Browse the repository at this point in the history
  • Loading branch information
Darin Deforest committed Apr 11, 2018
1 parent 06f5f5c commit b1f004e
Show file tree
Hide file tree
Showing 2 changed files with 4 additions and 3 deletions.
3 changes: 2 additions & 1 deletion gensim/models/ldamodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -698,7 +698,8 @@ def rho():
dirty = False

reallen = 0
for chunk_no, chunk in enumerate(utils.grouper(corpus, chunksize, as_numpy=chunks_as_numpy)):
for chunk_no, chunk in enumerate(utils.grouper(corpus, chunksize, as_numpy=chunks_as_numpy,
dtype=self.dtype)):
reallen += len(chunk) # keep track of how many documents we've processed so far

if eval_every and ((reallen == lencorpus) or ((chunk_no + 1) % (eval_every * self.numworkers) == 0)):
Expand Down
4 changes: 2 additions & 2 deletions gensim/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1119,7 +1119,7 @@ def substitute_entity(match):
return RE_HTML_ENTITY.sub(substitute_entity, text)


def chunkize_serial(iterable, chunksize, as_numpy=False):
def chunkize_serial(iterable, chunksize, as_numpy=False,dtype=np.float32):
"""Give elements from the iterable in `chunksize`-ed lists.
The last returned element may be smaller (if length of collection is not divisible by `chunksize`).
Expand Down Expand Up @@ -1148,7 +1148,7 @@ def chunkize_serial(iterable, chunksize, as_numpy=False):
if as_numpy:
# convert each document to a 2d numpy array (~6x faster when transmitting
# chunk data over the wire, in Pyro)
wrapped_chunk = [[np.array(doc) for doc in itertools.islice(it, int(chunksize))]]
wrapped_chunk = [[np.asarray(doc,dtype=dtype) for doc in itertools.islice(it, int(chunksize))]]
else:
wrapped_chunk = [list(itertools.islice(it, int(chunksize)))]
if not wrapped_chunk[0]:
Expand Down

0 comments on commit b1f004e

Please sign in to comment.