Skip to content

Commit

Permalink
avoid byte concatenation
Browse files Browse the repository at this point in the history
  • Loading branch information
mpenkov committed Feb 13, 2019
1 parent 1211071 commit 6dc4aef
Showing 1 changed file with 6 additions and 3 deletions.
9 changes: 6 additions & 3 deletions gensim/models/_fasttext_bin.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
"""

import collections
import io
import logging
import struct

Expand Down Expand Up @@ -173,12 +174,14 @@ def _load_vocab(fin, new_format, encoding='utf-8'):

raw_vocab = collections.OrderedDict()
for i in range(vocab_size):
word_bytes = b''
word_bytes = io.BytesIO()
char_byte = fin.read(1)
# Read vocab word

while char_byte != _END_OF_WORD_MARKER:
word_bytes += char_byte
word_bytes.write(char_byte)
char_byte = fin.read(1)

word_bytes = word_bytes.getvalue()
try:
word = word_bytes.decode(encoding)
except UnicodeDecodeError:
Expand Down

0 comments on commit 6dc4aef

Please sign in to comment.