Skip to content

Commit

Permalink
Add word2vec.PathLineSentences for reading a directory as a corpus (#…
Browse files Browse the repository at this point in the history
…1364) (#1423)

* issue #1364 first commit, corpus from a directory

added method models.word2vec.LineSentencePath

method to read an entire directory's files in the same style as
models.word2vec.LineSentence

* test for word2vec.LineSentencePath issue #1364

initial attempt at test, including files. test just splits the
lee_background.cor file into two parts and puts them in a directory,
then makes sure they match the unsplit file as loaded by
word2vec.LineSentence

* better handling of input for LineSentencePath

no longer sensitive to an input without a trailing os-specific slash

* LineSentencePath renamed PathLineSentences

in word2vec.py . Test updated as well

* LineSentencePath rename to PathLineSentences

in models.word2vec . Tests also updated

* fix whitespace style error

had only 1 space before an inline comment, flagged by travis CI build

* updated PathLineSentences test and test data

Removed LineSentencePath directory, created PathLineSentences
lee corpus duplicates were in LineSentencePath, was wasting space
made new small corpus to test PathLineSentences, put in directory
changed test to read both files manually, combine, and compare to
PathLineSentences (rather than having a separate single file to match
the entire contents of the PathLineSentences test_data directory

* word2vec.PathLineSentences single file support

changed PathLineSentences to support a single file in addition to a
directory, raises a warning to use LineSentence when a single file is
given as a parameter. added corresponding test.

* fixing style issues

* fix style issue
  • Loading branch information
Michael W. Sherman authored and menshikh-iv committed Jul 18, 2017
1 parent 3e38e33 commit b818c91
Show file tree
Hide file tree
Showing 4 changed files with 79 additions and 1 deletion.
52 changes: 51 additions & 1 deletion gensim/models/word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -1550,7 +1550,57 @@ def __iter__(self):
line = utils.to_unicode(line).split()
i = 0
while i < len(line):
yield line[i : i + self.max_sentence_length]
yield line[i:i + self.max_sentence_length]
i += self.max_sentence_length


class PathLineSentences(object):
"""
Simple format: one sentence = one line; words already preprocessed and separated by whitespace.
Like LineSentence, but will process all files in a directory in alphabetical order by filename
"""

def __init__(self, source, max_sentence_length=MAX_WORDS_IN_BATCH, limit=None):
"""
`source` should be a path to a directory (as a string) where all files can be opened by the
LineSentence class. Each file will be read up to
`limit` lines (or no clipped if limit is None, the default).
Example::
sentences = LineSentencePath(os.getcwd() + '\\corpus\\')
The files in the directory should be either text files, .bz2 files, or .gz files.
"""
self.source = source
self.max_sentence_length = max_sentence_length
self.limit = limit

if os.path.isfile(self.source):
logging.warning('single file read, better to use models.word2vec.LineSentence')
self.input_files = [self.source] # force code compatibility with list of files
elif os.path.isdir(self.source):
self.source = os.path.join(self.source, '') # ensures os-specific slash at end of path
logging.debug('reading directory ' + self.source)
self.input_files = os.listdir(self.source)
self.input_files = [self.source + file for file in self.input_files] # make full paths
self.input_files.sort() # makes sure it happens in filename order
else: # not a file or a directory, then we can't do anything with it
raise ValueError('input is neither a file nor a path')

logging.info('files read into PathLineSentences:' + '\n'.join(self.input_files))

def __iter__(self):
'''iterate through the files'''
for file_name in self.input_files:
logging.info('reading file ' + file_name)
with utils.smart_open(file_name) as fin:
for line in itertools.islice(fin, self.limit):
line = utils.to_unicode(line).split()
i = 0
while i < len(line):
yield line[i:i + self.max_sentence_length]
i += self.max_sentence_length


Expand Down
7 changes: 7 additions & 0 deletions gensim/test/test_data/PathLineSentences/1.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
this is important text
it is very important
you are learning a lot
from reading this text.
it much be hard to be so special!
we envy you, with your knowledge of this text file,
thank you.
Binary file added gensim/test/test_data/PathLineSentences/2.txt.bz2
Binary file not shown.
21 changes: 21 additions & 0 deletions gensim/test/test_word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -770,6 +770,27 @@ def testLineSentenceWorksWithNormalFile(self):
sentences = word2vec.LineSentence(fin)
for words in sentences:
self.assertEqual(words, utils.to_unicode(orig.readline()).split())

def testPathLineSentences(self):
"""Does PathLineSentences work with a path argument?"""
with utils.smart_open(os.path.join(datapath('PathLineSentences'), '1.txt')) as orig1,\
utils.smart_open(os.path.join(datapath('PathLineSentences'), '2.txt.bz2')) as orig2:
sentences = word2vec.PathLineSentences(datapath('PathLineSentences'))
orig = orig1.readlines() + orig2.readlines()
orig_counter = 0 # to go through orig while matching PathLineSentences
for words in sentences:
self.assertEqual(words, utils.to_unicode(orig[orig_counter]).split())
orig_counter += 1

def testPathLineSentencesOneFile(self):
"""Does PathLineSentences work with a single file argument?"""
test_file = os.path.join(datapath('PathLineSentences'), '1.txt')
with utils.smart_open(test_file) as orig:
sentences = word2vec.PathLineSentences(test_file)
for words in sentences:
self.assertEqual(words, utils.to_unicode(orig.readline()).split())


#endclass TestWord2VecSentenceIterators

# TODO: get correct path to Python binary
Expand Down

0 comments on commit b818c91

Please sign in to comment.