Add word2vec.PathLineSentences for reading a directory as a corpus (#…

…1364) (#1423) * issue #1364 first commit, corpus from a directory added method models.word2vec.LineSentencePath method to read an entire directory's files in the same style as models.word2vec.LineSentence * test for word2vec.LineSentencePath issue #1364 initial attempt at test, including files. test just splits the lee_background.cor file into two parts and puts them in a directory, then makes sure they match the unsplit file as loaded by word2vec.LineSentence * better handling of input for LineSentencePath no longer sensitive to an input without a trailing os-specific slash * LineSentencePath renamed PathLineSentences in word2vec.py . Test updated as well * LineSentencePath rename to PathLineSentences in models.word2vec . Tests also updated * fix whitespace style error had only 1 space before an inline comment, flagged by travis CI build * updated PathLineSentences test and test data Removed LineSentencePath directory, created PathLineSentences lee corpus duplicates were in LineSentencePath, was wasting space made new small corpus to test PathLineSentences, put in directory changed test to read both files manually, combine, and compare to PathLineSentences (rather than having a separate single file to match the entire contents of the PathLineSentences test_data directory * word2vec.PathLineSentences single file support changed PathLineSentences to support a single file in addition to a directory, raises a warning to use LineSentence when a single file is given as a parameter. added corresponding test. * fixing style issues * fix style issue
piskvorky · Jul 18, 2017 · b818c91 · b818c91
1 parent 3e38e33
commit b818c91
Show file tree

Hide file tree

Showing 4 changed files with 79 additions and 1 deletion.
diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py
@@ -1550,7 +1550,57 @@ def __iter__(self):
                     line = utils.to_unicode(line).split()
                     i = 0
                     while i < len(line):
-                        yield line[i : i + self.max_sentence_length]
+                        yield line[i:i + self.max_sentence_length]
+                        i += self.max_sentence_length
+
+
+class PathLineSentences(object):
+    """
+    Simple format: one sentence = one line; words already preprocessed and separated by whitespace.
+    Like LineSentence, but will process all files in a directory in alphabetical order by filename
+    """
+
+    def __init__(self, source, max_sentence_length=MAX_WORDS_IN_BATCH, limit=None):
+        """
+        `source` should be a path to a directory (as a string) where all files can be opened by the
+        LineSentence class. Each file will be read up to
+        `limit` lines (or no clipped if limit is None, the default).
+
+        Example::
+
+            sentences = LineSentencePath(os.getcwd() + '\\corpus\\')
+
+        The files in the directory should be either text files, .bz2 files, or .gz files.
+
+        """
+        self.source = source
+        self.max_sentence_length = max_sentence_length
+        self.limit = limit
+
+        if os.path.isfile(self.source):
+            logging.warning('single file read, better to use models.word2vec.LineSentence')
+            self.input_files = [self.source]  # force code compatibility with list of files
+        elif os.path.isdir(self.source):
+            self.source = os.path.join(self.source, '')  # ensures os-specific slash at end of path
+            logging.debug('reading directory ' + self.source)
+            self.input_files = os.listdir(self.source)
+            self.input_files = [self.source + file for file in self.input_files]  # make full paths
+            self.input_files.sort()  # makes sure it happens in filename order
+        else:  # not a file or a directory, then we can't do anything with it
+            raise ValueError('input is neither a file nor a path')
+
+        logging.info('files read into PathLineSentences:' + '\n'.join(self.input_files))
+
+    def __iter__(self):
+        '''iterate through the files'''
+        for file_name in self.input_files:
+            logging.info('reading file ' + file_name)
+            with utils.smart_open(file_name) as fin:
+                for line in itertools.islice(fin, self.limit):
+                    line = utils.to_unicode(line).split()
+                    i = 0
+                    while i < len(line):
+                        yield line[i:i + self.max_sentence_length]
                         i += self.max_sentence_length
 
 

diff --git a/gensim/test/test_data/PathLineSentences/1.txt b/gensim/test/test_data/PathLineSentences/1.txt
@@ -0,0 +1,7 @@
+this is important text
+it is very important
+you are learning a lot
+from reading this text.
+it much be hard to be so special!
+we envy you, with your knowledge of this text file,
+thank you.
diff --git a/gensim/test/test_data/PathLineSentences/2.txt.bz2 b/gensim/test/test_data/PathLineSentences/2.txt.bz2
diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py
@@ -770,6 +770,27 @@ def testLineSentenceWorksWithNormalFile(self):
                 sentences = word2vec.LineSentence(fin)
                 for words in sentences:
                     self.assertEqual(words, utils.to_unicode(orig.readline()).split())
+
+    def testPathLineSentences(self):
+        """Does PathLineSentences work with a path argument?"""
+        with utils.smart_open(os.path.join(datapath('PathLineSentences'), '1.txt')) as orig1,\
+        utils.smart_open(os.path.join(datapath('PathLineSentences'), '2.txt.bz2')) as orig2:
+            sentences = word2vec.PathLineSentences(datapath('PathLineSentences'))
+            orig = orig1.readlines() + orig2.readlines()
+            orig_counter = 0  # to go through orig while matching PathLineSentences
+            for words in sentences:
+                self.assertEqual(words, utils.to_unicode(orig[orig_counter]).split())
+                orig_counter += 1
+
+    def testPathLineSentencesOneFile(self):
+        """Does PathLineSentences work with a single file argument?"""
+        test_file = os.path.join(datapath('PathLineSentences'), '1.txt')
+        with utils.smart_open(test_file) as orig:
+            sentences = word2vec.PathLineSentences(test_file)
+            for words in sentences:
+                self.assertEqual(words, utils.to_unicode(orig.readline()).split())
+
+
 #endclass TestWord2VecSentenceIterators
 
 # TODO: get correct path to Python binary