piskvorky · menshikh-iv · Jul 25, 2017 · Jul 24, 2017 · Jul 24, 2017 · Jul 24, 2017
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -3,7 +3,7 @@ FROM ubuntu:16.04
 MAINTAINER Parul Sethi <parul1sethi@gmail.com>
 
 ENV GENSIM_REPOSITORY https://github.com/RaRe-Technologies/gensim.git
-ENV GENSIM_VERSION 59c3834e0aa233ef9010fe1a22d32c43c0378bab
+ENV GENSIM_BRANCH develop
 
 # Installs python, pip and setup tools (with fixed versions)
 RUN apt-get update \
@@ -72,7 +72,7 @@ RUN python3 -m spacy download en
 # Download gensim from Github
 RUN git clone $GENSIM_REPOSITORY \
     && cd /gensim \
-    && git checkout $GENSIM_VERSION \
+    && git checkout $GENSIM_BRANCH \
     && pip2 install .[test] \
     && python2 setup.py install \
     && pip3 install .[test] \

diff --git a/gensim/models/wrappers/wordrank.py b/gensim/models/wrappers/wordrank.py
@@ -87,28 +87,26 @@ def train(cls, wr_path, corpus_file, out_name, size=100, window=15, symmetric=1,
         `ensemble` = 0 (default), use ensemble of word and context vectors
         """
 
-        meta_data_path = 'matrix.meta'
-        vocab_file = 'vocab.txt'
-        temp_vocab_file = 'tempvocab.txt'
-        cooccurrence_file = 'cooccurrence'
-        cooccurrence_shuf_file = 'wiki.toy'
-        meta_file = 'meta'
-
         # prepare training data (cooccurrence matrix and vocab)
         model_dir = os.path.join(wr_path, out_name)
         meta_dir = os.path.join(model_dir, 'meta')
         os.makedirs(meta_dir)
         logger.info("Dumped data will be stored in '%s'", model_dir)
         copyfile(corpus_file, os.path.join(meta_dir, corpus_file.split('/')[-1]))
-        os.chdir(meta_dir)
 
-        cmd_vocab_count = ['../../glove/vocab_count', '-min-count', str(min_count), '-max-vocab', str(max_vocab_size)]
-        cmd_cooccurence_count = ['../../glove/cooccur', '-memory', str(memory), '-vocab-file', temp_vocab_file, '-window-size', str(window), '-symmetric', str(symmetric)]
-        cmd_shuffle_cooccurences = ['../../glove/shuffle', '-memory', str(memory)]
+        vocab_file = os.path.join(meta_dir, 'vocab.txt')
+        temp_vocab_file = os.path.join(meta_dir, 'tempvocab.txt')
+        cooccurrence_file = os.path.join(meta_dir, 'cooccurrence')
+        cooccurrence_shuf_file = os.path.join(meta_dir, 'wiki.toy')
+        meta_file = os.path.join(meta_dir, 'meta')
+
+        cmd_vocab_count = [os.path.join(wr_path, 'glove', 'vocab_count'), '-min-count', str(min_count), '-max-vocab', str(max_vocab_size)]
+        cmd_cooccurence_count = [os.path.join(wr_path, 'glove', 'cooccur'), '-memory', str(memory), '-vocab-file', temp_vocab_file, '-window-size', str(window), '-symmetric', str(symmetric)]
+        cmd_shuffle_cooccurences = [os.path.join(wr_path, 'glove', 'shuffle'), '-memory', str(memory)]
         cmd_del_vocab_freq = ['cut', '-d', " ", '-f', '1', temp_vocab_file]
 
         commands = [cmd_vocab_count, cmd_cooccurence_count, cmd_shuffle_cooccurences]
-        input_fnames = [corpus_file.split('/')[-1], corpus_file.split('/')[-1], cooccurrence_file]
+        input_fnames = [os.path.join(meta_dir, os.path.split(corpus_file)[-1]), os.path.join(meta_dir, os.path.split(corpus_file)[-1]), cooccurrence_file]
         output_fnames = [temp_vocab_file, cooccurrence_file, cooccurrence_shuf_file]
 
         logger.info("Prepare training data (%s) using glove code", ", ".join(input_fnames))
@@ -126,7 +124,7 @@ def train(cls, wr_path, corpus_file, out_name, size=100, window=15, symmetric=1,
         with smart_open(cooccurrence_shuf_file, 'rb') as f:
             numlines = sum(1 for line in f)
         with smart_open(meta_file, 'wb') as f:
-            meta_info = "{0} {1}\n{2} {3}\n{4} {5}".format(numwords, numwords, numlines, cooccurrence_shuf_file, numwords, vocab_file)
+            meta_info = "{0} {1}\n{2} {3}\n{4} {5}".format(numwords, numwords, numlines, cooccurrence_shuf_file.split('/')[-1], numwords, vocab_file.split('/')[-1])
             f.write(meta_info.encode('utf-8'))
 
         if iter % dump_period == 0:
@@ -158,7 +156,7 @@ def train(cls, wr_path, corpus_file, out_name, size=100, window=15, symmetric=1,
 
         # run wordrank executable with wr_args
         cmd = ['mpirun', '-np']
-        cmd.append(np)
+        cmd.append(str(np))
         cmd.append(os.path.join(wr_path, 'wordrank'))
         for option, value in wr_args.items():
             cmd.append('--%s' % option)
@@ -168,10 +166,9 @@ def train(cls, wr_path, corpus_file, out_name, size=100, window=15, symmetric=1,
 
         # use embeddings from max. iteration's dump
         max_iter_dump = iter - (iter % dump_period)
-        copyfile('model_word_%d.txt' % max_iter_dump, 'wordrank.words')
-        copyfile('model_context_%d.txt' % max_iter_dump, 'wordrank.contexts')
-        model = cls.load_wordrank_model('wordrank.words', os.path.join('meta', vocab_file), 'wordrank.contexts', sorted_vocab, ensemble)
-        os.chdir('../..')
+        os.rename('model_word_%d.txt' % max_iter_dump, os.path.join(model_dir, 'wordrank.words'))
+        os.rename('model_context_%d.txt' % max_iter_dump, os.path.join(model_dir, 'wordrank.contexts'))
+        model = cls.load_wordrank_model(os.path.join(model_dir, 'wordrank.words'), vocab_file, os.path.join(model_dir, 'wordrank.contexts'), sorted_vocab, ensemble)
 
         if cleanup_files:
             rmtree(model_dir)

diff --git a/gensim/test/test_corpora.py b/gensim/test/test_corpora.py
@@ -517,14 +517,14 @@ def test_non_trivial_structure(self):
 
         corpus = textcorpus.TextDirectoryCorpus(dirpath)
         filenames = list(corpus.iter_filepaths())
-        base_names = [name[len(dirpath) + 1:] for name in filenames]
-        expected = [
+        base_names = sorted([name[len(dirpath) + 1:] for name in filenames])
+        expected = sorted([
             '0.txt',
             'a_folder/1.txt',
             'b_folder/2.txt',
             'b_folder/3.txt',
             'b_folder/c_folder/4.txt'
-        ]
+        ])
         expected = [os.path.normpath(path) for path in expected]
         self.assertEqual(expected, base_names)