diff --git a/gensim/models/wrappers/wordrank.py b/gensim/models/wrappers/wordrank.py index aaceeac2b1..7620093e37 100644 --- a/gensim/models/wrappers/wordrank.py +++ b/gensim/models/wrappers/wordrank.py @@ -8,7 +8,7 @@ `Word2Vec` for that. Example: ->>> model = gensim.models.wrappers.Wordrank('/Users/dummy/wordrank', corpus_file='text8', out_path='wr_model') +>>> model = gensim.models.wrappers.Wordrank('/Users/dummy/wordrank', corpus_file='text8', out_name='wr_model') >>> print model[word] # prints vector for given words .. [1] https://bitbucket.org/shihaoji/wordrank/ @@ -45,14 +45,14 @@ class Wordrank(KeyedVectors): """ @classmethod - def train(cls, wr_path, corpus_file, out_path, size=100, window=15, symmetric=1, min_count=5, max_vocab_size=0, + def train(cls, wr_path, corpus_file, out_name, size=100, window=15, symmetric=1, min_count=5, max_vocab_size=0, sgd_num=100, lrate=0.001, period=10, iter=90, epsilon=0.75, dump_period=10, reg=0, alpha=100, beta=99, loss='hinge', memory=4.0, cleanup_files=True, sorted_vocab=1, ensemble=0): """ `wr_path` is the path to the Wordrank directory. `corpus_file` is the filename of the text file to be used for training the Wordrank model. Expects file to contain space-separated tokens in a single line - `out_path` is the path to directory which will be created to save embeddings and training data. + `out_name` is name of the directory which will be created(in wordrank folder) to save embeddings and training data. `size` is the dimensionality of the feature vectors. `window` is the number of context words to the left (and to the right, if symmetric = 1). `symmetric` if 0, only use left context words, else use left and right both. @@ -82,7 +82,7 @@ def train(cls, wr_path, corpus_file, out_path, size=100, window=15, symmetric=1, meta_file = 'meta' # prepare training data (cooccurrence matrix and vocab) - model_dir = os.path.join(wr_path, out_path) + model_dir = os.path.join(wr_path, out_name) meta_dir = os.path.join(model_dir, 'meta') os.makedirs(meta_dir) logger.info("Dumped data will be stored in '%s'", model_dir) @@ -95,14 +95,16 @@ def train(cls, wr_path, corpus_file, out_path, size=100, window=15, symmetric=1, cmd_del_vocab_freq = ['cut', '-d', " ", '-f', '1', temp_vocab_file] commands = [cmd_vocab_count, cmd_cooccurence_count, cmd_shuffle_cooccurences] - logger.info("Prepare training data using glove code '%s'", commands) input_fnames = [corpus_file.split('/')[-1], corpus_file.split('/')[-1], cooccurrence_file] output_fnames = [temp_vocab_file, cooccurrence_file, cooccurrence_shuf_file] + logger.info("Prepare training data using glove code") for command, input_fname, output_fname in zip(commands, input_fnames, output_fnames): with smart_open(input_fname, 'rb') as r: with smart_open(output_fname, 'wb') as w: utils.check_output(w, args=command, stdin=r) + + logger.info("Delete frequencies from vocab file") with smart_open(vocab_file, 'wb') as w: utils.check_output(w, args=cmd_del_vocab_freq) @@ -147,7 +149,7 @@ def train(cls, wr_path, corpus_file, out_path, size=100, window=15, symmetric=1, for option, value in wr_args.items(): cmd.append('--%s' % option) cmd.append(str(value)) - logger.info("Running wordrank binary '%s'", cmd) + logger.info("Running wordrank binary") output = utils.check_output(args=cmd) # use embeddings from max. iteration's dump diff --git a/gensim/test/test_wordrank_wrapper.py b/gensim/test/test_wordrank_wrapper.py index dbface5e34..2b185c4839 100644 --- a/gensim/test/test_wordrank_wrapper.py +++ b/gensim/test/test_wordrank_wrapper.py @@ -30,11 +30,11 @@ def setUp(self): wr_home = os.environ.get('WR_HOME', None) self.wr_path = wr_home if wr_home else None self.corpus_file = datapath('lee.cor') - self.out_path = 'testmodel' + self.out_name = 'testmodel' self.wr_file = datapath('test_glove.txt') if not self.wr_path: return - self.test_model = wordrank.Wordrank.train(self.wr_path, self.corpus_file, self.out_path, iter=6, dump_period=5,period=5) + self.test_model = wordrank.Wordrank.train(self.wr_path, self.corpus_file, self.out_name, iter=6, dump_period=5, period=5) def testLoadWordrankFormat(self): """Test model successfully loaded from Wordrank format file""" diff --git a/gensim/utils.py b/gensim/utils.py index 8d5fdb7d7f..5fa91c5032 100644 --- a/gensim/utils.py +++ b/gensim/utils.py @@ -1164,6 +1164,7 @@ def check_output(stdout=subprocess.PIPE, *popenargs, **kwargs): Added extra KeyboardInterrupt handling """ try: + logger.debug("COMMAND: %s %s", str(popenargs), str(kwargs)) process = subprocess.Popen(stdout=stdout, *popenargs, **kwargs) output, unused_err = process.communicate() retcode = process.poll()