piskvorky · tmylk · Nov 3, 2015 · Aug 13, 2015 · Oct 10, 2015 · Oct 11, 2015
diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py
@@ -35,6 +35,7 @@
 
 import logging
 import numpy  # for arrays, array broadcasting etc.
+import numbers
 
 from gensim import interfaces, utils, matutils
 from itertools import chain
@@ -66,6 +67,29 @@ def dirichlet_expectation(alpha):
         result = psi(alpha) - psi(numpy.sum(alpha, 1))[:, numpy.newaxis]
     return result.astype(alpha.dtype)  # keep the same precision as input
 
+def update_dir_prior(prior, N, logphat, rho):
+    """
+    Updates a given prior using Newton's method, described in
+    **Huang: Maximum Likelihood Estimation of Dirichlet Distribution Parameters.**
+    http://jonathan-huang.org/research/dirichlet/dirichlet.pdf
+    """
+    dprior = numpy.copy(prior)
+    gradf = N * (psi(numpy.sum(prior)) - psi(prior) + logphat)
+
+    c = N * polygamma(1, numpy.sum(prior))
+    q = -N * polygamma(1, prior)
+
+    b = numpy.sum(gradf / q) / (1 / c + numpy.sum(1 / q))
+
+    dprior = -(gradf - b) / q
+
+    if all(rho * dprior + prior > 0):
+        prior += rho * dprior
+    else:
+        logger.warning("updated prior not positive")
+
+    return prior
+
 
 class LdaState(utils.SaveLoad):
     """
@@ -200,11 +224,12 @@ def __init__(self, corpus=None, num_topics=100, id2word=None,
         prior directly from your data.
 
         `eta` can be a scalar for a symmetric prior over topic/word
-        distributions, or a matrix of shape num_topics x num_words,
-        which can be used to impose asymmetric priors over the word
-        distribution on a per-topic basis. This may be useful if you
-        want to seed certain topics with particular words by boosting
-        the priors for those words.
+        distributions, or a matrix of shape num_topics x num_words, which can
+        be used to impose asymmetric priors over the word distribution on a
+        per-topic basis. This may be useful if you want to seed certain topics
+        with particular words by boosting the priors for those words.  It also
+        supports the special value 'auto', which learns an asymmetric prior
+        directly from your data.
 
         Turn on `distributed` to force distributed computing (see the `web tutorial <http://radimrehurek.com/gensim/distributed.html>`_
         on how to set up a cluster of machines for gensim).
@@ -258,26 +283,16 @@ def __init__(self, corpus=None, num_topics=100, id2word=None,
         self.eval_every = eval_every
 
         self.optimize_alpha = alpha == 'auto'
-        if alpha == 'symmetric' or alpha is None:
-            logger.info("using symmetric alpha at %s", 1.0 / num_topics)
-            self.alpha = numpy.asarray([1.0 / num_topics for i in xrange(num_topics)])
-        elif alpha == 'asymmetric':
-            self.alpha = numpy.asarray([1.0 / (i + numpy.sqrt(num_topics)) for i in xrange(num_topics)])
-            self.alpha /= self.alpha.sum()
-            logger.info("using asymmetric alpha %s", list(self.alpha))
-        elif alpha == 'auto':
-            self.alpha = numpy.asarray([1.0 / num_topics for i in xrange(num_topics)])
-            logger.info("using autotuned alpha, starting with %s", list(self.alpha))
-        else:
-            # must be either float or an array of floats, of size num_topics
-            self.alpha = alpha if isinstance(alpha, numpy.ndarray) else numpy.asarray([alpha] * num_topics)
-            if len(self.alpha) != num_topics:
-                raise RuntimeError("invalid alpha shape (must match num_topics)")
+        self.alpha = self.init_dir_prior(alpha, 'alpha')
 
-        if eta is None:
-            self.eta = 1.0 / num_topics
-        else:
-            self.eta = eta
+        assert self.alpha.shape == (num_topics,), "Invalid alpha shape. Got shape %s, but expected (%d, )" % (str(self.alpha.shape), num_topics)
+
+        self.optimize_eta = eta == 'auto'
+        self.eta = self.init_dir_prior(eta, 'eta')
+
+        assert (self.eta.shape == (num_topics, 1) or self.eta.shape == (num_topics, self.num_terms)), (
+            "Invalid eta shape. Got shape %s, but expected (%d, 1) or (%d, %d)" %
+            (str(self.eta.shape), num_topics, num_topics, self.num_terms))
 
         # VB constants
         self.iterations = iterations
@@ -314,6 +329,36 @@ def __init__(self, corpus=None, num_topics=100, id2word=None,
         if corpus is not None:
             self.update(corpus)
 
+    def init_dir_prior(self, prior, name):
+        if prior == 'symmetric' or prior is None:
+            logger.info("using symmetric %s at %s", name, 1.0 / self.num_topics)
+            init_prior = numpy.asarray([1.0 / self.num_topics for i in xrange(self.num_topics)])
+        elif prior == 'asymmetric':
+            init_prior = numpy.asarray([1.0 / (i + numpy.sqrt(self.num_topics)) for i in xrange(self.num_topics)])
+            init_prior /= init_prior.sum()
+            logger.info("using asymmetric %s %s", name, list(init_prior))
+        elif prior == 'auto':
+            init_prior = numpy.asarray([1.0 / self.num_topics for i in xrange(self.num_topics)])
+            logger.info("using autotuned %s, starting with %s", name, list(init_prior))
+        elif isinstance(prior, list):
+            init_prior = numpy.asarray(prior)
+        elif isinstance(prior, numpy.ndarray):
+            init_prior = prior
+        elif isinstance(prior, numpy.number) or isinstance(prior, numbers.Real):
+            init_prior = numpy.asarray([prior] * self.num_topics)
+        else:
+            raise ValueError("%s must be either a numpy array of scalars, list of scalars, or scalar" % name)
+
+        if name == 'eta':
+            # please note the difference in shapes between alpha and eta:
+            # alpha is a row: [0.1, 0.1]
+            # eta is a column: [[0.1],
+            #                   [0.1]]
+            if init_prior.shape == (self.num_topics,) or init_prior.shape == (1, self.num_topics):
+                init_prior = init_prior.reshape((self.num_topics, 1)) # this statement throws ValueError if eta did not match self.num_topics
+
+        return init_prior
+
     def __str__(self):
         return "LdaModel(num_terms=%s, num_topics=%s, decay=%s, chunksize=%s)" % \
             (self.num_terms, self.num_topics, self.decay, self.chunksize)
@@ -425,34 +470,34 @@ def do_estep(self, chunk, state=None):
         state.numdocs += gamma.shape[0]  # avoids calling len(chunk) on a generator
         return gamma
 
+
     def update_alpha(self, gammat, rho):
         """
         Update parameters for the Dirichlet prior on the per-document
         topic weights `alpha` given the last `gammat`.
-
-        Uses Newton's method, described in **Huang: Maximum Likelihood Estimation of Dirichlet Distribution Parameters.**
-        http://jonathan-huang.org/research/dirichlet/dirichlet.pdf
-
         """
         N = float(len(gammat))
         logphat = sum(dirichlet_expectation(gamma) for gamma in gammat) / N
-        dalpha = numpy.copy(self.alpha)
-        gradf = N * (psi(numpy.sum(self.alpha)) - psi(self.alpha) + logphat)
 
-        c = N * polygamma(1, numpy.sum(self.alpha))
-        q = -N * polygamma(1, self.alpha)
+        self.alpha = update_dir_prior(self.alpha, N, logphat, rho)
+        logger.info("optimized alpha %s", list(self.alpha))
 
-        b = numpy.sum(gradf / q) / (1 / c + numpy.sum(1 / q))
+        return self.alpha
 
-        dalpha = -(gradf - b) / q
+    def update_eta(self, lambdat, rho):
+        """
+        Update parameters for the Dirichlet prior on the per-topic
+        word weights `eta` given the last `lambdat`.
+        """
+        if self.eta.shape[1] != 1:
+            raise ValueError("Can't use update_eta with eta matrices, only column vectors.")
+        N = float(lambdat.shape[1])
+        logphat = (sum(dirichlet_expectation(lambda_) for lambda_ in lambdat.transpose()) / N).reshape((self.num_topics,1))
 
-        if all(rho * dalpha + self.alpha > 0):
-            self.alpha += rho * dalpha
-        else:
-            logger.warning("updated alpha not positive")
-        logger.info("optimized alpha %s", list(self.alpha))
+        self.eta = update_dir_prior(self.eta, N, logphat, rho)
+        logger.info("optimized eta %s", list(self.eta.reshape((self.num_topics))))
 
-        return self.alpha
+        return self.eta
 
     def log_perplexity(self, chunk, total_docs=None):
         """
@@ -629,6 +674,9 @@ def do_mstep(self, rho, other, extra_pass=False):
         self.print_topics(5)
         logger.info("topic diff=%f, rho=%f", numpy.mean(numpy.abs(diff)), rho)
 
+        if self.optimize_eta:
+            self.update_eta(self.state.get_lambda(), rho)
+
         if not extra_pass:
             # only update if this isn't an additional pass
             self.num_updates += other.numdocs
@@ -846,9 +894,9 @@ def save(self, fname, ignore=['state', 'dispatcher'], *args, **kwargs):
         Save the model to file.
 
         Large internal arrays may be stored into separate files, with `fname` as prefix.
-        
+
         `separately` can be used to define which arrays should be stored in separate files.
-        
+
         `ignore` parameter can be used to define which variables should be ignored, i.e. left
         out from the pickled lda model. By default the internal `state` is ignored as it uses
         its own serialisation not the one provided by `LdaModel`. The `state` and `dispatcher
@@ -870,7 +918,7 @@ def save(self, fname, ignore=['state', 'dispatcher'], *args, **kwargs):
         """
         if self.state is not None:
             self.state.save(utils.smart_extension(fname, '.state'), *args, **kwargs)
-        
+
         # make sure 'state' and 'dispatcher' are ignored from the pickled object, even if
         # someone sets the ignore list themselves
         if ignore is not None and ignore:

diff --git a/gensim/test/test_ldamodel.py b/gensim/test/test_ldamodel.py
@@ -77,6 +77,132 @@ def testTransform(self):
                             (i, sorted(vec), sorted(expected)))
         self.assertTrue(passed)
 
+    def testAlphaAuto(self):
+        model1 = self.class_(corpus, id2word=dictionary, alpha='symmetric', passes=10)
+        modelauto = self.class_(corpus, id2word=dictionary, alpha='auto', passes=10)
+
+        # did we learn something?
+        self.assertFalse(all(numpy.equal(model1.alpha, modelauto.alpha)))
+
+    def testAlpha(self):
+        kwargs = dict(
+            id2word=dictionary,
+            num_topics=2,
+            alpha=None
+        )
+        expected_shape = (2,)
+
+        # should not raise anything
+        self.class_(**kwargs)
+
+        kwargs['alpha'] = 'symmetric'
+        model = self.class_(**kwargs)
+        self.assertEqual(model.alpha.shape, expected_shape)
+        self.assertTrue(all(model.alpha == numpy.array([0.5, 0.5])))
+
+        kwargs['alpha'] = 'asymmetric'
+        model = self.class_(**kwargs)
+        self.assertEqual(model.alpha.shape, expected_shape)
+        self.assertTrue(numpy.allclose(model.alpha, [0.630602, 0.369398]))
+
+        kwargs['alpha'] = 0.3
+        model = self.class_(**kwargs)
+        self.assertEqual(model.alpha.shape, expected_shape)
+        self.assertTrue(all(model.alpha == numpy.array([0.3, 0.3])))
+
+        kwargs['alpha'] = 3
+        model = self.class_(**kwargs)
+        self.assertEqual(model.alpha.shape, expected_shape)
+        self.assertTrue(all(model.alpha == numpy.array([3, 3])))
+
+        kwargs['alpha'] = [0.3, 0.3]
+        model = self.class_(**kwargs)
+        self.assertEqual(model.alpha.shape, expected_shape)
+        self.assertTrue(all(model.alpha == numpy.array([0.3, 0.3])))
+
+        # all should raise an exception for being wrong shape
+        kwargs['alpha'] = [0.3, 0.3, 0.3]
+        self.assertRaises(AssertionError, self.class_, **kwargs)
+
+        kwargs['alpha'] = [[0.3], [0.3]]
+        self.assertRaises(AssertionError, self.class_, **kwargs)
+
+        kwargs['alpha'] = [0.3]
+        self.assertRaises(AssertionError, self.class_, **kwargs)
+
+        kwargs['alpha'] = "gensim is cool"
+        self.assertRaises(ValueError, self.class_, **kwargs)
+
+
+    def testEtaAuto(self):
+        model1 = self.class_(corpus, id2word=dictionary, eta='symmetric', passes=10)
+        modelauto = self.class_(corpus, id2word=dictionary, eta='auto', passes=10)
+
+        # did we learn something?
+        self.assertFalse(all(numpy.equal(model1.eta, modelauto.eta)))
+
+    def testEta(self):
+        kwargs = dict(
+            id2word=dictionary,
+            num_topics=2,
+            eta=None
+        )
+        expected_shape = (2, 1)
+
+        # should not raise anything
+        model = self.class_(**kwargs)
+        self.assertEqual(model.eta.shape, expected_shape)
+        self.assertTrue(all(model.eta == numpy.array([[0.5], [0.5]])))
+
+        kwargs['eta'] = 'symmetric'
+        model = self.class_(**kwargs)
+        self.assertEqual(model.eta.shape, expected_shape)
+        self.assertTrue(all(model.eta == numpy.array([[0.5], [0.5]])))
+
+        kwargs['eta'] = 'asymmetric'
+        model = self.class_(**kwargs)
+        self.assertEqual(model.eta.shape, expected_shape)
+        self.assertTrue(numpy.allclose(model.eta, [[0.630602], [0.369398]]))
+
+        kwargs['eta'] = 0.3
+        model = self.class_(**kwargs)
+        self.assertEqual(model.eta.shape, expected_shape)
+        self.assertTrue(all(model.eta == numpy.array([[0.3], [0.3]])))
+
+        kwargs['eta'] = 3
+        model = self.class_(**kwargs)
+        self.assertEqual(model.eta.shape, expected_shape)
+        self.assertTrue(all(model.eta == numpy.array([[3], [3]])))
+
+        kwargs['eta'] = [[0.3], [0.3]]
+        model = self.class_(**kwargs)
+        self.assertEqual(model.eta.shape, expected_shape)
+        self.assertTrue(all(model.eta == numpy.array([[0.3], [0.3]])))
+
+        kwargs['eta'] = [0.3, 0.3]
+        model = self.class_(**kwargs)
+        self.assertEqual(model.eta.shape, expected_shape)
+        self.assertTrue(all(model.eta == numpy.array([[0.3], [0.3]])))
+
+        # should be ok with num_topics x num_terms
+        testeta = numpy.array([[0.5] * len(dictionary)] * 2)
+        kwargs['eta'] = testeta
+        self.class_(**kwargs)
+
+        # all should raise an exception for being wrong shape
+        kwargs['eta'] = testeta.reshape(tuple(reversed(testeta.shape)))
+        self.assertRaises(AssertionError, self.class_, **kwargs)
+
+        kwargs['eta'] = [0.3, 0.3, 0.3]
+        self.assertRaises(AssertionError, self.class_, **kwargs)
+
+        kwargs['eta'] = [0.3]
+        self.assertRaises(AssertionError, self.class_, **kwargs)
+
+        kwargs['eta'] = "gensim is cool"
+        self.assertRaises(ValueError, self.class_, **kwargs)
+
+
     def testTopTopics(self):
         top_topics = self.model.top_topics(self.corpus)
 
@@ -254,6 +380,11 @@ def setUp(self):
         self.class_ = ldamulticore.LdaMulticore
         self.model = self.class_(corpus, id2word=dictionary, num_topics=2, passes=100)
 
+    # override LdaModel because multicore does not allow alpha=auto
+    def testAlphaAuto(self):
+        self.assertRaises(RuntimeError, self.class_, alpha='auto')
+
+
 #endclass TestLdaMulticore