Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

LDA eta parameter auto-learning #479

Merged
merged 5 commits into from
Nov 3, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
134 changes: 91 additions & 43 deletions gensim/models/ldamodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@

import logging
import numpy # for arrays, array broadcasting etc.
import numbers

from gensim import interfaces, utils, matutils
from itertools import chain
Expand Down Expand Up @@ -66,6 +67,29 @@ def dirichlet_expectation(alpha):
result = psi(alpha) - psi(numpy.sum(alpha, 1))[:, numpy.newaxis]
return result.astype(alpha.dtype) # keep the same precision as input

def update_dir_prior(prior, N, logphat, rho):
"""
Updates a given prior using Newton's method, described in
**Huang: Maximum Likelihood Estimation of Dirichlet Distribution Parameters.**
http://jonathan-huang.org/research/dirichlet/dirichlet.pdf
"""
dprior = numpy.copy(prior)
gradf = N * (psi(numpy.sum(prior)) - psi(prior) + logphat)

c = N * polygamma(1, numpy.sum(prior))
q = -N * polygamma(1, prior)

b = numpy.sum(gradf / q) / (1 / c + numpy.sum(1 / q))

dprior = -(gradf - b) / q

if all(rho * dprior + prior > 0):
prior += rho * dprior
else:
logger.warning("updated prior not positive")

return prior


class LdaState(utils.SaveLoad):
"""
Expand Down Expand Up @@ -200,11 +224,12 @@ def __init__(self, corpus=None, num_topics=100, id2word=None,
prior directly from your data.

`eta` can be a scalar for a symmetric prior over topic/word
distributions, or a matrix of shape num_topics x num_words,
which can be used to impose asymmetric priors over the word
distribution on a per-topic basis. This may be useful if you
want to seed certain topics with particular words by boosting
the priors for those words.
distributions, or a matrix of shape num_topics x num_words, which can
be used to impose asymmetric priors over the word distribution on a
per-topic basis. This may be useful if you want to seed certain topics
with particular words by boosting the priors for those words. It also
supports the special value 'auto', which learns an asymmetric prior
directly from your data.

Turn on `distributed` to force distributed computing (see the `web tutorial <http://radimrehurek.com/gensim/distributed.html>`_
on how to set up a cluster of machines for gensim).
Expand Down Expand Up @@ -258,26 +283,16 @@ def __init__(self, corpus=None, num_topics=100, id2word=None,
self.eval_every = eval_every

self.optimize_alpha = alpha == 'auto'
if alpha == 'symmetric' or alpha is None:
logger.info("using symmetric alpha at %s", 1.0 / num_topics)
self.alpha = numpy.asarray([1.0 / num_topics for i in xrange(num_topics)])
elif alpha == 'asymmetric':
self.alpha = numpy.asarray([1.0 / (i + numpy.sqrt(num_topics)) for i in xrange(num_topics)])
self.alpha /= self.alpha.sum()
logger.info("using asymmetric alpha %s", list(self.alpha))
elif alpha == 'auto':
self.alpha = numpy.asarray([1.0 / num_topics for i in xrange(num_topics)])
logger.info("using autotuned alpha, starting with %s", list(self.alpha))
else:
# must be either float or an array of floats, of size num_topics
self.alpha = alpha if isinstance(alpha, numpy.ndarray) else numpy.asarray([alpha] * num_topics)
if len(self.alpha) != num_topics:
raise RuntimeError("invalid alpha shape (must match num_topics)")
self.alpha = self.init_dir_prior(alpha, 'alpha')

if eta is None:
self.eta = 1.0 / num_topics
else:
self.eta = eta
assert self.alpha.shape == (num_topics,), "Invalid alpha shape. Got shape %s, but expected (%d, )" % (str(self.alpha.shape), num_topics)

self.optimize_eta = eta == 'auto'
self.eta = self.init_dir_prior(eta, 'eta')

assert (self.eta.shape == (num_topics, 1) or self.eta.shape == (num_topics, self.num_terms)), (
"Invalid eta shape. Got shape %s, but expected (%d, 1) or (%d, %d)" %
(str(self.eta.shape), num_topics, num_topics, self.num_terms))

# VB constants
self.iterations = iterations
Expand Down Expand Up @@ -314,6 +329,36 @@ def __init__(self, corpus=None, num_topics=100, id2word=None,
if corpus is not None:
self.update(corpus)

def init_dir_prior(self, prior, name):
if prior == 'symmetric' or prior is None:
logger.info("using symmetric %s at %s", name, 1.0 / self.num_topics)
init_prior = numpy.asarray([1.0 / self.num_topics for i in xrange(self.num_topics)])
elif prior == 'asymmetric':
init_prior = numpy.asarray([1.0 / (i + numpy.sqrt(self.num_topics)) for i in xrange(self.num_topics)])
init_prior /= init_prior.sum()
logger.info("using asymmetric %s %s", name, list(init_prior))
elif prior == 'auto':
init_prior = numpy.asarray([1.0 / self.num_topics for i in xrange(self.num_topics)])
logger.info("using autotuned %s, starting with %s", name, list(init_prior))
elif isinstance(prior, list):
init_prior = numpy.asarray(prior)
elif isinstance(prior, numpy.ndarray):
init_prior = prior
elif isinstance(prior, numpy.number) or isinstance(prior, numbers.Real):
init_prior = numpy.asarray([prior] * self.num_topics)
else:
raise ValueError("%s must be either a numpy array of scalars, list of scalars, or scalar" % name)

if name == 'eta':
# please note the difference in shapes between alpha and eta:
# alpha is a row: [0.1, 0.1]
# eta is a column: [[0.1],
# [0.1]]
if init_prior.shape == (self.num_topics,) or init_prior.shape == (1, self.num_topics):
init_prior = init_prior.reshape((self.num_topics, 1)) # this statement throws ValueError if eta did not match self.num_topics

return init_prior

def __str__(self):
return "LdaModel(num_terms=%s, num_topics=%s, decay=%s, chunksize=%s)" % \
(self.num_terms, self.num_topics, self.decay, self.chunksize)
Expand Down Expand Up @@ -425,34 +470,34 @@ def do_estep(self, chunk, state=None):
state.numdocs += gamma.shape[0] # avoids calling len(chunk) on a generator
return gamma


def update_alpha(self, gammat, rho):
"""
Update parameters for the Dirichlet prior on the per-document
topic weights `alpha` given the last `gammat`.

Uses Newton's method, described in **Huang: Maximum Likelihood Estimation of Dirichlet Distribution Parameters.**
http://jonathan-huang.org/research/dirichlet/dirichlet.pdf

"""
N = float(len(gammat))
logphat = sum(dirichlet_expectation(gamma) for gamma in gammat) / N
dalpha = numpy.copy(self.alpha)
gradf = N * (psi(numpy.sum(self.alpha)) - psi(self.alpha) + logphat)

c = N * polygamma(1, numpy.sum(self.alpha))
q = -N * polygamma(1, self.alpha)
self.alpha = update_dir_prior(self.alpha, N, logphat, rho)
logger.info("optimized alpha %s", list(self.alpha))

b = numpy.sum(gradf / q) / (1 / c + numpy.sum(1 / q))
return self.alpha

dalpha = -(gradf - b) / q
def update_eta(self, lambdat, rho):
"""
Update parameters for the Dirichlet prior on the per-topic
word weights `eta` given the last `lambdat`.
"""
if self.eta.shape[1] != 1:
raise ValueError("Can't use update_eta with eta matrices, only column vectors.")
N = float(lambdat.shape[1])
logphat = (sum(dirichlet_expectation(lambda_) for lambda_ in lambdat.transpose()) / N).reshape((self.num_topics,1))

if all(rho * dalpha + self.alpha > 0):
self.alpha += rho * dalpha
else:
logger.warning("updated alpha not positive")
logger.info("optimized alpha %s", list(self.alpha))
self.eta = update_dir_prior(self.eta, N, logphat, rho)
logger.info("optimized eta %s", list(self.eta.reshape((self.num_topics))))

return self.alpha
return self.eta

def log_perplexity(self, chunk, total_docs=None):
"""
Expand Down Expand Up @@ -629,6 +674,9 @@ def do_mstep(self, rho, other, extra_pass=False):
self.print_topics(5)
logger.info("topic diff=%f, rho=%f", numpy.mean(numpy.abs(diff)), rho)

if self.optimize_eta:
self.update_eta(self.state.get_lambda(), rho)

if not extra_pass:
# only update if this isn't an additional pass
self.num_updates += other.numdocs
Expand Down Expand Up @@ -846,9 +894,9 @@ def save(self, fname, ignore=['state', 'dispatcher'], *args, **kwargs):
Save the model to file.

Large internal arrays may be stored into separate files, with `fname` as prefix.

`separately` can be used to define which arrays should be stored in separate files.

`ignore` parameter can be used to define which variables should be ignored, i.e. left
out from the pickled lda model. By default the internal `state` is ignored as it uses
its own serialisation not the one provided by `LdaModel`. The `state` and `dispatcher
Expand All @@ -870,7 +918,7 @@ def save(self, fname, ignore=['state', 'dispatcher'], *args, **kwargs):
"""
if self.state is not None:
self.state.save(utils.smart_extension(fname, '.state'), *args, **kwargs)

# make sure 'state' and 'dispatcher' are ignored from the pickled object, even if
# someone sets the ignore list themselves
if ignore is not None and ignore:
Expand Down
131 changes: 131 additions & 0 deletions gensim/test/test_ldamodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,132 @@ def testTransform(self):
(i, sorted(vec), sorted(expected)))
self.assertTrue(passed)

def testAlphaAuto(self):
model1 = self.class_(corpus, id2word=dictionary, alpha='symmetric', passes=10)
modelauto = self.class_(corpus, id2word=dictionary, alpha='auto', passes=10)

# did we learn something?
self.assertFalse(all(numpy.equal(model1.alpha, modelauto.alpha)))

def testAlpha(self):
kwargs = dict(
id2word=dictionary,
num_topics=2,
alpha=None
)
expected_shape = (2,)

# should not raise anything
self.class_(**kwargs)

kwargs['alpha'] = 'symmetric'
model = self.class_(**kwargs)
self.assertEqual(model.alpha.shape, expected_shape)
self.assertTrue(all(model.alpha == numpy.array([0.5, 0.5])))

kwargs['alpha'] = 'asymmetric'
model = self.class_(**kwargs)
self.assertEqual(model.alpha.shape, expected_shape)
self.assertTrue(numpy.allclose(model.alpha, [0.630602, 0.369398]))

kwargs['alpha'] = 0.3
model = self.class_(**kwargs)
self.assertEqual(model.alpha.shape, expected_shape)
self.assertTrue(all(model.alpha == numpy.array([0.3, 0.3])))

kwargs['alpha'] = 3
model = self.class_(**kwargs)
self.assertEqual(model.alpha.shape, expected_shape)
self.assertTrue(all(model.alpha == numpy.array([3, 3])))

kwargs['alpha'] = [0.3, 0.3]
model = self.class_(**kwargs)
self.assertEqual(model.alpha.shape, expected_shape)
self.assertTrue(all(model.alpha == numpy.array([0.3, 0.3])))

# all should raise an exception for being wrong shape
kwargs['alpha'] = [0.3, 0.3, 0.3]
self.assertRaises(AssertionError, self.class_, **kwargs)

kwargs['alpha'] = [[0.3], [0.3]]
self.assertRaises(AssertionError, self.class_, **kwargs)

kwargs['alpha'] = [0.3]
self.assertRaises(AssertionError, self.class_, **kwargs)

kwargs['alpha'] = "gensim is cool"
self.assertRaises(ValueError, self.class_, **kwargs)


def testEtaAuto(self):
model1 = self.class_(corpus, id2word=dictionary, eta='symmetric', passes=10)
modelauto = self.class_(corpus, id2word=dictionary, eta='auto', passes=10)

# did we learn something?
self.assertFalse(all(numpy.equal(model1.eta, modelauto.eta)))

def testEta(self):
kwargs = dict(
id2word=dictionary,
num_topics=2,
eta=None
)
expected_shape = (2, 1)

# should not raise anything
model = self.class_(**kwargs)
self.assertEqual(model.eta.shape, expected_shape)
self.assertTrue(all(model.eta == numpy.array([[0.5], [0.5]])))

kwargs['eta'] = 'symmetric'
model = self.class_(**kwargs)
self.assertEqual(model.eta.shape, expected_shape)
self.assertTrue(all(model.eta == numpy.array([[0.5], [0.5]])))

kwargs['eta'] = 'asymmetric'
model = self.class_(**kwargs)
self.assertEqual(model.eta.shape, expected_shape)
self.assertTrue(numpy.allclose(model.eta, [[0.630602], [0.369398]]))

kwargs['eta'] = 0.3
model = self.class_(**kwargs)
self.assertEqual(model.eta.shape, expected_shape)
self.assertTrue(all(model.eta == numpy.array([[0.3], [0.3]])))

kwargs['eta'] = 3
model = self.class_(**kwargs)
self.assertEqual(model.eta.shape, expected_shape)
self.assertTrue(all(model.eta == numpy.array([[3], [3]])))

kwargs['eta'] = [[0.3], [0.3]]
model = self.class_(**kwargs)
self.assertEqual(model.eta.shape, expected_shape)
self.assertTrue(all(model.eta == numpy.array([[0.3], [0.3]])))

kwargs['eta'] = [0.3, 0.3]
model = self.class_(**kwargs)
self.assertEqual(model.eta.shape, expected_shape)
self.assertTrue(all(model.eta == numpy.array([[0.3], [0.3]])))

# should be ok with num_topics x num_terms
testeta = numpy.array([[0.5] * len(dictionary)] * 2)
kwargs['eta'] = testeta
self.class_(**kwargs)

# all should raise an exception for being wrong shape
kwargs['eta'] = testeta.reshape(tuple(reversed(testeta.shape)))
self.assertRaises(AssertionError, self.class_, **kwargs)

kwargs['eta'] = [0.3, 0.3, 0.3]
self.assertRaises(AssertionError, self.class_, **kwargs)

kwargs['eta'] = [0.3]
self.assertRaises(AssertionError, self.class_, **kwargs)

kwargs['eta'] = "gensim is cool"
self.assertRaises(ValueError, self.class_, **kwargs)


def testTopTopics(self):
top_topics = self.model.top_topics(self.corpus)

Expand Down Expand Up @@ -254,6 +380,11 @@ def setUp(self):
self.class_ = ldamulticore.LdaMulticore
self.model = self.class_(corpus, id2word=dictionary, num_topics=2, passes=100)

# override LdaModel because multicore does not allow alpha=auto
def testAlphaAuto(self):
self.assertRaises(RuntimeError, self.class_, alpha='auto')


#endclass TestLdaMulticore


Expand Down