Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix light linting issues in LdaSeqModel #2360

Merged
merged 3 commits into from
Jan 29, 2019
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
120 changes: 58 additions & 62 deletions gensim/models/ldaseqmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@
from scipy.special import digamma, gammaln
from scipy import optimize
import logging
from six.moves import range, zip

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -126,7 +127,7 @@ def __init__(self, corpus=None, time_slice=None, id2word=None, alphas=0.01, num_
logger.warning("no word id mapping provided; initializing from corpus, assuming identity")
self.id2word = utils.dict_from_corpus(corpus)
self.vocab_len = len(self.id2word)
elif len(self.id2word) > 0:
elif self.id2word:
self.vocab_len = len(self.id2word)
else:
self.vocab_len = 0
Expand All @@ -142,12 +143,6 @@ def __init__(self, corpus=None, time_slice=None, id2word=None, alphas=0.01, num_
if self.time_slice is not None:
self.num_time_slices = len(time_slice)

max_doc_len = 0
for line_no, line in enumerate(corpus):
if len(line) > max_doc_len:
max_doc_len = len(line)
self.max_doc_len = max_doc_len

self.num_topics = num_topics
self.num_time_slices = len(time_slice)
self.alphas = np.full(num_topics, alphas)
Expand All @@ -157,7 +152,7 @@ def __init__(self, corpus=None, time_slice=None, id2word=None, alphas=0.01, num_
# the sslm class is described below and contains information
# on topic-word probabilities and doc-topic probabilities.
self.topic_chains = []
for topic in range(0, num_topics):
for topic in range(num_topics):
sslm_ = sslm(
num_time_slices=self.num_time_slices, vocab_len=self.vocab_len, num_topics=self.num_topics,
chain_variance=chain_variance, obs_variance=obs_variance
Expand All @@ -172,6 +167,8 @@ def __init__(self, corpus=None, time_slice=None, id2word=None, alphas=0.01, num_

# if a corpus and time_slice is provided, depending on the user choice of initializing LDA, we start DTM.
if corpus is not None and time_slice is not None:
self.max_doc_len = max(len(line) for line in corpus)
menshikh-iv marked this conversation as resolved.
Show resolved Hide resolved

if initialize == 'gensim':
lda_model = ldamodel.LdaModel(
corpus, id2word=self.id2word, num_topics=self.num_topics,
Expand Down Expand Up @@ -268,12 +265,12 @@ def fit_lda_seq(self, corpus, lda_inference_max_iter, em_min_iter, em_max_iter,

# initiate sufficient statistics
topic_suffstats = []
for topic in range(0, num_topics):
topic_suffstats.append(np.resize(np.zeros(vocab_len * data_len), (vocab_len, data_len)))
for topic in range(num_topics):
topic_suffstats.append(np.zeros((vocab_len, data_len)))

# set up variables
gammas = np.resize(np.zeros(corpus_len * num_topics), (corpus_len, num_topics))
lhoods = np.resize(np.zeros(corpus_len * num_topics + 1), (corpus_len, num_topics + 1))
gammas = np.zeros((corpus_len, num_topics))
lhoods = np.zeros((corpus_len, num_topics + 1))
# compute the likelihood of a sequential corpus under an LDA
# seq model and find the evidence lower bound. This is the E - Step
bound, gammas = \
Expand Down Expand Up @@ -346,7 +343,7 @@ def lda_seq_infer(self, corpus, topic_suffstats, gammas, lhoods,
bound = 0.0

lda = ldamodel.LdaModel(num_topics=num_topics, alpha=self.alphas, id2word=self.id2word, dtype=np.float64)
lda.topics = np.array(np.split(np.zeros(vocab_len * num_topics), vocab_len))
lda.topics = np.zeros((vocab_len, num_topics))
ldapost = LdaPost(max_doc_len=self.max_doc_len, num_topics=num_topics, lda=lda)

model = "DTM"
Expand Down Expand Up @@ -460,8 +457,8 @@ def make_lda_seq_slice(self, lda, time):
The stationary model updated to reflect the passed time slice.

"""
for k in range(0, self.num_topics):
lda.topics[:, k] = np.copy(self.topic_chains[k].e_log_prob[:, time])
for k in range(self.num_topics):
lda.topics[:, k] = self.topic_chains[k].e_log_prob[:, time]
menshikh-iv marked this conversation as resolved.
Show resolved Hide resolved

lda.alpha = np.copy(self.alphas)
return lda
Expand Down Expand Up @@ -507,7 +504,7 @@ def print_topic_times(self, topic, top_terms=20):

"""
topics = []
for time in range(0, self.num_time_slices):
for time in range(self.num_time_slices):
topics.append(self.print_topic(topic, time, top_terms))

return topics
Expand All @@ -530,7 +527,7 @@ def print_topics(self, time=0, top_terms=20):
probability.

"""
return [self.print_topic(topic, time, top_terms) for topic in range(0, self.num_topics)]
return [self.print_topic(topic, time, top_terms) for topic in range(self.num_topics)]

def print_topic(self, topic, time=0, top_terms=20):
"""Get the list of words most relevant to the given topic.
Expand Down Expand Up @@ -578,8 +575,7 @@ def doc_topics(self, doc_number):
Probability for each topic in the mixture (essentially a point in the `self.num_topics - 1` simplex.

"""
doc_topic = np.copy(self.gammas)
doc_topic /= doc_topic.sum(axis=1)[:, np.newaxis]
doc_topic = self.gammas / self.gammas.sum(axis=1)[:, np.newaxis]
return doc_topic[doc_number]
menshikh-iv marked this conversation as resolved.
Show resolved Hide resolved

def dtm_vis(self, time, corpus):
Expand Down Expand Up @@ -608,22 +604,25 @@ def dtm_vis(self, time, corpus):
The set of unique terms existing in the cropuse's vocabulary.

"""
doc_topic = np.copy(self.gammas)
doc_topic /= doc_topic.sum(axis=1)[:, np.newaxis]
doc_topic = self.gammas / self.gammas.sum(axis=1)[:, np.newaxis]

def normalize(x):
return x / x.sum()

topic_term = [
np.exp(np.transpose(chain.e_log_prob)[time]) / np.exp(np.transpose(chain.e_log_prob)[time]).sum()
normalize(np.exp(chain.e_log_prob.T[time]))
for k, chain in enumerate(self.topic_chains)
]

doc_lengths = [len(doc) for doc_no, doc in enumerate(corpus)]

doc_lengths = []
term_frequency = np.zeros(self.vocab_len)
for doc_no, doc in enumerate(corpus):
for pair in doc:
term_frequency[pair[0]] += pair[1]
doc_lengths.append(len(doc))

for term, freq in doc:
term_frequency[term] += freq

vocab = [self.id2word[i] for i in range(0, len(self.id2word))]
vocab = [self.id2word[i] for i in range(len(self.id2word))]

return doc_topic, np.array(topic_term), doc_lengths, term_frequency, vocab

Expand Down Expand Up @@ -668,13 +667,13 @@ def __getitem__(self, doc):
Probabilities for each topic in the mixture. This is essentially a point in the `num_topics - 1` simplex.

"""
lda_model = \
ldamodel.LdaModel(num_topics=self.num_topics, alpha=self.alphas, id2word=self.id2word, dtype=np.float64)
lda_model.topics = np.array(np.split(np.zeros(self.vocab_len * self.num_topics), self.vocab_len))
lda_model = ldamodel.LdaModel(
num_topics=self.num_topics, alpha=self.alphas, id2word=self.id2word, dtype=np.float64)
lda_model.topics = np.zeros((self.vocab_len, self.num_topics))
ldapost = LdaPost(num_topics=self.num_topics, max_doc_len=len(doc), lda=lda_model, doc=doc)

time_lhoods = []
for time in range(0, self.num_time_slices):
for time in range(self.num_time_slices):
lda_model = self.make_lda_seq_slice(lda_model, time) # create lda_seq slice
lhood = LdaPost.fit_lda_post(ldapost, 0, time, self)
time_lhoods.append(lhood)
Expand Down Expand Up @@ -706,12 +705,12 @@ def __init__(self, vocab_len=None, num_time_slices=None, num_topics=None, obs_va
self.num_topics = num_topics

# setting up matrices
self.obs = np.array(np.split(np.zeros(num_time_slices * vocab_len), vocab_len))
self.e_log_prob = np.array(np.split(np.zeros(num_time_slices * vocab_len), vocab_len))
self.mean = np.array(np.split(np.zeros((num_time_slices + 1) * vocab_len), vocab_len))
self.fwd_mean = np.array(np.split(np.zeros((num_time_slices + 1) * vocab_len), vocab_len))
self.fwd_variance = np.array(np.split(np.zeros((num_time_slices + 1) * vocab_len), vocab_len))
self.variance = np.array(np.split(np.zeros((num_time_slices + 1) * vocab_len), vocab_len))
self.obs = np.zeros((vocab_len, num_time_slices))
self.e_log_prob = np.zeros((vocab_len, num_time_slices))
self.mean = np.zeros((vocab_len, num_time_slices + 1))
self.fwd_mean = np.zeros((vocab_len, num_time_slices + 1))
self.fwd_variance = np.zeros((vocab_len, num_time_slices + 1))
self.variance = np.zeros((vocab_len, num_time_slices + 1))
self.zeta = np.zeros(num_time_slices)

# the following are class variables which are to be integrated during Document Influence Model
Expand Down Expand Up @@ -896,9 +895,9 @@ def sslm_counts_init(self, obs_variance, chain_variance, sstats):
T = self.num_time_slices

log_norm_counts = np.copy(sstats)
log_norm_counts = log_norm_counts / sum(log_norm_counts)
log_norm_counts = log_norm_counts + 1.0 / W
log_norm_counts = log_norm_counts / sum(log_norm_counts)
log_norm_counts /= sum(log_norm_counts)
menshikh-iv marked this conversation as resolved.
Show resolved Hide resolved
log_norm_counts += 1.0 / W
log_norm_counts /= sum(log_norm_counts)
log_norm_counts = np.log(log_norm_counts)

# setting variational observations to transformed counts
Expand All @@ -908,7 +907,7 @@ def sslm_counts_init(self, obs_variance, chain_variance, sstats):
self.chain_variance = chain_variance

# compute post variance, mean
for w in range(0, W):
for w in range(W):
self.variance[w], self.fwd_variance[w] = self.compute_post_variance(w, self.chain_variance)
self.mean[w], self.fwd_mean[w] = self.compute_post_mean(w, self.chain_variance)

Expand Down Expand Up @@ -944,7 +943,7 @@ def fit_sslm(self, sstats):

# computing variance, fwd_variance
self.variance, self.fwd_variance = \
(np.array(x) for x in list(zip(*[self.compute_post_variance(w, self.chain_variance) for w in range(0, W)])))
(np.array(x) for x in zip(*(self.compute_post_variance(w, self.chain_variance) for w in range(W))))

# column sum of sstats
totals = sstats.sum(axis=0)
Expand Down Expand Up @@ -1006,19 +1005,18 @@ def compute_bound(self, sstats, totals):
chain_variance = self.chain_variance
# computing mean, fwd_mean
self.mean, self.fwd_mean = \
(np.array(x) for x in zip(*[self.compute_post_mean(w, self.chain_variance) for w in range(0, w)]))
(np.array(x) for x in zip(*(self.compute_post_mean(w, self.chain_variance) for w in range(w))))
self.zeta = self.update_zeta()

for w in range(0, w):
val += (self.variance[w][0] - self.variance[w][t]) / 2 * chain_variance
val = sum(self.variance[w][0] - self.variance[w][t] for w in range(w)) / 2 * chain_variance

logger.info("Computing bound, all times")

for t in range(1, t + 1):
term_1 = 0.0
term_2 = 0.0
ent = 0.0
for w in range(0, w):
for w in range(w):

m = self.mean[w][t]
prev_m = self.mean[w][t - 1]
Expand Down Expand Up @@ -1071,14 +1069,14 @@ def update_obs(self, sstats, totals):
T = self.num_time_slices

runs = 0
mean_deriv_mtx = np.resize(np.zeros(T * (T + 1)), (T, T + 1))
mean_deriv_mtx = np.zeros((T, T + 1))

norm_cutoff_obs = None
for w in range(0, W):
for w in range(W):
w_counts = sstats[w]
counts_norm = 0
# now we find L2 norm of w_counts
for i in range(0, len(w_counts)):
for i in range(len(w_counts)):
counts_norm += w_counts[i] * w_counts[i]

counts_norm = np.sqrt(counts_norm)
Expand All @@ -1091,10 +1089,8 @@ def update_obs(self, sstats, totals):
w_counts = np.zeros(len(w_counts))

# TODO: apply lambda function
for t in range(0, T):
mean_deriv = mean_deriv_mtx[t]
mean_deriv = self.compute_mean_deriv(w, t, mean_deriv)
mean_deriv_mtx[t] = mean_deriv
for t in range(T):
mean_deriv_mtx[t] = self.compute_mean_deriv(w, t, mean_deriv_mtx[t])
menshikh-iv marked this conversation as resolved.
Show resolved Hide resolved

deriv = np.zeros(T)
args = self, w_counts, totals, mean_deriv_mtx, w, deriv
Expand Down Expand Up @@ -1207,10 +1203,10 @@ def compute_obs_deriv(self, word, word_counts, totals, mean_deriv_mtx, deriv):
# temp_vector holds temporary zeta values
self.temp_vect = np.zeros(T)

for u in range(0, T):
for u in range(T):
self.temp_vect[u] = np.exp(mean[u + 1] + variance[u + 1] / 2)

for t in range(0, T):
for t in range(T):
mean_deriv = mean_deriv_mtx[t]
term1 = 0
term2 = 0
Expand Down Expand Up @@ -1280,8 +1276,8 @@ def __init__(self, doc=None, lda=None, max_doc_len=None, num_topics=None, gamma=
self.lhood = np.zeros(num_topics + 1)

if max_doc_len is not None and num_topics is not None:
self.phi = np.resize(np.zeros(max_doc_len * num_topics), (max_doc_len, num_topics))
self.log_phi = np.resize(np.zeros(max_doc_len * num_topics), (max_doc_len, num_topics))
self.phi = np.zeros((max_doc_len, num_topics))
self.log_phi = np.zeros((max_doc_len, num_topics))

# the following are class variables which are to be integrated during Document Influence Model

Expand Down Expand Up @@ -1314,12 +1310,12 @@ def update_phi(self, doc_number, time):
# digamma values
dig = np.zeros(num_topics)

for k in range(0, num_topics):
for k in range(num_topics):
dig[k] = digamma(self.gamma[k])

n = 0 # keep track of iterations for phi, log_phi
for word_id, count in self.doc:
for k in range(0, num_topics):
for k in range(num_topics):
self.log_phi[n][k] = dig[k] + self.lda.topics[word_id][k]

log_phi_row = self.log_phi[n]
Expand Down Expand Up @@ -1355,7 +1351,7 @@ def update_gamma(self):
n = 0 # keep track of number of iterations for phi, log_phi
for word_id, count in self.doc:
phi_row = self.phi[n]
for k in range(0, self.lda.num_topics):
for k in range(self.lda.num_topics):
self.gamma[k] += phi_row[k] * count
n += 1

Expand Down Expand Up @@ -1392,7 +1388,7 @@ def compute_lda_lhood(self):
digsum = digamma(gamma_sum)

model = "DTM" # noqa:F841
for k in range(0, num_topics):
for k in range(num_topics):
# below code only to be used in DIM mode
# if ldapost.doc_weight is not None and (model == "DIM" or model == "fixed"):
# influence_topic = ldapost.doc_weight[k]
Expand Down Expand Up @@ -1518,7 +1514,7 @@ def update_lda_seq_ss(self, time, doc, topic_suffstats):
"""
num_topics = self.lda.num_topics

for k in range(0, num_topics):
for k in range(num_topics):
topic_ss = topic_suffstats[k]
n = 0
for word_id, count in self.doc:
Expand Down