Skip to content

Commit

Permalink
Added ind_conf_m explanation, refactoring in prob_est, seg modules
Browse files Browse the repository at this point in the history
  • Loading branch information
devashishd12 committed Aug 7, 2016
1 parent 20e2d6d commit 0ca2672
Show file tree
Hide file tree
Showing 3 changed files with 19 additions and 16 deletions.
7 changes: 6 additions & 1 deletion gensim/topic_coherence/indirect_confirmation_measure.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,12 @@
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html

"""
This module contains functions to compute confirmation on a pair of words or word subsets.
This module contains functions to compute confirmation on a pair of words or word subsets. The advantage of indirect
confirmation measure is that it computes similarity of words in W' and W* with respect to direct confirmations to all words.
Eg. Suppose x and z are both competing brands of cars, which semantically support each other. However, both brands are
seldom mentioned together in documents in the reference corpus. But their confirmations to other words like “road”
or “speed” do strongly correlate. This would be reflected by an indirect confirmation measure. Thus, indirect confirmation
measures may capture semantic support that direct measures would miss.
The formula used to compute indirect confirmation measure is:
Expand Down
17 changes: 9 additions & 8 deletions gensim/topic_coherence/probability_estimation.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,25 +78,26 @@ def p_boolean_sliding_window(texts, segmented_topics, dictionary, window_size):
window_id[0] : Total no of windows
"""
top_ids = _ret_top_ids(segmented_topics)
window_id = [0] # Each window assigned a window id.
window_id = 0 # Each window assigned a window id.
per_topic_postings = {}
token2id_dict = dictionary.token2id
def add_topic_posting():
def add_topic_posting(top_ids, window, per_topic_postings, window_id, token2id_dict):
for word in window:
word_id = token2id_dict[word]
if word_id in top_ids:
if word_id in per_topic_postings:
per_topic_postings[word_id].add(window_id[0])
per_topic_postings[word_id].add(window_id)
else:
per_topic_postings[word_id] = set([window_id[0]])
window_id[0] += 1
per_topic_postings[word_id] = set([window_id])
window_id += 1
return (window_id, per_topic_postings)
# Apply boolean sliding window to each document in texts.
for document in texts:
it = iter(document)
window = tuple(islice(it, window_size))
add_topic_posting()
window_id, per_topic_postings = add_topic_posting(top_ids, window, per_topic_postings, window_id, token2id_dict)
for elem in it:
window = window[1:] + (elem,)
add_topic_posting()
window_id, per_topic_postings = add_topic_posting(top_ids, window, per_topic_postings, window_id, token2id_dict)

return (per_topic_postings, window_id[0])
return per_topic_postings, window_id
11 changes: 4 additions & 7 deletions gensim/topic_coherence/segmentation.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,8 @@ def s_one_pre(topics):

for top_words in topics:
s_one_pre_t = []
for w_prime in top_words[1:]:
w_prime_index = int(np.where(top_words == w_prime)[0]) # To get index of w_prime in top_words
for w_star in top_words[:w_prime_index]:
for w_prime_index, w_prime in enumerate(top_words[1:]):
for w_star in top_words[:w_prime_index + 1]:
s_one_pre_t.append((w_prime, w_star))
s_one_pre.append(s_one_pre_t)

Expand Down Expand Up @@ -67,10 +66,8 @@ def s_one_one(topics):

for top_words in topics:
s_one_one_t = []
for w_prime in top_words:
w_prime_index = int(np.where(top_words == int(w_prime))[0]) # To get index of w_prime in top_words
for w_star in top_words:
w_star_index = int(np.where(top_words == int(w_star))[0])
for w_prime_index, w_prime in enumerate(top_words):
for w_star_index, w_star in enumerate(top_words):
if w_prime_index == w_star_index:
continue
else:
Expand Down

0 comments on commit 0ca2672

Please sign in to comment.