Added ind_conf_m explanation, refactoring in prob_est, seg modules

piskvorky · Aug 7, 2016 · 0ca2672 · 0ca2672
1 parent 20e2d6d
commit 0ca2672
Show file tree

Hide file tree

Showing 3 changed files with 19 additions and 16 deletions.
diff --git a/gensim/topic_coherence/indirect_confirmation_measure.py b/gensim/topic_coherence/indirect_confirmation_measure.py
@@ -5,7 +5,12 @@
 # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
 
 """
-This module contains functions to compute confirmation on a pair of words or word subsets.
+This module contains functions to compute confirmation on a pair of words or word subsets. The advantage of indirect
+confirmation measure is that it computes similarity of words in W' and W* with respect to direct confirmations to all words.
+Eg. Suppose x and z are both competing brands of cars, which semantically support each other. However, both brands are
+seldom mentioned together in documents in the reference corpus. But their confirmations to other words like “road”
+or “speed” do strongly correlate. This would be reflected by an indirect confirmation measure. Thus, indirect confirmation
+measures may capture semantic support that direct measures would miss.
 
 The formula used to compute indirect confirmation measure is:
 

diff --git a/gensim/topic_coherence/probability_estimation.py b/gensim/topic_coherence/probability_estimation.py
@@ -78,25 +78,26 @@ def p_boolean_sliding_window(texts, segmented_topics, dictionary, window_size):
     window_id[0] : Total no of windows
     """
     top_ids = _ret_top_ids(segmented_topics)
-    window_id = [0]  # Each window assigned a window id.
+    window_id = 0  # Each window assigned a window id.
     per_topic_postings = {}
     token2id_dict = dictionary.token2id
-    def add_topic_posting():
+    def add_topic_posting(top_ids, window, per_topic_postings, window_id, token2id_dict):
         for word in window:
             word_id = token2id_dict[word]
             if word_id in top_ids:
                 if word_id in per_topic_postings:
-                    per_topic_postings[word_id].add(window_id[0])
+                    per_topic_postings[word_id].add(window_id)
                 else:
-                    per_topic_postings[word_id] = set([window_id[0]])
-        window_id[0] += 1
+                    per_topic_postings[word_id] = set([window_id])
+        window_id += 1
+        return (window_id, per_topic_postings)
     # Apply boolean sliding window to each document in texts.
     for document in texts:
         it = iter(document)
         window = tuple(islice(it, window_size))
-        add_topic_posting()
+        window_id, per_topic_postings = add_topic_posting(top_ids, window, per_topic_postings, window_id, token2id_dict)
         for elem in it:
             window = window[1:] + (elem,)
-            add_topic_posting()
+            window_id, per_topic_postings = add_topic_posting(top_ids, window, per_topic_postings, window_id, token2id_dict)
 
-    return (per_topic_postings, window_id[0])
+    return per_topic_postings, window_id
diff --git a/gensim/topic_coherence/segmentation.py b/gensim/topic_coherence/segmentation.py
@@ -36,9 +36,8 @@ def s_one_pre(topics):
 
     for top_words in topics:
         s_one_pre_t = []
-        for w_prime in top_words[1:]:
-            w_prime_index = int(np.where(top_words == w_prime)[0])  # To get index of w_prime in top_words
-            for w_star in top_words[:w_prime_index]:
+        for w_prime_index, w_prime in enumerate(top_words[1:]):
+            for w_star in top_words[:w_prime_index + 1]:
                 s_one_pre_t.append((w_prime, w_star))
         s_one_pre.append(s_one_pre_t)
 
@@ -67,10 +66,8 @@ def s_one_one(topics):
 
     for top_words in topics:
         s_one_one_t = []
-        for w_prime in top_words:
-            w_prime_index = int(np.where(top_words == int(w_prime))[0])  # To get index of w_prime in top_words
-            for w_star in top_words:
-                w_star_index = int(np.where(top_words == int(w_star))[0])
+        for w_prime_index, w_prime in enumerate(top_words):
+            for w_star_index, w_star in enumerate(top_words):
                 if w_prime_index == w_star_index:
                     continue
                 else: