Skip to content

Commit

Permalink
piskvorky#1380: Add tests for with_std option for confirmation meas…
Browse files Browse the repository at this point in the history
…ures, and add test case to sanity check `word2vec_similarity`.
  • Loading branch information
Sweeney, Mack committed Jun 14, 2017
1 parent 60a2130 commit 60096e1
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 12 deletions.
21 changes: 18 additions & 3 deletions gensim/test/test_direct_confirmation.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,23 +37,38 @@ def testLogConditionalProbability(self):
self.segmentation, self.accumulator)[0]
# Answer should be ~ ln(1 / 2) = -0.693147181
expected = -0.693147181
self.assertAlmostEqual(obtained, expected)
self.assertAlmostEqual(expected, obtained)

mean, std = direct_confirmation_measure.log_conditional_probability(
self.segmentation, self.accumulator, with_std=True)[0]
self.assertAlmostEqual(expected, mean)
self.assertEqual(0.0, std)

def testLogRatioMeasure(self):
"""Test log_ratio_measure()"""
obtained = direct_confirmation_measure.log_ratio_measure(
self.segmentation, self.accumulator)[0]
# Answer should be ~ ln{(1 / 5) / [(3 / 5) * (2 / 5)]} = -0.182321557
expected = -0.182321557
self.assertAlmostEqual(obtained, expected)
self.assertAlmostEqual(expected, obtained)

mean, std = direct_confirmation_measure.log_ratio_measure(
self.segmentation, self.accumulator, with_std=True)[0]
self.assertAlmostEqual(expected, mean)
self.assertEqual(0.0, std)

def testNormalizedLogRatioMeasure(self):
"""Test normalized_log_ratio_measure()"""
obtained = direct_confirmation_measure.log_ratio_measure(
self.segmentation, self.accumulator, normalize=True)[0]
# Answer should be ~ -0.182321557 / -ln(1 / 5) = -0.113282753
expected = -0.113282753
self.assertAlmostEqual(obtained, expected)
self.assertAlmostEqual(expected, obtained)

mean, std = direct_confirmation_measure.log_ratio_measure(
self.segmentation, self.accumulator, normalize=True, with_std=True)[0]
self.assertAlmostEqual(expected, mean)
self.assertEqual(0.0, std)


if __name__ == '__main__':
Expand Down
31 changes: 25 additions & 6 deletions gensim/test/test_indirect_confirmation.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,16 +28,16 @@ def setUp(self):
self.gamma = 1
self.measure = 'nlr'

dictionary = Dictionary()
dictionary.id2token = {1: 'fake', 2: 'tokens'}
self.accumulator = text_analysis.InvertedIndexAccumulator({1, 2}, dictionary)
self.accumulator._inverted_index = {0: {2, 3, 4}, 1: {3, 5}}
self.accumulator._num_docs = 5
self.dictionary = Dictionary()
self.dictionary.id2token = {1: 'fake', 2: 'tokens'}

def testCosineSimilarity(self):
"""Test cosine_similarity()"""
accumulator = text_analysis.InvertedIndexAccumulator({1, 2}, self.dictionary)
accumulator._inverted_index = {0: {2, 3, 4}, 1: {3, 5}}
accumulator._num_docs = 5
obtained = indirect_confirmation_measure.cosine_similarity(
self.segmentation, self.accumulator, self.topics, self.measure, self.gamma)
self.segmentation, accumulator, self.topics, self.measure, self.gamma)

# The steps involved in this calculation are as follows:
# 1. Take (1, array([1, 2]). Take w' which is 1.
Expand All @@ -49,6 +49,25 @@ def testCosineSimilarity(self):
expected = (0.6230 + 0.6230) / 2. # To account for EPSILON approximation
self.assertAlmostEqual(expected, obtained[0], 4)

mean, std = indirect_confirmation_measure.cosine_similarity(
self.segmentation, accumulator, self.topics, self.measure, self.gamma,
with_std=True)[0]
self.assertAlmostEqual(expected, mean, 4)
self.assertAlmostEqual(0.0, std, 1)

def testWord2VecSimilarity(self):
"""Sanity check word2vec_similarity."""
accumulator = text_analysis.WordVectorsAccumulator({1, 2}, self.dictionary)
accumulator.accumulate([
['fake', 'tokens'],
['tokens', 'fake']
], 5)

mean, std = indirect_confirmation_measure.word2vec_similarity(
self.segmentation, accumulator, with_std=True)[0]
self.assertNotEqual(0.0, mean)
self.assertNotEqual(0.0, std)


if __name__ == '__main__':
logging.root.setLevel(logging.WARNING)
Expand Down
6 changes: 3 additions & 3 deletions gensim/topic_coherence/indirect_confirmation_measure.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def word2vec_similarity(segmented_topics, accumulator, with_std=False):
-------
topic_coherences : list of word2vec cosine similarities per topic.
"""
topic_coherences = np.zeros(len(segmented_topics))
topic_coherences = []
for i, topic_segments in enumerate(segmented_topics):
segment_similarities = []
for w_prime, w_star in topic_segments:
Expand All @@ -67,9 +67,9 @@ def word2vec_similarity(segmented_topics, accumulator, with_std=False):
logger.warn("at least one topic word not in word2vec model")

if with_std:
topic_coherences[i] = (np.mean(segment_similarities), np.std(segment_similarities))
topic_coherences.append((np.mean(segment_similarities), np.std(segment_similarities)))
else:
topic_coherences[i] = np.mean(segment_similarities)
topic_coherences.append(np.mean(segment_similarities))

return topic_coherences

Expand Down

0 comments on commit 60096e1

Please sign in to comment.