diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index 442e9ca07a..9954abe75f 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -159,6 +159,7 @@ from __future__ import division # py3 "true division" from collections import deque +from itertools import chain import logging try: @@ -1144,8 +1145,8 @@ def evaluate_word_analogies(self, analogies, restrict_vocab=300000, case_insensi total = { 'section': 'Total accuracy', - 'correct': sum((s['correct'] for s in sections), []), - 'incorrect': sum((s['incorrect'] for s in sections), []), + 'correct': list(chain.from_iterable(s['correct'] for s in sections)), + 'incorrect': list(chain.from_iterable(s['incorrect'] for s in sections)), } oov_ratio = float(oov) / quadruplets_no * 100 @@ -1250,8 +1251,8 @@ def accuracy(self, questions, restrict_vocab=30000, most_similar=most_similar, c total = { 'section': 'total', - 'correct': sum((s['correct'] for s in sections), []), - 'incorrect': sum((s['incorrect'] for s in sections), []), + 'correct': list(chain.from_iterable(s['correct'] for s in sections)), + 'incorrect': list(chain.from_iterable(s['incorrect'] for s in sections)), } self.log_accuracy(total) sections.append(total) diff --git a/gensim/models/wrappers/ldamallet.py b/gensim/models/wrappers/ldamallet.py index 6c9487eb37..6639be5d8e 100644 --- a/gensim/models/wrappers/ldamallet.py +++ b/gensim/models/wrappers/ldamallet.py @@ -52,6 +52,7 @@ import tempfile import xml.etree.ElementTree as et import zipfile +from itertools import chain import numpy from smart_open import smart_open @@ -222,9 +223,9 @@ def corpus2mallet(self, corpus, file_like): """ for docno, doc in enumerate(corpus): if self.id2word: - tokens = sum(([self.id2word[tokenid]] * int(cnt) for tokenid, cnt in doc), []) + tokens = chain.from_iterable([self.id2word[tokenid]] * int(cnt) for tokenid, cnt in doc) else: - tokens = sum(([str(tokenid)] * int(cnt) for tokenid, cnt in doc), []) + tokens = chain.from_iterable([str(tokenid)] * int(cnt) for tokenid, cnt in doc) file_like.write(utils.to_utf8("%s 0 %s\n" % (docno, ' '.join(tokens)))) def convert_input(self, corpus, infer=False, serialize_corpus=True): diff --git a/gensim/test/test_corpora_dictionary.py b/gensim/test/test_corpora_dictionary.py index e0b8d1e426..13a16a3cd1 100644 --- a/gensim/test/test_corpora_dictionary.py +++ b/gensim/test/test_corpora_dictionary.py @@ -9,6 +9,7 @@ from collections import Mapping +from itertools import chain import logging import unittest import codecs @@ -258,7 +259,7 @@ def test_from_corpus(self): for document in documents] # remove words that appear only once - all_tokens = sum(texts, []) + all_tokens = list(chain.from_iterable(texts)) tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1) texts = [[word for word in text if word not in tokens_once] for text in texts] diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py index 9cce7e6fa9..7a7ef31262 100644 --- a/gensim/test/test_word2vec.py +++ b/gensim/test/test_word2vec.py @@ -593,12 +593,18 @@ def testLocking(self): self.assertFalse((unlocked1 == model.wv.vectors[1]).all()) # unlocked vector should vary self.assertTrue((locked0 == model.wv.vectors[0]).all()) # locked vector should not vary - def testAccuracy(self): - """Test Word2Vec accuracy and KeyedVectors accuracy give the same result""" + def testEvaluateWordAnalogies(self): + """Test that evaluating analogies on KeyedVectors give sane results""" model = word2vec.Word2Vec(LeeCorpus()) - w2v_accuracy = model.wv.evaluate_word_analogies(datapath('questions-words.txt')) - kv_accuracy = model.wv.evaluate_word_analogies(datapath('questions-words.txt')) - self.assertEqual(w2v_accuracy, kv_accuracy) + score, sections = model.wv.evaluate_word_analogies(datapath('questions-words.txt')) + self.assertGreaterEqual(score, 0.0) + self.assertLessEqual(score, 1.0) + self.assertGreater(len(sections), 0) + # Check that dict contains the right keys + first_section = sections[0] + self.assertIn('section', first_section) + self.assertIn('correct', first_section) + self.assertIn('incorrect', first_section) def testEvaluateWordPairs(self): """Test Spearman and Pearson correlation coefficients give sane results on similarity datasets"""