Skip to content

Commit

Permalink
Use itertools.chain instead of sum to concatenate lists (#2212)
Browse files Browse the repository at this point in the history
* use itertools.chain to concatenate lists

* concatenate lists with chain instead of sum
  • Loading branch information
Stigjb authored and menshikh-iv committed Oct 5, 2018
1 parent 2891861 commit 5934b13
Show file tree
Hide file tree
Showing 4 changed files with 21 additions and 12 deletions.
9 changes: 5 additions & 4 deletions gensim/models/keyedvectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,7 @@
from __future__ import division # py3 "true division"

from collections import deque
from itertools import chain
import logging

try:
Expand Down Expand Up @@ -1144,8 +1145,8 @@ def evaluate_word_analogies(self, analogies, restrict_vocab=300000, case_insensi

total = {
'section': 'Total accuracy',
'correct': sum((s['correct'] for s in sections), []),
'incorrect': sum((s['incorrect'] for s in sections), []),
'correct': list(chain.from_iterable(s['correct'] for s in sections)),
'incorrect': list(chain.from_iterable(s['incorrect'] for s in sections)),
}

oov_ratio = float(oov) / quadruplets_no * 100
Expand Down Expand Up @@ -1250,8 +1251,8 @@ def accuracy(self, questions, restrict_vocab=30000, most_similar=most_similar, c

total = {
'section': 'total',
'correct': sum((s['correct'] for s in sections), []),
'incorrect': sum((s['incorrect'] for s in sections), []),
'correct': list(chain.from_iterable(s['correct'] for s in sections)),
'incorrect': list(chain.from_iterable(s['incorrect'] for s in sections)),
}
self.log_accuracy(total)
sections.append(total)
Expand Down
5 changes: 3 additions & 2 deletions gensim/models/wrappers/ldamallet.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@
import tempfile
import xml.etree.ElementTree as et
import zipfile
from itertools import chain

import numpy
from smart_open import smart_open
Expand Down Expand Up @@ -222,9 +223,9 @@ def corpus2mallet(self, corpus, file_like):
"""
for docno, doc in enumerate(corpus):
if self.id2word:
tokens = sum(([self.id2word[tokenid]] * int(cnt) for tokenid, cnt in doc), [])
tokens = chain.from_iterable([self.id2word[tokenid]] * int(cnt) for tokenid, cnt in doc)
else:
tokens = sum(([str(tokenid)] * int(cnt) for tokenid, cnt in doc), [])
tokens = chain.from_iterable([str(tokenid)] * int(cnt) for tokenid, cnt in doc)
file_like.write(utils.to_utf8("%s 0 %s\n" % (docno, ' '.join(tokens))))

def convert_input(self, corpus, infer=False, serialize_corpus=True):
Expand Down
3 changes: 2 additions & 1 deletion gensim/test/test_corpora_dictionary.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@


from collections import Mapping
from itertools import chain
import logging
import unittest
import codecs
Expand Down Expand Up @@ -258,7 +259,7 @@ def test_from_corpus(self):
for document in documents]

# remove words that appear only once
all_tokens = sum(texts, [])
all_tokens = list(chain.from_iterable(texts))
tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1)
texts = [[word for word in text if word not in tokens_once] for text in texts]

Expand Down
16 changes: 11 additions & 5 deletions gensim/test/test_word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -593,12 +593,18 @@ def testLocking(self):
self.assertFalse((unlocked1 == model.wv.vectors[1]).all()) # unlocked vector should vary
self.assertTrue((locked0 == model.wv.vectors[0]).all()) # locked vector should not vary

def testAccuracy(self):
"""Test Word2Vec accuracy and KeyedVectors accuracy give the same result"""
def testEvaluateWordAnalogies(self):
"""Test that evaluating analogies on KeyedVectors give sane results"""
model = word2vec.Word2Vec(LeeCorpus())
w2v_accuracy = model.wv.evaluate_word_analogies(datapath('questions-words.txt'))
kv_accuracy = model.wv.evaluate_word_analogies(datapath('questions-words.txt'))
self.assertEqual(w2v_accuracy, kv_accuracy)
score, sections = model.wv.evaluate_word_analogies(datapath('questions-words.txt'))
self.assertGreaterEqual(score, 0.0)
self.assertLessEqual(score, 1.0)
self.assertGreater(len(sections), 0)
# Check that dict contains the right keys
first_section = sections[0]
self.assertIn('section', first_section)
self.assertIn('correct', first_section)
self.assertIn('incorrect', first_section)

def testEvaluateWordPairs(self):
"""Test Spearman and Pearson correlation coefficients give sane results on similarity datasets"""
Expand Down

0 comments on commit 5934b13

Please sign in to comment.