Skip to content

Commit

Permalink
Fix OOV pairs counter in `WordEmbeddingsKeyedVectors.evaluate_word_pa…
Browse files Browse the repository at this point in the history
…irs` (#1934)

`Evaluate word pairs' function returned incorrect number of out-of-vocabulary pairs when `dummy4unknown` parameter was set to True.
  • Loading branch information
akutuzov authored and menshikh-iv committed Feb 26, 2018
1 parent 9cff044 commit b000b4f
Showing 1 changed file with 7 additions and 3 deletions.
10 changes: 7 additions & 3 deletions gensim/models/keyedvectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -996,23 +996,27 @@ def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000,
a, b, sim = [word for word in line.split(delimiter)]
sim = float(sim)
except (ValueError, TypeError):
logger.info('skipping invalid line #%d in %s', line_no, pairs)
logger.info('Skipping invalid line #%d in %s', line_no, pairs)
continue
if a not in ok_vocab or b not in ok_vocab:
oov += 1
if dummy4unknown:
logger.debug('Zero similarity for line #%d with OOV words: %s', line_no, line.strip())
similarity_model.append(0.0)
similarity_gold.append(sim)
continue
else:
logger.debug('skipping line #%d with OOV words: %s', line_no, line.strip())
logger.debug('Skipping line #%d with OOV words: %s', line_no, line.strip())
continue
similarity_gold.append(sim) # Similarity from the dataset
similarity_model.append(self.similarity(a, b)) # Similarity from the model
self.vocab = original_vocab
spearman = stats.spearmanr(similarity_gold, similarity_model)
pearson = stats.pearsonr(similarity_gold, similarity_model)
oov_ratio = float(oov) / (len(similarity_gold) + oov) * 100
if dummy4unknown:
oov_ratio = float(oov) / len(similarity_gold) * 100
else:
oov_ratio = float(oov) / (len(similarity_gold) + oov) * 100

logger.debug('Pearson correlation coefficient against %s: %f with p-value %f', pairs, pearson[0], pearson[1])
logger.debug(
Expand Down

0 comments on commit b000b4f

Please sign in to comment.