-
Notifications
You must be signed in to change notification settings - Fork 3
/
compute-embeddings-coverage.py
68 lines (55 loc) · 2.48 KB
/
compute-embeddings-coverage.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import re
import time
import io
import sys
import argparse
from collections import defaultdict
# parse/validate arguments
argparser = argparse.ArgumentParser()
argparser.add_argument("-e", "--embeddings", help=
" An embeddings file (word2vec format).")
argparser.add_argument("-t", "--text", help=
" The text in this file will be tokenized on white spaces and used to measure coverage.")
args = argparser.parse_args()
# stream
with gzip.open(args.embeddings, mode='r') if args.embeddings.endswith('.gz') else open(args.embeddings, mode='r') as embeddings, gzip.open(args.text, mode='r') if args.text.endswith('.gz') else open(args.text, mode='r') as text:
# read all words in the embeddings file
embeddings_vocab = set()
for line in embeddings:
try:
line = line.decode('utf8')
except UnicodeDecodeError:
print 'WARNING: utf8 decoding error for the following line in the embeddings file:', line, '\nWill skip this one.'
continue
embeddings_vocab.add(line.strip().split(' ')[0])
# read each token in the text file
tokens_covered, tokens_uncovered = 0, 0
types_covered, types_uncovered = 0, 0
text_vocab = set()
frequency_of_uncovered_types = defaultdict(int)
for line in text:
try:
line = line.decode('utf8')
except UnicodeDecodeError:
print 'WARNING: utf8 decoding error for the following line in the embeddings file:', line, '\nWill skip this one.'
continue
# for each token in the file
for token in line.strip().split():
# word found in the embeddings
if token in embeddings_vocab:
tokens_covered += 1
# is it a new word type?
if token not in text_vocab: types_covered += 1
# word not found in the embeddings
else:
tokens_uncovered += 1
# is it a new word type?
if token not in text_vocab:
types_uncovered += 1
# count frequency of uncovered word types
frequency_of_uncovered_types[token] += 1
# this is no longer a new word type
text_vocab.add(token)
print 'token coverage =', tokens_covered, '/', tokens_covered + tokens_uncovered, '=', 1.0 * tokens_covered / (tokens_covered + tokens_uncovered)
print 'type coverage =', types_covered, '/', types_covered + types_uncovered, '=', 1.0 * types_covered / (types_covered + types_uncovered)
print 'most frequent uncovered word types =', sorted(frequency_of_uncovered_types.iteritems(),key=lambda (k,v): v,reverse=True)[:30]