-
Notifications
You must be signed in to change notification settings - Fork 0
/
create_rouge_references.py
58 lines (52 loc) · 1.89 KB
/
create_rouge_references.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import config
import data
import os
path = '/home/sfua14/gigaword_parsed/'
def make_dirs(name):
data.make_dir(os.path.join(path, name+'_summaries'))
summary_path = os.path.join(path, name+'_summaries/reference')
data.make_dir(summary_path)
data.make_dir(os.path.join(path, name+'_summaries/system'))
return summary_path
def write_data(set, name):
write_path = make_dirs(name)
count = 0
iteration = 0
bucket_index = 0
while True:
bucket = set[bucket_index]
next_bucket = False
start_i = iteration * config.BATCH_SIZE
end_i = (iteration + 1) * config.BATCH_SIZE
if end_i >= len(bucket['enc_input']):
next_bucket = True
start_i = -config.BATCH_SIZE
end_i = None
dec_input = bucket['dec_input'][start_i:]
else:
dec_input = bucket['dec_input'][start_i:end_i]
for headline in dec_input:
with open(os.path.join(write_path, str(count)+'.txt'), 'w') as f:
f.write(headline)
if count % 10000 == 0:
print 'writing #', count
count += 1
if next_bucket:
bucket_index += 1
iteration = 0
else:
iteration += 1
if bucket_index >= len(config.BUCKETS):
break
print 'loading data'
enc_vocab = data._read_and_split_file(os.path.join(path, 'enc_vocab.txt'))
dec_vocab = data._read_and_split_file(os.path.join(path, 'dec_vocab.txt'))
enc_dict = {enc_vocab[i]: i for i in range(len(enc_vocab))}
dec_dict = {dec_vocab[i]: i for i in range(len(dec_vocab))}
# load these properly as text
dev = data.load_one_set(path, 'dev', config.BUCKETS,
enc_dict, dec_dict, vec=False)
test = data.load_one_set(path, 'test', config.BUCKETS,
enc_dict, dec_dict, vec=False)
write_data(dev, 'dev')
write_data(test, 'test')