This repository has been archived by the owner on May 15, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 6
/
prepare_wow_data.py
51 lines (40 loc) · 2.31 KB
/
prepare_wow_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import sys
import json
import os
from tqdm import tqdm
from nltk.tokenize import WordPunctTokenizer
from nltk import word_tokenize
#save_names = ['train', 'test-seen', 'test-unseen', 'dev-seen', 'dev-unseen']
file_names = ["test", "test_unseen", "valid", "valid_unseen", "train", ]
save_names = ["test_random_split", "test_topic_split", "valid_random_split", "valid_topic_split", "train", ]
f = lambda sen: ' '.join(WordPunctTokenizer().tokenize(sen.strip())).lower()
#f = lambda sen: ' '.join(word_tokenize(sen)).lower()
for key, name in zip(file_names, save_names):
print(key)
total_data = []
d = json.load(open("./Wizard-of-Wikipedia/%s_collected.json" % key, 'r'))
for data in tqdm(d, total=len(d)):
new_data = {}
new_data['chosen_topics'] = data['topics']
new_data['posts'] = list(map(f, data['post']))
#new_data['posts'][0] = ' '.join(new_data['posts'][0].split()[1:])
new_data['responses'] = list(map(f, data['response']))
assert all(e[0] == 'no_passages_used __knowledge__ no_passages_used' for e in data['knowledge'])
new_data['knowledge'] = [['no_knowledge_used'] + list(map(lambda x: f(x.split('__knowledge__')[1]), e[1:])) for e in data['knowledge']]
new_data['labels'] = data['labels']
total_data.append(new_data)
json.dump(total_data, open('./Wizard-of-Wikipedia/prepared_data/%s.json' % name, 'w'), indent=4, ensure_ascii=False, sort_keys=True)
with open('./Wizard-of-Wikipedia/prepared_data/dev.json', 'w') as f:
data = json.load(open('./Wizard-of-Wikipedia/prepared_data/valid_random_split.json')) + \
json.load(open('./Wizard-of-Wikipedia/prepared_data/valid_topic_split.json'))
json.dump(data, f, ensure_ascii=False, indent=4)
#os.remove('valid_random_split.json')
#os.remove('valid_topic_split.json')
with open('./Wizard-of-Wikipedia/prepared_data/test_seen.json', 'w') as f:
data = json.load(open('./Wizard-of-Wikipedia/prepared_data/test_random_split.json'))
json.dump(data, f, ensure_ascii=False, indent=4)
#os.remove('test_random_split.json')
with open('./Wizard-of-Wikipedia/prepared_data/test_unseen.json', 'w') as f:
data = json.load(open('./Wizard-of-Wikipedia/prepared_data/test_topic_split.json'))
json.dump(data, f, ensure_ascii=False, indent=4)
#os.remove('test_topic_split.json')