-
Notifications
You must be signed in to change notification settings - Fork 2
/
subset_generator.py
92 lines (73 loc) · 2.89 KB
/
subset_generator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
###############################################
### Parameter Setting
ref_read_path = 'data/conll2003'
ref_write_path = 'gen_data/copy/subset'
###############################################
from data_handling_for_heuristic import *
import os, sys
import random
arg_str = ' '.join(sys.argv[1:])
percent_subset = arg_str
### Train Dataset
# Read
read_path = ref_read_path + '/train.txt'
raw_data, label_data = load_conll2003(read_path)
temp_pair = []
for i, _ in enumerate(raw_data):
each = (raw_data[i], label_data[i])
temp_pair.append(each)
random.shuffle(temp_pair)
output = temp_pair[:int(len(temp_pair)*float(percent_subset))]
print('before:', len(temp_pair), '--->', ' after:', len(output))
# Write
path_write = ref_write_path+'/train_'+str(percent_subset)+'_'+str(len(output))+'.txt'
with open(path_write, 'w', encoding='UTF-8') as txt:
for i, _ in enumerate(output):
splited_sent = output[i][0].split()
splited_label = output[i][1].split()
for j, token in enumerate(splited_sent):
txt.write(splited_sent[j]+' '+'NNP'+' '+'B-NP'+' '+splited_label[j])
txt.write('\n')
txt.write('\n')
### Valid Dataset
# Read
read_path = ref_read_path +'/valid.txt'
raw_data, label_data = load_conll2003(read_path)
temp_pair = []
for i, _ in enumerate(raw_data):
each = (raw_data[i], label_data[i])
temp_pair.append(each)
random.shuffle(temp_pair)
output = temp_pair[:int(len(temp_pair)*float(percent_subset))]
print('before:', len(temp_pair), '--->', ' after:', len(output))
# Write
path_write = ref_write_path+'/valid_'+str(percent_subset)+'_'+str(len(output))+'.txt'
with open(path_write, 'w', encoding='UTF-8') as txt:
for i, _ in enumerate(output):
splited_sent = output[i][0].split()
splited_label = output[i][1].split()
for j, token in enumerate(splited_sent):
txt.write(splited_sent[j]+' '+'NNP'+' '+'B-NP'+' '+splited_label[j])
txt.write('\n')
txt.write('\n')
### Test Dataset
# Read
read_path = ref_read_path +'/test.txt'
raw_data, label_data = load_conll2003(read_path)
temp_pair = []
for i, _ in enumerate(raw_data):
each = (raw_data[i], label_data[i])
temp_pair.append(each)
random.shuffle(temp_pair)
output = temp_pair[:int(len(temp_pair)*float(percent_subset))]
print('before:', len(temp_pair), '--->', ' after:', len(output))
# Write
path_write = ref_write_path+'/test_'+str(percent_subset)+'_'+str(len(output))+'.txt'
with open(path_write, 'w', encoding='UTF-8') as txt:
for i, _ in enumerate(output):
splited_sent = output[i][0].split()
splited_label = output[i][1].split()
for j, token in enumerate(splited_sent):
txt.write(splited_sent[j]+' '+'NNP'+' '+'B-NP'+' '+splited_label[j])
txt.write('\n')
txt.write('\n')