forked from Sachin19/adversarial-classify
-
Notifications
You must be signed in to change notification settings - Fork 0
/
datasets.py
200 lines (172 loc) · 15.8 KB
/
datasets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
from torchtext import data
from torchtext import datasets
import torch
def make_reddit(batch_size, device=-1, vectors=None, base_path="", suffix="",extrasuffix="",domain="", oodname="", topics=False):
TEXT = data.Field(include_lengths=True, lower=True)
LABEL = data.LabelField()
TOPICS = data.Field(sequential=True, use_vocab=False, preprocessing=data.Pipeline(lambda x:float(x)), tensor_type=torch.cuda.FloatTensor, batch_first=True)
train = data.TabularDataset(path=base_path+"/train"+suffix+extrasuffix+".txt", format="tsv", fields=[('text',TEXT), ('label', LABEL), ('username', None) , ('topics', TOPICS)])
val = data.TabularDataset(path=base_path+"/valid"+suffix+".txt", format="tsv", fields=[('text',TEXT), ('label', LABEL), ('username', None)])
test = data.TabularDataset(path=base_path+"/test"+suffix+".txt", format="tsv", fields=[('text',TEXT), ('label', LABEL), ('username', None)])
outdomain_test = data.TabularDataset(path=base_path+"/oodtest"+suffix+".txt", format="tsv", fields=[('text',TEXT), ('label', LABEL), ('username', None)])
# train, test = datasets.REDDIT.splits(TEXT, LABEL)
TEXT.build_vocab(train, vectors=vectors, max_size=30000)
LABEL.build_vocab(train)
train_iter, val_iter, test_iter, outdomain_test_iter = data.BucketIterator.splits((train, val, test, outdomain_test), batch_sizes=(batch_size, batch_size, batch_size, batch_size), device=device, repeat=False, sort_key=lambda x: len(x.text))
return (train_iter, val_iter, test_iter, outdomain_test_iter), TEXT, LABEL, TOPICS
def make_reddit_with_indices(batch_size, device=-1, vectors=None, base_path="", suffix="",extrasuffix="",domain="", oodname="", topics=False):
TEXT = data.Field(include_lengths=True, lower=True)
LABEL = data.LabelField()
TOPICS = data.Field(sequential=True, use_vocab=False, preprocessing=data.Pipeline(lambda x:float(x)), tensor_type=torch.cuda.FloatTensor, batch_first=True)
INDEX = data.Field(sequential=False, use_vocab=False, batch_first=True)
train = data.TabularDataset(path=base_path+"/train.tok.clean.index"+extrasuffix+suffix+".txt", format="tsv", fields=[('text',TEXT), ('label', LABEL), ('username', None) ,('index',INDEX), ('topics', TOPICS)])
val = data.TabularDataset(path=base_path+"/valid.tok.clean.index"+suffix+".txt", format="tsv", fields=[('text',TEXT), ('label', LABEL), ('username', None), ('index',INDEX)])
test = data.TabularDataset(path=base_path+"/test.tok.clean.index.loremoved200"+suffix+".txt", format="tsv", fields=[('text',TEXT), ('label', LABEL), ('username', None), ('index',INDEX)])
outdomain_test = data.TabularDataset(path=base_path+"/oodtest.tok.clean.index.loremoved200"+suffix+".txt", format="tsv", fields=[('text',TEXT), ('label', LABEL), ('username', None), ('index',INDEX)])
# train, test = datasets.REDDIT.splits(TEXT, LABEL)
TEXT.build_vocab(train, vectors=vectors, max_size=30000)
LABEL.build_vocab(train)
train_iter, val_iter, test_iter, outdomain_test_iter = data.BucketIterator.splits((train, val, test, outdomain_test), batch_sizes=(batch_size, batch_size, batch_size, batch_size), device=device, repeat=False, sort_key=lambda x: len(x.text))
return (train_iter, val_iter, test_iter, outdomain_test_iter), TEXT, LABEL, TOPICS, INDEX
def make_reddit_baseline(batch_size, device=-1, vectors=None, base_path="", suffix="",extrasuffix="",domain="", oodname="", topics=False):
TEXT = data.Field(include_lengths=True, lower=True)
LABEL = data.LabelField()
TOPICS = data.Field(sequential=True, use_vocab=False, preprocessing=data.Pipeline(lambda x:float(x)), tensor_type=torch.cuda.FloatTensor, batch_first=True)
train = data.TabularDataset(path=base_path+"/train"+suffix+".txt", format="tsv", fields=[('text',TEXT), ('label', LABEL), ('username', None)])
val = data.TabularDataset(path=base_path+"/valid"+suffix+".txt", format="tsv", fields=[('text',TEXT), ('label', LABEL), ('username', None)])
test = data.TabularDataset(path=base_path+"/test"+suffix+".txt", format="tsv", fields=[('text',TEXT), ('label', LABEL), ('username', None)])
outdomain_test = data.TabularDataset(path=base_path+"/oodtest"+suffix+".txt", format="tsv", fields=[('text',TEXT), ('label', LABEL), ('username', None)])
# train, test = datasets.REDDIT.splits(TEXT, LABEL)
TEXT.build_vocab(train, vectors=vectors, max_size=30000)
LABEL.build_vocab(train)
train_iter, val_iter, test_iter, outdomain_test_iter = data.BucketIterator.splits((train, val, test, outdomain_test), batch_sizes=(batch_size, batch_size, batch_size, batch_size), device=device, repeat=False, sort_key=lambda x: len(x.text))
return (train_iter, val_iter, test_iter, outdomain_test_iter), TEXT, LABEL, TOPICS
def make_reddit_baseline_with_indices(batch_size, device=-1, vectors=None, base_path="", suffix="",extrasuffix="",domain="", oodname="", topics=False):
TEXT = data.Field(include_lengths=True, lower=True)
LABEL = data.LabelField()
TOPICS = data.Field(sequential=True, use_vocab=False, preprocessing=data.Pipeline(lambda x:float(x)), tensor_type=torch.cuda.FloatTensor, batch_first=True)
INDEX = data.Field(sequential=False, use_vocab=False, batch_first=True)
train = data.TabularDataset(path=base_path+"/train"+suffix+".txt", format="tsv", fields=[('text',TEXT), ('label', LABEL), ('username', None), ('index',INDEX)])
val = data.TabularDataset(path=base_path+"/valid"+suffix+".txt", format="tsv", fields=[('text',TEXT), ('label', LABEL), ('username', None), ('index',INDEX)])
test = data.TabularDataset(path=base_path+"/test"+suffix+".txt", format="tsv", fields=[('text',TEXT), ('label', LABEL), ('username', None), ('index',INDEX)])
outdomain_test = data.TabularDataset(path=base_path+"/oodtest"+suffix+".txt", format="tsv", fields=[('text',TEXT), ('label', LABEL), ('username', None), ('index',INDEX)])
# train, test = datasets.REDDIT.splits(TEXT, LABEL)
TEXT.build_vocab(train, vectors=vectors, max_size=30000)
LABEL.build_vocab(train)
train_iter, val_iter, test_iter, outdomain_test_iter = data.BucketIterator.splits((train, val, test, outdomain_test), batch_sizes=(batch_size, batch_size, batch_size, batch_size), device=device, repeat=False, sort_key=lambda x: len(x.text))
return (train_iter, val_iter, test_iter, outdomain_test_iter), TEXT, LABEL, TOPICS, INDEX
def make_reddit_ensemble(batch_size, device=-1, vectors=None, base_path="", suffix="",extrasuffix="",domain="", oodname="", topics=False):
TEXT = data.Field(include_lengths=True, lower=True)
LABEL = data.LabelField()
TOPICS = data.Field(sequential=True, use_vocab=False, preprocessing=data.Pipeline(lambda x:float(x)), tensor_type=torch.cuda.FloatTensor, batch_first=True)
train = data.TabularDataset(path=base_path+"/train"+suffix+extrasuffix+".txt", format="tsv", fields=[('text',TEXT), ('label', LABEL), ('username', None) , ('topics', TOPICS)])
val = data.TabularDataset(path=base_path+"/valid"+suffix+extrasuffix+".txt", format="tsv", fields=[('text',TEXT), ('label', LABEL), ('username', None), ('topics', TOPICS)])
test = data.TabularDataset(path=base_path+"/test"+suffix+extrasuffix+".txt", format="tsv", fields=[('text',TEXT), ('label', LABEL), ('username', None), ('topics', TOPICS)])
outdomain_test = data.TabularDataset(path=base_path+"/oodtest"+suffix+extrasuffix+".txt", format="tsv", fields=[('text',TEXT), ('label', LABEL), ('username', None), ('topics', TOPICS)])
# train, test = datasets.REDDIT.splits(TEXT, LABEL)
TEXT.build_vocab(train, vectors=vectors, max_size=30000)
LABEL.build_vocab(train)
train_iter, val_iter, test_iter, outdomain_test_iter = data.BucketIterator.splits((train, val, test, outdomain_test), batch_sizes=(batch_size, batch_size, batch_size, batch_size), device=device, repeat=False, sort_key=lambda x: len(x.text))
return (train_iter, val_iter, test_iter, outdomain_test_iter), TEXT, LABEL, TOPICS
def make_reddit2(batch_size, device=-1, vectors=None, base_path="", suffix="",extrasuffix="",domain="", oodname="", topics=False):
TEXT = data.Field(include_lengths=True, lower=True)
LABEL = data.LabelField()
TOPICS = data.Field(sequential=True, use_vocab=False, preprocessing=data.Pipeline(lambda x:float(x)), tensor_type=torch.cuda.FloatTensor, batch_first=True)
train = data.TabularDataset(path=base_path+"/train"+suffix+extrasuffix+".txt", format="tsv", fields=[('text',TEXT), ('label', LABEL), ('username', None) , ('topics', TOPICS)])
outdomain_test = data.TabularDataset(path=base_path+"/valid"+suffix+".txt", format="tsv", fields=[('text',TEXT), ('label', LABEL), ('username', None)])
test = data.TabularDataset(path=base_path+"/test"+suffix+".txt", format="tsv", fields=[('text',TEXT), ('label', LABEL), ('username', None)])
val = data.TabularDataset(path=base_path+"/oodtest"+suffix+".txt", format="tsv", fields=[('text',TEXT), ('label', LABEL), ('username', None)])
# train, test = datasets.REDDIT.splits(TEXT, LABEL)
TEXT.build_vocab(train, vectors=vectors, max_size=30000)
LABEL.build_vocab(train)
train_iter, val_iter, test_iter, outdomain_test_iter = data.BucketIterator.splits((train, val, test, outdomain_test), batch_sizes=(batch_size, 256, 256, 256), device=device, repeat=False, sort_key=lambda x: len(x.text))
return (train_iter, val_iter, test_iter, outdomain_test_iter), TEXT, LABEL, TOPICS
def make_ted(batch_size, device=-1, vectors=None, base_path="", suffix="",extrasuffix="",domain="", oodname="", topics=False):
TEXT = data.Field(include_lengths=True, lower=True)
LABEL = data.LabelField()
TOPICS = data.Field(sequential=True, use_vocab=False, preprocessing=data.Pipeline(lambda x:float(x)), tensor_type=torch.cuda.FloatTensor, batch_first=True)
train = data.TabularDataset(path=base_path+"/train"+suffix+extrasuffix+".txt", format="tsv", fields=[('text',TEXT), ('label', LABEL), ('topics', TOPICS)])
val = data.TabularDataset(path=base_path+"/valid"+suffix+".txt", format="tsv", fields=[('text',TEXT), ('label', LABEL)])
test = data.TabularDataset(path=base_path+"/test"+suffix+".txt", format="tsv", fields=[('text',TEXT), ('label', LABEL)])
# train, test = datasets.REDDIT.splits(TEXT, LABEL)
TEXT.build_vocab(train, vectors=vectors, max_size=30000)
LABEL.build_vocab(train)
print (LABEL.vocab.stoi)
train_iter, val_iter, test_iter = data.BucketIterator.splits((train, val, test), batch_sizes=(batch_size, 256, 256), device=device, repeat=False, sort_key=lambda x: len(x.text))
return (train_iter, val_iter, test_iter), TEXT, LABEL, TOPICS
def make_reddit_gender(batch_size, device=-1, vectors=None, base_path="", suffix="",extrasuffix="",domain="", oodname="", topics=False):
TEXT = data.Field(include_lengths=True, lower=True)
LABEL = data.LabelField()
TOPICS = data.Field(sequential=True, use_vocab=False, preprocessing=data.Pipeline(lambda x:float(x)), tensor_type=torch.cuda.FloatTensor, batch_first=True)
train = data.TabularDataset(path=base_path+"/train"+suffix+extrasuffix+".txt", format="tsv", fields=[('text',TEXT), ('label', LABEL)])
val = data.TabularDataset(path=base_path+"/valid"+suffix+".txt", format="tsv", fields=[('text',TEXT), ('label', LABEL)])
test = data.TabularDataset(path=base_path+"/test"+suffix+".txt", format="tsv", fields=[('text',TEXT), ('label', LABEL)])
# train, test = datasets.REDDIT.splits(TEXT, LABEL)
TEXT.build_vocab(train, vectors=vectors, max_size=30000)
LABEL.build_vocab(train)
print (LABEL.vocab.stoi)
train_iter, val_iter, test_iter = data.BucketIterator.splits((train, val, test), batch_sizes=(batch_size, 256, 256), device=device, repeat=False, sort_key=lambda x: len(x.text))
return (train_iter, val_iter, test_iter), TEXT, LABEL, TOPICS
def make_amazon(batch_size, device=-1, vectors=None, base_path="", suffix="",extrasuffix="", domain="", oodname="", topics=False):
TEXT = data.Field(include_lengths=True, lower=True)
LABEL = data.LabelField()
TOPICS = data.Field(sequential=True, use_vocab=False, preprocessing=data.Pipeline(lambda x:float(x)), tensor_type=torch.cuda.FloatTensor, batch_first=True)
if not topics:
train = data.TabularDataset(path=base_path+"/"+domain+".train.lower.tok"+suffix+extrasuffix+".txt", format="tsv", fields=[('text',TEXT), ('label', LABEL)])
else:
train = data.TabularDataset(path=base_path+"/"+domain+".train.lower.tok"+suffix+extrasuffix+".txt", format="tsv", fields=[('text',TEXT), ('label', LABEL), ('topics', TOPICS)])
val = data.TabularDataset(path=base_path+"/"+domain+".valid.lower.tok"+suffix+".txt", format="tsv", fields=[('text',TEXT), ('label', LABEL)])
test = data.TabularDataset(path=base_path+"/"+domain+".test.lower.tok"+suffix+".txt", format="tsv", fields=[('text',TEXT), ('label', LABEL)])
oodnames = oodname.split(",")
outdomain_test = []
for oodname in oodnames:
outdomain_test.append(data.TabularDataset(path=base_path+"/"+oodname+".test.lower.tok"+suffix+".txt", format="tsv", fields=[('text',TEXT), ('label', LABEL)]))
# train, test = datasets.REDDIT.splits(TEXT, LABEL)
TEXT.build_vocab(train, vectors=vectors, max_size=30000)
LABEL.build_vocab(train)
all_iters = data.BucketIterator.splits(tuple([train, val, test] + outdomain_test), batch_sizes=tuple([batch_size]*(3+len(outdomain_test))), device=device, repeat=False, sort_key=lambda x: len(x.text))
# train_iter, val_iter, test_iter, outdomain_test_iters
return all_iters, TEXT, LABEL, TOPICS
def make_toefl(batch_size, device=-1, vectors=None, base_path="", suffix="", testsuffix=""):
TEXT = data.Field(include_lengths=True, lower=True)
LABEL = data.LabelField()
PROMPT = data.Field(sequential=False, batch_first=True)
train = data.TabularDataset(path=base_path+"/train"+suffix+".txt", format="tsv", fields=[('text',TEXT), ('label', LABEL), ('takerid', None), ('prompt', PROMPT)])
val = data.TabularDataset(path=base_path+"/valid"+suffix+".txt", format="tsv", fields=[('text',TEXT), ('label', LABEL), ('takerid', None), ('prompt', PROMPT)])
test = data.TabularDataset(path=base_path+"/valid"+testsuffix+".txt", format="tsv", fields=[('text',TEXT), ('label', LABEL), ('takerid', None), ('prompt', PROMPT)])
# train, test = datasets.REDDIT.splits(TEXT, LABEL)
TEXT.build_vocab(train, vectors=vectors, max_size=60000)
LABEL.build_vocab(train)
PROMPT.build_vocab(train)
train_iter, val_iter, test_iter = data.BucketIterator.splits((train, val, test), batch_sizes=(batch_size, batch_size, batch_size, batch_size), device=device, repeat=False, sort_key=lambda x: len(x.text))
return (train_iter, val_iter, test_iter), TEXT, LABEL, PROMPT
def make_demog(batch_size, device=-1, vectors=None, base_path=""):
TEXT = data.Field(include_lengths=True, lower=True)
LABEL = data.LabelField()
PROMPT = data.Field(sequential=False, batch_first=True)
train = data.TabularDataset(path=base_path+"/train.txt", format="tsv", fields=[('text',TEXT), ('label', LABEL), ('prompt', PROMPT)])
val = data.TabularDataset(path=base_path+"/valid.txt", format="tsv", fields=[('text',TEXT), ('label', LABEL), ('prompt', PROMPT)])
test = data.TabularDataset(path=base_path+"/test.txt", format="tsv", fields=[('text',TEXT), ('label', LABEL), ('prompt', PROMPT)])
# train, test = datasets.REDDIT.splits(TEXT, LABEL)
TEXT.build_vocab(train, vectors=vectors, max_size=60000)
LABEL.build_vocab(train)
PROMPT.build_vocab(train)
train_iter, val_iter, test_iter = data.BucketIterator.splits((train, val, test), batch_sizes=(batch_size, batch_size, batch_size, batch_size), device=device, repeat=False, sort_key=lambda x: len(x.text))
return (train_iter, val_iter, test_iter), TEXT, LABEL, PROMPT
dataset_map = {
"REDDIT": make_reddit,
"REDDITI": make_reddit_with_indices,
"REDDIT_ENSEMBLE": make_reddit_ensemble,
"REDDIT_BASELINE": make_reddit_baseline,
"REDDIT_BASELINEI": make_reddit_baseline_with_indices,
"REDDIT_GENDER": make_reddit_gender,
"REDDIT2": make_reddit2,
"TED": make_ted,
"AMAZON": make_amazon,
"TOEFL": make_toefl,
"DEMOG": make_demog,
}
if __name__ == '__main__':
(tr, te), T, L = make_reddit(64)
print("[REDDIT] vocab: {} labels: {}".format(len(T.vocab), len(L.vocab)))
print("[REDDIT] train: {} test {}".format(len(tr.dataset), len(te.dataset)))