-
Notifications
You must be signed in to change notification settings - Fork 0
/
cu_dataset.py
90 lines (79 loc) · 3.92 KB
/
cu_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import os
import numpy as np
import torch.utils.data as data_utils
import torch
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from tqdm import tqdm
import logging
from scipy.sparse import csr_matrix
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt='%m/%d/%Y %H:%M:%S',
level=logging.INFO)
logger = logging.getLogger(__name__)
def get_data(args, sample_level_da=None):
train_texts = np.load(os.path.join(args.data_dir, args.train_texts), allow_pickle=True)
train_labels = np.load(os.path.join(args.data_dir, args.train_labels), allow_pickle=True)
print(os.path.join(args.data_dir, args.test_texts))
test_texts = np.load(os.path.join(args.data_dir, args.test_texts), allow_pickle=True)
# test_labels = np.load(os.path.join(args.data_dir, args.test_labels), allow_pickle=True)
emb_init = get_word_emb(os.path.join(args.data_dir, args.emb_init))
print(f"Length of train_texts: {len(train_texts)}")
print(f"Length of train_labels: {len(train_labels)}")
X_train, X_valid, train_y, valid_y = train_test_split(train_texts, train_labels,
test_size=args.valid_size,
random_state=args.seed)
'''print(f"Train y shape: {train_y.shape}")
print(train_y)
print(f"Valid y shape: {valid_y.shape}")
print(valid_y)'''
# mlb = get_mlb(os.path.join(args.data_dir, args.labels_binarizer), np.hastack((train_y, valid_y)))
mlb = get_mlb(os.path.join(args.data_dir, args.labels_binarizer), np.hstack((train_y, valid_y)))
y_train, y_valid = mlb.transform(train_y), mlb.transform(valid_y)
args.label_size = len(mlb.classes_)
if args.sample_level_da==True:
X_train, y_train = slda(X_train, y_train)
'''print(X_train)
print(X_train.shape)'''
logger.info(F'Size of Training Set: {len(X_train)}')
logger.info(F'Size of Validation Set: {len(X_valid)}')
train_data = data_utils.TensorDataset(torch.from_numpy(X_train).type(torch.LongTensor),
torch.from_numpy(y_train.A).type(torch.LongTensor))
val_data = data_utils.TensorDataset(torch.from_numpy(X_valid).type(torch.LongTensor),
torch.from_numpy(y_valid.A).type(torch.LongTensor))
# ctest_texts = test_texts.astype(float)
# print(test_texts)
test_data = data_utils.TensorDataset(torch.from_numpy(test_texts).type(torch.LongTensor))
train_loader = data_utils.DataLoader(train_data, args.batch_size, shuffle=True, drop_last=True, num_workers=4)
val_loader = data_utils.DataLoader(val_data, args.batch_size, shuffle=True, drop_last=True, num_workers=4)
test_loader = data_utils.DataLoader(test_data, args.batch_size, drop_last=False)
return train_loader, val_loader, test_loader, emb_init, mlb, args
def get_word_emb(vec_path, vocab_path=None):
if vocab_path is not None:
with open(vocab_path) as fp:
vocab = {word: idx for idx, word in enumerate(fp)}
return np.load(vec_path, allow_pickle=True), vocab
else:
return np.load(vec_path, allow_pickle=True)
def get_mlb(mlb_path, labels=None) -> MultiLabelBinarizer:
if os.path.exists(mlb_path):
return joblib.load(mlb_path)
mlb = MultiLabelBinarizer(sparse_output=True)
mlb.fit(labels)
joblib.dump(mlb, mlb_path)
return mlb
def slda(X_train, y_train):
#sample level data augemtation
y_train = y_train.A
x = []
y = []
for i in range(10):
x.append(X_train)
y.append(y_train)
x = np.array(x)
y = np.array(y)
x = np.reshape(x, (-1, x.shape[-1]))
y = np.reshape(y, (-1, y.shape[-1]))
y = csr_matrix(y)
return x, y