-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathdataset.py
227 lines (179 loc) · 6.67 KB
/
dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
# coding: UTF-8
"""This module defines an example Torch dataset from the Oze datachallenge.
Example
-------
$ dataloader = DataLoader(OzeDataset(DATSET_PATH),
batch_size=BATCH_SIZE,
shuffle=True,
num_workers=NUM_WORKERS)
"""
import json
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset
class OzeEvaluationDataset(Dataset):
"""Torch dataset for Oze datachallenge evaluation.
Load dataset from two train.csv and test.csv file.
Attributes
----------
x: np.array
Dataset input of shape (m, K, 37).
labels: Dict
Ordered labels list for R, Z and X.
m: np.array
Normalization constant.
M: np.array
Normalization constant.
"""
def __init__(self, dataset_x_path, labels_path="labels.json", **kwargs):
"""Load dataset from csv.
Parameters
---------
dataset_x_path: str or Path
Path to the dataset inputs as csv.
labels_path: str or Path, optional
Path to the labels, divided in R, Z and X, in json format.
Default is "labels.json".
"""
super().__init__(**kwargs)
self._load_x_from_csv(dataset_x_path, labels_path)
def _load_x_from_csv(self, dataset_x_path, labels_path):
"""Load input dataset from csv and create x_train tensor."""
# Load dataset as csv
x = pd.read_csv(dataset_x_path)
# Load labels, file can be found in challenge description
with open(labels_path, "r") as stream_json:
self.labels = json.load(stream_json)
m = x.shape[0]
K = 672 # Can be found through csv
# Create R and Z
R = x[self.labels["R"]].values
R = np.tile(R[:, np.newaxis, :], (1, K, 1))
Z = x[[f"{var_name}_{i}" for var_name in self.labels["Z"]
for i in range(K)]]
Z = Z.values.reshape((m, -1, K))
Z = Z.transpose((0, 2, 1))
# Store R and Z as x_train
self._x = np.concatenate([Z, R], axis=-1)
# Normalize
self.M = np.max(self._x, axis=(0, 1))
self.m = np.min(self._x, axis=(0, 1))
self._x = (self._x - self.m) / (self.M - self.m + np.finfo(float).eps)
# Convert to float32
self._x = self._x.astype(np.float32)
def __getitem__(self, idx):
if torch.is_tensor(idx):
idx = idx.tolist()
return self._x[idx]
def __len__(self):
return self._x.shape[0]
class OzeDataset(OzeEvaluationDataset):
"""Torch dataset for Oze datachallenge training.
Attributes
----------
dataset_y_path: str or Path
Path to the dataset targets as csv.
y: np.array
Dataset target of shape (m, K, 8).
"""
def __init__(self, dataset_x_path, dataset_y_path, labels_path="labels.json", **kwargs):
"""Load dataset from csv.
Parameters
---------
dataset_x_path: str or Path
Path to the dataset inputs as csv.
dataset_y_path: str or Path
Path to the dataset targets as csv.
labels_path: str or Path, optional
Path to the labels, divided in R, Z and X, in json format.
Default is "labels.json".
"""
super().__init__(dataset_x_path=dataset_x_path, labels_path=labels_path, **kwargs)
self._load_y_from_csv(dataset_y_path)
def _load_y_from_csv(self, dataset_y_path):
"""Load target dataset from csv and create y_train tensor."""
# Load dataset as csv
y = pd.read_csv(dataset_y_path)
m = y.shape[0]
K = 672 # Can be found through csv
# Create X
X = y[[f"{var_name}_{i}" for var_name in self.labels["X"]
for i in range(K)]]
X = X.values.reshape((m, -1, K))
X = X.transpose((0, 2, 1))
# Store X as y_train
self._y = X
# Normalize
self.M = np.max(self._y, axis=(0, 1))
self.m = np.min(self._y, axis=(0, 1))
self._y = (self._y - self.m) / (self.M - self.m + np.finfo(float).eps)
# Convert to float32
self._y = self._y.astype(np.float32)
def __getitem__(self, idx):
if torch.is_tensor(idx):
idx = idx.tolist()
return (self._x[idx], self._y[idx])
class OzeNPZDataset(Dataset):
"""Torch dataset for Oze datachallenge evaluation.
Load dataset from a single npz file.
Attributes
----------
x: np.array
Dataset input of shape (m, K, 37).
y: np.array
Dataset target of shape (m, K, 8).
labels: Dict
Ordered labels list for R, Z and X.
m: np.array
Normalization constant.
M: np.array
Normalization constant.
"""
def __init__(self, dataset_path, labels_path="labels.json", **kwargs):
"""Load dataset from npz.
Parameters
---------
dataset_x: str or Path
Path to the dataset inputs as npz.
labels_path: str or Path, optional
Path to the labels, divided in R, Z and X, in json format.
Default is "labels.json".
"""
super().__init__(**kwargs)
self._load_npz(dataset_path, labels_path)
def _load_npz(self, dataset_path, labels_path):
"""Load dataset from csv and create x_train and y_train tensors."""
# Load dataset as csv
dataset = np.load(dataset_path)
# Load labels, can be found through csv or challenge description
with open(labels_path, "r") as stream_json:
self.labels = json.load(stream_json)
R, X, Z = dataset['R'], dataset['X'], dataset['Z']
m = Z.shape[0] # Number of training example
K = Z.shape[-1] # Time serie length
R = np.tile(R[:, np.newaxis, :], (1, K, 1))
Z = Z.transpose((0, 2, 1))
X = X.transpose((0, 2, 1))
# Store R, Z and X as x_train and y_train
self._x = np.concatenate([Z, R], axis=-1)
# Normalize
M = np.max(self._x, axis=(0, 1))
m = np.min(self._x, axis=(0, 1))
self._x = (self._x - m) / (M - m + np.finfo(float).eps)
# Convert to float32
self._x = self._x.astype(np.float32)
self._y = X
self.original_y = np.array(self._y).astype(np.float32)
# Normalize
self.M = np.max(self._y, axis=(0, 1))
self.m = np.min(self._y, axis=(0, 1))
self._y = (self._y - m) / (M - m + np.finfo(float).eps)
# Convert to float32
self._y = self._y.astype(np.float32)
def __getitem__(self, idx):
if torch.is_tensor(idx):
idx = idx.tolist()
return (self._x[idx], self._y[idx])
def __len__(self):
return self._x.shape[0]