-
Notifications
You must be signed in to change notification settings - Fork 2
/
bootstrap.py
107 lines (80 loc) · 3.05 KB
/
bootstrap.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
"""
Methods for boostrapping model performance
"""
import random
import numpy as np
import pandas as pd
from ast import literal_eval
from sklearn.utils import resample
def get_seeds(seed, num_seeds):
random.seed(seed)
return random.sample(range(0, 2 ** 32), num_seeds)
def get_resample_index(seed, indices):
return resample(indices, n_samples=len(indices), random_state=seed)
def get_resample(genes, y_labels, ids, seed, num_seeds, bs_iter):
seeds = get_seeds(seed=seed, num_seeds=num_seeds)
indices = get_resample_index(seed=seeds[bs_iter],
indices=ids)
return genes.loc[indices], y_labels.loc[indices], indices
def get_selected_params(file, model, label):
cv = pd.read_csv(file, sep=',', header=0, index_col=0)
params = cv['params'].loc[(cv['label'] == label) & (cv['model'] == model)].values
params = literal_eval(params[0]) # to dict
for k, v in params.items(): # put each param value in a list
params[k] = [v]
return params
def bootstap_gen_cv(cv_split, seed, y, classes):
"""f or each cv split, resample from train and validation partitions
Args:
cv_split: fold indices generator
seed: int seed
y: int ndarray y class labels
classes: str ndarray y class names
Returns: a bootstrap generator, for input to gridsearchcv
"""
for train_index, val_index in cv_split:
train = resample(train_index, n_samples=len(train_index), random_state=seed)
if classes is None:
val = resample(val_index, n_samples=len(val_index), random_state=seed)
else:
# if classes are highly imbalanced, make sure they appear in validation still
n = 0
s = seed
while n < len(np.unique(y)):
val = resample(val_index, n_samples=len(val_index), random_state=s)
n = len(np.unique(y[val]))
s += 1
yield train, val
def bootstap_gen_cv_class(cv_split, seed, y, folds):
"""for each cv split, for each partition, separate classes and sample from each class
Args:
cv_split: fold indices generator
seed: int seed
y: int ndarray y class labels
folds: int folds
Returns: a bootstrap generator, for input to gridsearchcv
"""
seeds = get_seeds(seed=seed, num_seeds=folds)
i = 0
for train_index, val_index in cv_split:
s = seeds[i]
train = get_boot_idx(train_index, y, s)
val = get_boot_idx(val_index, y, s)
i += 1
yield train, val
def get_boot_idx(index, y, seed):
"""for each class in subset, sample and return indices
Args:
index: subset of y_labels via indices
y: class labels
seed: int seed
Returns:
"""
sub_y = y[index] # subset
classes, counts = np.unique(sub_y, return_counts=True) # subset class and counts
bs = []
for c, n, in zip(classes, counts):
# print(c, n)
a = resample(index[sub_y == c], n_samples=n, random_state=seed)
bs.extend(a)
return np.array(bs)