-
Notifications
You must be signed in to change notification settings - Fork 1
/
linear_evaluation.py
111 lines (95 loc) · 4.21 KB
/
linear_evaluation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import sys
import pandas as pd
import numpy as np
from glob import glob
import os
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from argparse import ArgumentParser
def main(csv=None, dataset=None, splitdir=None):
if csv is None:
parser = ArgumentParser()
parser.add_argument('--csv', type=str, help='Path to csv file')
parser.add_argument('--dataset', type=str, help='Dataset name')
parser.add_argument('--splitdir', type=str, default=None, help='Path to split directory')
args = parser.parse_args()
else:
args = type('args', (object,), {'csv': csv, 'dataset': dataset, 'splitdir': splitdir})
df = pd.read_csv(args.csv)
ids = list(np.load(os.path.join(args.dataset, 'ids.npy')))
embeddings = np.load(os.path.join(args.dataset, 'embeddings.npy'))
# Cleans the csv from slides not in DB. Prints the number of slides removed.
df['is_in_db'] = df.apply(lambda x: x['ID'] in ids, axis=1)
n_removed = df[df['is_in_db'] == False].shape[0]
df = df[df['is_in_db']]
# Reorders X according to the order of the csv.
Is = [ids.index(i) for i in df['ID'].values]
X = np.vstack([embeddings[i, :] for i in Is])
y = df['label'].values
# Splits the data into 5 test/train splits.
splits = split_data(X, y, list(df['ID'].values), args.splitdir)
# Performs 5-fold cross-validation.
scores = []
for i, (train, test) in enumerate(splits):
train_size = len(train)
test_size = len(test)
X_train, X_test = X[train], X[test]
y_train, y_test = y[train], y[test]
logreg = LogisticRegression(C=10, max_iter=1000, class_weight='balanced')
logreg.fit(X_train, y_train)
if len(np.unique(y)) == 2:
y_pred = logreg.predict_proba(X_test)[:, 1]
score = roc_auc_score(y_test, y_pred)
else:
y_pred = logreg.predict_proba(X_test)
score = roc_auc_score(y_test, y_pred, multi_class="ovr")
scores.append(score)
# Nice printing of the results reporting the name of the csv used and the split strategy.
if args.splitdir is None:
split_str = 'random'
else:
split_str = 'fixed'
# Prints spaces before and after the results
print('\n')
header = '--- Results for {} using {} split: ---'.format(args.csv, split_str)
print(header)
if args.splitdir:
print('Split directory: {}'.format(args.splitdir))
print('train / test size: {}/{}'.format(train_size, test_size))
print('Number of slides removed: {}'.format(n_removed))
print('Mean AUC: {}'.format(np.mean(scores)))
print('Std AUC: {}'.format(np.std(scores)))
print('-' * len(header))
print('\n')
def split_data(X, y, ids, splitdir, n_splits=5):
"""
Splits the data into 5 test/train splits. Returns a tuple of two lists of indices.
If splitdir is None, the data is split randomly. Otherwise, the data is split according to the
split files in splitdir.
"""
if splitdir is None:
skf = StratifiedKFold(n_splits=n_splits, shuffle=True)
splits = skf.split(X, y)
else:
splits = get_index_iterator_from_splitdir(splitdir, ids)
return splits
def get_index_iterator_from_splitdir(splitdir, names):
"""
return a list of tuples (train, test) where train and test are the indices of the samples in the train and test set
takes them in the splitdir folder corresponding to the current experiment (must coincide with the good data table)
args:
splitdir: path to the folder containing the splits
prop_train: size of the training set, folder name containing the splits
names: list of names of the samples
returns:
list of tuples (train, test) where train and test are the indices of the samples in the train and test set
"""
splits = [pd.read_csv(x) for x in glob(f'{splitdir}/split_[0-9].csv')]
for split in splits:
train = np.where(np.isin(names, split['train']))[0]
test = np.where(np.isin(names, split['test']))[0]
yield train, test
if __name__ == '__main__':
main()