-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocess.py
86 lines (64 loc) · 3.29 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import pandas as pd
from sklearn.model_selection import GroupShuffleSplit
from sklearn.utils import resample
import matplotlib.pyplot as plt
import seaborn as sns
import argparse
def undersample_negatives(df, labels, ratio=0.3):
negative = df[df[labels].sum(axis=1) == 0]
positive = df[df[labels].sum(axis=1) > 0]
print(f'Negatives before undersampling {len(negative)}')
negative = resample(negative,
replace=False,
n_samples=int(len(positive)*(ratio / (1- ratio))),
random_state=42)
print(f'Negatives before undersampling {len(negative)}')
return pd.concat([positive, negative])
def main(args):
#Get the labels and read the original metadata
labels = ['Atelectasis',
'Cardiomegaly',
'Consolidation',
'Edema',
'Effusion',
'Emphysema',
'Fibrosis',
'Hernia',
'Infiltration',
'Mass',
'Nodule',
'Pleural_Thickening',
'Pneumonia',
'Pneumothorax']
metadata = pd.read_csv('./labels/Data_Entry_2017_v2020.csv', delimiter=',')
#Encode the labels with multi-label friendly encoding
for label in labels:
metadata[label] = metadata['Finding Labels'].apply(lambda x: 1 if label in x else 0)
metadata = metadata.drop(columns=['Finding Labels', 'Follow-up #','Patient Age', 'Patient Gender', 'View Position', 'OriginalImage[Width','Height]', 'OriginalImagePixelSpacing[x', 'y]'])
#Get the test train and val splits according to the patient ID so no patients end up split between groups
gss_test = GroupShuffleSplit(test_size=0.2, n_splits=1, random_state=42)
train_val_idx, test_idx = next(gss_test.split(metadata, groups=metadata['Patient ID']))
train_val_metadata = metadata.iloc[train_val_idx]
test_metadata = metadata.iloc[test_idx]
gss_train_val = GroupShuffleSplit(test_size=0.125, n_splits=1, random_state=42)
train_idx, val_idx = next(gss_train_val.split(train_val_metadata, groups=train_val_metadata['Patient ID']))
train_metadata = train_val_metadata.iloc[train_idx]
val_metadata = train_val_metadata.iloc[val_idx]
#Drop the column of patient ID
train_metadata = train_metadata.drop(columns=['Patient ID'])
val_metadata = val_metadata.drop(columns=['Patient ID'])
test_metadata = test_metadata.drop(columns=['Patient ID'])
#Undersample "No Findings"
print(f'Applying a ratio of undersample of: {args}')
train_metadata = undersample_negatives(train_metadata,labels,args)
val_metadata = undersample_negatives(val_metadata,labels,args)
test_metadata = undersample_negatives(test_metadata,labels,args)
#Write all the new metadata as csv to load easier
train_metadata.to_csv('./labels/train_metadata.csv', index=False)
val_metadata.to_csv('./labels/val_metadata.csv', index=False)
test_metadata.to_csv('./labels/test_metadata.csv', index=False)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Preprocessing to the NIH dataset metadata for the model")
parser.add_argument('--ratio', type=float, default=0.3, help='Ratio of positive to negative samples for undersampling (default: 0.3)')
args = parser.parse_args()
main(args.ratio)