-
Notifications
You must be signed in to change notification settings - Fork 1
/
analysis_nlst.py
259 lines (200 loc) · 11.4 KB
/
analysis_nlst.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Analyze NLST
Created on Thu Dec 12 22:41:52 2019
@author: Yesh
"""
import csv
import numpy as np
import pandas as pd
import time
import os
#os.chdir('/home/bdrad1/chest_ct_projects/pytorch-retinanet')
#os.chdir('/Users/Yesh/Documents/BDRAD/chest_ct_projects/pytorch-retinanet')
import argparse
import torch
from torch.utils.data import DataLoader
from torchvision import transforms
from dataloader_NLST import NLSTDataset, Normalizer, Resizer
print('CUDA available: {}'.format(torch.cuda.is_available()))
def listdir_nohidden(path):
for f in os.listdir(path):
if not f.startswith('.'):
yield f
def load_csv(results_dir: str, csv_path: str, data_dir: str, pids):
"""
Loads csv given path to sctabn csv file and path to where results csvs are written.
Removes examples from sctabn csv which aren't in local dataset, and removes examples for which we've already done inference.
Arguments:
1) results_dir: Path to where results csvs are written
2) csv_path: Path to sctabn csv
3) data_dir: Path to directory containing all data
4) pids: List of PIDs to perform inference on, None if we want to do
inference on all examples locally stored.
Returns:
anns: pandas.DataFrame object with annotations
"""
save_path = ""
for x in csv_path.split("/")[:-1]:
save_path = os.path.join(save_path, x)
save_path = os.path.join(save_path, "sctabn_massive_annotation.csv") # Location new csv will be saved, this avoids repetitive computation
if not os.path.exists(save_path):
anns = pd.read_csv(csv_path)
remove_idx = []
for index in range(len(anns)):
ann = anns.iloc[index]
pid = ann['pid']
study_yr = ann['STUDY_YR']
pid_dir = os.path.join(data_dir,str(pid))
if os.path.isdir(pid_dir) == False: # If we don't have the PID
remove_idx.append(index)
continue
timestamps = sorted(listdir_nohidden(pid_dir))
study_yr = 'T'+str(study_yr)
if study_yr not in timestamps: # If we have the PID but not the specific study year
remove_idx.append(index)
for index in remove_idx[::-1]: # Remove in reverse order so index to remove don't move around unexpectedly
anns.drop(index, axis=0, inplace=True)
anns.to_csv(save_path, index=False)
else:
anns = pd.read_csv(save_path)
# If directory exists, continue annotations from there
# - by removing already inferenced anns from the dataloader
if os.path.exists(results_dir):
# get list of all pid-study_yrs already inferenced - will be used to drop pid_yr from "anns"
df_preds = pd.DataFrame(columns=['pid', 'study_yr'])
for f in sorted(os.listdir(results_dir)):
if f.endswith('.csv'):
df_temp = pd.read_csv(os.path.join(results_dir, f), header=0)
# just get unique pid-study_yr to prevent dataframe from getting too large
df_temp = df_temp.drop_duplicates(subset=['pid', 'study_yr'])
df_preds = df_preds.append(df_temp[['pid', 'study_yr']])
last_f = f
# remove last pid_yr in case incomplete from BOTH the last csv and the df_preds so it will reinference this pid
last_pid = df_preds.tail(1)['pid'].values[0]
last_yr = df_preds.tail(1)['study_yr'].values[0]
df_last_preds = pd.read_csv(os.path.join(results_dir, last_f), header=0)
## drop from last pid_yr and save the last results csv
df_last_preds = df_last_preds[~((df_last_preds['pid'] == last_pid) & (df_last_preds['study_yr'] == last_yr))]
df_last_preds.to_csv(os.path.join(results_dir, last_f), index=False)
## drop from df_preds so you won't drop it from anns
df_preds = df_preds[~((df_preds['pid'] == last_pid) & (df_preds['study_yr'] == last_yr))]
# drop those already in preds
anns = anns[~((anns['pid'].isin(df_preds['pid'])) & (anns['STUDY_YR'].isin(df_preds['study_yr'])))]
else:
os.mkdir(results_dir)
if isinstance(pids, int):
anns = anns[:pids]
elif isinstance(pids, list):
anns = anns[anns['pid'].isin(pids)]
return anns
def main(args=None):
parser = argparse.ArgumentParser(description='NLST inference script for retinanet.')
parser.add_argument('--datadir', help='Path to NLST Folder')
parser.add_argument('--anns', help='Path to NLST sctabn.csv Annotations')
parser.add_argument('--model', help='Path to retinanet model')
parser.add_argument('--pids', help='load an array of specific PIDs (optional)', type=int, nargs='+', default=None) # test pids with large nodules: [207375, 210581]
parser.add_argument('--mip', help='apply maximum intensity projects', type=int, default=25)
parser.add_argument('--nodule_cts_only', help='Apply inference to only use CT scans with a known nodule', action="store_true")
parser.add_argument('--no_voxel_transform', help='Do not applu 1mm transformation to voxels', action="store_true")
parser.add_argument('--results_dir', help='Dir to save results csv or director from which to continue')
parser.add_argument('--no_mask', help='do NOT apply lung mask (i.e. do not segment out the lungs with manual thresholding)', action="store_true")
parser = parser.parse_args()
np.random.seed(10)
results_dir = parser.results_dir
anns = load_csv(results_dir=results_dir, csv_path=parser.anns,
data_dir=parser.datadir, pids=parser.pids)
# Load dataloader
nlst = NLSTDataset(fp_anns=anns,
data_dir=parser.datadir,
nodule_cts_only=parser.nodule_cts_only,
transform=transforms.Compose([Normalizer(), Resizer()]),
mip=parser.mip,
voxel_transform = not parser.no_voxel_transform,
mask = not parser.no_mask)
dataloader = DataLoader(nlst, shuffle=False)
print('Remaining PID-YRs: {}'.format(nlst.anns.shape[0]))
if parser.pids:
print('Inferencing: {}'.format(np.unique(nlst.anns['pid'])))
# Load Model
assert(torch.cuda.is_available())
retinanet = torch.load(parser.model)
retinanet = retinanet.cuda()
retinanet.eval()
# index for the results files
results_file_index = round(time.time())
cts_per_results_file = 100 # how many pid-year combos per ct
total_time0 = time.time()
preprocess_time_0 = time.time()
for idx, data in enumerate(dataloader):
if bool(data) == False:
continue
preprocess_time = time.time() - preprocess_time_0
print('preprocessing time: {:.2f}s'.format(preprocess_time))
# intiailize csv filename
if idx % cts_per_results_file == 0:
results_file_index = round(time.time())
results_filename = 'NLST_inferences_' + str(results_file_index) + '.csv'
results_csv_path = os.path.join(results_dir, results_filename)
for kernel, kernel_data in data.items():
# iterate through axial and coronal views
for view, item in kernel_data.items():
ct = item['img_array']
file_path = item['file_path'][0]
scale = float(item['scale'])
axial_conversion = float(item['axial_conversion'])
saggital_conversion = float(item['saggital_conversion'])
coronal_conversion = float(item['coronal_conversion'])
pid = int(item['pid'])
study_yr = int(item['study_yr'])
print('Inferencing {}_{}_{}_{}'.format(pid, study_yr, kernel, view))
inference_time_list = []
for slice_idx, ct_slice in enumerate(ct[0]): # NOTE: this loop only works with inferencing a batch size of 1 currently
with torch.no_grad():
h, w = ct_slice.shape[:2]
ct_input = ct_slice.unsqueeze(0)
ct_input = ct_input.permute(0,3,1,2)
ct_input = torch.cat((ct_input, ct_input, ct_input), dim=1)
inference_time_0 = time.time()
scores, classifications, transformed_anchors = retinanet(ct_input.cuda().float())
inference_time = time.time() - inference_time_0
inference_time_list.append(inference_time)
for score, classification, transformed_anchor in zip(scores, classifications, transformed_anchors):
bbox = transformed_anchor.cpu().numpy()
x1 = int(bbox[0])
y1 = int(bbox[1])
x2 = int(bbox[2])
y2 = int(bbox[3])
confidence = float(score.cpu())
inference_time_per_prediction = inference_time / len(scores)
# initialize csv headers
if not os.path.exists(results_csv_path):
with open(results_csv_path,'w',newline='') as csvfile:
writer = csv.writer(csvfile, delimiter=',')
writer.writerow(['pid', 'study_yr', 'kernel', 'view', 'pred_slice_num',
'scale', 'axial_conversion', 'saggital_conversion', 'coronal_conversion',
'x1', 'y1', 'x2', 'y2', 'confidence', 'file_path', 'mip',
'inference_time', 'preprocess_time', 'model_fp', 'height','width', 'timestamp',
'voxel_transform'])
# write to csv
with open(results_csv_path,'a', newline='') as csvfile:
csv_array = [pid, study_yr, kernel, view, slice_idx,
scale, axial_conversion, saggital_conversion, coronal_conversion,
x1, y1, x2, y2, confidence, file_path, parser.mip,
inference_time_per_prediction, preprocess_time, parser.model, h, w, time.time(),
not parser.no_voxel_transform]
writer = csv.writer(csvfile, delimiter=',')
writer.writerow(csv_array)
print('- inference time: {:.2f}s'.format(sum(inference_time_list)))
# reset preprocessing timer
preprocess_time_0 = time.time()
# estimate total inference time remaining = (number of PID-YRs reamining) * (total_time / pidyrs inferenced)
remaining_pidyrs = len(nlst.anns) - (idx+1)
time_remaining_estimate_seconds = (remaining_pidyrs) * ((time.time() - total_time0) / (idx+1))
print('PID-YRs Remaining: {}'.format(remaining_pidyrs))
print('- time remaining estimate: {:.2f} hours'.format(time_remaining_estimate_seconds / 60 / 60))
# cleanup big items to reduce memory usage
del data, kernel_data, item, ct, ct_input,
if __name__ == '__main__':
main()