-
Notifications
You must be signed in to change notification settings - Fork 1
/
clustering.py
374 lines (285 loc) · 15.5 KB
/
clustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue May 26 11:17:08 2020
@author: Yesh
"""
import os
import numpy as np
import pandas as pd
from sklearn.cluster import DBSCAN
from stl import mesh
import time
import argparse
import matplotlib.pyplot as plt
import joblib
from dataloader_NLST import NLSTDataset, Normalizer, Resizer
from torchvision import transforms
plt.rcParams['figure.figsize'] = [12, 6]
"""
Script to cluster slice inferences into discrete nodules.
- saves nodules predictions into CSV
- creates Mesh file to visualize the nodule using Slicer3d
"""
def createMeshCube(corner, dx, dy, dz):
"""
Returns a mesh cube object with the above diameters for x, y, z axis with respect to an axial slice orientation
- corner is the posterior, inferior, left corner - I think
"""
# vertices of square
vertices = np.array([
corner,
corner + np.array([dx, 0, 0]),
corner + np.array([dx, dy, 0]),
corner + np.array([ 0, dy, 0]),
corner + np.array([ 0, 0, dz]),
corner + np.array([dx, 0, dz]),
corner + np.array([dx, dy, dz]),
corner + np.array([ 0, dy, dz]),])
# Define the 12 triangles composing the cube
faces = np.array([
[0,3,1],
[1,3,2],
[0,4,7],
[0,7,3],
[4,5,6],
[4,6,7],
[5,1,2],
[5,2,6],
[2,3,6],
[3,7,6],
[0,1,5],
[0,5,4]])
# Create the mesh
cube = mesh.Mesh(np.zeros(faces.shape[0], dtype=mesh.Mesh.dtype))
for i, f in enumerate(faces):
for j in range(3):
cube.vectors[i][j] = vertices[f[j],:]
return cube
def convertToWorldCoords(df_pid_yr, dcms, origin, mip):
# - Axial inferences
df_axial = df_pid_yr[df_pid_yr['view'] == 'axial'].copy()
df_axial['axial_slice'] = (df_axial['pred_slice_num'] - mip/2)/ df_axial['axial_conversion']
df_axial['axial_world_coord'] = [dcms[int(slice_idx)].ImagePositionPatient[2] for slice_idx in df_axial['axial_slice'].values]
df_axial['saggital_world_coord'] = -origin[0] - (df_axial['x'] / df_axial['scale'])
df_axial['coronal_world_coord'] = -origin[1] - (df_axial['y'] / df_axial['scale'])
# - Coronal inferences
df_cor = df_pid_yr[df_pid_yr['view'] == 'coronal'].copy()
df_cor['axial_slice'] = df_cor['y'] / df_cor['scale'] / df_cor['axial_conversion']
df_cor['axial_world_coord'] = [dcms[int(slice_idx)].ImagePositionPatient[2] for slice_idx in df_cor['axial_slice'].values]
df_cor['saggital_world_coord'] = -origin[0] - (df_cor['x'] / df_cor['scale'])
df_cor['coronal_world_coord'] = -origin[1] - (df_cor['pred_slice_num'] - mip/2)
# - combine axial and coronal
df_cluster = pd.concat([df_cor, df_axial], axis=0)
return df_cluster
def main(args=None):
parser = argparse.ArgumentParser(description='Clustering on NLST Inference By pid and study_yr')
parser.add_argument('--inference_dir', help='Path to Directory containing NLST Inferences', default='')
parser.add_argument('--save_dir', help='Where to Save Nodule Candidates CSV & Mesh Segmentation Files', default='')
parser.add_argument('--conf_thresh', help='Confidence Threshold to drop inferences', default=0.10, type=float)
parser.add_argument('--nlst_dir', help='Directory of NLST data', default='../data/nlst-ct')
parser.add_argument('--nlst_anns', help='Path to NLST annotations', default='../data/nlst_annotations/all_14_datasets/cdas_delivery_csv/sctabn.csv')
parser.add_argument('--plot', help='Plot Predictions for debugging', action='store_true')
parser.add_argument('--fp_reducer', help='Path To False Positive Reducer', default=None)
parser = parser.parse_args()
nodules_fn = 'nodules.csv'
if parser.fp_reducer:
# load fp_reducer
fp_reducer = joblib.load(parser.fp_reducer)
# initialize or load already existing df_nodules
if os.path.exists(parser.save_dir):
df_nodules = pd.read_csv(os.path.join(parser.save_dir, nodules_fn))
completed_pid_yrs = df_nodules['pid'].astype(str) + '_' + df_nodules['study_yr'].astype(str)
completed_pid_yrs = completed_pid_yrs.values
else:
os.mkdir(parser.save_dir)
df_nodules = pd.DataFrame()
completed_pid_yrs = []
# get all csvs & sort in scending timestamp
csvs = [f for f in os.listdir(parser.inference_dir) if f.endswith('.csv')]
csvs.sort()
# loop through csvs
for csv in csvs:
time_csv0 = time.time()
print('-'*30)
print(csv)
csv_base = csv.split('.')[0]
# read csv
df_csv = pd.read_csv(os.path.join(parser.inference_dir, csv), header=0)
# create a pid_study_yr convenience index
df_csv['pid_yr'] = df_csv['pid'].astype(str) + '_' + df_csv['study_yr'].astype(str)
# loop through unique pid_study
for pid_yr in np.unique(df_csv['pid_yr']):
# skip if already clustered
if pid_yr in completed_pid_yrs:
continue
time_pid_yr0 = time.time()
# get subset matching pid_yr
df_pid_yr = df_csv[df_csv['pid_yr'] == pid_yr].copy()
pid = df_pid_yr['pid'].values[0]
study_yr = df_pid_yr['study_yr'].values[0]
mip = df_pid_yr['mip'].values[0]
# apply conf_thresh
conf_thresh = parser.conf_thresh
df_pid_yr = df_pid_yr[df_pid_yr['confidence'] > conf_thresh]
# if no rows remaining, save empty row to final dataframe
if len(df_pid_yr) == 0:
df_nodules = df_nodules.append(pd.DataFrame({'pid': [pid], 'study_yr': [study_yr]}), sort=False)
df_nodules = df_nodules.reset_index(drop=True)
continue
# use only kernel with highest mean confidence score
kernel = df_pid_yr.groupby('kernel').confidence.max().idxmax()
df_pid_yr = df_pid_yr[df_pid_yr['kernel'] == kernel]
# get origin from NLSTDataset
try:
nlst = NLSTDataset(fp_anns=parser.nlst_anns,
data_dir=parser.nlst_dir,
transform=transforms.Compose([Normalizer(), Resizer()]))
res = nlst.load_NLST_images(pid, study_yr, load_dcms=True)
dcms = res[kernel]['dcms']
origin = res[kernel]['image_position_patient']
except:
print('Unable to load NLST data for {}. Check if you have the dicom data locally'.format(pid_yr))
continue
# calculate diameter
df_pid_yr['nod_width'] = np.abs(df_pid_yr['x1'] - df_pid_yr['x2']) / df_pid_yr['scale']
df_pid_yr['nod_height'] = np.abs(df_pid_yr['y1'] - df_pid_yr['y2']) / df_pid_yr['scale']
df_pid_yr['diameter'] = np.max(df_pid_yr[['nod_width', 'nod_height']], axis=1)
# calc mean nodule position
df_pid_yr['x'] = (df_pid_yr['x1'] + df_pid_yr['x2'])/2
df_pid_yr['y'] = (df_pid_yr['y1'] + df_pid_yr['y2'])/2
# Convert to World Coordinates
try:
df_cluster = convertToWorldCoords(df_pid_yr, dcms, origin, mip)
except IndexError as e:
print(' - {} Error: {}. May be missing some dcms files if you transferred dicoms to another computer'.format(pid_yr, e))
continue
# convenience: binary view to perform calculation easily
df_cluster['is_coronal'] = np.array(df_cluster['view'] == 'coronal').astype(int)
# plot axial vs coronal
if parser.plot:
f, ax = plt.subplots()
points = ax.scatter(df_cluster['saggital_world_coord'], df_cluster['axial_world_coord'],
s=(df_cluster['confidence']*20)**3.5,
c=df_cluster['is_coronal'],
alpha=0.1)
f.colorbar(points)
points
# 3d plot
if parser.plot:
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
# plot coronal
df_sub_cor = df_cluster[df_cluster['is_coronal'] == 1]
ax.scatter(df_sub_cor['saggital_world_coord'], df_sub_cor['axial_world_coord'], df_sub_cor['coronal_world_coord'],
s=(df_sub_cor['confidence']*25)**3,
label='coronal inferences',
c='#003f5c',
alpha=0.1)
# plot axial
df_sub_ax = df_cluster[df_cluster['is_coronal'] == 0]
ax.scatter(df_sub_ax['saggital_world_coord'], df_sub_ax['axial_world_coord'], df_sub_ax['coronal_world_coord'],
s=(df_sub_ax['confidence']*25)**3,
label='axial inferences',
c='#ff6361',
alpha=0.1)
ax.set_xlabel('Saggital Position')
ax.set_ylabel('Axial Position')
ax.set_zlabel('Coronal Position')
ax.legend()
plt.savefig('inferences_3d_scatter.pdf', dpi=300)
# CLUSTERING
df_cluster['cluster'] = DBSCAN(eps=10, min_samples=4).fit_predict(X=df_cluster[['axial_world_coord', 'saggital_world_coord', 'coronal_world_coord']])
# remove all unclustered inferences
df_cluster = df_cluster[df_cluster['cluster'] != -1]
# if no rows remaining, save empty row to final dataframe
if len(df_cluster) == 0:
df_nodules = df_nodules.append(pd.DataFrame({'pid': [pid], 'study_yr': [study_yr]}), sort=False)
df_nodules = df_nodules.reset_index(drop=True)
continue
# plot clusters
if parser.plot:
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
points = ax.scatter(df_cluster['saggital_world_coord'], df_cluster['axial_world_coord'], df_cluster['coronal_world_coord'],
s=(df_cluster['confidence']*20)**4,
c=df_cluster['cluster'],
alpha=0.1)
fig.colorbar(points)
points
# plot clusters 3d
if parser.plot:
plt.style.use('seaborn')
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
df_sub = df_cluster
ax.scatter(df_sub['saggital_world_coord'], df_sub['axial_world_coord'], df_sub['coronal_world_coord'],
s=(df_sub['confidence']*25)**3,
c=df_sub['cluster'].values,
label=df_sub['cluster'].values,
alpha=0.1)
ax.set_xlabel('Saggital Position')
ax.set_ylabel('Axial Position')
ax.set_zlabel('Coronal Position')
plt.savefig('inferences_clustered_3d_scatter.pdf', dpi=300)
# Group inferences by cluster
df_cluster = df_cluster.groupby('cluster').agg({
'confidence':['max', 'mean', 'std'],
'axial_world_coord': ['mean'],
'saggital_world_coord': ['mean'],
'coronal_world_coord': ['mean'],
'pred_slice_num': ['count'],
'is_coronal': ['mean'],
'diameter': ['max']})
# collapse multiindex columns b/c easier to work with
df_cluster.columns = list(map('_'.join, df_cluster.columns.values))
df_cluster['is_seen_axial_and_coronal'] = np.where(df_cluster['is_coronal_mean'].isin([0,1]), 0, 1)
df_cluster['top_world_coord'] = dcms[0].ImagePositionPatient[2]
df_cluster['bottom_world_coord'] = dcms[-1].ImagePositionPatient[2]
df_cluster['dist_to_top'] = np.abs(df_cluster['axial_world_coord_mean'] - df_cluster['top_world_coord'])
df_cluster['dist_to_bottom'] = np.abs(df_cluster['axial_world_coord_mean'] - df_cluster['bottom_world_coord'])
if parser.fp_reducer:
# apply fp_reducer
variables = ['is_seen_axial_and_coronal', 'pred_slice_num_count', 'confidence_max',
'confidence_mean', 'diameter_max', 'dist_to_top', 'dist_to_bottom']
df_cluster['nodule_proba'] = fp_reducer.predict_proba(df_cluster[variables])[:,1]
df_cluster = df_cluster[df_cluster['nodule_proba'] >= 0.2]
df_cluster = df_cluster.reset_index(drop=True)
# if no rows remaining, save empty row to final dataframe
if len(df_cluster) == 0:
df_nodules = df_nodules.append(pd.DataFrame({'pid': [pid], 'study_yr': [study_yr]}), sort=False)
df_nodules = df_nodules.reset_index(drop=True)
continue
# create cubes & save cubes
cubes = []
for nod, row in df_cluster.iterrows():
dx, dy, dz = row['diameter_max'], row['diameter_max'], row['diameter_max']
corner = np.array([0,0,0])
corner[0] = row['saggital_world_coord_mean'] - dx/2
corner[1] = row['coronal_world_coord_mean'] - dy/2
corner[2] = row['axial_world_coord_mean'] - dz/2
cube = createMeshCube(corner, dx, dy, dz)
cubes.append(cube)
cubes = mesh.Mesh(np.concatenate([c.data for c in cubes]))
# save cube to directory, grouped into folders with same name as the csv
cube_dir = os.path.join(parser.save_dir, csv_base)
if not os.path.exists(cube_dir):
os.mkdir(cube_dir)
cubes.save(os.path.join(cube_dir, '{}_{}_{}.stl'.format(pid, study_yr, kernel, len(df_cluster), mip)))
# add in metadata
df_cluster['pid'] = pid
df_cluster['study_yr'] = study_yr
df_cluster['kernel'] = kernel
df_cluster['cluster'] = df_cluster.index.values
df_cluster['conf_thresh'] = conf_thresh
df_nodules = df_nodules.append(df_cluster, sort=False)
df_nodules = df_nodules.reset_index(drop=True)
# save on each loop in case script crashes
df_nodules.to_csv(os.path.join(parser.save_dir, nodules_fn), index=False)
print(' - {} clusters: {}. time: {} secs'.format(pid_yr, len(df_cluster), round((time.time()-time_pid_yr0), 2)))
print('CSV total time: {} min'.format(round((time.time()-time_csv0)/60, 2)))
if __name__ == '__main__':
time_total0 = time.time()
main()
print('-'*30)
print('Total Time: {} min'.format(round((time.time()-time_total0)/60, 2)))