-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathload_data.py
117 lines (97 loc) · 3.86 KB
/
load_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import scipy
import SimpleITK
from sklearn.cluster import KMeans
from skimage import morphology, measure
import scipy
# Define function to import scans for one patient
def import_img_series(path):
reader = SimpleITK.ImageSeriesReader()
filenamesDICOM = reader.GetGDCMSeriesFileNames(path)
reader.SetFileNames(filenamesDICOM)
return reader.Execute()
# Define funcion to load in DICOM images
def load_images(patient_id, path=''):
path = path + patient_id
img = import_img_series(path)
return img
# Define a fuction to remove image noise
def remove_noise(img):
imgSmooth = SimpleITK.CurvatureFlow(image1=img, timeStep=0.125, numberOfIterations=5)
return imgSmooth
# Define a function to reshape images
def resample(img):
# Determine current pixel spacing
size = np.array([img.GetSize()[2], img.GetSize()[1], img.GetSize()[0]])
resize_factor = [float(84),float(64),float(64)] / size
arr = SimpleITK.GetArrayFromImage(img)
arr_resampled = scipy.ndimage.interpolation.zoom(arr, resize_factor)
arr_resampled = arr_resampled[10:-10]
return arr_resampled
# Segment the lungs
def segment_lungs(arr):
# Produce a satisfactory separation of regions for both types of images
# and eliminate the black halo in some images
middle = arr[5:60, 10:45,10:45]
mean = np.mean(middle)
max = np.max(arr)
min = np.min(arr)
#move the underflow bins
arr[arr==max]=mean
arr[arr==min]=mean
kmeans = KMeans(n_clusters=2).fit(np.reshape(middle,[np.prod(middle.shape),1]))
centers = sorted(kmeans.cluster_centers_.flatten())
threshold = np.mean(centers)
thresh_arr = np.where(arr<threshold,1.0,0.0) # threshold the image
# Use erosion and dilation to fill in the incursions into the lungs region by radio-opaque tissue
# and select the regions based on the bounding box sizes of each region
eroded = morphology.erosion(thresh_arr)
dilation = morphology.dilation(eroded)
labels = measure.label(dilation)
label_vals = np.unique(labels)
# Cutting non-ROI regions
regions = measure.regionprops(labels)
good_labels = []
for prop in regions:
B = prop.bbox
if B[4]-B[1]<60 and B[5]-B[2]<60 and B[1]>5 and B[2]<60: # Check thresholds as the cuts applied to each region bounding box were determined empirically and seem to work well for the LUNA data, but may not be generally applicable
good_labels.append(prop.label)
mask = np.ndarray(thresh_arr.shape, dtype=np.int8)
mask[:] = 0
#
# The mask here is the mask for the lungs--not the nodes
# After just the lungs are left, we do another large dilation
# in order to fill in and out the lung mask
#
for N in good_labels:
mask = mask + np.where(labels==N,1,0)
mask = morphology.dilation(mask) # one last dilation
# renormalizing the masked image (in the mask region)
#
new_mean = np.mean(arr[mask>0])
new_std = np.std(arr[mask>0])
#
# Pushing the background color up to the lower end
# of the pixel range for the lungs
#
old_min = np.min(arr) # background color
arr[arr==old_min] = new_mean-1.2*new_std # resetting backgound color
arr = arr-new_mean
arr = arr/new_std
return arr*mask
# Final function to get segmented lungs out of folder path
# Denoising hasn't been included so far
def get_lungs_arr(patient_id, path=''):
img = load_images(patient_id, path)
resampled_arr = resample(img)
segmented_lungs = segment_lungs(resampled_arr)
return segmented_lungs
# Get patient IDs from CSV file
train_labels = pd.read_csv('stage1_labels.csv')
for id in train_labels.loc[:, 'id']:
patient = SimpleITK.GetImageFromArray(get_lungs_arr(id, '../stage1/'))
np.save("../64x64x64_Images/{}.npy".format(id), patient)