-
Notifications
You must be signed in to change notification settings - Fork 26
/
pixie_preprocessing.py
410 lines (338 loc) · 16.6 KB
/
pixie_preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
import multiprocessing
import os
import random
from shutil import rmtree
import feather
from functools import partial
import numpy as np
import pandas as pd
import scipy.ndimage as ndimage
from skimage.io import imread
from alpineer import io_utils, load_utils, misc_utils
from ark.phenotyping import pixel_cluster_utils
multiprocessing.set_start_method('spawn', force=True)
def create_fov_pixel_data(fov, channels, img_data, seg_labels, pixel_thresh_val,
blur_factor=2, subset_proportion=0.1):
"""Preprocess pixel data for one fov
Args:
fov (str):
Name of the fov to index
channels (list):
List of channels to subset over
img_data (numpy.ndarray):
Array representing image data for one fov
seg_labels (numpy.ndarray):
Array representing segmentation labels for one fov
pixel_thresh_val (float):
value used to determine per-pixel cutoff for total signal inclusion
blur_factor (int):
The sigma to set for the Gaussian blur
subset_proportion (float):
The proportion of pixels to take from each fov
Returns:
tuple:
Contains the following:
- `pandas.DataFrame`: Gaussian blurred and channel sum normalized pixel data for a fov
- `pandas.DataFrame`: subset of the preprocessed pixel dataset for a fov
"""
# for each marker, compute the Gaussian blur
for marker in range(len(channels)):
img_data[:, :, marker] = ndimage.gaussian_filter(img_data[:, :, marker],
sigma=blur_factor)
# flatten each image, make sure to subset only on channels
pixel_mat = img_data.reshape(-1, len(channels))
# convert into a dataframe
pixel_mat = pd.DataFrame(pixel_mat, columns=channels)
# assign metadata about each entry
pixel_mat['fov'] = fov
pixel_mat['row_index'] = np.repeat(range(img_data.shape[0]), img_data.shape[1])
pixel_mat['column_index'] = np.tile(range(img_data.shape[1]), img_data.shape[0])
# assign segmentation labels if it is not None
if seg_labels is not None:
seg_labels_flat = seg_labels.flatten()
pixel_mat['segmentation_label'] = seg_labels_flat
# remove any rows with channels with a sum below the threshold
rowsums = pixel_mat[channels].sum(axis=1)
pixel_mat = pixel_mat.loc[rowsums > pixel_thresh_val, :].reset_index(drop=True)
# normalize the row sums of pixel mat
pixel_mat = pixel_cluster_utils.normalize_rows(pixel_mat, channels, seg_labels is not None)
# subset the pixel matrix for training
pixel_mat_subset = pixel_mat.sample(frac=subset_proportion)
return pixel_mat, pixel_mat_subset
def preprocess_fov(base_dir, tiff_dir, data_dir, subset_dir, seg_dir, seg_suffix,
img_sub_folder, is_mibitiff, channels, blur_factor,
subset_proportion, pixel_thresh_val, seed, channel_norm_df, fov):
"""Helper function to read in the FOV-level pixel data, run `create_fov_pixel_data`,
and save the preprocessed data.
Args:
base_dir (str):
The path to the data directories
tiff_dir (str):
Name of the directory containing the tiff files
data_dir (str):
Name of the directory which contains the full preprocessed pixel data
subset_dir (str):
The name of the directory containing the subsetted pixel data
seg_dir (str):
Name of the directory containing the segmented files.
Set to `None` if no segmentation directory is available or desired.
seg_suffix (str):
The suffix that the segmentation images use.
Ignored if `seg_dir` is `None`.
img_sub_folder (str):
Name of the subdirectory inside `tiff_dir` containing the tiff files.
Set to `None` if there isn't any.
is_mibitiff (bool):
Whether to load the images from MIBITiff
channels (list):
List of channels to subset over, applies only to `pixel_mat_subset`
blur_factor (int):
The sigma to set for the Gaussian blur
subset_proportion (float):
The proportion of pixels to take from each fov
pixel_thresh_val (float):
The value to normalize the pixels by
seed (int):
The random seed to set for subsetting
channel_norm_df (pandas.DataFrame):
The channel normalization values to use
fov (str):
The name of the FOV to preprocess
Returns:
pandas.DataFrame:
The full preprocessed pixel dataset, needed for computing
99.9% normalized values in `create_pixel_matrix`
"""
# load img_xr from MIBITiff or directory with the fov
if is_mibitiff:
img_xr = load_utils.load_imgs_from_mibitiff(
tiff_dir, mibitiff_files=[fov])
else:
img_xr = load_utils.load_imgs_from_tree(
tiff_dir, img_sub_folder=img_sub_folder, fovs=[fov])
# ensure the provided channels will actually exist in img_xr
misc_utils.verify_in_list(
provided_chans=channels,
pixel_mat_chans=img_xr.channels.values
)
# if seg_dir is None, leave seg_labels as None
seg_labels = None
# otherwise, load segmentation labels in for fov
if seg_dir is not None:
seg_labels = imread(os.path.join(seg_dir, fov + seg_suffix))
# subset for the channel data
img_data = img_xr.loc[fov, :, :, channels].values.astype(np.float32)
# create vector for normalizing image data
norm_vect = channel_norm_df.iloc[0].values
norm_vect = np.array(norm_vect).reshape([1, 1, len(norm_vect)])
# normalize image data
img_data = img_data / norm_vect
# set seed for subsetting
np.random.seed(seed)
# create the full and subsetted fov matrices
pixel_mat, pixel_mat_subset = create_fov_pixel_data(
fov=fov, channels=channels, img_data=img_data, seg_labels=seg_labels,
pixel_thresh_val=pixel_thresh_val, blur_factor=blur_factor,
subset_proportion=subset_proportion
)
# write complete dataset to feather, needed for cluster assignment
feather.write_dataframe(pixel_mat,
os.path.join(base_dir,
data_dir,
fov + ".feather"),
compression='uncompressed')
# write subseted dataset to feather, needed for training
feather.write_dataframe(pixel_mat_subset,
os.path.join(base_dir,
subset_dir,
fov + ".feather"),
compression='uncompressed')
return pixel_mat
def create_pixel_matrix(fovs, channels, base_dir, tiff_dir, seg_dir,
img_sub_folder="TIFs", seg_suffix='_whole_cell.tiff',
pixel_output_dir='pixel_output_dir',
data_dir='pixel_mat_data',
subset_dir='pixel_mat_subsetted',
norm_vals_name='channel_norm_post_rowsum.feather', is_mibitiff=False,
blur_factor=2, subset_proportion=0.1, seed=42,
channel_percentile=0.99, multiprocess=False, batch_size=5):
"""For each fov, add a Gaussian blur to each channel and normalize channel sums for each pixel
Saves data to `data_dir` and subsetted data to `subset_dir`
Args:
fovs (list):
List of fovs to subset over
channels (list):
List of channels to subset over, applies only to `pixel_mat_subset`
base_dir (str):
The path to the data directories
tiff_dir (str):
Name of the directory containing the tiff files
seg_dir (str):
Name of the directory containing the segmented files.
Set to `None` if no segmentation directory is available or desired.
img_sub_folder (str):
Name of the subdirectory inside `tiff_dir` containing the tiff files.
Set to `None` if there isn't any.
seg_suffix (str):
The suffix that the segmentation images use.
Ignored if `seg_dir` is `None`.
pixel_output_dir (str):
The name of the data directory containing the pixel data to use for the
clustering pipeline. `data_dir` and `subset_dir` should be placed here.
data_dir (str):
Name of the directory which contains the full preprocessed pixel data.
Should be placed in `pixel_output_dir`.
subset_dir (str):
The name of the directory containing the subsetted pixel data.
Should be placed in `pixel_output_dir`.
norm_vals_name (str):
The name of the file to store the 99.9% normalization values
is_mibitiff (bool):
Whether to load the images from MIBITiff
blur_factor (int):
The sigma to set for the Gaussian blur
subset_proportion (float):
The proportion of pixels to take from each fov
seed (int):
The random seed to set for subsetting
channel_percentile (float):
Percentile used to normalize channels to same range
multiprocess (bool):
Whether to use multiprocessing or not
batch_size (int):
The number of FOVs to process in parallel, ignored if `multiprocess` is `False`
"""
# if the subset_proportion specified is out of range
if subset_proportion <= 0 or subset_proportion > 1:
raise ValueError('Invalid subset percentage entered: must be in (0, 1]')
# path validation
io_utils.validate_paths([base_dir, tiff_dir, os.path.join(base_dir, pixel_output_dir)])
# create data_dir if it doesn't already exist
if not os.path.exists(os.path.join(base_dir, data_dir)):
os.mkdir(os.path.join(base_dir, data_dir))
# create subset_dir if it doesn't already exist
if not os.path.exists(os.path.join(base_dir, subset_dir)):
os.mkdir(os.path.join(base_dir, subset_dir))
# define path to channel normalization values
channel_norm_path = os.path.join(
base_dir, pixel_output_dir, 'channel_norm.feather'
)
# define path to pixel normalization values
pixel_thresh_path = os.path.join(
base_dir, pixel_output_dir, 'pixel_thresh.feather'
)
# reset entire cohort if channels provided are different from ones in existing channel_norm
if os.path.exists(channel_norm_path):
channel_norm_df = feather.read_dataframe(channel_norm_path)
if set(channel_norm_df.columns.values) != set(channels):
print("New channels provided: overwriting whole cohort")
# delete the existing data in data_dir and subset_dir
rmtree(os.path.join(base_dir, data_dir))
os.mkdir(os.path.join(base_dir, data_dir))
rmtree(os.path.join(base_dir, subset_dir))
os.mkdir(os.path.join(base_dir, subset_dir))
# delete the existing channel_norm.feather and pixel_thresh.feather
os.remove(channel_norm_path)
os.remove(pixel_thresh_path)
# create variable for storing 99.9% values
quant_dat = pd.DataFrame()
# find all the FOV files in the full data and subsetted directories
# NOTE: this handles the case where the data file was written, but not the subset file
fovs_sub = io_utils.list_files(os.path.join(base_dir, subset_dir), substrs='.feather')
fovs_data = io_utils.list_files(os.path.join(base_dir, data_dir), substrs='.feather')
# intersect the two fovs lists together (if a FOV appears in one but not the other, regenerate)
fovs_full = list(set(fovs_sub).intersection(fovs_data))
# trim the .feather suffix from the fovs in the subsetted directory
fovs_full = io_utils.remove_file_extensions(fovs_full)
# define the list of FOVs for preprocessing
# NOTE: if an existing FOV is already corrupted, future steps will discard it
fovs_list = list(set(fovs).difference(set(fovs_full)))
# if there are no FOVs left to preprocess don't run function
if len(fovs_list) == 0:
print("There are no more FOVs to preprocess, skipping")
return
# if the process is only partially complete, inform the user of restart
if len(fovs_list) < len(fovs):
print("Restarting preprocessing from FOV %s, "
"%d fovs left to process" % (fovs_list[0], len(fovs_list)))
# check to make sure correct channels were specified
pixel_cluster_utils.check_for_modified_channels(
tiff_dir=tiff_dir,
test_fov=fovs[0],
img_sub_folder=img_sub_folder,
channels=channels
)
# load existing channel_norm_path if exists, otherwise generate
if not os.path.exists(channel_norm_path):
# compute channel percentiles
channel_norm_df = pixel_cluster_utils.calculate_channel_percentiles(
tiff_dir=tiff_dir,
fovs=fovs,
channels=channels,
img_sub_folder=img_sub_folder,
percentile=channel_percentile
)
else:
# load previously generated output
channel_norm_df = feather.read_dataframe(channel_norm_path)
# load existing pixel_thresh_path if exists, otherwise generate
if not os.path.exists(pixel_thresh_path):
# compute pixel percentiles
pixel_thresh_val = pixel_cluster_utils.calculate_pixel_intensity_percentile(
tiff_dir=tiff_dir, fovs=fovs, channels=channels,
img_sub_folder=img_sub_folder, channel_percentiles=channel_norm_df
)
pixel_thresh_df = pd.DataFrame({'pixel_thresh_val': [pixel_thresh_val]})
else:
pixel_thresh_df = feather.read_dataframe(pixel_thresh_path)
pixel_thresh_val = pixel_thresh_df['pixel_thresh_val'].values[0]
# define the partial function to iterate over
fov_data_func = partial(
preprocess_fov, base_dir, tiff_dir, data_dir, subset_dir,
seg_dir, seg_suffix, img_sub_folder, is_mibitiff, channels, blur_factor,
subset_proportion, pixel_thresh_val, seed, channel_norm_df
)
# define variable to keep track of number of fovs processed
fovs_processed = 0
# define the columns to drop for 99.9% normalization
cols_to_drop = ['fov', 'row_index', 'column_index']
# account for segmentation_label if seg_dir is set
if seg_dir:
cols_to_drop.append('segmentation_label')
if multiprocess:
# define the multiprocessing context
with multiprocessing.get_context('spawn').Pool(batch_size) as fov_data_pool:
# asynchronously generate and save the pixel matrices per FOV
# NOTE: fov_data_pool should NOT operate on quant_dat since that is a shared resource
for fov_batch in [fovs_list[i:(i + batch_size)]
for i in range(0, len(fovs_list), batch_size)]:
fov_data_batch = fov_data_pool.map(fov_data_func, fov_batch)
# compute the 99.9% quantile values for each FOV
for pixel_mat_data in fov_data_batch:
# retrieve the FOV name, note that there will only be one per FOV DataFrame
fov = pixel_mat_data['fov'].unique()[0]
# drop the metadata columns and generate the 99.9% quantile values for the FOV
fov_full_pixel_data = pixel_mat_data.drop(columns=cols_to_drop)
quant_dat[fov] = fov_full_pixel_data.replace(
0, np.nan
).quantile(q=0.999, axis=0)
# update number of fovs processed
fovs_processed += len(fov_batch)
print("Processed %d fovs" % fovs_processed)
else:
for fov in fovs_list:
pixel_mat_data = fov_data_func(fov)
# drop the metadata columns and generate the 99.9% quantile values for the FOV
fov_full_pixel_data = pixel_mat_data.drop(columns=cols_to_drop)
quant_dat[fov] = fov_full_pixel_data.replace(0, np.nan).quantile(q=0.999, axis=0)
# update number of fovs processed
fovs_processed += 1
# update every 10 FOVs, or at the very end
if fovs_processed % 10 == 0 or fovs_processed == len(fovs_list):
print("Processed %d fovs" % fovs_processed)
# get mean 99.9% across all fovs for all markers
mean_quant = pd.DataFrame(quant_dat.mean(axis=1))
# save 99.9% normalization values
feather.write_dataframe(mean_quant.T,
os.path.join(base_dir, norm_vals_name),
compression='uncompressed')