Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Temporarily remove pixel preprocessing normalization for Candace's paper #913

Merged
merged 23 commits into from
Mar 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
8cc13f0
Temporarily purge certain normalization steps for Pixie pixel preproc…
alex-l-kong Feb 14, 2023
f109a43
Pin pyFlowSOM to 0.1.14
alex-l-kong Feb 14, 2023
beedc79
Move back to 0.1.12 (this will be addressed by a different PR)
alex-l-kong Feb 14, 2023
e6fdd1e
Never mind, pin to 0.1.14
alex-l-kong Feb 14, 2023
6bdac58
Remove deterministic flag for testing
alex-l-kong Feb 14, 2023
dfde9c0
Pin pyFlowSOM at 0.1.13
alex-l-kong Feb 15, 2023
bf38338
pyFlowSOM.som requires dtype float64, explicitly cast
alex-l-kong Feb 15, 2023
fdc232e
Merge branch 'main' of https://github.com/angelolab/ark-analysis into…
alex-l-kong Feb 15, 2023
499b03a
Explicitly cast weights and external data to np.float64 before passin…
alex-l-kong Feb 16, 2023
8a06816
Remove saving the channel and pixel feather files
alex-l-kong Feb 16, 2023
3c47ed5
Adjust tests so they don't need to handle channel_norm and pixel_thre…
alex-l-kong Feb 16, 2023
52a6eb3
Merge branch 'main' into temp_norm_purge
alex-l-kong Feb 16, 2023
20cff9c
Remove more commented code
alex-l-kong Feb 16, 2023
79ac387
Merge branch 'main' into temp_norm_purge
alex-l-kong Feb 16, 2023
562b49f
Merge branch 'main' into temp_norm_purge
alex-l-kong Feb 16, 2023
207e8ca
Merge branch 'main' into temp_norm_purge
alex-l-kong Feb 17, 2023
c685d66
Merge branch 'main' into temp_norm_purge
alex-l-kong Feb 17, 2023
bdfc47c
Merge branch 'main' into temp_norm_purge
alex-l-kong Feb 23, 2023
78cc23c
Merge branch 'main' into temp_norm_purge
alex-l-kong Mar 2, 2023
7e5efde
Merge branch 'main' into temp_norm_purge
alex-l-kong Mar 13, 2023
8efc712
Fix errors caused by GitHub merge tool
alex-l-kong Mar 13, 2023
c6751c3
Remove channel_norm_df and pixel_thresh_val
alex-l-kong Mar 14, 2023
96aa412
Remove more references in pixie_preprocessing_test.py
alex-l-kong Mar 14, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 7 additions & 5 deletions src/ark/phenotyping/cluster_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,10 +69,10 @@ def train_som(self, data: pd.DataFrame):
data (pandas.DataFrame):
The input data to train the SOM on.
"""

# pyFlowSOM.som requires data in np.float64, add type cast for safety purposes
som_weights = som(
data=data.values, xdim=self.xdim, ydim=self.ydim, rlen=self.num_passes,
alpha_range=(self.lr_start, self.lr_end), seed=self.seed
data=data.values.astype(np.float64), xdim=self.xdim, ydim=self.ydim,
rlen=self.num_passes, alpha_range=(self.lr_start, self.lr_end), seed=self.seed
)

# ensure dimensions of weights are flattened
Expand Down Expand Up @@ -110,8 +110,10 @@ def generate_som_clusters(self, external_data: pd.DataFrame) -> np.ndarray:
for i in np.arange(0, external_data.shape[0], 100):
# NOTE: this also orders the columns of external_data_sub the same as self.weights
cluster_labels.append(map_data_to_nodes(
self.weights.values,
external_data.loc[i:min(i + 99, external_data.shape[0]), weights_cols].values
self.weights.values.astype(np.float64),
external_data.loc[
i:min(i + 99, external_data.shape[0]), weights_cols
].values.astype(np.float64)
)[0])

# concat all the results together and return
Expand Down
83 changes: 4 additions & 79 deletions src/ark/phenotyping/pixie_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
multiprocessing.set_start_method('spawn', force=True)


def create_fov_pixel_data(fov, channels, img_data, seg_labels, pixel_thresh_val,
def create_fov_pixel_data(fov, channels, img_data, seg_labels,
blur_factor=2, subset_proportion=0.1):
"""Preprocess pixel data for one fov

Expand All @@ -29,8 +29,6 @@ def create_fov_pixel_data(fov, channels, img_data, seg_labels, pixel_thresh_val,
Array representing image data for one fov
seg_labels (numpy.ndarray):
Array representing segmentation labels for one fov
pixel_thresh_val (float):
value used to determine per-pixel cutoff for total signal inclusion
blur_factor (int):
The sigma to set for the Gaussian blur
subset_proportion (float):
Expand Down Expand Up @@ -65,10 +63,6 @@ def create_fov_pixel_data(fov, channels, img_data, seg_labels, pixel_thresh_val,
seg_labels_flat = seg_labels.flatten()
pixel_mat['segmentation_label'] = seg_labels_flat

# remove any rows with channels with a sum below the threshold
rowsums = pixel_mat[channels].sum(axis=1)
pixel_mat = pixel_mat.loc[rowsums > pixel_thresh_val, :].reset_index(drop=True)

# normalize the row sums of pixel mat
pixel_mat = pixel_cluster_utils.normalize_rows(pixel_mat, channels, seg_labels is not None)

Expand All @@ -80,7 +74,7 @@ def create_fov_pixel_data(fov, channels, img_data, seg_labels, pixel_thresh_val,

def preprocess_fov(base_dir, tiff_dir, data_dir, subset_dir, seg_dir, seg_suffix,
img_sub_folder, is_mibitiff, channels, blur_factor,
subset_proportion, pixel_thresh_val, seed, channel_norm_df, fov):
subset_proportion, seed, fov):
"""Helper function to read in the FOV-level pixel data, run `create_fov_pixel_data`,
and save the preprocessed data.

Expand Down Expand Up @@ -110,12 +104,8 @@ def preprocess_fov(base_dir, tiff_dir, data_dir, subset_dir, seg_dir, seg_suffix
The sigma to set for the Gaussian blur
subset_proportion (float):
The proportion of pixels to take from each fov
pixel_thresh_val (float):
The value to normalize the pixels by
seed (int):
The random seed to set for subsetting
channel_norm_df (pandas.DataFrame):
The channel normalization values to use
fov (str):
The name of the FOV to preprocess

Expand Down Expand Up @@ -149,20 +139,13 @@ def preprocess_fov(base_dir, tiff_dir, data_dir, subset_dir, seg_dir, seg_suffix
# subset for the channel data
img_data = img_xr.loc[fov, :, :, channels].values.astype(np.float32)

# create vector for normalizing image data
norm_vect = channel_norm_df.iloc[0].values
norm_vect = np.array(norm_vect).reshape([1, 1, len(norm_vect)])

# normalize image data
img_data = img_data / norm_vect

# set seed for subsetting
np.random.seed(seed)

# create the full and subsetted fov matrices
pixel_mat, pixel_mat_subset = create_fov_pixel_data(
fov=fov, channels=channels, img_data=img_data, seg_labels=seg_labels,
pixel_thresh_val=pixel_thresh_val, blur_factor=blur_factor,
blur_factor=blur_factor,
subset_proportion=subset_proportion
)

Expand Down Expand Up @@ -255,34 +238,6 @@ def create_pixel_matrix(fovs, channels, base_dir, tiff_dir, seg_dir,
if not os.path.exists(os.path.join(base_dir, subset_dir)):
os.mkdir(os.path.join(base_dir, subset_dir))

# define path to channel normalization values
channel_norm_path = os.path.join(
base_dir, pixel_output_dir, 'channel_norm.feather'
)

# define path to pixel normalization values
pixel_thresh_path = os.path.join(
base_dir, pixel_output_dir, 'pixel_thresh.feather'
)

# reset entire cohort if channels provided are different from ones in existing channel_norm
if os.path.exists(channel_norm_path):
channel_norm_df = feather.read_dataframe(channel_norm_path)

if set(channel_norm_df.columns.values) != set(channels):
print("New channels provided: overwriting whole cohort")

# delete the existing data in data_dir and subset_dir
rmtree(os.path.join(base_dir, data_dir))
os.mkdir(os.path.join(base_dir, data_dir))

rmtree(os.path.join(base_dir, subset_dir))
os.mkdir(os.path.join(base_dir, subset_dir))

# delete the existing channel_norm.feather and pixel_thresh.feather
os.remove(channel_norm_path)
os.remove(pixel_thresh_path)

# create variable for storing 99.9% values
quant_dat = pd.DataFrame()

Expand Down Expand Up @@ -319,41 +274,11 @@ def create_pixel_matrix(fovs, channels, base_dir, tiff_dir, seg_dir,
channels=channels
)

# load existing channel_norm_path if exists, otherwise generate
if not os.path.exists(channel_norm_path):
# compute channel percentiles
channel_norm_df = pixel_cluster_utils.calculate_channel_percentiles(
tiff_dir=tiff_dir,
fovs=fovs,
channels=channels,
img_sub_folder=img_sub_folder,
percentile=channel_percentile
)
# save output
feather.write_dataframe(channel_norm_df, channel_norm_path, compression='uncompressed')
else:
# load previously generated output
channel_norm_df = feather.read_dataframe(channel_norm_path)

# load existing pixel_thresh_path if exists, otherwise generate
if not os.path.exists(pixel_thresh_path):
# compute pixel percentiles
pixel_thresh_val = pixel_cluster_utils.calculate_pixel_intensity_percentile(
tiff_dir=tiff_dir, fovs=fovs, channels=channels,
img_sub_folder=img_sub_folder, channel_percentiles=channel_norm_df
)

pixel_thresh_df = pd.DataFrame({'pixel_thresh_val': [pixel_thresh_val]})
feather.write_dataframe(pixel_thresh_df, pixel_thresh_path, compression='uncompressed')
else:
pixel_thresh_df = feather.read_dataframe(pixel_thresh_path)
pixel_thresh_val = pixel_thresh_df['pixel_thresh_val'].values[0]

# define the partial function to iterate over
fov_data_func = partial(
preprocess_fov, base_dir, tiff_dir, data_dir, subset_dir,
seg_dir, seg_suffix, img_sub_folder, is_mibitiff, channels, blur_factor,
subset_proportion, pixel_thresh_val, seed, channel_norm_df
subset_proportion, seed
)

# define variable to keep track of number of fovs processed
Expand Down
101 changes: 6 additions & 95 deletions tests/phenotyping/pixie_preprocessing_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,7 @@ def test_create_fov_pixel_data():

# TEST 1: run fov preprocessing for one fov with seg_labels and no blank pixels
sample_pixel_mat, sample_pixel_mat_subset = pixie_preprocessing.create_fov_pixel_data(
fov=fov, channels=chans, img_data=sample_img_data, seg_labels=seg_labels,
pixel_thresh_val=1
fov=fov, channels=chans, img_data=sample_img_data, seg_labels=seg_labels
)

# assert the channel names are the same
Expand All @@ -87,7 +86,7 @@ def test_create_fov_pixel_data():

# TEST 2: run fov preprocessing for one fov without seg_labels and no blank pixels
sample_pixel_mat, sample_pixel_mat_subset = pixie_preprocessing.create_fov_pixel_data(
fov=fov, channels=chans, img_data=sample_img_data, seg_labels=None, pixel_thresh_val=1
fov=fov, channels=chans, img_data=sample_img_data, seg_labels=None
)

# assert the channel names are the same
Expand All @@ -106,28 +105,6 @@ def test_create_fov_pixel_data():
# NOTE: need to account for rounding if multiplying by 0.1 leads to non-int
assert round(sample_pixel_mat.shape[0] * 0.1) == sample_pixel_mat_subset.shape[0]

# TEST 3: run fov preprocessing with a pixel_thresh_val to ensure rows get removed
sample_pixel_mat, sample_pixel_mat_subset = pixie_preprocessing.create_fov_pixel_data(
fov=fov, channels=chans, img_data=sample_img_data / 1000, seg_labels=seg_labels,
pixel_thresh_val=0.5
)

# assert the channel names are the same
misc_utils.verify_same_elements(flowsom_chans=sample_pixel_mat.columns.values[:-4],
provided_chans=chans)
misc_utils.verify_same_elements(flowsom_chans=sample_pixel_mat_subset.columns.values[:-4],
provided_chans=chans)

# assert all rows sum to 1 (within tolerance because of floating-point errors)
assert np.all(np.allclose(sample_pixel_mat.loc[:, chans].sum(axis=1).values, 1))

# assert we successfully filtered out pixels below pixel_thresh_val
assert sample_pixel_mat.shape[0] < (sample_img_data.shape[0] * sample_img_data.shape[1])

# assert the size of the subsetted DataFrame is less than 0.1 of the preprocessed DataFrame
# NOTE: need to account for rounding if multiplying by 0.1 leads to non-int
assert round(sample_pixel_mat.shape[0] * 0.1) == sample_pixel_mat_subset.shape[0]

# TODO: add a test where after Gaussian blurring one or more rows in sample_pixel_mat
# are all 0 after, tested successfully via hard-coding values in create_fov_pixel_data

Expand Down Expand Up @@ -163,17 +140,12 @@ def test_preprocess_fov(mocker):
file_name = fov + "_whole_cell.tiff"
image_utils.save_image(os.path.join(seg_dir, file_name), rand_img)

channel_norm_df = pd.DataFrame(
np.expand_dims(np.repeat(10, repeats=len(chans)), axis=0),
columns=chans
)

# run the preprocessing for fov0
# NOTE: don't test the return value, leave that for test_create_pixel_matrix
pixie_preprocessing.preprocess_fov(
temp_dir, tiff_dir, 'pixel_mat_data', 'pixel_mat_subsetted',
seg_dir, '_whole_cell.tiff', 'TIFs', False, ['chan0', 'chan1', 'chan2'],
2, 0.1, 1, 42, channel_norm_df, 'fov0'
2, 0.1, 42, 'fov0'
)

fov_data_path = os.path.join(
Expand Down Expand Up @@ -212,7 +184,7 @@ def test_preprocess_fov(mocker):


def mocked_create_fov_pixel_data(fov, channels, img_data, seg_labels, blur_factor,
subset_proportion, pixel_thresh_val):
subset_proportion):
# create fake data to be compatible with downstream functions
data = np.random.rand(len(channels) * 5).reshape(5, len(channels))
df = pd.DataFrame(data, columns=channels)
Expand All @@ -233,7 +205,7 @@ def mocked_create_fov_pixel_data(fov, channels, img_data, seg_labels, blur_facto

def mocked_preprocess_fov(base_dir, tiff_dir, data_dir, subset_dir, seg_dir, seg_suffix,
img_sub_folder, is_mibitiff, channels, blur_factor,
subset_proportion, pixel_thresh_val, seed, channel_norm_df, fov):
subset_proportion, seed, fov):
# load img_xr from MIBITiff or directory with the fov
if is_mibitiff:
img_xr = load_utils.load_imgs_from_mibitiff(
Expand All @@ -258,20 +230,13 @@ def mocked_preprocess_fov(base_dir, tiff_dir, data_dir, subset_dir, seg_dir, seg
# subset for the channel data
img_data = img_xr.loc[fov, :, :, channels].values.astype(np.float32)

# create vector for normalizing image data
norm_vect = channel_norm_df['norm_val'].values
norm_vect = np.array(norm_vect).reshape([1, 1, len(norm_vect)])

# normalize image data
img_data = img_data / norm_vect

# set seed for subsetting
np.random.seed(seed)

# create the full and subsetted fov matrices
pixel_mat, pixel_mat_subset = mocked_create_fov_pixel_data(
fov=fov, channels=channels, img_data=img_data, seg_labels=seg_labels,
pixel_thresh_val=pixel_thresh_val, blur_factor=blur_factor,
blur_factor=blur_factor,
subset_proportion=subset_proportion
)

Expand Down Expand Up @@ -394,31 +359,6 @@ def test_create_pixel_matrix_base(fovs, chans, sub_dir, seg_dir_include,
seg_dir=seg_dir
)

# make the channel_norm.feather file if the test requires it
# NOTE: pixel_mat_data already created in the previous validation tests
if channel_norm_include:
# helps test if channel_norm.feather contains a different set of channels
norm_chans = [chans[0]] if norm_diff_chan else chans
sample_channel_norm_df = pd.DataFrame(
np.expand_dims(np.random.rand(len(norm_chans)), axis=0),
columns=norm_chans
)

feather.write_dataframe(
sample_channel_norm_df,
os.path.join(temp_dir, sample_pixel_output_dir, 'channel_norm.feather'),
compression='uncompressed'
)

# make the pixel_thresh.feather file if the test requires it
if pixel_thresh_include:
sample_pixel_thresh_df = pd.DataFrame({'pixel_thresh_val': np.random.rand(1)})
feather.write_dataframe(
sample_pixel_thresh_df,
os.path.join(temp_dir, sample_pixel_output_dir, 'pixel_thresh.feather'),
compression='uncompressed'
)

# create the pixel matrices
pixie_preprocessing.create_pixel_matrix(
fovs=fovs,
Expand All @@ -430,30 +370,12 @@ def test_create_pixel_matrix_base(fovs, chans, sub_dir, seg_dir_include,
multiprocess=multiprocess
)

# assert we overwrote the original channel_norm and pixel_thresh files
# if new set of channels provided
if norm_diff_chan:
output_capture = capsys.readouterr().out
assert 'New channels provided: overwriting whole cohort' in output_capture

# check that we actually created a data directory
assert os.path.exists(os.path.join(temp_dir, 'pixel_mat_data'))

# check that we actually created a subsetted directory
assert os.path.exists(os.path.join(temp_dir, 'pixel_mat_subsetted'))

# if there wasn't originally a channel_norm.feather or if overwritten, assert one created
if not channel_norm_include or norm_diff_chan:
assert os.path.exists(
os.path.join(temp_dir, sample_pixel_output_dir, 'channel_norm.feather')
)

# if there wasn't originally a pixel_thresh.feather or if overwritten, assert one created
if not pixel_thresh_include or norm_diff_chan:
assert os.path.exists(
os.path.join(temp_dir, sample_pixel_output_dir, 'pixel_thresh.feather')
)

# check that we created a norm vals file
assert os.path.exists(os.path.join(temp_dir, 'channel_norm_post_rowsum.feather'))

Expand Down Expand Up @@ -520,17 +442,6 @@ def test_create_pixel_matrix_base(fovs, chans, sub_dir, seg_dir_include,
# generate the data
mults = [(1 / 2) ** i for i in range(len(chans))]

sample_channel_norm_df = pd.DataFrame(
np.expand_dims(mults, axis=0),
columns=chans
)

feather.write_dataframe(
sample_channel_norm_df,
os.path.join(temp_dir, sample_pixel_output_dir, 'channel_norm.feather'),
compression='uncompressed'
)

pixie_preprocessing.create_pixel_matrix(
fovs=fovs,
channels=chans,
Expand Down