angelolab · alex-l-kong · Mar 14, 2023 · Feb 14, 2023 · Feb 14, 2023 · Feb 14, 2023
diff --git a/src/ark/phenotyping/cluster_helpers.py b/src/ark/phenotyping/cluster_helpers.py
@@ -69,10 +69,10 @@ def train_som(self, data: pd.DataFrame):
             data (pandas.DataFrame):
                 The input data to train the SOM on.
         """
-
+        # pyFlowSOM.som requires data in np.float64, add type cast for safety purposes
         som_weights = som(
-            data=data.values, xdim=self.xdim, ydim=self.ydim, rlen=self.num_passes,
-            alpha_range=(self.lr_start, self.lr_end), seed=self.seed
+            data=data.values.astype(np.float64), xdim=self.xdim, ydim=self.ydim,
+            rlen=self.num_passes, alpha_range=(self.lr_start, self.lr_end), seed=self.seed
         )
 
         # ensure dimensions of weights are flattened
@@ -110,8 +110,10 @@ def generate_som_clusters(self, external_data: pd.DataFrame) -> np.ndarray:
         for i in np.arange(0, external_data.shape[0], 100):
             # NOTE: this also orders the columns of external_data_sub the same as self.weights
             cluster_labels.append(map_data_to_nodes(
-                self.weights.values,
-                external_data.loc[i:min(i + 99, external_data.shape[0]), weights_cols].values
+                self.weights.values.astype(np.float64),
+                external_data.loc[
+                    i:min(i + 99, external_data.shape[0]), weights_cols
+                ].values.astype(np.float64)
             )[0])
 
         # concat all the results together and return

diff --git a/src/ark/phenotyping/pixie_preprocessing.py b/src/ark/phenotyping/pixie_preprocessing.py
@@ -16,7 +16,7 @@
 multiprocessing.set_start_method('spawn', force=True)
 
 
-def create_fov_pixel_data(fov, channels, img_data, seg_labels, pixel_thresh_val,
+def create_fov_pixel_data(fov, channels, img_data, seg_labels,
                           blur_factor=2, subset_proportion=0.1):
     """Preprocess pixel data for one fov
 
@@ -29,8 +29,6 @@ def create_fov_pixel_data(fov, channels, img_data, seg_labels, pixel_thresh_val,
             Array representing image data for one fov
         seg_labels (numpy.ndarray):
             Array representing segmentation labels for one fov
-        pixel_thresh_val (float):
-            value used to determine per-pixel cutoff for total signal inclusion
         blur_factor (int):
             The sigma to set for the Gaussian blur
         subset_proportion (float):
@@ -65,10 +63,6 @@ def create_fov_pixel_data(fov, channels, img_data, seg_labels, pixel_thresh_val,
         seg_labels_flat = seg_labels.flatten()
         pixel_mat['segmentation_label'] = seg_labels_flat
 
-    # remove any rows with channels with a sum below the threshold
-    rowsums = pixel_mat[channels].sum(axis=1)
-    pixel_mat = pixel_mat.loc[rowsums > pixel_thresh_val, :].reset_index(drop=True)
-
     # normalize the row sums of pixel mat
     pixel_mat = pixel_cluster_utils.normalize_rows(pixel_mat, channels, seg_labels is not None)
 
@@ -80,7 +74,7 @@ def create_fov_pixel_data(fov, channels, img_data, seg_labels, pixel_thresh_val,
 
 def preprocess_fov(base_dir, tiff_dir, data_dir, subset_dir, seg_dir, seg_suffix,
                    img_sub_folder, is_mibitiff, channels, blur_factor,
-                   subset_proportion, pixel_thresh_val, seed, channel_norm_df, fov):
+                   subset_proportion, seed, fov):
     """Helper function to read in the FOV-level pixel data, run `create_fov_pixel_data`,
     and save the preprocessed data.
 
@@ -110,12 +104,8 @@ def preprocess_fov(base_dir, tiff_dir, data_dir, subset_dir, seg_dir, seg_suffix
             The sigma to set for the Gaussian blur
         subset_proportion (float):
             The proportion of pixels to take from each fov
-        pixel_thresh_val (float):
-            The value to normalize the pixels by
         seed (int):
             The random seed to set for subsetting
-        channel_norm_df (pandas.DataFrame):
-            The channel normalization values to use
         fov (str):
             The name of the FOV to preprocess
 
@@ -149,20 +139,13 @@ def preprocess_fov(base_dir, tiff_dir, data_dir, subset_dir, seg_dir, seg_suffix
     # subset for the channel data
     img_data = img_xr.loc[fov, :, :, channels].values.astype(np.float32)
 
-    # create vector for normalizing image data
-    norm_vect = channel_norm_df.iloc[0].values
-    norm_vect = np.array(norm_vect).reshape([1, 1, len(norm_vect)])
-
-    # normalize image data
-    img_data = img_data / norm_vect
-
     # set seed for subsetting
     np.random.seed(seed)
 
     # create the full and subsetted fov matrices
     pixel_mat, pixel_mat_subset = create_fov_pixel_data(
         fov=fov, channels=channels, img_data=img_data, seg_labels=seg_labels,
-        pixel_thresh_val=pixel_thresh_val, blur_factor=blur_factor,
+        blur_factor=blur_factor,
         subset_proportion=subset_proportion
     )
 
@@ -255,34 +238,6 @@ def create_pixel_matrix(fovs, channels, base_dir, tiff_dir, seg_dir,
     if not os.path.exists(os.path.join(base_dir, subset_dir)):
         os.mkdir(os.path.join(base_dir, subset_dir))
 
-    # define path to channel normalization values
-    channel_norm_path = os.path.join(
-        base_dir, pixel_output_dir, 'channel_norm.feather'
-    )
-
-    # define path to pixel normalization values
-    pixel_thresh_path = os.path.join(
-        base_dir, pixel_output_dir, 'pixel_thresh.feather'
-    )
-
-    # reset entire cohort if channels provided are different from ones in existing channel_norm
-    if os.path.exists(channel_norm_path):
-        channel_norm_df = feather.read_dataframe(channel_norm_path)
-
-        if set(channel_norm_df.columns.values) != set(channels):
-            print("New channels provided: overwriting whole cohort")
-
-            # delete the existing data in data_dir and subset_dir
-            rmtree(os.path.join(base_dir, data_dir))
-            os.mkdir(os.path.join(base_dir, data_dir))
-
-            rmtree(os.path.join(base_dir, subset_dir))
-            os.mkdir(os.path.join(base_dir, subset_dir))
-
-            # delete the existing channel_norm.feather and pixel_thresh.feather
-            os.remove(channel_norm_path)
-            os.remove(pixel_thresh_path)
-
     # create variable for storing 99.9% values
     quant_dat = pd.DataFrame()
 
@@ -319,41 +274,11 @@ def create_pixel_matrix(fovs, channels, base_dir, tiff_dir, seg_dir,
         channels=channels
     )
 
-    # load existing channel_norm_path if exists, otherwise generate
-    if not os.path.exists(channel_norm_path):
-        # compute channel percentiles
-        channel_norm_df = pixel_cluster_utils.calculate_channel_percentiles(
-            tiff_dir=tiff_dir,
-            fovs=fovs,
-            channels=channels,
-            img_sub_folder=img_sub_folder,
-            percentile=channel_percentile
-        )
-        # save output
-        feather.write_dataframe(channel_norm_df, channel_norm_path, compression='uncompressed')
-    else:
-        # load previously generated output
-        channel_norm_df = feather.read_dataframe(channel_norm_path)
-
-    # load existing pixel_thresh_path if exists, otherwise generate
-    if not os.path.exists(pixel_thresh_path):
-        # compute pixel percentiles
-        pixel_thresh_val = pixel_cluster_utils.calculate_pixel_intensity_percentile(
-            tiff_dir=tiff_dir, fovs=fovs, channels=channels,
-            img_sub_folder=img_sub_folder, channel_percentiles=channel_norm_df
-        )
-
-        pixel_thresh_df = pd.DataFrame({'pixel_thresh_val': [pixel_thresh_val]})
-        feather.write_dataframe(pixel_thresh_df, pixel_thresh_path, compression='uncompressed')
-    else:
-        pixel_thresh_df = feather.read_dataframe(pixel_thresh_path)
-        pixel_thresh_val = pixel_thresh_df['pixel_thresh_val'].values[0]
-
     # define the partial function to iterate over
     fov_data_func = partial(
         preprocess_fov, base_dir, tiff_dir, data_dir, subset_dir,
         seg_dir, seg_suffix, img_sub_folder, is_mibitiff, channels, blur_factor,
-        subset_proportion, pixel_thresh_val, seed, channel_norm_df
+        subset_proportion, seed
     )
 
     # define variable to keep track of number of fovs processed

diff --git a/tests/phenotyping/pixie_preprocessing_test.py b/tests/phenotyping/pixie_preprocessing_test.py
@@ -65,8 +65,7 @@ def test_create_fov_pixel_data():
 
         # TEST 1: run fov preprocessing for one fov with seg_labels and no blank pixels
         sample_pixel_mat, sample_pixel_mat_subset = pixie_preprocessing.create_fov_pixel_data(
-            fov=fov, channels=chans, img_data=sample_img_data, seg_labels=seg_labels,
-            pixel_thresh_val=1
+            fov=fov, channels=chans, img_data=sample_img_data, seg_labels=seg_labels
         )
 
         # assert the channel names are the same
@@ -87,7 +86,7 @@ def test_create_fov_pixel_data():
 
         # TEST 2: run fov preprocessing for one fov without seg_labels and no blank pixels
         sample_pixel_mat, sample_pixel_mat_subset = pixie_preprocessing.create_fov_pixel_data(
-            fov=fov, channels=chans, img_data=sample_img_data, seg_labels=None, pixel_thresh_val=1
+            fov=fov, channels=chans, img_data=sample_img_data, seg_labels=None
         )
 
         # assert the channel names are the same
@@ -106,28 +105,6 @@ def test_create_fov_pixel_data():
         # NOTE: need to account for rounding if multiplying by 0.1 leads to non-int
         assert round(sample_pixel_mat.shape[0] * 0.1) == sample_pixel_mat_subset.shape[0]
 
-        # TEST 3: run fov preprocessing with a pixel_thresh_val to ensure rows get removed
-        sample_pixel_mat, sample_pixel_mat_subset = pixie_preprocessing.create_fov_pixel_data(
-            fov=fov, channels=chans, img_data=sample_img_data / 1000, seg_labels=seg_labels,
-            pixel_thresh_val=0.5
-        )
-
-        # assert the channel names are the same
-        misc_utils.verify_same_elements(flowsom_chans=sample_pixel_mat.columns.values[:-4],
-                                        provided_chans=chans)
-        misc_utils.verify_same_elements(flowsom_chans=sample_pixel_mat_subset.columns.values[:-4],
-                                        provided_chans=chans)
-
-        # assert all rows sum to 1 (within tolerance because of floating-point errors)
-        assert np.all(np.allclose(sample_pixel_mat.loc[:, chans].sum(axis=1).values, 1))
-
-        # assert we successfully filtered out pixels below pixel_thresh_val
-        assert sample_pixel_mat.shape[0] < (sample_img_data.shape[0] * sample_img_data.shape[1])
-
-        # assert the size of the subsetted DataFrame is less than 0.1 of the preprocessed DataFrame
-        # NOTE: need to account for rounding if multiplying by 0.1 leads to non-int
-        assert round(sample_pixel_mat.shape[0] * 0.1) == sample_pixel_mat_subset.shape[0]
-
         # TODO: add a test where after Gaussian blurring one or more rows in sample_pixel_mat
         # are all 0 after, tested successfully via hard-coding values in create_fov_pixel_data
 
@@ -163,17 +140,12 @@ def test_preprocess_fov(mocker):
             file_name = fov + "_whole_cell.tiff"
             image_utils.save_image(os.path.join(seg_dir, file_name), rand_img)
 
-        channel_norm_df = pd.DataFrame(
-            np.expand_dims(np.repeat(10, repeats=len(chans)), axis=0),
-            columns=chans
-        )
-
         # run the preprocessing for fov0
         # NOTE: don't test the return value, leave that for test_create_pixel_matrix
         pixie_preprocessing.preprocess_fov(
             temp_dir, tiff_dir, 'pixel_mat_data', 'pixel_mat_subsetted',
             seg_dir, '_whole_cell.tiff', 'TIFs', False, ['chan0', 'chan1', 'chan2'],
-            2, 0.1, 1, 42, channel_norm_df, 'fov0'
+            2, 0.1, 42, 'fov0'
         )
 
         fov_data_path = os.path.join(
@@ -212,7 +184,7 @@ def test_preprocess_fov(mocker):
 
 
 def mocked_create_fov_pixel_data(fov, channels, img_data, seg_labels, blur_factor,
-                                 subset_proportion, pixel_thresh_val):
+                                 subset_proportion):
     # create fake data to be compatible with downstream functions
     data = np.random.rand(len(channels) * 5).reshape(5, len(channels))
     df = pd.DataFrame(data, columns=channels)
@@ -233,7 +205,7 @@ def mocked_create_fov_pixel_data(fov, channels, img_data, seg_labels, blur_facto
 
 def mocked_preprocess_fov(base_dir, tiff_dir, data_dir, subset_dir, seg_dir, seg_suffix,
                           img_sub_folder, is_mibitiff, channels, blur_factor,
-                          subset_proportion, pixel_thresh_val, seed, channel_norm_df, fov):
+                          subset_proportion, seed, fov):
     # load img_xr from MIBITiff or directory with the fov
     if is_mibitiff:
         img_xr = load_utils.load_imgs_from_mibitiff(
@@ -258,20 +230,13 @@ def mocked_preprocess_fov(base_dir, tiff_dir, data_dir, subset_dir, seg_dir, seg
     # subset for the channel data
     img_data = img_xr.loc[fov, :, :, channels].values.astype(np.float32)
 
-    # create vector for normalizing image data
-    norm_vect = channel_norm_df['norm_val'].values
-    norm_vect = np.array(norm_vect).reshape([1, 1, len(norm_vect)])
-
-    # normalize image data
-    img_data = img_data / norm_vect
-
     # set seed for subsetting
     np.random.seed(seed)
 
     # create the full and subsetted fov matrices
     pixel_mat, pixel_mat_subset = mocked_create_fov_pixel_data(
         fov=fov, channels=channels, img_data=img_data, seg_labels=seg_labels,
-        pixel_thresh_val=pixel_thresh_val, blur_factor=blur_factor,
+        blur_factor=blur_factor,
         subset_proportion=subset_proportion
     )
 
@@ -394,31 +359,6 @@ def test_create_pixel_matrix_base(fovs, chans, sub_dir, seg_dir_include,
                 seg_dir=seg_dir
             )
 
-        # make the channel_norm.feather file if the test requires it
-        # NOTE: pixel_mat_data already created in the previous validation tests
-        if channel_norm_include:
-            # helps test if channel_norm.feather contains a different set of channels
-            norm_chans = [chans[0]] if norm_diff_chan else chans
-            sample_channel_norm_df = pd.DataFrame(
-                np.expand_dims(np.random.rand(len(norm_chans)), axis=0),
-                columns=norm_chans
-            )
-
-            feather.write_dataframe(
-                sample_channel_norm_df,
-                os.path.join(temp_dir, sample_pixel_output_dir, 'channel_norm.feather'),
-                compression='uncompressed'
-            )
-
-        # make the pixel_thresh.feather file if the test requires it
-        if pixel_thresh_include:
-            sample_pixel_thresh_df = pd.DataFrame({'pixel_thresh_val': np.random.rand(1)})
-            feather.write_dataframe(
-                sample_pixel_thresh_df,
-                os.path.join(temp_dir, sample_pixel_output_dir, 'pixel_thresh.feather'),
-                compression='uncompressed'
-            )
-
         # create the pixel matrices
         pixie_preprocessing.create_pixel_matrix(
             fovs=fovs,
@@ -430,30 +370,12 @@ def test_create_pixel_matrix_base(fovs, chans, sub_dir, seg_dir_include,
             multiprocess=multiprocess
         )
 
-        # assert we overwrote the original channel_norm and pixel_thresh files
-        # if new set of channels provided
-        if norm_diff_chan:
-            output_capture = capsys.readouterr().out
-            assert 'New channels provided: overwriting whole cohort' in output_capture
-
         # check that we actually created a data directory
         assert os.path.exists(os.path.join(temp_dir, 'pixel_mat_data'))
 
         # check that we actually created a subsetted directory
         assert os.path.exists(os.path.join(temp_dir, 'pixel_mat_subsetted'))
 
-        # if there wasn't originally a channel_norm.feather or if overwritten, assert one created
-        if not channel_norm_include or norm_diff_chan:
-            assert os.path.exists(
-                os.path.join(temp_dir, sample_pixel_output_dir, 'channel_norm.feather')
-            )
-
-        # if there wasn't originally a pixel_thresh.feather or if overwritten, assert one created
-        if not pixel_thresh_include or norm_diff_chan:
-            assert os.path.exists(
-                os.path.join(temp_dir, sample_pixel_output_dir, 'pixel_thresh.feather')
-            )
-
         # check that we created a norm vals file
         assert os.path.exists(os.path.join(temp_dir, 'channel_norm_post_rowsum.feather'))
 
@@ -520,17 +442,6 @@ def test_create_pixel_matrix_base(fovs, chans, sub_dir, seg_dir_include,
         # generate the data
         mults = [(1 / 2) ** i for i in range(len(chans))]
 
-        sample_channel_norm_df = pd.DataFrame(
-            np.expand_dims(mults, axis=0),
-            columns=chans
-        )
-
-        feather.write_dataframe(
-            sample_channel_norm_df,
-            os.path.join(temp_dir, sample_pixel_output_dir, 'channel_norm.feather'),
-            compression='uncompressed'
-        )
-
         pixie_preprocessing.create_pixel_matrix(
             fovs=fovs,
             channels=chans,