Skip to content

Commit

Permalink
owtsne: Fix crash on data containing NaNs, add sparse
Browse files Browse the repository at this point in the history
  • Loading branch information
pavlin-policar committed Sep 22, 2023
1 parent fc33050 commit 6d9a3ca
Show file tree
Hide file tree
Showing 3 changed files with 137 additions and 26 deletions.
12 changes: 0 additions & 12 deletions Orange/projection/manifold.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from itertools import chain

import numpy as np
import scipy.sparse as sp
from scipy.linalg import eigh as lapack_eigh
from scipy.sparse.linalg import eigsh as arpack_eigh
import sklearn.manifold as skl_manifold
Expand Down Expand Up @@ -233,11 +232,6 @@ def proj_variable(i):
metas=table.domain.metas)

def transform(self, X: np.ndarray, learning_rate=1, **kwargs) -> openTSNE.PartialTSNEEmbedding:
if sp.issparse(X):
raise TypeError(
"A sparse matrix was passed, but dense data is required. Use "
"X.toarray() to convert to a dense numpy array."
)
if isinstance(self.embedding_.affinities, openTSNE.affinity.Multiscale):
perplexity = kwargs.pop("perplexity", False)
if perplexity:
Expand Down Expand Up @@ -415,12 +409,6 @@ def __init__(self, n_components=2, perplexity=30, learning_rate="auto",
self.random_state = random_state

def compute_affinities(self, X):
# Sparse data are not supported
if sp.issparse(X):
raise TypeError(
"A sparse matrix was passed, but dense data is required. Use "
"X.toarray() to convert to a dense numpy array."
)

# Build up the affinity matrix, using multiscale if needed
if self.multiscale:
Expand Down
41 changes: 31 additions & 10 deletions Orange/widgets/unsupervised/owtsne.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,10 @@ def error(msg):
"distance matrix is provided"
)

if self.data is not None and self.data.is_sparse():
if self.normalize:
error("Data normalization is not supported for sparse data")

return self


Expand Down Expand Up @@ -445,6 +449,23 @@ def __init__(self):
self.tsne_embedding = None # type: Optional[manifold.TSNEModel]
self.iterations_done = 0 # type: int

@property
def normalize_(self):
should_normalize = self.normalize
if self.distance_matrix is not None:
should_normalize = False
if self.data is not None:
if self.data.is_sparse():
should_normalize = False
return should_normalize

@property
def use_pca_preprocessing_(self):
should_use_pca_preprocessing = self.use_pca_preprocessing
if self.distance_matrix is not None:
should_use_pca_preprocessing = False
return should_use_pca_preprocessing

@property
def effective_data(self):
return self.data.transform(Domain(self.effective_variables))
Expand All @@ -457,7 +478,7 @@ def _add_controls_start_box(self):
self.preprocessing_box = gui.vBox(self.controlArea, box="Preprocessing")
self.normalize_cbx = gui.checkBox(
self.preprocessing_box, self, "normalize", "Normalize data",
callback=self._invalidate_normalized_data, stateWhenDisabled=False,
callback=self._normalize_data_changed, stateWhenDisabled=False,
)
self.pca_preprocessing_cbx = gui.checkBox(
self.preprocessing_box, self, "use_pca_preprocessing", "Apply PCA preprocessing",
Expand Down Expand Up @@ -519,10 +540,10 @@ def _add_controls_start_box(self):
# GUI control callbacks
def _normalize_data_changed(self):
# We only care about the normalization checkbox if there is no distance
# matrix provided. This is not user-settable anyway, but is triggered
# when we programmatically enable/disable the checkbox in
# `enable_controls`
if self.distance_matrix is None:
# matrix provided and if the data are not sparse. This is not user-
# settable anyway, but is triggered when we programmatically
# enable/disable the checkbox in`enable_controls`
if self.distance_matrix is None and not self.data.is_sparse():
self._invalidate_normalized_data()

def _pca_preprocessing_changed(self):
Expand Down Expand Up @@ -856,7 +877,7 @@ def enable_controls(self):
)

# Disable slider parent, because we want to disable the labels too
self.pca_component_slider.parent().setEnabled(self.use_pca_preprocessing)
self.pca_component_slider.parent().setEnabled(self.use_pca_preprocessing_)

# Disable the perplexity spin box if multiscale is turned on
self.perplexity_spin.setDisabled(self.multiscale)
Expand Down Expand Up @@ -904,10 +925,10 @@ def run(self):
# Preprocessed data
preprocessed_data=self.preprocessed_data,
# Normalization
normalize=self.normalize,
normalize=self.normalize_,
normalized_data=self.normalized_data,
# PCA preprocessing
use_pca_preprocessing=self.use_pca_preprocessing,
use_pca_preprocessing=self.use_pca_preprocessing_,
pca_components=self.pca_components,
pca_projection=self.pca_projection,
# t-SNE parameters
Expand All @@ -931,14 +952,14 @@ def __ensure_task_same_for_preprocessing(self, task: Task):
len(task.preprocessed_data) == len(self.data)

def __ensure_task_same_for_normalization(self, task: Task):
assert task.normalize == self.normalize
assert task.normalize == self.normalize_
if task.normalize and task.distance_metric != "precomputed":
assert task.data is self.data
assert isinstance(task.normalized_data, Table) and \
len(task.normalized_data) == len(self.data)

def __ensure_task_same_for_pca(self, task: Task):
assert task.use_pca_preprocessing == self.use_pca_preprocessing
assert task.use_pca_preprocessing == self.use_pca_preprocessing_
if task.use_pca_preprocessing and task.distance_metric != "precomputed":
assert task.data is self.data
assert task.pca_components == self.pca_components
Expand Down
110 changes: 106 additions & 4 deletions Orange/widgets/unsupervised/tests/test_owtsne.py
Original file line number Diff line number Diff line change
Expand Up @@ -350,14 +350,23 @@ def test_invalidation_flow(self):
# set global structure "on" (after the embedding is computed)
w.controls.multiscale.setChecked(False)
self.send_signal(w.Inputs.data, self.data)

# By default, t-SNE is smart and disables PCA preprocessing if the
# number of features is too low. Since we are testing with the iris
# data set, we want to force t-SNE to use PCA preprocessing.
w.controls.use_pca_preprocessing.setChecked(True)
self.widget.run_button.click()

self.wait_until_finished()
self.assertFalse(self.widget.Information.modified.is_shown())
# All the embedding components should be computed
self.assertIsNotNone(w.preprocessed_data)
self.assertIsNotNone(w.normalized_data)
self.assertIsNotNone(w.pca_projection)
self.assertIsNotNone(w.affinities)
self.assertIsNotNone(w.tsne_embedding)
# All the invalidation flags should be set to false
self.assertFalse(w._invalidated.preprocessed_data)
self.assertFalse(w._invalidated.normalized_data)
self.assertFalse(w._invalidated.pca_projection)
self.assertFalse(w._invalidated.affinities)
Expand All @@ -368,13 +377,15 @@ def test_invalidation_flow(self):
self.assertTrue(self.widget.Information.modified.is_shown())
# Setting `multiscale` to true should set the invalidate flags for
# the affinities and embedding, but not the pca_projection
self.assertFalse(w._invalidated.preprocessed_data)
self.assertFalse(w._invalidated.normalized_data)
self.assertFalse(w._invalidated.pca_projection)
self.assertTrue(w._invalidated.affinities)
self.assertTrue(w._invalidated.tsne_embedding)

# The flags should now be set, but the embedding should still be
# available when selecting a subset of data and such
self.assertIsNotNone(w.preprocessed_data)
self.assertIsNotNone(w.normalized_data)
self.assertIsNotNone(w.pca_projection)
self.assertIsNotNone(w.affinities)
Expand Down Expand Up @@ -472,6 +483,9 @@ def test_distance_matrix_not_symmetric(self):
self.send_signal(w.Inputs.distances, DistMatrix([[1, 2, 3], [4, 5, 6]]))
self.assertTrue(w.Error.distance_matrix_not_symmetric.is_shown())

self.send_signal(w.Inputs.distances, DistMatrix([[1, 2, 3], [4, 5, 6], [7, 8, 9]]))
self.assertTrue(w.Error.distance_matrix_not_symmetric.is_shown())

self.send_signal(w.Inputs.distances, None)
self.assertFalse(w.Error.distance_matrix_not_symmetric.is_shown())

Expand Down Expand Up @@ -813,6 +827,64 @@ def test_controls_ignored_by_distance_matrix_retain_values_on_table_signal(self)
self.assertTrue(w.perplexity_spin.isEnabled())
self.assertEqual(w.perplexity_spin.value(), 42)

def test_controls_are_properly_disabled_with_sparse_matrix(self):
w = self.widget

# Normalizing sparse matrix is disabled, since this would require
# centering
disabled_fields = ["normalize"]
# PCA preprocessing and supported distance metrics are enable for sparse
# matrices
enabled_fields = [
"use_pca_preprocessing", "distance_metric_idx", "initialization_method_idx"
]

self.send_signal(w.Inputs.data, self.iris.to_sparse())
self.wait_until_finished()

for field in disabled_fields:
self.assertFalse(getattr(w.controls, field).isEnabled())
for field in enabled_fields:
self.assertTrue(getattr(w.controls, field).isEnabled())

# Send dense table, shoule enable disabled fields
self.send_signal(w.Inputs.data, self.iris)
self.wait_until_finished()

for field in disabled_fields:
self.assertTrue(getattr(w.controls, field).isEnabled())
for field in enabled_fields:
self.assertTrue(getattr(w.controls, field).isEnabled())

def test_data_containing_nans(self):
x = np.random.normal(0, 1, size=(150, 50))
# Randomly sprinkle a few NaNs into the matrix
num_nans = 20
x[np.random.randint(0, 150, num_nans), np.random.randint(0, 50, num_nans)] = np.nan

nan_data = Table.from_numpy(Domain.from_numpy(x), x)

w = self.widget

self.send_signal(w.Inputs.data, nan_data)
self.assertTrue(w.controls.normalize.isChecked())
self.assertTrue(w.controls.use_pca_preprocessing.isChecked())
self.widget.run_button.click(), self.wait_until_finished()

# Disable only normalization
w.controls.normalize.setChecked(False)
self.widget.run_button.click(), self.wait_until_finished()

# Disable only PCA preprocessing
w.controls.normalize.setChecked(True)
w.controls.use_pca_preprocessing.setChecked(False)
self.widget.run_button.click(), self.wait_until_finished()

# Disable both normalization and PCA preprocessing
w.controls.normalize.setChecked(False)
w.controls.use_pca_preprocessing.setChecked(False)
self.widget.run_button.click(), self.wait_until_finished()


class TestTSNERunner(unittest.TestCase):
@classmethod
Expand All @@ -834,8 +906,9 @@ def test_run_with_normalization_and_pca_preprocessing(self):
)
task = TSNERunner.run(task, state)

self.assertEqual(len(state.set_status.mock_calls), 5)
self.assertEqual(len(state.set_status.mock_calls), 6)
state.set_status.assert_has_calls([
call("Preprocessing data..."),
call("Normalizing data..."),
call("Computing PCA..."),
call("Finding nearest neighbors..."),
Expand All @@ -862,8 +935,9 @@ def test_run_with_normalization(self):
)
task = TSNERunner.run(task, state)

self.assertEqual(len(state.set_status.mock_calls), 4)
self.assertEqual(len(state.set_status.mock_calls), 5)
state.set_status.assert_has_calls([
call("Preprocessing data..."),
call("Normalizing data..."),
call("Finding nearest neighbors..."),
call("Preparing initialization..."),
Expand All @@ -890,8 +964,9 @@ def test_run_with_pca_preprocessing(self):
)
task = TSNERunner.run(task, state)

self.assertEqual(len(state.set_status.mock_calls), 4)
self.assertEqual(len(state.set_status.mock_calls), 5)
state.set_status.assert_has_calls([
call("Preprocessing data..."),
call("Computing PCA..."),
call("Finding nearest neighbors..."),
call("Preparing initialization..."),
Expand Down Expand Up @@ -949,7 +1024,6 @@ def test_run_with_distance_matrix(self):
task = Task(
normalize=False,
use_pca_preprocessing=False,
# data=self.data,
distance_matrix=self.distances,
perplexity=30,
initialization_method="spectral",
Expand Down Expand Up @@ -1064,6 +1138,34 @@ def test_run_with_distance_matrix_ignores_preprocessing(self):
self.assertIsInstance(task.tsne, TSNE)
self.assertIsInstance(task.tsne_embedding, TSNEModel)

def test_run_with_sparse_matrix_ignores_normalization(self):
state = Mock()
state.is_interruption_requested = Mock(return_value=False)

task = Task(
normalize=False,
use_pca_preprocessing=True,
data=self.data.to_sparse(),
perplexity=30,
initialization_method="spectral",
distance_metric="cosine",
)
task = TSNERunner.run(task, state)
self.assertEqual(len(state.set_status.mock_calls), 5)
state.set_status.assert_has_calls([
call("Preprocessing data..."),
call("Computing PCA..."),
call("Finding nearest neighbors..."),
call("Preparing initialization..."),
call("Running optimization..."),
])

self.assertIsNone(task.normalized_data)
self.assertIsInstance(task.pca_projection, Table)
self.assertIsInstance(task.initialization, np.ndarray)
self.assertIsInstance(task.tsne, TSNE)
self.assertIsInstance(task.tsne_embedding, TSNEModel)


if __name__ == "__main__":
unittest.main()

0 comments on commit 6d9a3ca

Please sign in to comment.